{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "ce_loss_13": 11.519832849502563, "ce_loss_26": 11.473536491394043, "ce_loss_39": 11.263565063476562, "ce_loss_52": 1.3852829337120056, "ce_loss_7": 11.56409740447998, "epoch": 0.0001, "grad_norm": 22.293954988093517, "kl_loss_13": 20864.0, "kl_loss_26": 20736.0, "kl_loss_39": 20320.0, "kl_loss_7": 20992.0, "learning_rate": 1e-05, "loss": 41440.0, "step": 1 }, { "ce_loss_13": 11.513921552234226, "ce_loss_26": 11.469077931510078, "ce_loss_39": 11.246019893222385, "ce_loss_52": 1.4558950497044458, "ce_loss_7": 11.55848307079739, "epoch": 0.001, "grad_norm": 23.161174410395507, "kl_loss_13": 20800.0, "kl_loss_26": 20696.88888888889, "kl_loss_39": 20227.555555555555, "kl_loss_7": 20881.777777777777, "learning_rate": 0.0001, "loss": 41384.0, "step": 10 }, { "ce_loss_13": 11.426759386062622, "ce_loss_26": 11.41293363571167, "ce_loss_39": 11.229010510444642, "ce_loss_52": 1.4324860751628876, "ce_loss_7": 11.462353825569153, "epoch": 0.002, "grad_norm": 38.50857397395853, "kl_loss_13": 20668.8, "kl_loss_26": 20640.0, "kl_loss_39": 20256.0, "kl_loss_7": 20745.6, "learning_rate": 0.0002, "loss": 41179.2, "step": 20 }, { "ce_loss_13": 10.954976797103882, "ce_loss_26": 11.087984251976014, "ce_loss_39": 11.10265805721283, "ce_loss_52": 1.4276391446590424, "ce_loss_7": 10.940680837631225, "epoch": 0.003, "grad_norm": 58.08626567131467, "kl_loss_13": 19702.4, "kl_loss_26": 19977.6, "kl_loss_39": 20028.8, "kl_loss_7": 19680.0, "learning_rate": 0.0003, "loss": 39668.0, "step": 30 }, { "ce_loss_13": 10.308717799186706, "ce_loss_26": 10.375125074386597, "ce_loss_39": 10.562542247772218, "ce_loss_52": 1.455844309926033, "ce_loss_7": 10.312761902809143, "epoch": 0.004, "grad_norm": 30.451988937114738, "kl_loss_13": 18307.2, "kl_loss_26": 18438.4, "kl_loss_39": 18832.0, "kl_loss_7": 18313.6, "learning_rate": 0.0004, "loss": 36999.2, "step": 40 }, { "ce_loss_13": 10.173566651344299, "ce_loss_26": 10.188505339622498, "ce_loss_39": 10.173172044754029, "ce_loss_52": 1.4577810317277908, "ce_loss_7": 10.182775902748109, "epoch": 0.005, "grad_norm": 37.851028798241174, "kl_loss_13": 18006.4, "kl_loss_26": 18028.8, "kl_loss_39": 18012.8, "kl_loss_7": 18022.4, "learning_rate": 0.0005, "loss": 36191.2, "step": 50 }, { "ce_loss_13": 10.071360087394714, "ce_loss_26": 10.092596936225892, "ce_loss_39": 10.072972583770753, "ce_loss_52": 1.428243064880371, "ce_loss_7": 10.105072927474975, "epoch": 0.006, "grad_norm": 45.98715921867029, "kl_loss_13": 17872.0, "kl_loss_26": 17907.2, "kl_loss_39": 17878.4, "kl_loss_7": 17936.0, "learning_rate": 0.0006, "loss": 35728.0, "step": 60 }, { "ce_loss_13": 9.994266033172607, "ce_loss_26": 10.000447010993957, "ce_loss_39": 9.958984637260437, "ce_loss_52": 1.392430166900158, "ce_loss_7": 10.041595196723938, "epoch": 0.007, "grad_norm": 53.56384816487026, "kl_loss_13": 17750.4, "kl_loss_26": 17763.2, "kl_loss_39": 17667.2, "kl_loss_7": 17846.4, "learning_rate": 0.0007, "loss": 35411.2, "step": 70 }, { "ce_loss_13": 9.870160865783692, "ce_loss_26": 9.870406699180602, "ce_loss_39": 9.815650677680969, "ce_loss_52": 1.4188331544399262, "ce_loss_7": 9.92598659992218, "epoch": 0.008, "grad_norm": 58.363906192597035, "kl_loss_13": 17475.2, "kl_loss_26": 17484.8, "kl_loss_39": 17366.4, "kl_loss_7": 17587.2, "learning_rate": 0.0008, "loss": 35010.4, "step": 80 }, { "ce_loss_13": 9.786613607406617, "ce_loss_26": 9.77514407634735, "ce_loss_39": 9.697125172615051, "ce_loss_52": 1.4261163920164108, "ce_loss_7": 9.84011538028717, "epoch": 0.009, "grad_norm": 57.597510936184484, "kl_loss_13": 17267.2, "kl_loss_26": 17232.0, "kl_loss_39": 17065.6, "kl_loss_7": 17376.0, "learning_rate": 0.0009000000000000001, "loss": 34545.6, "step": 90 }, { "ce_loss_13": 9.70496118068695, "ce_loss_26": 9.680623888969421, "ce_loss_39": 9.57672963142395, "ce_loss_52": 1.4332450866699218, "ce_loss_7": 9.75693221092224, "epoch": 0.01, "grad_norm": 56.92102830978135, "kl_loss_13": 17075.2, "kl_loss_26": 17030.4, "kl_loss_39": 16814.4, "kl_loss_7": 17187.2, "learning_rate": 0.001, "loss": 34141.6, "step": 100 }, { "ce_loss_13": 9.62608094215393, "ce_loss_26": 9.584086346626282, "ce_loss_39": 9.469242024421693, "ce_loss_52": 1.4119121626019477, "ce_loss_7": 9.681734418869018, "epoch": 0.011, "grad_norm": 55.54751251170693, "kl_loss_13": 16956.8, "kl_loss_26": 16856.0, "kl_loss_39": 16632.0, "kl_loss_7": 17075.2, "learning_rate": 0.0009999974825027757, "loss": 33673.2, "step": 110 }, { "ce_loss_13": 9.557737636566163, "ce_loss_26": 9.502408647537232, "ce_loss_39": 9.373179388046264, "ce_loss_52": 1.420964427292347, "ce_loss_7": 9.613711476325989, "epoch": 0.012, "grad_norm": 55.463113229450904, "kl_loss_13": 16777.6, "kl_loss_26": 16667.2, "kl_loss_39": 16393.6, "kl_loss_7": 16905.6, "learning_rate": 0.0009999899300364532, "loss": 33335.2, "step": 120 }, { "ce_loss_13": 9.474759387969971, "ce_loss_26": 9.408789944648742, "ce_loss_39": 9.266374969482422, "ce_loss_52": 1.4124270409345627, "ce_loss_7": 9.538655042648315, "epoch": 0.013, "grad_norm": 54.241543470395335, "kl_loss_13": 16628.8, "kl_loss_26": 16492.8, "kl_loss_39": 16193.6, "kl_loss_7": 16766.4, "learning_rate": 0.0009999773426770863, "loss": 32999.6, "step": 130 }, { "ce_loss_13": 9.420424246788025, "ce_loss_26": 9.348571801185608, "ce_loss_39": 9.1972074508667, "ce_loss_52": 1.4392782002687454, "ce_loss_7": 9.492741465568542, "epoch": 0.014, "grad_norm": 54.10933362933205, "kl_loss_13": 16476.8, "kl_loss_26": 16324.8, "kl_loss_39": 16014.4, "kl_loss_7": 16638.4, "learning_rate": 0.0009999597205514296, "loss": 32751.6, "step": 140 }, { "ce_loss_13": 9.388373732566833, "ce_loss_26": 9.308008575439453, "ce_loss_39": 9.153336524963379, "ce_loss_52": 1.4420859813690186, "ce_loss_7": 9.46445541381836, "epoch": 0.015, "grad_norm": 55.15236350542743, "kl_loss_13": 16382.4, "kl_loss_26": 16219.2, "kl_loss_39": 15888.0, "kl_loss_7": 16542.4, "learning_rate": 0.0009999370638369377, "loss": 32525.2, "step": 150 }, { "ce_loss_13": 9.301919007301331, "ce_loss_26": 9.212661600112915, "ce_loss_39": 9.050238633155823, "ce_loss_52": 1.4233157366514206, "ce_loss_7": 9.383031058311463, "epoch": 0.016, "grad_norm": 55.03653973388566, "kl_loss_13": 16278.4, "kl_loss_26": 16092.8, "kl_loss_39": 15755.2, "kl_loss_7": 16440.0, "learning_rate": 0.000999909372761763, "loss": 32209.6, "step": 160 }, { "ce_loss_13": 9.24642186164856, "ce_loss_26": 9.14689018726349, "ce_loss_39": 8.978599190711975, "ce_loss_52": 1.429112258553505, "ce_loss_7": 9.331455826759338, "epoch": 0.017, "grad_norm": 54.90625528920335, "kl_loss_13": 16142.4, "kl_loss_26": 15931.2, "kl_loss_39": 15580.8, "kl_loss_7": 16315.2, "learning_rate": 0.0009998766476047546, "loss": 31964.8, "step": 170 }, { "ce_loss_13": 9.187117385864259, "ce_loss_26": 9.076116013526917, "ce_loss_39": 8.902227759361267, "ce_loss_52": 1.3885775536298752, "ce_loss_7": 9.275272035598755, "epoch": 0.018, "grad_norm": 54.60426962776646, "kl_loss_13": 16072.0, "kl_loss_26": 15844.8, "kl_loss_39": 15480.0, "kl_loss_7": 16262.4, "learning_rate": 0.0009998388886954545, "loss": 31645.2, "step": 180 }, { "ce_loss_13": 9.131648278236389, "ce_loss_26": 9.008217167854308, "ce_loss_39": 8.831042790412903, "ce_loss_52": 1.4482133895158769, "ce_loss_7": 9.224077129364014, "epoch": 0.019, "grad_norm": 53.93299711922953, "kl_loss_13": 15870.4, "kl_loss_26": 15609.6, "kl_loss_39": 15232.0, "kl_loss_7": 16067.2, "learning_rate": 0.0009997960964140947, "loss": 31408.4, "step": 190 }, { "ce_loss_13": 9.050732731819153, "ce_loss_26": 8.918284726142883, "ce_loss_39": 8.738019919395446, "ce_loss_52": 1.4300477087497712, "ce_loss_7": 9.145042276382446, "epoch": 0.02, "grad_norm": 53.732589384741736, "kl_loss_13": 15728.0, "kl_loss_26": 15449.6, "kl_loss_39": 15064.0, "kl_loss_7": 15928.0, "learning_rate": 0.0009997482711915926, "loss": 31145.6, "step": 200 }, { "ce_loss_13": 8.988229060173035, "ce_loss_26": 8.844111323356628, "ce_loss_39": 8.654908394813537, "ce_loss_52": 1.4580651924014092, "ce_loss_7": 9.090379095077514, "epoch": 0.021, "grad_norm": 53.04072542826613, "kl_loss_13": 15550.4, "kl_loss_26": 15251.2, "kl_loss_39": 14854.4, "kl_loss_7": 15771.2, "learning_rate": 0.0009996954135095479, "loss": 30853.2, "step": 210 }, { "ce_loss_13": 8.945707607269288, "ce_loss_26": 8.79644329547882, "ce_loss_39": 8.601721858978271, "ce_loss_52": 1.4154451981186866, "ce_loss_7": 9.052277135849, "epoch": 0.022, "grad_norm": 53.309057416596275, "kl_loss_13": 15544.0, "kl_loss_26": 15219.2, "kl_loss_39": 14809.6, "kl_loss_7": 15761.6, "learning_rate": 0.0009996375239002368, "loss": 30606.8, "step": 220 }, { "ce_loss_13": 8.933341026306152, "ce_loss_26": 8.773744869232178, "ce_loss_39": 8.57140781879425, "ce_loss_52": 1.4167777329683304, "ce_loss_7": 9.04187982082367, "epoch": 0.023, "grad_norm": 53.383646235412414, "kl_loss_13": 15486.4, "kl_loss_26": 15156.8, "kl_loss_39": 14726.4, "kl_loss_7": 15721.6, "learning_rate": 0.0009995746029466072, "loss": 30406.4, "step": 230 }, { "ce_loss_13": 8.869291019439697, "ce_loss_26": 8.693119740486145, "ce_loss_39": 8.476832008361816, "ce_loss_52": 1.4153499186038971, "ce_loss_7": 8.980035424232483, "epoch": 0.024, "grad_norm": 52.917730504652305, "kl_loss_13": 15353.6, "kl_loss_26": 14985.6, "kl_loss_39": 14528.0, "kl_loss_7": 15588.8, "learning_rate": 0.0009995066512822719, "loss": 30148.4, "step": 240 }, { "ce_loss_13": 8.81713101863861, "ce_loss_26": 8.636789417266845, "ce_loss_39": 8.412476801872254, "ce_loss_52": 1.4529948115348816, "ce_loss_7": 8.929914593696594, "epoch": 0.025, "grad_norm": 53.98592158305785, "kl_loss_13": 15196.8, "kl_loss_26": 14820.8, "kl_loss_39": 14340.8, "kl_loss_7": 15438.4, "learning_rate": 0.000999433669591504, "loss": 29860.8, "step": 250 }, { "ce_loss_13": 8.748036527633667, "ce_loss_26": 8.558162140846253, "ce_loss_39": 8.331628108024598, "ce_loss_52": 1.4311662405729293, "ce_loss_7": 8.864733743667603, "epoch": 0.026, "grad_norm": 52.415587650337294, "kl_loss_13": 15088.0, "kl_loss_26": 14683.2, "kl_loss_39": 14200.0, "kl_loss_7": 15329.6, "learning_rate": 0.000999355658609228, "loss": 29636.0, "step": 260 }, { "ce_loss_13": 8.692949771881104, "ce_loss_26": 8.494869589805603, "ce_loss_39": 8.259693837165832, "ce_loss_52": 1.4384984374046326, "ce_loss_7": 8.814060854911805, "epoch": 0.027, "grad_norm": 53.356303831580306, "kl_loss_13": 14976.0, "kl_loss_26": 14555.2, "kl_loss_39": 14054.4, "kl_loss_7": 15230.4, "learning_rate": 0.0009992726191210138, "loss": 29438.0, "step": 270 }, { "ce_loss_13": 8.67082085609436, "ce_loss_26": 8.463814663887025, "ce_loss_39": 8.215038228034974, "ce_loss_52": 1.4267250567674636, "ce_loss_7": 8.794116616249084, "epoch": 0.028, "grad_norm": 52.94359037736481, "kl_loss_13": 14902.4, "kl_loss_26": 14476.8, "kl_loss_39": 13944.0, "kl_loss_7": 15174.4, "learning_rate": 0.0009991845519630679, "loss": 29276.4, "step": 280 }, { "ce_loss_13": 8.61563618183136, "ce_loss_26": 8.402152299880981, "ce_loss_39": 8.144541609287263, "ce_loss_52": 1.4250373497605324, "ce_loss_7": 8.742781138420105, "epoch": 0.029, "grad_norm": 51.566336997103136, "kl_loss_13": 14817.6, "kl_loss_26": 14369.6, "kl_loss_39": 13816.0, "kl_loss_7": 15089.6, "learning_rate": 0.0009990914580222257, "loss": 29010.0, "step": 290 }, { "ce_loss_13": 8.572561240196228, "ce_loss_26": 8.35531551837921, "ce_loss_39": 8.092759764194488, "ce_loss_52": 1.4600662559270858, "ce_loss_7": 8.698393726348877, "epoch": 0.03, "grad_norm": 53.1076582900563, "kl_loss_13": 14704.0, "kl_loss_26": 14251.2, "kl_loss_39": 13689.6, "kl_loss_7": 14974.4, "learning_rate": 0.0009989933382359422, "loss": 28776.8, "step": 300 }, { "ce_loss_13": 8.491887974739075, "ce_loss_26": 8.263946199417115, "ce_loss_39": 7.987871313095093, "ce_loss_52": 1.4451197743415833, "ce_loss_7": 8.625923323631287, "epoch": 0.031, "grad_norm": 52.58460107915946, "kl_loss_13": 14537.6, "kl_loss_26": 14054.4, "kl_loss_39": 13464.0, "kl_loss_7": 14825.6, "learning_rate": 0.0009988901935922825, "loss": 28548.4, "step": 310 }, { "ce_loss_13": 8.474869418144227, "ce_loss_26": 8.245952117443085, "ce_loss_39": 7.974157309532165, "ce_loss_52": 1.4602727562189102, "ce_loss_7": 8.610798478126526, "epoch": 0.032, "grad_norm": 52.331617741046635, "kl_loss_13": 14480.0, "kl_loss_26": 14003.2, "kl_loss_39": 13427.2, "kl_loss_7": 14760.0, "learning_rate": 0.0009987820251299122, "loss": 28364.4, "step": 320 }, { "ce_loss_13": 8.44498426914215, "ce_loss_26": 8.20645843744278, "ce_loss_39": 7.912872779369354, "ce_loss_52": 1.4554857224225999, "ce_loss_7": 8.583872628211974, "epoch": 0.033, "grad_norm": 50.87832937020315, "kl_loss_13": 14411.2, "kl_loss_26": 13900.8, "kl_loss_39": 13281.6, "kl_loss_7": 14700.8, "learning_rate": 0.0009986688339380862, "loss": 28109.2, "step": 330 }, { "ce_loss_13": 8.38349392414093, "ce_loss_26": 8.133478546142578, "ce_loss_39": 7.828318297863007, "ce_loss_52": 1.425998830795288, "ce_loss_7": 8.525882768630982, "epoch": 0.034, "grad_norm": 51.153122440976325, "kl_loss_13": 14328.0, "kl_loss_26": 13809.6, "kl_loss_39": 13163.2, "kl_loss_7": 14633.6, "learning_rate": 0.0009985506211566387, "loss": 27878.0, "step": 340 }, { "ce_loss_13": 8.349605464935303, "ce_loss_26": 8.097514569759369, "ce_loss_39": 7.785721278190612, "ce_loss_52": 1.4315812528133391, "ce_loss_7": 8.496586155891418, "epoch": 0.035, "grad_norm": 51.20256734387486, "kl_loss_13": 14254.4, "kl_loss_26": 13721.6, "kl_loss_39": 13062.4, "kl_loss_7": 14561.6, "learning_rate": 0.0009984273879759713, "loss": 27693.2, "step": 350 }, { "ce_loss_13": 8.273482608795167, "ce_loss_26": 8.020016944408416, "ce_loss_39": 7.711479115486145, "ce_loss_52": 1.4499147981405258, "ce_loss_7": 8.422512984275818, "epoch": 0.036, "grad_norm": 52.12411775413645, "kl_loss_13": 14088.0, "kl_loss_26": 13556.8, "kl_loss_39": 12892.8, "kl_loss_7": 14403.2, "learning_rate": 0.0009982991356370402, "loss": 27442.0, "step": 360 }, { "ce_loss_13": 8.215481567382813, "ce_loss_26": 7.953275382518768, "ce_loss_39": 7.628855121135712, "ce_loss_52": 1.411750042438507, "ce_loss_7": 8.368350863456726, "epoch": 0.037, "grad_norm": 51.0215332330724, "kl_loss_13": 14024.0, "kl_loss_26": 13470.4, "kl_loss_39": 12779.2, "kl_loss_7": 14348.8, "learning_rate": 0.0009981658654313456, "loss": 27348.0, "step": 370 }, { "ce_loss_13": 8.217882227897643, "ce_loss_26": 7.9462348341941835, "ce_loss_39": 7.613846385478974, "ce_loss_52": 1.4831970453262329, "ce_loss_7": 8.373853397369384, "epoch": 0.038, "grad_norm": 50.4571010346743, "kl_loss_13": 13913.6, "kl_loss_26": 13345.6, "kl_loss_39": 12646.4, "kl_loss_7": 14243.2, "learning_rate": 0.000998027578700917, "loss": 27082.8, "step": 380 }, { "ce_loss_13": 8.11286985874176, "ce_loss_26": 7.831897294521331, "ce_loss_39": 7.493152487277984, "ce_loss_52": 1.4128454998135567, "ce_loss_7": 8.275482225418092, "epoch": 0.039, "grad_norm": 51.75236464910614, "kl_loss_13": 13798.4, "kl_loss_26": 13212.8, "kl_loss_39": 12491.2, "kl_loss_7": 14139.2, "learning_rate": 0.0009978842768382998, "loss": 26835.6, "step": 390 }, { "ce_loss_13": 8.092873919010163, "ce_loss_26": 7.80686913728714, "ce_loss_39": 7.462458717823028, "ce_loss_52": 1.449526023864746, "ce_loss_7": 8.253998827934264, "epoch": 0.04, "grad_norm": 50.588198000151046, "kl_loss_13": 13680.0, "kl_loss_26": 13083.2, "kl_loss_39": 12355.2, "kl_loss_7": 14019.2, "learning_rate": 0.0009977359612865424, "loss": 26670.0, "step": 400 }, { "ce_loss_13": 8.073025333881379, "ce_loss_26": 7.792924261093139, "ce_loss_39": 7.448240423202515, "ce_loss_52": 1.4590945556759833, "ce_loss_7": 8.234287071228028, "epoch": 0.041, "grad_norm": 50.388842117598266, "kl_loss_13": 13630.4, "kl_loss_26": 13032.0, "kl_loss_39": 12302.4, "kl_loss_7": 13969.6, "learning_rate": 0.0009975826335391806, "loss": 26457.2, "step": 410 }, { "ce_loss_13": 7.959017169475556, "ce_loss_26": 7.667092227935791, "ce_loss_39": 7.308180010318756, "ce_loss_52": 1.3903952419757843, "ce_loss_7": 8.129511964321136, "epoch": 0.042, "grad_norm": 50.86911767207133, "kl_loss_13": 13540.8, "kl_loss_26": 12916.8, "kl_loss_39": 12163.2, "kl_loss_7": 13888.0, "learning_rate": 0.0009974242951402235, "loss": 26197.6, "step": 420 }, { "ce_loss_13": 7.940323996543884, "ce_loss_26": 7.629641830921173, "ce_loss_39": 7.2696495175361635, "ce_loss_52": 1.4528310179710389, "ce_loss_7": 8.112548959255218, "epoch": 0.043, "grad_norm": 49.24163454624527, "kl_loss_13": 13374.4, "kl_loss_26": 12728.0, "kl_loss_39": 11958.4, "kl_loss_7": 13744.0, "learning_rate": 0.0009972609476841367, "loss": 25992.4, "step": 430 }, { "ce_loss_13": 7.8985715508461, "ce_loss_26": 7.592843997478485, "ce_loss_39": 7.205817592144013, "ce_loss_52": 1.4196556687355042, "ce_loss_7": 8.078336155414581, "epoch": 0.044, "grad_norm": 49.811998634305894, "kl_loss_13": 13345.6, "kl_loss_26": 12699.2, "kl_loss_39": 11881.6, "kl_loss_7": 13718.4, "learning_rate": 0.0009970925928158272, "loss": 25854.4, "step": 440 }, { "ce_loss_13": 7.867163848876953, "ce_loss_26": 7.55372383594513, "ce_loss_39": 7.173475623130798, "ce_loss_52": 1.4358820408582686, "ce_loss_7": 8.0431494474411, "epoch": 0.045, "grad_norm": 48.311473749243106, "kl_loss_13": 13248.0, "kl_loss_26": 12595.2, "kl_loss_39": 11792.0, "kl_loss_7": 13614.4, "learning_rate": 0.000996919232230627, "loss": 25620.0, "step": 450 }, { "ce_loss_13": 7.794478893280029, "ce_loss_26": 7.470960378646851, "ce_loss_39": 7.075640022754669, "ce_loss_52": 1.4313764542341232, "ce_loss_7": 7.97986272573471, "epoch": 0.046, "grad_norm": 49.75600981611632, "kl_loss_13": 13113.6, "kl_loss_26": 12433.6, "kl_loss_39": 11588.8, "kl_loss_7": 13496.0, "learning_rate": 0.0009967408676742752, "loss": 25367.2, "step": 460 }, { "ce_loss_13": 7.780554842948914, "ce_loss_26": 7.450528597831726, "ce_loss_39": 7.06024489402771, "ce_loss_52": 1.4271526962518692, "ce_loss_7": 7.96198604106903, "epoch": 0.047, "grad_norm": 49.73198686766462, "kl_loss_13": 13089.6, "kl_loss_26": 12406.4, "kl_loss_39": 11576.0, "kl_loss_7": 13475.2, "learning_rate": 0.0009965575009429006, "loss": 25186.4, "step": 470 }, { "ce_loss_13": 7.771422934532166, "ce_loss_26": 7.443638646602631, "ce_loss_39": 7.045053339004516, "ce_loss_52": 1.4691366642713546, "ce_loss_7": 7.956920957565307, "epoch": 0.048, "grad_norm": 48.6898889709016, "kl_loss_13": 12971.2, "kl_loss_26": 12286.4, "kl_loss_39": 11440.0, "kl_loss_7": 13366.4, "learning_rate": 0.0009963691338830043, "loss": 25028.4, "step": 480 }, { "ce_loss_13": 7.7170240640640255, "ce_loss_26": 7.3867839813232425, "ce_loss_39": 6.986623299121857, "ce_loss_52": 1.4700770109891892, "ce_loss_7": 7.900947248935699, "epoch": 0.049, "grad_norm": 47.968754476102596, "kl_loss_13": 12884.8, "kl_loss_26": 12195.2, "kl_loss_39": 11332.8, "kl_loss_7": 13273.6, "learning_rate": 0.0009961757683914405, "loss": 24808.8, "step": 490 }, { "ce_loss_13": 7.612575709819794, "ce_loss_26": 7.270983147621155, "ce_loss_39": 6.851053369045258, "ce_loss_52": 1.4072588831186295, "ce_loss_7": 7.807804656028748, "epoch": 0.05, "grad_norm": 49.18975121944083, "kl_loss_13": 12780.8, "kl_loss_26": 12060.8, "kl_loss_39": 11161.6, "kl_loss_7": 13190.4, "learning_rate": 0.0009959774064153978, "loss": 24615.6, "step": 500 }, { "ce_loss_13": 7.6113405585289, "ce_loss_26": 7.257396864891052, "ce_loss_39": 6.8364926934242245, "ce_loss_52": 1.405586513876915, "ce_loss_7": 7.807830440998077, "epoch": 0.051, "grad_norm": 48.36038036613293, "kl_loss_13": 12753.6, "kl_loss_26": 12017.6, "kl_loss_39": 11121.6, "kl_loss_7": 13161.6, "learning_rate": 0.0009957740499523787, "loss": 24452.0, "step": 510 }, { "ce_loss_13": 7.562924301624298, "ce_loss_26": 7.205751180648804, "ce_loss_39": 6.7729793906211855, "ce_loss_52": 1.441327565908432, "ce_loss_7": 7.762964737415314, "epoch": 0.052, "grad_norm": 48.52091531249349, "kl_loss_13": 12577.6, "kl_loss_26": 11833.6, "kl_loss_39": 10912.0, "kl_loss_7": 12990.4, "learning_rate": 0.0009955657010501807, "loss": 24214.4, "step": 520 }, { "ce_loss_13": 7.5027553796768185, "ce_loss_26": 7.149892139434814, "ce_loss_39": 6.725725698471069, "ce_loss_52": 1.4616976886987687, "ce_loss_7": 7.700168478488922, "epoch": 0.053, "grad_norm": 47.609892122251686, "kl_loss_13": 12451.2, "kl_loss_26": 11710.4, "kl_loss_39": 10804.8, "kl_loss_7": 12872.0, "learning_rate": 0.000995352361806875, "loss": 24037.2, "step": 530 }, { "ce_loss_13": 7.525733006000519, "ce_loss_26": 7.1605717778205875, "ce_loss_39": 6.703774988651276, "ce_loss_52": 1.42885320186615, "ce_loss_7": 7.727836930751801, "epoch": 0.054, "grad_norm": 47.0111007198644, "kl_loss_13": 12556.8, "kl_loss_26": 11790.4, "kl_loss_39": 10828.8, "kl_loss_7": 12976.0, "learning_rate": 0.0009951340343707852, "loss": 23845.2, "step": 540 }, { "ce_loss_13": 7.423776483535766, "ce_loss_26": 7.0557411193847654, "ce_loss_39": 6.604493200778961, "ce_loss_52": 1.4447590827941894, "ce_loss_7": 7.626477897167206, "epoch": 0.055, "grad_norm": 49.45361296474082, "kl_loss_13": 12328.0, "kl_loss_26": 11540.8, "kl_loss_39": 10584.0, "kl_loss_7": 12747.2, "learning_rate": 0.0009949107209404665, "loss": 23664.0, "step": 550 }, { "ce_loss_13": 7.434036374092102, "ce_loss_26": 7.059368348121643, "ce_loss_39": 6.6053709268569945, "ce_loss_52": 1.4645269870758058, "ce_loss_7": 7.6432753801345825, "epoch": 0.056, "grad_norm": 47.673368470799254, "kl_loss_13": 12291.2, "kl_loss_26": 11512.0, "kl_loss_39": 10547.2, "kl_loss_7": 12726.4, "learning_rate": 0.0009946824237646824, "loss": 23469.6, "step": 560 }, { "ce_loss_13": 7.307004892826081, "ce_loss_26": 6.927345609664917, "ce_loss_39": 6.4629304051399235, "ce_loss_52": 1.437305434048176, "ce_loss_7": 7.522386133670807, "epoch": 0.057, "grad_norm": 46.80952508481597, "kl_loss_13": 12094.4, "kl_loss_26": 11300.8, "kl_loss_39": 10308.8, "kl_loss_7": 12539.2, "learning_rate": 0.0009944491451423828, "loss": 23249.6, "step": 570 }, { "ce_loss_13": 7.349401378631592, "ce_loss_26": 6.964329659938812, "ce_loss_39": 6.480949449539184, "ce_loss_52": 1.4452718168497085, "ce_loss_7": 7.564259791374207, "epoch": 0.058, "grad_norm": 46.22867294627436, "kl_loss_13": 12145.6, "kl_loss_26": 11340.8, "kl_loss_39": 10315.2, "kl_loss_7": 12592.0, "learning_rate": 0.0009942108874226813, "loss": 23091.2, "step": 580 }, { "ce_loss_13": 7.254886651039124, "ce_loss_26": 6.858836472034454, "ce_loss_39": 6.3856946468353275, "ce_loss_52": 1.4449717432260514, "ce_loss_7": 7.473185133934021, "epoch": 0.059, "grad_norm": 45.84422579202554, "kl_loss_13": 11969.6, "kl_loss_26": 11147.2, "kl_loss_39": 10136.0, "kl_loss_7": 12424.0, "learning_rate": 0.00099396765300483, "loss": 22886.4, "step": 590 }, { "ce_loss_13": 7.248957896232605, "ce_loss_26": 6.855677163600921, "ce_loss_39": 6.3774519801139835, "ce_loss_52": 1.477000206708908, "ce_loss_7": 7.465912497043609, "epoch": 0.06, "grad_norm": 46.37348014710593, "kl_loss_13": 11888.0, "kl_loss_26": 11064.0, "kl_loss_39": 10044.8, "kl_loss_7": 12347.2, "learning_rate": 0.0009937194443381972, "loss": 22708.0, "step": 600 }, { "ce_loss_13": 7.210493552684784, "ce_loss_26": 6.8088652968406675, "ce_loss_39": 6.325126445293426, "ce_loss_52": 1.444644930958748, "ce_loss_7": 7.429195690155029, "epoch": 0.061, "grad_norm": 44.92499922138711, "kl_loss_13": 11859.2, "kl_loss_26": 11019.2, "kl_loss_39": 9995.2, "kl_loss_7": 12320.0, "learning_rate": 0.0009934662639222412, "loss": 22544.8, "step": 610 }, { "ce_loss_13": 7.1185362339019775, "ce_loss_26": 6.714106225967408, "ce_loss_39": 6.223515486717224, "ce_loss_52": 1.4858893424272537, "ce_loss_7": 7.341545379161834, "epoch": 0.062, "grad_norm": 46.45143938897793, "kl_loss_13": 11601.6, "kl_loss_26": 10750.4, "kl_loss_39": 9708.8, "kl_loss_7": 12072.0, "learning_rate": 0.000993208114306486, "loss": 22270.0, "step": 620 }, { "ce_loss_13": 7.0913821935653685, "ce_loss_26": 6.689675974845886, "ce_loss_39": 6.203632855415345, "ce_loss_52": 1.4506051570177079, "ce_loss_7": 7.311544299125671, "epoch": 0.063, "grad_norm": 45.34630221197193, "kl_loss_13": 11592.0, "kl_loss_26": 10752.0, "kl_loss_39": 9720.0, "kl_loss_7": 12067.2, "learning_rate": 0.0009929449980904952, "loss": 22153.2, "step": 630 }, { "ce_loss_13": 7.083522534370422, "ce_loss_26": 6.665987038612366, "ce_loss_39": 6.162657225131989, "ce_loss_52": 1.4658448547124863, "ce_loss_7": 7.312107050418854, "epoch": 0.064, "grad_norm": 45.471744941742365, "kl_loss_13": 11552.0, "kl_loss_26": 10675.2, "kl_loss_39": 9596.8, "kl_loss_7": 12032.0, "learning_rate": 0.0009926769179238466, "loss": 21949.2, "step": 640 }, { "ce_loss_13": 6.994167017936706, "ce_loss_26": 6.563658082485199, "ce_loss_39": 6.042373907566071, "ce_loss_52": 1.4207285180687905, "ce_loss_7": 7.2311041235923765, "epoch": 0.065, "grad_norm": 43.84127734363621, "kl_loss_13": 11489.6, "kl_loss_26": 10593.6, "kl_loss_39": 9488.0, "kl_loss_7": 11980.8, "learning_rate": 0.000992403876506104, "loss": 21796.8, "step": 650 }, { "ce_loss_13": 6.9931820154190065, "ce_loss_26": 6.566097593307495, "ce_loss_39": 6.045178306102753, "ce_loss_52": 1.4772068083286285, "ce_loss_7": 7.2253869533538815, "epoch": 0.066, "grad_norm": 43.29636197313948, "kl_loss_13": 11363.2, "kl_loss_26": 10462.4, "kl_loss_39": 9350.4, "kl_loss_7": 11856.0, "learning_rate": 0.0009921258765867918, "loss": 21581.2, "step": 660 }, { "ce_loss_13": 6.907565414905548, "ce_loss_26": 6.471543419361114, "ce_loss_39": 5.933564639091491, "ce_loss_52": 1.4364299774169922, "ce_loss_7": 7.147727394104004, "epoch": 0.067, "grad_norm": 45.37835002704289, "kl_loss_13": 11259.2, "kl_loss_26": 10348.8, "kl_loss_39": 9200.0, "kl_loss_7": 11766.4, "learning_rate": 0.0009918429209653662, "loss": 21394.0, "step": 670 }, { "ce_loss_13": 6.9164858102798465, "ce_loss_26": 6.482825660705567, "ce_loss_39": 5.9588632702827455, "ce_loss_52": 1.4493420034646989, "ce_loss_7": 7.152165937423706, "epoch": 0.068, "grad_norm": 44.49853682897619, "kl_loss_13": 11238.4, "kl_loss_26": 10336.0, "kl_loss_39": 9201.6, "kl_loss_7": 11729.6, "learning_rate": 0.0009915550124911866, "loss": 21260.8, "step": 680 }, { "ce_loss_13": 6.871931791305542, "ce_loss_26": 6.440780913829803, "ce_loss_39": 5.910830950736999, "ce_loss_52": 1.4238717705011368, "ce_loss_7": 7.117027842998505, "epoch": 0.069, "grad_norm": 44.632228248662486, "kl_loss_13": 11209.6, "kl_loss_26": 10307.2, "kl_loss_39": 9184.0, "kl_loss_7": 11716.8, "learning_rate": 0.0009912621540634887, "loss": 21100.4, "step": 690 }, { "ce_loss_13": 6.761080467700959, "ce_loss_26": 6.306544578075409, "ce_loss_39": 5.754528117179871, "ce_loss_52": 1.378117674589157, "ce_loss_7": 7.018208122253418, "epoch": 0.07, "grad_norm": 45.73542626422545, "kl_loss_13": 11040.0, "kl_loss_26": 10088.0, "kl_loss_39": 8913.6, "kl_loss_7": 11569.6, "learning_rate": 0.0009909643486313534, "loss": 20851.6, "step": 700 }, { "ce_loss_13": 6.78661539554596, "ce_loss_26": 6.327802836894989, "ce_loss_39": 5.761592519283295, "ce_loss_52": 1.411187869310379, "ce_loss_7": 7.041204571723938, "epoch": 0.071, "grad_norm": 42.217908581540186, "kl_loss_13": 11076.8, "kl_loss_26": 10112.0, "kl_loss_39": 8905.6, "kl_loss_7": 11606.4, "learning_rate": 0.000990661599193678, "loss": 20727.2, "step": 710 }, { "ce_loss_13": 6.725618660449982, "ce_loss_26": 6.256143915653229, "ce_loss_39": 5.684507942199707, "ce_loss_52": 1.4021562442183495, "ce_loss_7": 6.98309029340744, "epoch": 0.072, "grad_norm": 41.87540936995742, "kl_loss_13": 10950.4, "kl_loss_26": 9966.4, "kl_loss_39": 8756.0, "kl_loss_7": 11489.6, "learning_rate": 0.0009903539087991462, "loss": 20494.0, "step": 720 }, { "ce_loss_13": 6.704308211803436, "ce_loss_26": 6.2481373190879825, "ce_loss_39": 5.691672837734222, "ce_loss_52": 1.4353317350149155, "ce_loss_7": 6.9648723125457765, "epoch": 0.073, "grad_norm": 41.50232307107669, "kl_loss_13": 10825.6, "kl_loss_26": 9868.8, "kl_loss_39": 8681.6, "kl_loss_7": 11364.8, "learning_rate": 0.0009900412805461966, "loss": 20435.6, "step": 730 }, { "ce_loss_13": 6.661628067493439, "ce_loss_26": 6.196605837345123, "ce_loss_39": 5.6325979948043825, "ce_loss_52": 1.4341969668865204, "ce_loss_7": 6.919101679325104, "epoch": 0.074, "grad_norm": 40.95865426481825, "kl_loss_13": 10744.0, "kl_loss_26": 9771.2, "kl_loss_39": 8564.0, "kl_loss_7": 11276.8, "learning_rate": 0.0009897237175829927, "loss": 20203.6, "step": 740 }, { "ce_loss_13": 6.600095963478088, "ce_loss_26": 6.1362119793891905, "ce_loss_39": 5.5736886858940125, "ce_loss_52": 1.408122679591179, "ce_loss_7": 6.857535183429718, "epoch": 0.075, "grad_norm": 39.711086876656914, "kl_loss_13": 10672.0, "kl_loss_26": 9696.0, "kl_loss_39": 8488.0, "kl_loss_7": 11211.2, "learning_rate": 0.0009894012231073895, "loss": 20039.6, "step": 750 }, { "ce_loss_13": 6.570180189609528, "ce_loss_26": 6.098634576797485, "ce_loss_39": 5.526832151412964, "ce_loss_52": 1.4715266615152358, "ce_loss_7": 6.833914375305175, "epoch": 0.076, "grad_norm": 43.2480305118225, "kl_loss_13": 10513.6, "kl_loss_26": 9529.6, "kl_loss_39": 8309.6, "kl_loss_7": 11064.0, "learning_rate": 0.0009890738003669028, "loss": 19880.0, "step": 760 }, { "ce_loss_13": 6.567346775531769, "ce_loss_26": 6.0874533414840695, "ce_loss_39": 5.5076407313346865, "ce_loss_52": 1.4363629996776581, "ce_loss_7": 6.83351217508316, "epoch": 0.077, "grad_norm": 40.54611020558553, "kl_loss_13": 10534.4, "kl_loss_26": 9550.4, "kl_loss_39": 8325.6, "kl_loss_7": 11091.2, "learning_rate": 0.0009887414526586764, "loss": 19717.2, "step": 770 }, { "ce_loss_13": 6.519154870510102, "ce_loss_26": 6.030243515968323, "ce_loss_39": 5.4369661688804625, "ce_loss_52": 1.4315312415361405, "ce_loss_7": 6.7882112741470335, "epoch": 0.078, "grad_norm": 40.32433812744511, "kl_loss_13": 10425.6, "kl_loss_26": 9401.6, "kl_loss_39": 8148.0, "kl_loss_7": 10985.6, "learning_rate": 0.0009884041833294476, "loss": 19528.4, "step": 780 }, { "ce_loss_13": 6.458490109443664, "ce_loss_26": 5.970031499862671, "ce_loss_39": 5.392513406276703, "ce_loss_52": 1.4178608924150466, "ce_loss_7": 6.729024958610535, "epoch": 0.079, "grad_norm": 41.89461078246612, "kl_loss_13": 10355.2, "kl_loss_26": 9337.6, "kl_loss_39": 8099.2, "kl_loss_7": 10920.0, "learning_rate": 0.000988061995775515, "loss": 19441.6, "step": 790 }, { "ce_loss_13": 6.432489657402039, "ce_loss_26": 5.9446264743804935, "ce_loss_39": 5.3700491189956665, "ce_loss_52": 1.448669496178627, "ce_loss_7": 6.70353993177414, "epoch": 0.08, "grad_norm": 41.284533949329585, "kl_loss_13": 10246.4, "kl_loss_26": 9230.4, "kl_loss_39": 8004.0, "kl_loss_7": 10811.2, "learning_rate": 0.0009877148934427035, "loss": 19206.8, "step": 800 }, { "ce_loss_13": 6.43521283864975, "ce_loss_26": 5.939107716083527, "ce_loss_39": 5.3414135575294495, "ce_loss_52": 1.4238088309764863, "ce_loss_7": 6.7135733485221865, "epoch": 0.081, "grad_norm": 39.49115349094681, "kl_loss_13": 10304.0, "kl_loss_26": 9264.0, "kl_loss_39": 8010.4, "kl_loss_7": 10881.6, "learning_rate": 0.0009873628798263297, "loss": 19058.0, "step": 810 }, { "ce_loss_13": 6.375123608112335, "ce_loss_26": 5.867140221595764, "ce_loss_39": 5.2596719622612, "ce_loss_52": 1.4445373743772507, "ce_loss_7": 6.642891383171081, "epoch": 0.082, "grad_norm": 39.14925963953466, "kl_loss_13": 10124.8, "kl_loss_26": 9064.0, "kl_loss_39": 7779.2, "kl_loss_7": 10689.6, "learning_rate": 0.0009870059584711668, "loss": 18891.2, "step": 820 }, { "ce_loss_13": 6.2935021877288815, "ce_loss_26": 5.791378605365753, "ce_loss_39": 5.201059639453888, "ce_loss_52": 1.424940624833107, "ce_loss_7": 6.564577507972717, "epoch": 0.083, "grad_norm": 39.604616846798116, "kl_loss_13": 9998.4, "kl_loss_26": 8953.6, "kl_loss_39": 7687.2, "kl_loss_7": 10563.2, "learning_rate": 0.000986644132971409, "loss": 18704.0, "step": 830 }, { "ce_loss_13": 6.274489688873291, "ce_loss_26": 5.771354067325592, "ce_loss_39": 5.155186474323273, "ce_loss_52": 1.429549178481102, "ce_loss_7": 6.554718315601349, "epoch": 0.084, "grad_norm": 38.382382142673, "kl_loss_13": 9971.2, "kl_loss_26": 8913.6, "kl_loss_39": 7593.6, "kl_loss_7": 10548.8, "learning_rate": 0.0009862774069706345, "loss": 18644.4, "step": 840 }, { "ce_loss_13": 6.2195475697517395, "ce_loss_26": 5.717883968353272, "ce_loss_39": 5.13158141374588, "ce_loss_52": 1.4267238914966582, "ce_loss_7": 6.497097527980804, "epoch": 0.085, "grad_norm": 39.67456709505246, "kl_loss_13": 9820.8, "kl_loss_26": 8772.8, "kl_loss_39": 7518.4, "kl_loss_7": 10401.6, "learning_rate": 0.000985905784161771, "loss": 18443.2, "step": 850 }, { "ce_loss_13": 6.253428983688354, "ce_loss_26": 5.746143198013305, "ce_loss_39": 5.127990126609802, "ce_loss_52": 1.4239614009857178, "ce_loss_7": 6.538207769393921, "epoch": 0.086, "grad_norm": 39.46700749652253, "kl_loss_13": 9912.0, "kl_loss_26": 8857.6, "kl_loss_39": 7540.0, "kl_loss_7": 10500.8, "learning_rate": 0.000985529268287055, "loss": 18336.4, "step": 860 }, { "ce_loss_13": 6.181728804111481, "ce_loss_26": 5.661606287956237, "ce_loss_39": 5.042867851257324, "ce_loss_52": 1.427235585451126, "ce_loss_7": 6.472940897941589, "epoch": 0.087, "grad_norm": 38.42645346767181, "kl_loss_13": 9798.4, "kl_loss_26": 8700.0, "kl_loss_39": 7378.4, "kl_loss_7": 10409.6, "learning_rate": 0.0009851478631379982, "loss": 18167.2, "step": 870 }, { "ce_loss_13": 6.113723492622375, "ce_loss_26": 5.586072051525116, "ce_loss_39": 4.9395282626152035, "ce_loss_52": 1.3542900115251542, "ce_loss_7": 6.410606110095978, "epoch": 0.088, "grad_norm": 37.03254074803745, "kl_loss_13": 9768.0, "kl_loss_26": 8668.8, "kl_loss_39": 7310.4, "kl_loss_7": 10384.0, "learning_rate": 0.0009847615725553456, "loss": 18092.4, "step": 880 }, { "ce_loss_13": 6.182212936878204, "ce_loss_26": 5.651630616188049, "ce_loss_39": 5.018951749801635, "ce_loss_52": 1.4187958374619485, "ce_loss_7": 6.474197280406952, "epoch": 0.089, "grad_norm": 36.66962934304384, "kl_loss_13": 9784.0, "kl_loss_26": 8682.4, "kl_loss_39": 7329.6, "kl_loss_7": 10400.0, "learning_rate": 0.0009843704004290394, "loss": 18005.6, "step": 890 }, { "ce_loss_13": 6.1027270436286924, "ce_loss_26": 5.578388214111328, "ce_loss_39": 4.954142212867737, "ce_loss_52": 1.433653001487255, "ce_loss_7": 6.3896349430084225, "epoch": 0.09, "grad_norm": 37.895271797404135, "kl_loss_13": 9592.0, "kl_loss_26": 8488.8, "kl_loss_39": 7161.6, "kl_loss_7": 10198.4, "learning_rate": 0.0009839743506981783, "loss": 17767.6, "step": 900 }, { "ce_loss_13": 6.134694254398346, "ce_loss_26": 5.600531077384948, "ce_loss_39": 4.956934368610382, "ce_loss_52": 1.4616568014025688, "ce_loss_7": 6.426614594459534, "epoch": 0.091, "grad_norm": 35.5095046076979, "kl_loss_13": 9608.0, "kl_loss_26": 8489.6, "kl_loss_39": 7093.6, "kl_loss_7": 10217.6, "learning_rate": 0.0009835734273509786, "loss": 17664.2, "step": 910 }, { "ce_loss_13": 6.059712076187134, "ce_loss_26": 5.526252567768097, "ce_loss_39": 4.893229007720947, "ce_loss_52": 1.4395650416612624, "ce_loss_7": 6.356565976142884, "epoch": 0.092, "grad_norm": 34.796923941159505, "kl_loss_13": 9540.8, "kl_loss_26": 8412.8, "kl_loss_39": 7049.6, "kl_loss_7": 10158.4, "learning_rate": 0.0009831676344247342, "loss": 17511.4, "step": 920 }, { "ce_loss_13": 6.012007105350494, "ce_loss_26": 5.470919144153595, "ce_loss_39": 4.816142636537552, "ce_loss_52": 1.3806764528155326, "ce_loss_7": 6.3036043524742125, "epoch": 0.093, "grad_norm": 34.520449300223, "kl_loss_13": 9520.0, "kl_loss_26": 8383.2, "kl_loss_39": 6990.4, "kl_loss_7": 10132.8, "learning_rate": 0.0009827569760057755, "loss": 17476.2, "step": 930 }, { "ce_loss_13": 6.020643877983093, "ce_loss_26": 5.4863135576248165, "ce_loss_39": 4.843132376670837, "ce_loss_52": 1.4196506530046462, "ce_loss_7": 6.318761503696441, "epoch": 0.094, "grad_norm": 33.89098596439575, "kl_loss_13": 9425.6, "kl_loss_26": 8303.2, "kl_loss_39": 6938.4, "kl_loss_7": 10048.0, "learning_rate": 0.000982341456229428, "loss": 17230.8, "step": 940 }, { "ce_loss_13": 5.974271166324615, "ce_loss_26": 5.436639845371246, "ce_loss_39": 4.804401755332947, "ce_loss_52": 1.4617832124233245, "ce_loss_7": 6.261663150787354, "epoch": 0.095, "grad_norm": 33.71164019490533, "kl_loss_13": 9278.4, "kl_loss_26": 8148.0, "kl_loss_39": 6788.0, "kl_loss_7": 9880.0, "learning_rate": 0.000981921079279971, "loss": 17111.2, "step": 950 }, { "ce_loss_13": 5.966051030158996, "ce_loss_26": 5.422689366340637, "ce_loss_39": 4.77484347820282, "ce_loss_52": 1.4197688490152358, "ce_loss_7": 6.259041047096252, "epoch": 0.096, "grad_norm": 33.446193055983784, "kl_loss_13": 9321.6, "kl_loss_26": 8191.2, "kl_loss_39": 6796.8, "kl_loss_7": 9932.8, "learning_rate": 0.0009814958493905962, "loss": 17055.2, "step": 960 }, { "ce_loss_13": 5.8990898609161375, "ce_loss_26": 5.356628429889679, "ce_loss_39": 4.708657902479172, "ce_loss_52": 1.4259506687521935, "ce_loss_7": 6.204090213775634, "epoch": 0.097, "grad_norm": 32.73849681683031, "kl_loss_13": 9192.0, "kl_loss_26": 8055.2, "kl_loss_39": 6684.0, "kl_loss_7": 9836.8, "learning_rate": 0.0009810657708433637, "loss": 16837.6, "step": 970 }, { "ce_loss_13": 5.879060399532318, "ce_loss_26": 5.346331930160522, "ce_loss_39": 4.690547597408295, "ce_loss_52": 1.4302369862794877, "ce_loss_7": 6.184376800060273, "epoch": 0.098, "grad_norm": 32.85977379524165, "kl_loss_13": 9155.2, "kl_loss_26": 8032.0, "kl_loss_39": 6616.8, "kl_loss_7": 9792.0, "learning_rate": 0.0009806308479691594, "loss": 16832.6, "step": 980 }, { "ce_loss_13": 5.816424036026001, "ce_loss_26": 5.264147555828094, "ce_loss_39": 4.6131664395332335, "ce_loss_52": 1.4319524437189102, "ce_loss_7": 6.118786966800689, "epoch": 0.099, "grad_norm": 33.426081767419625, "kl_loss_13": 8992.0, "kl_loss_26": 7842.4, "kl_loss_39": 6442.4, "kl_loss_7": 9635.2, "learning_rate": 0.0009801910851476522, "loss": 16728.4, "step": 990 }, { "ce_loss_13": 5.814940357208252, "ce_loss_26": 5.2667844772338865, "ce_loss_39": 4.6347626686096195, "ce_loss_52": 1.4415421515703202, "ce_loss_7": 6.12279201745987, "epoch": 0.1, "grad_norm": 33.016085914188245, "kl_loss_13": 8960.0, "kl_loss_26": 7812.8, "kl_loss_39": 6444.0, "kl_loss_7": 9588.8, "learning_rate": 0.0009797464868072487, "loss": 16535.6, "step": 1000 }, { "ce_loss_13": 5.8194945573806764, "ce_loss_26": 5.259810090065002, "ce_loss_39": 4.592129653692245, "ce_loss_52": 1.4136200681328774, "ce_loss_7": 6.127060306072235, "epoch": 0.101, "grad_norm": 31.039827427941336, "kl_loss_13": 9033.6, "kl_loss_26": 7855.2, "kl_loss_39": 6432.0, "kl_loss_7": 9683.2, "learning_rate": 0.0009792970574250492, "loss": 16416.6, "step": 1010 }, { "ce_loss_13": 5.752316701412201, "ce_loss_26": 5.182761800289154, "ce_loss_39": 4.4902693152427675, "ce_loss_52": 1.37675661444664, "ce_loss_7": 6.068772268295288, "epoch": 0.102, "grad_norm": 30.262078958434195, "kl_loss_13": 8969.6, "kl_loss_26": 7772.0, "kl_loss_39": 6300.8, "kl_loss_7": 9622.4, "learning_rate": 0.0009788428015268028, "loss": 16337.4, "step": 1020 }, { "ce_loss_13": 5.781488347053528, "ce_loss_26": 5.2409987449646, "ce_loss_39": 4.596257948875428, "ce_loss_52": 1.4595280766487122, "ce_loss_7": 6.078718197345734, "epoch": 0.103, "grad_norm": 31.79275929639243, "kl_loss_13": 8864.0, "kl_loss_26": 7715.2, "kl_loss_39": 6324.0, "kl_loss_7": 9491.2, "learning_rate": 0.0009783837236868609, "loss": 16174.0, "step": 1030 }, { "ce_loss_13": 5.71779420375824, "ce_loss_26": 5.146243929862976, "ce_loss_39": 4.4791832447052, "ce_loss_52": 1.4291063606739045, "ce_loss_7": 6.026824104785919, "epoch": 0.104, "grad_norm": 30.963349252235982, "kl_loss_13": 8771.2, "kl_loss_26": 7573.6, "kl_loss_39": 6160.8, "kl_loss_7": 9416.0, "learning_rate": 0.0009779198285281327, "loss": 16072.0, "step": 1040 }, { "ce_loss_13": 5.742816948890686, "ce_loss_26": 5.189597749710083, "ce_loss_39": 4.527956926822663, "ce_loss_52": 1.4477199196815491, "ce_loss_7": 6.058120143413544, "epoch": 0.105, "grad_norm": 32.72479335973728, "kl_loss_13": 8772.8, "kl_loss_26": 7620.0, "kl_loss_39": 6205.6, "kl_loss_7": 9438.4, "learning_rate": 0.0009774511207220368, "loss": 15932.8, "step": 1050 }, { "ce_loss_13": 5.728980660438538, "ce_loss_26": 5.167429828643799, "ce_loss_39": 4.511177510023117, "ce_loss_52": 1.4759073287248612, "ce_loss_7": 6.031959307193756, "epoch": 0.106, "grad_norm": 31.226954775299472, "kl_loss_13": 8736.8, "kl_loss_26": 7550.4, "kl_loss_39": 6153.6, "kl_loss_7": 9363.2, "learning_rate": 0.0009769776049884564, "loss": 15802.0, "step": 1060 }, { "ce_loss_13": 5.7382616877555845, "ce_loss_26": 5.175736773014068, "ce_loss_39": 4.510921847820282, "ce_loss_52": 1.4512410640716553, "ce_loss_7": 6.050414621829987, "epoch": 0.107, "grad_norm": 30.630877517383688, "kl_loss_13": 8788.0, "kl_loss_26": 7622.4, "kl_loss_39": 6202.4, "kl_loss_7": 9452.8, "learning_rate": 0.0009764992860956889, "loss": 15822.0, "step": 1070 }, { "ce_loss_13": 5.645794451236725, "ce_loss_26": 5.088676834106446, "ce_loss_39": 4.417802548408508, "ce_loss_52": 1.4182049363851548, "ce_loss_7": 5.959036731719971, "epoch": 0.108, "grad_norm": 30.457487960812127, "kl_loss_13": 8681.6, "kl_loss_26": 7509.6, "kl_loss_39": 6084.8, "kl_loss_7": 9332.8, "learning_rate": 0.0009760161688604008, "loss": 15627.0, "step": 1080 }, { "ce_loss_13": 5.568986439704895, "ce_loss_26": 5.009959697723389, "ce_loss_39": 4.367926681041718, "ce_loss_52": 1.4609902381896973, "ce_loss_7": 5.883221137523651, "epoch": 0.109, "grad_norm": 29.989545158997807, "kl_loss_13": 8458.4, "kl_loss_26": 7279.2, "kl_loss_39": 5895.2, "kl_loss_7": 9112.0, "learning_rate": 0.0009755282581475768, "loss": 15555.0, "step": 1090 }, { "ce_loss_13": 5.6291534304618835, "ce_loss_26": 5.068492126464844, "ce_loss_39": 4.403285652399063, "ce_loss_52": 1.445840133726597, "ce_loss_7": 5.945274484157562, "epoch": 0.11, "grad_norm": 30.66383917654591, "kl_loss_13": 8576.8, "kl_loss_26": 7396.8, "kl_loss_39": 5972.8, "kl_loss_7": 9244.8, "learning_rate": 0.0009750355588704727, "loss": 15472.0, "step": 1100 }, { "ce_loss_13": 5.522909152507782, "ce_loss_26": 4.949995934963226, "ce_loss_39": 4.270336884260177, "ce_loss_52": 1.407256692647934, "ce_loss_7": 5.840477633476257, "epoch": 0.111, "grad_norm": 29.782295400473814, "kl_loss_13": 8460.8, "kl_loss_26": 7250.4, "kl_loss_39": 5788.8, "kl_loss_7": 9118.4, "learning_rate": 0.0009745380759905647, "loss": 15294.0, "step": 1110 }, { "ce_loss_13": 5.517545366287232, "ce_loss_26": 4.932116758823395, "ce_loss_39": 4.250882798433304, "ce_loss_52": 1.3833064809441566, "ce_loss_7": 5.846422612667084, "epoch": 0.112, "grad_norm": 28.73414895119145, "kl_loss_13": 8486.4, "kl_loss_26": 7264.0, "kl_loss_39": 5804.0, "kl_loss_7": 9168.0, "learning_rate": 0.0009740358145174998, "loss": 15318.0, "step": 1120 }, { "ce_loss_13": 5.50923570394516, "ce_loss_26": 4.938242793083191, "ce_loss_39": 4.259438300132752, "ce_loss_52": 1.430792199075222, "ce_loss_7": 5.8306269407272335, "epoch": 0.113, "grad_norm": 28.895019773736312, "kl_loss_13": 8355.2, "kl_loss_26": 7152.8, "kl_loss_39": 5700.8, "kl_loss_7": 9024.0, "learning_rate": 0.0009735287795090455, "loss": 15192.0, "step": 1130 }, { "ce_loss_13": 5.401988506317139, "ce_loss_26": 4.814380037784576, "ce_loss_39": 4.133068162202835, "ce_loss_52": 1.3917075648903847, "ce_loss_7": 5.732739126682281, "epoch": 0.114, "grad_norm": 28.473930821013894, "kl_loss_13": 8231.2, "kl_loss_26": 7010.4, "kl_loss_39": 5549.6, "kl_loss_7": 8921.6, "learning_rate": 0.0009730169760710386, "loss": 15030.2, "step": 1140 }, { "ce_loss_13": 5.538087117671966, "ce_loss_26": 4.94068056344986, "ce_loss_39": 4.253137022256851, "ce_loss_52": 1.4375049352645874, "ce_loss_7": 5.861488628387451, "epoch": 0.115, "grad_norm": 30.345832823062537, "kl_loss_13": 8408.0, "kl_loss_26": 7160.0, "kl_loss_39": 5690.4, "kl_loss_7": 9096.0, "learning_rate": 0.0009725004093573342, "loss": 14951.8, "step": 1150 }, { "ce_loss_13": 5.385800528526306, "ce_loss_26": 4.801717817783356, "ce_loss_39": 4.136020374298096, "ce_loss_52": 1.4078958943486213, "ce_loss_7": 5.722076547145844, "epoch": 0.116, "grad_norm": 30.297186235009132, "kl_loss_13": 8177.6, "kl_loss_26": 6956.0, "kl_loss_39": 5514.4, "kl_loss_7": 8880.0, "learning_rate": 0.0009719790845697534, "loss": 14867.6, "step": 1160 }, { "ce_loss_13": 5.426836037635804, "ce_loss_26": 4.832395279407502, "ce_loss_39": 4.153381270170212, "ce_loss_52": 1.426569977402687, "ce_loss_7": 5.755613851547241, "epoch": 0.117, "grad_norm": 31.87589996223516, "kl_loss_13": 8220.8, "kl_loss_26": 6976.0, "kl_loss_39": 5544.0, "kl_loss_7": 8909.6, "learning_rate": 0.0009714530069580309, "loss": 14745.8, "step": 1170 }, { "ce_loss_13": 5.359652185440064, "ce_loss_26": 4.760751461982727, "ce_loss_39": 4.048358517885208, "ce_loss_52": 1.3907365471124649, "ce_loss_7": 5.693908452987671, "epoch": 0.118, "grad_norm": 27.539122306915463, "kl_loss_13": 8126.4, "kl_loss_26": 6876.0, "kl_loss_39": 5376.0, "kl_loss_7": 8822.4, "learning_rate": 0.0009709221818197624, "loss": 14704.2, "step": 1180 }, { "ce_loss_13": 5.350854313373565, "ce_loss_26": 4.768227469921112, "ce_loss_39": 4.099184954166413, "ce_loss_52": 1.4248775228857995, "ce_loss_7": 5.680926930904389, "epoch": 0.119, "grad_norm": 28.933475617899816, "kl_loss_13": 8027.2, "kl_loss_26": 6818.4, "kl_loss_39": 5372.0, "kl_loss_7": 8727.2, "learning_rate": 0.0009703866145003512, "loss": 14583.0, "step": 1190 }, { "ce_loss_13": 5.372988939285278, "ce_loss_26": 4.778137028217316, "ce_loss_39": 4.089074891805649, "ce_loss_52": 1.4195260405540466, "ce_loss_7": 5.714505088329315, "epoch": 0.12, "grad_norm": 26.60053169419214, "kl_loss_13": 8131.2, "kl_loss_26": 6885.6, "kl_loss_39": 5404.8, "kl_loss_7": 8840.8, "learning_rate": 0.0009698463103929542, "loss": 14513.0, "step": 1200 }, { "ce_loss_13": 5.392605185508728, "ce_loss_26": 4.798673605918884, "ce_loss_39": 4.126504504680634, "ce_loss_52": 1.4746148020029068, "ce_loss_7": 5.7245006442070006, "epoch": 0.121, "grad_norm": 26.989592045291893, "kl_loss_13": 8018.4, "kl_loss_26": 6775.2, "kl_loss_39": 5339.2, "kl_loss_7": 8718.4, "learning_rate": 0.0009693012749384279, "loss": 14383.2, "step": 1210 }, { "ce_loss_13": 5.319035434722901, "ce_loss_26": 4.736347317695618, "ce_loss_39": 4.0659150838851925, "ce_loss_52": 1.4397204488515853, "ce_loss_7": 5.642805421352387, "epoch": 0.122, "grad_norm": 29.38130003686817, "kl_loss_13": 7946.4, "kl_loss_26": 6723.2, "kl_loss_39": 5294.4, "kl_loss_7": 8631.2, "learning_rate": 0.0009687515136252732, "loss": 14375.6, "step": 1220 }, { "ce_loss_13": 5.3430128455162045, "ce_loss_26": 4.744695138931275, "ce_loss_39": 4.071437209844589, "ce_loss_52": 1.4354561120271683, "ce_loss_7": 5.681857228279114, "epoch": 0.123, "grad_norm": 25.479885446063793, "kl_loss_13": 8008.0, "kl_loss_26": 6754.4, "kl_loss_39": 5308.8, "kl_loss_7": 8709.6, "learning_rate": 0.0009681970319895803, "loss": 14273.4, "step": 1230 }, { "ce_loss_13": 5.331636953353882, "ce_loss_26": 4.735000967979431, "ce_loss_39": 4.073598688840866, "ce_loss_52": 1.470110397040844, "ce_loss_7": 5.655930757522583, "epoch": 0.124, "grad_norm": 28.379117971457468, "kl_loss_13": 7929.6, "kl_loss_26": 6684.0, "kl_loss_39": 5260.8, "kl_loss_7": 8612.8, "learning_rate": 0.0009676378356149733, "loss": 14150.8, "step": 1240 }, { "ce_loss_13": 5.1874682068824765, "ce_loss_26": 4.582801806926727, "ce_loss_39": 3.900476610660553, "ce_loss_52": 1.4180996417999268, "ce_loss_7": 5.519565558433532, "epoch": 0.125, "grad_norm": 27.465496459767188, "kl_loss_13": 7764.8, "kl_loss_26": 6504.0, "kl_loss_39": 5041.6, "kl_loss_7": 8455.2, "learning_rate": 0.0009670739301325534, "loss": 13985.0, "step": 1250 }, { "ce_loss_13": 5.221911752223969, "ce_loss_26": 4.625354039669037, "ce_loss_39": 3.9350062906742096, "ce_loss_52": 1.3881619155406952, "ce_loss_7": 5.562334418296814, "epoch": 0.126, "grad_norm": 26.021158683557974, "kl_loss_13": 7847.2, "kl_loss_26": 6599.2, "kl_loss_39": 5120.0, "kl_loss_7": 8558.4, "learning_rate": 0.0009665053212208426, "loss": 13978.8, "step": 1260 }, { "ce_loss_13": 5.201095879077911, "ce_loss_26": 4.594125282764435, "ce_loss_39": 3.8948924005031587, "ce_loss_52": 1.4181114554405212, "ce_loss_7": 5.552536249160767, "epoch": 0.127, "grad_norm": 26.300188898530276, "kl_loss_13": 7783.2, "kl_loss_26": 6515.2, "kl_loss_39": 5012.0, "kl_loss_7": 8515.2, "learning_rate": 0.0009659320146057262, "loss": 13927.6, "step": 1270 }, { "ce_loss_13": 5.186312806606293, "ce_loss_26": 4.5913723587989805, "ce_loss_39": 3.912171256542206, "ce_loss_52": 1.4045341789722443, "ce_loss_7": 5.535152721405029, "epoch": 0.128, "grad_norm": 25.74310395170922, "kl_loss_13": 7757.6, "kl_loss_26": 6509.6, "kl_loss_39": 5065.6, "kl_loss_7": 8486.4, "learning_rate": 0.0009653540160603955, "loss": 13929.0, "step": 1280 }, { "ce_loss_13": 5.17168892621994, "ce_loss_26": 4.572988575696945, "ce_loss_39": 3.902406334877014, "ce_loss_52": 1.4593060314655304, "ce_loss_7": 5.513335394859314, "epoch": 0.129, "grad_norm": 26.464934956114348, "kl_loss_13": 7624.0, "kl_loss_26": 6372.8, "kl_loss_39": 4939.2, "kl_loss_7": 8331.2, "learning_rate": 0.0009647713314052896, "loss": 13720.2, "step": 1290 }, { "ce_loss_13": 5.167734289169312, "ce_loss_26": 4.5827870786190035, "ce_loss_39": 3.9150634109973907, "ce_loss_52": 1.4295778691768646, "ce_loss_7": 5.517308306694031, "epoch": 0.13, "grad_norm": 26.41573865043221, "kl_loss_13": 7627.2, "kl_loss_26": 6407.2, "kl_loss_39": 4972.0, "kl_loss_7": 8360.0, "learning_rate": 0.0009641839665080363, "loss": 13644.6, "step": 1300 }, { "ce_loss_13": 5.155666828155518, "ce_loss_26": 4.557369256019593, "ce_loss_39": 3.8985717594623566, "ce_loss_52": 1.4532029300928115, "ce_loss_7": 5.49907066822052, "epoch": 0.131, "grad_norm": 28.09061259972242, "kl_loss_13": 7600.0, "kl_loss_26": 6340.8, "kl_loss_39": 4909.6, "kl_loss_7": 8327.2, "learning_rate": 0.0009635919272833937, "loss": 13575.0, "step": 1310 }, { "ce_loss_13": 5.079627573490143, "ce_loss_26": 4.46733387708664, "ce_loss_39": 3.7981239676475527, "ce_loss_52": 1.4149045318365097, "ce_loss_7": 5.424402499198914, "epoch": 0.132, "grad_norm": 29.96516682014439, "kl_loss_13": 7483.2, "kl_loss_26": 6202.4, "kl_loss_39": 4773.6, "kl_loss_7": 8210.4, "learning_rate": 0.0009629952196931902, "loss": 13547.6, "step": 1320 }, { "ce_loss_13": 5.090298974514008, "ce_loss_26": 4.495900344848633, "ce_loss_39": 3.821765500307083, "ce_loss_52": 1.4328191310167313, "ce_loss_7": 5.431738471984863, "epoch": 0.133, "grad_norm": 26.14827995707597, "kl_loss_13": 7504.0, "kl_loss_26": 6259.2, "kl_loss_39": 4809.6, "kl_loss_7": 8225.6, "learning_rate": 0.0009623938497462645, "loss": 13496.2, "step": 1330 }, { "ce_loss_13": 5.099011301994324, "ce_loss_26": 4.4848466455936435, "ce_loss_39": 3.794223016500473, "ce_loss_52": 1.416146419942379, "ce_loss_7": 5.462341606616974, "epoch": 0.134, "grad_norm": 24.84289392950202, "kl_loss_13": 7542.4, "kl_loss_26": 6259.2, "kl_loss_39": 4785.2, "kl_loss_7": 8304.8, "learning_rate": 0.0009617878234984055, "loss": 13395.2, "step": 1340 }, { "ce_loss_13": 5.097129952907562, "ce_loss_26": 4.498237466812133, "ce_loss_39": 3.8250812292099, "ce_loss_52": 1.4416721731424331, "ce_loss_7": 5.444403338432312, "epoch": 0.135, "grad_norm": 26.607564330476333, "kl_loss_13": 7480.8, "kl_loss_26": 6224.8, "kl_loss_39": 4782.4, "kl_loss_7": 8202.4, "learning_rate": 0.0009611771470522907, "loss": 13240.2, "step": 1350 }, { "ce_loss_13": 5.043468415737152, "ce_loss_26": 4.443963885307312, "ce_loss_39": 3.7804897725582123, "ce_loss_52": 1.4098651513457299, "ce_loss_7": 5.407905113697052, "epoch": 0.136, "grad_norm": 27.568927473511277, "kl_loss_13": 7464.8, "kl_loss_26": 6209.6, "kl_loss_39": 4773.6, "kl_loss_7": 8221.6, "learning_rate": 0.0009605618265574251, "loss": 13312.2, "step": 1360 }, { "ce_loss_13": 5.1134570121765135, "ce_loss_26": 4.510142356157303, "ce_loss_39": 3.844588041305542, "ce_loss_52": 1.4824964210391045, "ce_loss_7": 5.455760145187378, "epoch": 0.137, "grad_norm": 26.586450808382164, "kl_loss_13": 7431.2, "kl_loss_26": 6176.0, "kl_loss_39": 4748.0, "kl_loss_7": 8151.2, "learning_rate": 0.0009599418682100792, "loss": 13171.6, "step": 1370 }, { "ce_loss_13": 4.993066036701203, "ce_loss_26": 4.390464246273041, "ce_loss_39": 3.7108724772930146, "ce_loss_52": 1.3996128499507905, "ce_loss_7": 5.354221343994141, "epoch": 0.138, "grad_norm": 24.51436324855179, "kl_loss_13": 7388.0, "kl_loss_26": 6120.0, "kl_loss_39": 4660.8, "kl_loss_7": 8145.6, "learning_rate": 0.0009593172782532268, "loss": 13135.2, "step": 1380 }, { "ce_loss_13": 4.9749324202537535, "ce_loss_26": 4.36228443980217, "ce_loss_39": 3.71304127573967, "ce_loss_52": 1.4259393498301507, "ce_loss_7": 5.328954219818115, "epoch": 0.139, "grad_norm": 25.448579155888293, "kl_loss_13": 7284.8, "kl_loss_26": 6005.6, "kl_loss_39": 4599.2, "kl_loss_7": 8029.6, "learning_rate": 0.0009586880629764817, "loss": 13023.4, "step": 1390 }, { "ce_loss_13": 5.021213936805725, "ce_loss_26": 4.392004972696304, "ce_loss_39": 3.695616716146469, "ce_loss_52": 1.3939141556620598, "ce_loss_7": 5.386792302131653, "epoch": 0.14, "grad_norm": 27.169552009752685, "kl_loss_13": 7436.0, "kl_loss_26": 6132.8, "kl_loss_39": 4655.2, "kl_loss_7": 8205.6, "learning_rate": 0.0009580542287160348, "loss": 13043.6, "step": 1400 }, { "ce_loss_13": 5.006197059154511, "ce_loss_26": 4.410893344879151, "ce_loss_39": 3.74367755651474, "ce_loss_52": 1.4519873589277268, "ce_loss_7": 5.36526129245758, "epoch": 0.141, "grad_norm": 24.865151038825246, "kl_loss_13": 7283.2, "kl_loss_26": 6027.2, "kl_loss_39": 4615.2, "kl_loss_7": 8029.6, "learning_rate": 0.0009574157818545901, "loss": 12913.8, "step": 1410 }, { "ce_loss_13": 4.958354568481445, "ce_loss_26": 4.367397904396057, "ce_loss_39": 3.7062928318977355, "ce_loss_52": 1.4099174112081527, "ce_loss_7": 5.317552924156189, "epoch": 0.142, "grad_norm": 24.898155460709848, "kl_loss_13": 7277.6, "kl_loss_26": 6040.8, "kl_loss_39": 4623.6, "kl_loss_7": 8032.8, "learning_rate": 0.0009567727288213005, "loss": 12929.6, "step": 1420 }, { "ce_loss_13": 4.984454607963562, "ce_loss_26": 4.392505377531052, "ce_loss_39": 3.7512011885643006, "ce_loss_52": 1.473440769314766, "ce_loss_7": 5.340136766433716, "epoch": 0.143, "grad_norm": 24.34585690638739, "kl_loss_13": 7221.6, "kl_loss_26": 5972.8, "kl_loss_39": 4602.0, "kl_loss_7": 7973.6, "learning_rate": 0.0009561250760917027, "loss": 12830.2, "step": 1430 }, { "ce_loss_13": 4.917237496376037, "ce_loss_26": 4.313831263780594, "ce_loss_39": 3.6587266325950623, "ce_loss_52": 1.4092496067285538, "ce_loss_7": 5.2786689639091495, "epoch": 0.144, "grad_norm": 25.288024521189875, "kl_loss_13": 7198.4, "kl_loss_26": 5931.2, "kl_loss_39": 4527.2, "kl_loss_7": 7960.8, "learning_rate": 0.0009554728301876525, "loss": 12688.6, "step": 1440 }, { "ce_loss_13": 4.95776047706604, "ce_loss_26": 4.340852671861649, "ce_loss_39": 3.657728981971741, "ce_loss_52": 1.4168317198753357, "ce_loss_7": 5.322667574882507, "epoch": 0.145, "grad_norm": 26.641005752286592, "kl_loss_13": 7228.8, "kl_loss_26": 5940.0, "kl_loss_39": 4485.2, "kl_loss_7": 8003.2, "learning_rate": 0.0009548159976772592, "loss": 12683.8, "step": 1450 }, { "ce_loss_13": 4.831417870521546, "ce_loss_26": 4.231567287445069, "ce_loss_39": 3.581255227327347, "ce_loss_52": 1.4485478460788728, "ce_loss_7": 5.20115841627121, "epoch": 0.146, "grad_norm": 24.920691081516484, "kl_loss_13": 6952.0, "kl_loss_26": 5699.2, "kl_loss_39": 4308.4, "kl_loss_7": 7733.6, "learning_rate": 0.0009541545851748186, "loss": 12574.8, "step": 1460 }, { "ce_loss_13": 4.8803037166595455, "ce_loss_26": 4.2741272211074826, "ce_loss_39": 3.598990321159363, "ce_loss_52": 1.4145199984312058, "ce_loss_7": 5.244284570217133, "epoch": 0.147, "grad_norm": 25.90739775261194, "kl_loss_13": 7076.8, "kl_loss_26": 5806.4, "kl_loss_39": 4372.0, "kl_loss_7": 7841.6, "learning_rate": 0.0009534885993407473, "loss": 12558.0, "step": 1470 }, { "ce_loss_13": 4.854452967643738, "ce_loss_26": 4.251825517416, "ce_loss_39": 3.5948518395423887, "ce_loss_52": 1.428754985332489, "ce_loss_7": 5.219927191734314, "epoch": 0.148, "grad_norm": 24.48718194678669, "kl_loss_13": 7008.8, "kl_loss_26": 5743.2, "kl_loss_39": 4354.4, "kl_loss_7": 7777.6, "learning_rate": 0.0009528180468815154, "loss": 12484.4, "step": 1480 }, { "ce_loss_13": 4.884927380084991, "ce_loss_26": 4.2919243454933165, "ce_loss_39": 3.642289215326309, "ce_loss_52": 1.465419703722, "ce_loss_7": 5.241849565505982, "epoch": 0.149, "grad_norm": 24.903440253335923, "kl_loss_13": 7001.6, "kl_loss_26": 5763.2, "kl_loss_39": 4366.0, "kl_loss_7": 7752.0, "learning_rate": 0.0009521429345495787, "loss": 12486.6, "step": 1490 }, { "ce_loss_13": 4.82739794254303, "ce_loss_26": 4.228940737247467, "ce_loss_39": 3.5767277657985685, "ce_loss_52": 1.4382753789424896, "ce_loss_7": 5.209846138954163, "epoch": 0.15, "grad_norm": 25.291080092187237, "kl_loss_13": 6960.8, "kl_loss_26": 5698.4, "kl_loss_39": 4297.6, "kl_loss_7": 7764.0, "learning_rate": 0.0009514632691433108, "loss": 12420.2, "step": 1500 }, { "ce_loss_13": 4.828064477443695, "ce_loss_26": 4.213042998313904, "ce_loss_39": 3.5323724269866945, "ce_loss_52": 1.3961022228002549, "ce_loss_7": 5.196406292915344, "epoch": 0.151, "grad_norm": 25.466425081780237, "kl_loss_13": 7046.4, "kl_loss_26": 5772.8, "kl_loss_39": 4315.2, "kl_loss_7": 7825.6, "learning_rate": 0.0009507790575069346, "loss": 12387.6, "step": 1510 }, { "ce_loss_13": 4.786497128009796, "ce_loss_26": 4.188397663831711, "ce_loss_39": 3.536989223957062, "ce_loss_52": 1.4404057756066322, "ce_loss_7": 5.156642246246338, "epoch": 0.152, "grad_norm": 22.488994996506335, "kl_loss_13": 6872.8, "kl_loss_26": 5620.8, "kl_loss_39": 4225.6, "kl_loss_7": 7650.4, "learning_rate": 0.0009500903065304539, "loss": 12265.4, "step": 1520 }, { "ce_loss_13": 4.79404227733612, "ce_loss_26": 4.1956378519535065, "ce_loss_39": 3.539849889278412, "ce_loss_52": 1.447507870197296, "ce_loss_7": 5.170107614994049, "epoch": 0.153, "grad_norm": 24.979481722705945, "kl_loss_13": 6864.0, "kl_loss_26": 5609.6, "kl_loss_39": 4213.6, "kl_loss_7": 7656.8, "learning_rate": 0.0009493970231495835, "loss": 12182.2, "step": 1530 }, { "ce_loss_13": 4.754118239879608, "ce_loss_26": 4.16424406170845, "ce_loss_39": 3.5151414275169373, "ce_loss_52": 1.423200336098671, "ce_loss_7": 5.132595348358154, "epoch": 0.154, "grad_norm": 24.139218625352445, "kl_loss_13": 6807.2, "kl_loss_26": 5573.6, "kl_loss_39": 4190.4, "kl_loss_7": 7594.4, "learning_rate": 0.0009486992143456792, "loss": 12152.0, "step": 1540 }, { "ce_loss_13": 4.745328724384308, "ce_loss_26": 4.135520172119141, "ce_loss_39": 3.4818262457847595, "ce_loss_52": 1.4286953419446946, "ce_loss_7": 5.114579677581787, "epoch": 0.155, "grad_norm": 24.426109316342576, "kl_loss_13": 6791.2, "kl_loss_26": 5516.8, "kl_loss_39": 4120.0, "kl_loss_7": 7567.2, "learning_rate": 0.0009479968871456679, "loss": 12128.4, "step": 1550 }, { "ce_loss_13": 4.7574557065963745, "ce_loss_26": 4.145674997568131, "ce_loss_39": 3.476013499498367, "ce_loss_52": 1.4235228240489959, "ce_loss_7": 5.133380055427551, "epoch": 0.156, "grad_norm": 25.100926583342837, "kl_loss_13": 6843.2, "kl_loss_26": 5556.0, "kl_loss_39": 4126.0, "kl_loss_7": 7627.2, "learning_rate": 0.0009472900486219768, "loss": 12082.2, "step": 1560 }, { "ce_loss_13": 4.735032224655152, "ce_loss_26": 4.128789341449737, "ce_loss_39": 3.4694815576076508, "ce_loss_52": 1.4237273722887038, "ce_loss_7": 5.11258887052536, "epoch": 0.157, "grad_norm": 25.10370372986473, "kl_loss_13": 6792.0, "kl_loss_26": 5520.0, "kl_loss_39": 4095.6, "kl_loss_7": 7585.6, "learning_rate": 0.000946578705892462, "loss": 11936.2, "step": 1570 }, { "ce_loss_13": 4.741922962665558, "ce_loss_26": 4.132791459560394, "ce_loss_39": 3.482679557800293, "ce_loss_52": 1.4294559836387635, "ce_loss_7": 5.117163801193238, "epoch": 0.158, "grad_norm": 21.844394510796377, "kl_loss_13": 6799.2, "kl_loss_26": 5517.6, "kl_loss_39": 4118.0, "kl_loss_7": 7581.6, "learning_rate": 0.0009458628661203367, "loss": 11944.8, "step": 1580 }, { "ce_loss_13": 4.741668605804444, "ce_loss_26": 4.1376284003257755, "ce_loss_39": 3.478295695781708, "ce_loss_52": 1.415444830060005, "ce_loss_7": 5.117357003688812, "epoch": 0.159, "grad_norm": 25.4671883290825, "kl_loss_13": 6812.0, "kl_loss_26": 5548.0, "kl_loss_39": 4136.8, "kl_loss_7": 7601.6, "learning_rate": 0.0009451425365140996, "loss": 11952.4, "step": 1590 }, { "ce_loss_13": 4.723819291591644, "ce_loss_26": 4.128834217786789, "ce_loss_39": 3.47242848277092, "ce_loss_52": 1.429117676615715, "ce_loss_7": 5.096058523654937, "epoch": 0.16, "grad_norm": 25.14078013617688, "kl_loss_13": 6768.0, "kl_loss_26": 5519.2, "kl_loss_39": 4101.6, "kl_loss_7": 7547.2, "learning_rate": 0.0009444177243274617, "loss": 11862.0, "step": 1600 }, { "ce_loss_13": 4.648782467842102, "ce_loss_26": 4.0394273698329926, "ce_loss_39": 3.377221292257309, "ce_loss_52": 1.4151206001639367, "ce_loss_7": 5.0250336050987245, "epoch": 0.161, "grad_norm": 24.128253336718885, "kl_loss_13": 6640.0, "kl_loss_26": 5364.0, "kl_loss_39": 3953.2, "kl_loss_7": 7436.8, "learning_rate": 0.0009436884368592739, "loss": 11833.0, "step": 1610 }, { "ce_loss_13": 4.695314359664917, "ce_loss_26": 4.099924111366272, "ce_loss_39": 3.466512751579285, "ce_loss_52": 1.4766929775476456, "ce_loss_7": 5.064470827579498, "epoch": 0.162, "grad_norm": 23.68843577414951, "kl_loss_13": 6614.4, "kl_loss_26": 5368.8, "kl_loss_39": 3996.8, "kl_loss_7": 7387.2, "learning_rate": 0.0009429546814534529, "loss": 11713.8, "step": 1620 }, { "ce_loss_13": 4.7040504813194275, "ce_loss_26": 4.104205197095871, "ce_loss_39": 3.4451481282711027, "ce_loss_52": 1.4428326219320298, "ce_loss_7": 5.08098030090332, "epoch": 0.163, "grad_norm": 23.332187460756046, "kl_loss_13": 6673.6, "kl_loss_26": 5408.0, "kl_loss_39": 4000.0, "kl_loss_7": 7468.8, "learning_rate": 0.0009422164654989072, "loss": 11730.0, "step": 1630 }, { "ce_loss_13": 4.6945901870727536, "ce_loss_26": 4.092492777109146, "ce_loss_39": 3.4360527455806733, "ce_loss_52": 1.4436773255467414, "ce_loss_7": 5.079924070835114, "epoch": 0.164, "grad_norm": 25.877563512298988, "kl_loss_13": 6666.4, "kl_loss_26": 5404.8, "kl_loss_39": 4011.6, "kl_loss_7": 7479.2, "learning_rate": 0.0009414737964294635, "loss": 11645.0, "step": 1640 }, { "ce_loss_13": 4.614939618110657, "ce_loss_26": 4.018665736913681, "ce_loss_39": 3.3586190402507783, "ce_loss_52": 1.4472161442041398, "ce_loss_7": 4.990367615222931, "epoch": 0.165, "grad_norm": 24.534381720947415, "kl_loss_13": 6511.2, "kl_loss_26": 5264.0, "kl_loss_39": 3849.6, "kl_loss_7": 7293.6, "learning_rate": 0.000940726681723791, "loss": 11568.6, "step": 1650 }, { "ce_loss_13": 4.539776319265366, "ce_loss_26": 3.943451428413391, "ce_loss_39": 3.281195378303528, "ce_loss_52": 1.4070941284298897, "ce_loss_7": 4.923744630813599, "epoch": 0.166, "grad_norm": 23.51720209782485, "kl_loss_13": 6449.6, "kl_loss_26": 5196.0, "kl_loss_39": 3786.0, "kl_loss_7": 7249.6, "learning_rate": 0.0009399751289053266, "loss": 11569.4, "step": 1660 }, { "ce_loss_13": 4.590777164697647, "ce_loss_26": 3.9918887853622436, "ce_loss_39": 3.328452670574188, "ce_loss_52": 1.4019996047019958, "ce_loss_7": 4.978202056884766, "epoch": 0.167, "grad_norm": 22.82794096581106, "kl_loss_13": 6550.4, "kl_loss_26": 5291.2, "kl_loss_39": 3877.2, "kl_loss_7": 7354.4, "learning_rate": 0.0009392191455421988, "loss": 11557.4, "step": 1670 }, { "ce_loss_13": 4.534084904193878, "ce_loss_26": 3.9383736848831177, "ce_loss_39": 3.290838527679443, "ce_loss_52": 1.3803422033786774, "ce_loss_7": 4.9193053364753725, "epoch": 0.168, "grad_norm": 22.01316358613574, "kl_loss_13": 6469.6, "kl_loss_26": 5224.0, "kl_loss_39": 3837.2, "kl_loss_7": 7273.6, "learning_rate": 0.0009384587392471515, "loss": 11454.2, "step": 1680 }, { "ce_loss_13": 4.5477269172668455, "ce_loss_26": 3.9558385491371153, "ce_loss_39": 3.3075734674930573, "ce_loss_52": 1.410713329911232, "ce_loss_7": 4.9310637474060055, "epoch": 0.169, "grad_norm": 24.025001534080104, "kl_loss_13": 6453.6, "kl_loss_26": 5223.2, "kl_loss_39": 3830.0, "kl_loss_7": 7244.0, "learning_rate": 0.0009376939176774678, "loss": 11355.2, "step": 1690 }, { "ce_loss_13": 4.580456328392029, "ce_loss_26": 3.996914601325989, "ce_loss_39": 3.3568237483501435, "ce_loss_52": 1.4514233976602555, "ce_loss_7": 4.956906342506409, "epoch": 0.17, "grad_norm": 24.061048820242437, "kl_loss_13": 6424.8, "kl_loss_26": 5199.2, "kl_loss_39": 3822.0, "kl_loss_7": 7210.4, "learning_rate": 0.0009369246885348925, "loss": 11365.4, "step": 1700 }, { "ce_loss_13": 4.5829225301742555, "ce_loss_26": 3.973430114984512, "ce_loss_39": 3.3136274456977843, "ce_loss_52": 1.4179346442222596, "ce_loss_7": 4.9602553129196165, "epoch": 0.171, "grad_norm": 21.925882863353518, "kl_loss_13": 6508.0, "kl_loss_26": 5225.6, "kl_loss_39": 3821.2, "kl_loss_7": 7300.0, "learning_rate": 0.0009361510595655545, "loss": 11427.8, "step": 1710 }, { "ce_loss_13": 4.597618329524994, "ce_loss_26": 4.014382421970367, "ce_loss_39": 3.3793311297893522, "ce_loss_52": 1.4502436846494675, "ce_loss_7": 4.970238649845124, "epoch": 0.172, "grad_norm": 21.861723684559113, "kl_loss_13": 6463.2, "kl_loss_26": 5241.6, "kl_loss_39": 3880.8, "kl_loss_7": 7242.4, "learning_rate": 0.0009353730385598887, "loss": 11300.4, "step": 1720 }, { "ce_loss_13": 4.474293851852417, "ce_loss_26": 3.8755543529987335, "ce_loss_39": 3.212596780061722, "ce_loss_52": 1.4004584550857544, "ce_loss_7": 4.856262743473053, "epoch": 0.173, "grad_norm": 23.168666460490822, "kl_loss_13": 6318.4, "kl_loss_26": 5065.6, "kl_loss_39": 3658.4, "kl_loss_7": 7116.8, "learning_rate": 0.0009345906333525581, "loss": 11205.0, "step": 1730 }, { "ce_loss_13": 4.5212029337883, "ce_loss_26": 3.9314939856529234, "ce_loss_39": 3.301447206735611, "ce_loss_52": 1.422508242726326, "ce_loss_7": 4.894874656200409, "epoch": 0.174, "grad_norm": 25.870791070867757, "kl_loss_13": 6358.4, "kl_loss_26": 5133.6, "kl_loss_39": 3775.2, "kl_loss_7": 7142.4, "learning_rate": 0.0009338038518223745, "loss": 11159.2, "step": 1740 }, { "ce_loss_13": 4.551153075695038, "ce_loss_26": 3.96026993393898, "ce_loss_39": 3.326349085569382, "ce_loss_52": 1.4542756617069243, "ce_loss_7": 4.919975602626801, "epoch": 0.175, "grad_norm": 23.828468964880035, "kl_loss_13": 6352.8, "kl_loss_26": 5109.6, "kl_loss_39": 3757.6, "kl_loss_7": 7129.6, "learning_rate": 0.0009330127018922195, "loss": 11089.0, "step": 1750 }, { "ce_loss_13": 4.469128930568695, "ce_loss_26": 3.8787964940071107, "ce_loss_39": 3.2338991940021513, "ce_loss_52": 1.4316335827112199, "ce_loss_7": 4.848294925689697, "epoch": 0.176, "grad_norm": 24.772424094235244, "kl_loss_13": 6252.8, "kl_loss_26": 5015.2, "kl_loss_39": 3643.2, "kl_loss_7": 7041.6, "learning_rate": 0.0009322171915289634, "loss": 11050.6, "step": 1760 }, { "ce_loss_13": 4.515468680858612, "ce_loss_26": 3.9264565110206604, "ce_loss_39": 3.2920862257480623, "ce_loss_52": 1.46503643989563, "ce_loss_7": 4.88274484872818, "epoch": 0.177, "grad_norm": 24.580027558725412, "kl_loss_13": 6243.2, "kl_loss_26": 5015.2, "kl_loss_39": 3668.0, "kl_loss_7": 7024.0, "learning_rate": 0.0009314173287433873, "loss": 11083.0, "step": 1770 }, { "ce_loss_13": 4.563484919071198, "ce_loss_26": 3.988471633195877, "ce_loss_39": 3.3576016187667848, "ce_loss_52": 1.4738382428884507, "ce_loss_7": 4.929107296466827, "epoch": 0.178, "grad_norm": 23.727065102019264, "kl_loss_13": 6340.0, "kl_loss_26": 5134.4, "kl_loss_39": 3774.0, "kl_loss_7": 7108.0, "learning_rate": 0.0009306131215901003, "loss": 11053.2, "step": 1780 }, { "ce_loss_13": 4.485390210151673, "ce_loss_26": 3.9024369359016418, "ce_loss_39": 3.277720022201538, "ce_loss_52": 1.4684919208288192, "ce_loss_7": 4.849484694004059, "epoch": 0.179, "grad_norm": 24.140381804707665, "kl_loss_13": 6222.4, "kl_loss_26": 4996.0, "kl_loss_39": 3639.6, "kl_loss_7": 6991.2, "learning_rate": 0.0009298045781674596, "loss": 10948.8, "step": 1790 }, { "ce_loss_13": 4.485648030042649, "ce_loss_26": 3.8959447860717775, "ce_loss_39": 3.255040627717972, "ce_loss_52": 1.41890487074852, "ce_loss_7": 4.864380013942719, "epoch": 0.18, "grad_norm": 25.753548379396687, "kl_loss_13": 6269.6, "kl_loss_26": 5029.6, "kl_loss_39": 3653.2, "kl_loss_7": 7068.8, "learning_rate": 0.0009289917066174886, "loss": 10940.4, "step": 1800 }, { "ce_loss_13": 4.4491588294506075, "ce_loss_26": 3.862889313697815, "ce_loss_39": 3.203300213813782, "ce_loss_52": 1.4129745751619338, "ce_loss_7": 4.8373774766921995, "epoch": 0.181, "grad_norm": 23.580007870242206, "kl_loss_13": 6251.2, "kl_loss_26": 5015.2, "kl_loss_39": 3609.2, "kl_loss_7": 7063.2, "learning_rate": 0.0009281745151257945, "loss": 10831.6, "step": 1810 }, { "ce_loss_13": 4.4796471238136295, "ce_loss_26": 3.9034676015377046, "ce_loss_39": 3.27801650762558, "ce_loss_52": 1.470898449420929, "ce_loss_7": 4.846702206134796, "epoch": 0.182, "grad_norm": 21.825066910706077, "kl_loss_13": 6129.6, "kl_loss_26": 4921.6, "kl_loss_39": 3590.4, "kl_loss_7": 6907.2, "learning_rate": 0.0009273530119214868, "loss": 10852.6, "step": 1820 }, { "ce_loss_13": 4.397759801149368, "ce_loss_26": 3.809650295972824, "ce_loss_39": 3.163052296638489, "ce_loss_52": 1.4123397037386893, "ce_loss_7": 4.76624493598938, "epoch": 0.183, "grad_norm": 23.028395579089935, "kl_loss_13": 6109.6, "kl_loss_26": 4884.8, "kl_loss_39": 3520.0, "kl_loss_7": 6886.4, "learning_rate": 0.0009265272052770935, "loss": 10776.6, "step": 1830 }, { "ce_loss_13": 4.409473043680191, "ce_loss_26": 3.825248968601227, "ce_loss_39": 3.174910306930542, "ce_loss_52": 1.4039017781615257, "ce_loss_7": 4.799298018217087, "epoch": 0.184, "grad_norm": 22.60594476207274, "kl_loss_13": 6165.6, "kl_loss_26": 4934.4, "kl_loss_39": 3543.2, "kl_loss_7": 6977.6, "learning_rate": 0.0009256971035084784, "loss": 10733.4, "step": 1840 }, { "ce_loss_13": 4.3755183041095735, "ce_loss_26": 3.797974693775177, "ce_loss_39": 3.1725789427757265, "ce_loss_52": 1.4232216864824294, "ce_loss_7": 4.739054465293885, "epoch": 0.185, "grad_norm": 23.627865972104136, "kl_loss_13": 6060.0, "kl_loss_26": 4843.2, "kl_loss_39": 3517.2, "kl_loss_7": 6827.2, "learning_rate": 0.0009248627149747573, "loss": 10698.4, "step": 1850 }, { "ce_loss_13": 4.422569459676742, "ce_loss_26": 3.822605752944946, "ce_loss_39": 3.1763491451740267, "ce_loss_52": 1.427757203578949, "ce_loss_7": 4.793670791387558, "epoch": 0.186, "grad_norm": 22.345780165109367, "kl_loss_13": 6140.0, "kl_loss_26": 4902.0, "kl_loss_39": 3525.6, "kl_loss_7": 6920.0, "learning_rate": 0.0009240240480782129, "loss": 10688.6, "step": 1860 }, { "ce_loss_13": 4.390002739429474, "ce_loss_26": 3.8117696583271026, "ce_loss_39": 3.193506735563278, "ce_loss_52": 1.4390262439846992, "ce_loss_7": 4.754710161685944, "epoch": 0.187, "grad_norm": 24.270272909834983, "kl_loss_13": 6056.0, "kl_loss_26": 4841.6, "kl_loss_39": 3523.2, "kl_loss_7": 6828.0, "learning_rate": 0.0009231811112642122, "loss": 10605.8, "step": 1870 }, { "ce_loss_13": 4.347514522075653, "ce_loss_26": 3.774775582551956, "ce_loss_39": 3.1524779438972472, "ce_loss_52": 1.4184574037790298, "ce_loss_7": 4.711814332008362, "epoch": 0.188, "grad_norm": 23.060486415907942, "kl_loss_13": 6006.4, "kl_loss_26": 4801.6, "kl_loss_39": 3474.8, "kl_loss_7": 6776.0, "learning_rate": 0.0009223339130211192, "loss": 10599.8, "step": 1880 }, { "ce_loss_13": 4.280169582366943, "ce_loss_26": 3.6960571646690368, "ce_loss_39": 3.0768611639738084, "ce_loss_52": 1.4011510267853737, "ce_loss_7": 4.650495028495788, "epoch": 0.189, "grad_norm": 23.308893883500843, "kl_loss_13": 5916.0, "kl_loss_26": 4693.6, "kl_loss_39": 3365.6, "kl_loss_7": 6688.8, "learning_rate": 0.0009214824618802108, "loss": 10510.0, "step": 1890 }, { "ce_loss_13": 4.426742446422577, "ce_loss_26": 3.835584044456482, "ce_loss_39": 3.1762202858924864, "ce_loss_52": 1.435165250301361, "ce_loss_7": 4.8001045942306515, "epoch": 0.19, "grad_norm": 24.259267724718942, "kl_loss_13": 6154.4, "kl_loss_26": 4914.4, "kl_loss_39": 3499.2, "kl_loss_7": 6933.6, "learning_rate": 0.0009206267664155906, "loss": 10574.0, "step": 1900 }, { "ce_loss_13": 4.317660903930664, "ce_loss_26": 3.736785036325455, "ce_loss_39": 3.102087676525116, "ce_loss_52": 1.4297346964478492, "ce_loss_7": 4.69397531747818, "epoch": 0.191, "grad_norm": 23.3562329011761, "kl_loss_13": 5937.6, "kl_loss_26": 4723.2, "kl_loss_39": 3373.2, "kl_loss_7": 6731.2, "learning_rate": 0.0009197668352441024, "loss": 10503.4, "step": 1910 }, { "ce_loss_13": 4.334453409910202, "ce_loss_26": 3.7587957322597503, "ce_loss_39": 3.123855656385422, "ce_loss_52": 1.4094826728105545, "ce_loss_7": 4.709676373004913, "epoch": 0.192, "grad_norm": 24.22470876078119, "kl_loss_13": 5996.8, "kl_loss_26": 4786.4, "kl_loss_39": 3430.0, "kl_loss_7": 6772.8, "learning_rate": 0.0009189026770252437, "loss": 10471.0, "step": 1920 }, { "ce_loss_13": 4.351706159114838, "ce_loss_26": 3.7773966193199158, "ce_loss_39": 3.1497732281684874, "ce_loss_52": 1.4320787012577056, "ce_loss_7": 4.7191231608390805, "epoch": 0.193, "grad_norm": 23.447904782586527, "kl_loss_13": 5997.6, "kl_loss_26": 4794.4, "kl_loss_39": 3448.4, "kl_loss_7": 6763.2, "learning_rate": 0.000918034300461078, "loss": 10433.4, "step": 1930 }, { "ce_loss_13": 4.307104933261871, "ce_loss_26": 3.7206166088581085, "ce_loss_39": 3.091316682100296, "ce_loss_52": 1.4110687702894211, "ce_loss_7": 4.676908355951309, "epoch": 0.194, "grad_norm": 23.93372642527522, "kl_loss_13": 5951.2, "kl_loss_26": 4727.6, "kl_loss_39": 3376.0, "kl_loss_7": 6721.6, "learning_rate": 0.0009171617142961477, "loss": 10442.2, "step": 1940 }, { "ce_loss_13": 4.3363093614578245, "ce_loss_26": 3.750982850790024, "ce_loss_39": 3.111935979127884, "ce_loss_52": 1.431942057609558, "ce_loss_7": 4.707539451122284, "epoch": 0.195, "grad_norm": 23.910939036749266, "kl_loss_13": 5967.2, "kl_loss_26": 4743.2, "kl_loss_39": 3382.8, "kl_loss_7": 6752.0, "learning_rate": 0.0009162849273173857, "loss": 10366.8, "step": 1950 }, { "ce_loss_13": 4.271794074773789, "ce_loss_26": 3.7012794077396394, "ce_loss_39": 3.0882445216178893, "ce_loss_52": 1.4407746940851212, "ce_loss_7": 4.636665797233581, "epoch": 0.196, "grad_norm": 23.30649566244444, "kl_loss_13": 5844.8, "kl_loss_26": 4652.8, "kl_loss_39": 3337.2, "kl_loss_7": 6608.8, "learning_rate": 0.0009154039483540273, "loss": 10313.0, "step": 1960 }, { "ce_loss_13": 4.3892871856689455, "ce_loss_26": 3.8117631673812866, "ce_loss_39": 3.1672019243240355, "ce_loss_52": 1.4633917301893233, "ce_loss_7": 4.75022611618042, "epoch": 0.197, "grad_norm": 22.823988656575857, "kl_loss_13": 5992.0, "kl_loss_26": 4784.0, "kl_loss_39": 3421.6, "kl_loss_7": 6749.6, "learning_rate": 0.0009145187862775209, "loss": 10294.2, "step": 1970 }, { "ce_loss_13": 4.251708203554154, "ce_loss_26": 3.68521209359169, "ce_loss_39": 3.0638678431510926, "ce_loss_52": 1.4189983233809471, "ce_loss_7": 4.615437304973602, "epoch": 0.198, "grad_norm": 22.115400900183356, "kl_loss_13": 5814.4, "kl_loss_26": 4623.2, "kl_loss_39": 3291.2, "kl_loss_7": 6581.6, "learning_rate": 0.0009136294500014386, "loss": 10194.8, "step": 1980 }, { "ce_loss_13": 4.36306391954422, "ce_loss_26": 3.7900101482868194, "ce_loss_39": 3.1458350718021393, "ce_loss_52": 1.431062677502632, "ce_loss_7": 4.733654403686524, "epoch": 0.199, "grad_norm": 21.64648055888152, "kl_loss_13": 6001.6, "kl_loss_26": 4801.6, "kl_loss_39": 3444.4, "kl_loss_7": 6778.4, "learning_rate": 0.000912735948481387, "loss": 10217.4, "step": 1990 }, { "ce_loss_13": 4.2783638596534725, "ce_loss_26": 3.7015498995780947, "ce_loss_39": 3.079295587539673, "ce_loss_52": 1.4367393761873246, "ce_loss_7": 4.643443429470063, "epoch": 0.2, "grad_norm": 22.667053535414237, "kl_loss_13": 5844.8, "kl_loss_26": 4641.2, "kl_loss_39": 3314.8, "kl_loss_7": 6607.2, "learning_rate": 0.0009118382907149164, "loss": 10108.9, "step": 2000 }, { "ce_loss_13": 4.298079961538315, "ce_loss_26": 3.7112639427185057, "ce_loss_39": 3.0900191485881807, "ce_loss_52": 1.4447624236345291, "ce_loss_7": 4.659080803394318, "epoch": 0.201, "grad_norm": 21.421967222285037, "kl_loss_13": 5860.8, "kl_loss_26": 4647.6, "kl_loss_39": 3308.4, "kl_loss_7": 6620.8, "learning_rate": 0.0009109364857414306, "loss": 10210.1, "step": 2010 }, { "ce_loss_13": 4.298530715703964, "ce_loss_26": 3.7281334936618804, "ce_loss_39": 3.103990191221237, "ce_loss_52": 1.445554968714714, "ce_loss_7": 4.667031800746917, "epoch": 0.202, "grad_norm": 22.186513808055555, "kl_loss_13": 5863.2, "kl_loss_26": 4661.6, "kl_loss_39": 3329.2, "kl_loss_7": 6627.2, "learning_rate": 0.0009100305426420956, "loss": 10090.6, "step": 2020 }, { "ce_loss_13": 4.2117482125759125, "ce_loss_26": 3.6511631190776823, "ce_loss_39": 3.0435081899166105, "ce_loss_52": 1.4112246841192246, "ce_loss_7": 4.573570990562439, "epoch": 0.203, "grad_norm": 23.22208055275699, "kl_loss_13": 5758.4, "kl_loss_26": 4581.6, "kl_loss_39": 3281.6, "kl_loss_7": 6522.4, "learning_rate": 0.0009091204705397484, "loss": 10094.4, "step": 2030 }, { "ce_loss_13": 4.2797119140625, "ce_loss_26": 3.7096898019313813, "ce_loss_39": 3.0815310001373293, "ce_loss_52": 1.4514381274580956, "ce_loss_7": 4.634703290462494, "epoch": 0.204, "grad_norm": 22.77691157290275, "kl_loss_13": 5768.8, "kl_loss_26": 4568.8, "kl_loss_39": 3229.6, "kl_loss_7": 6520.8, "learning_rate": 0.0009082062785988049, "loss": 10052.8, "step": 2040 }, { "ce_loss_13": 4.228492313623429, "ce_loss_26": 3.649516838788986, "ce_loss_39": 3.017675918340683, "ce_loss_52": 1.4015851855278014, "ce_loss_7": 4.5945727050304415, "epoch": 0.205, "grad_norm": 24.043992652900016, "kl_loss_13": 5788.0, "kl_loss_26": 4580.8, "kl_loss_39": 3234.0, "kl_loss_7": 6555.2, "learning_rate": 0.0009072879760251679, "loss": 10047.6, "step": 2050 }, { "ce_loss_13": 4.153064209222793, "ce_loss_26": 3.5778062403202058, "ce_loss_39": 2.964171904325485, "ce_loss_52": 1.4066181004047393, "ce_loss_7": 4.5237502455711365, "epoch": 0.206, "grad_norm": 23.14612831170837, "kl_loss_13": 5666.4, "kl_loss_26": 4462.4, "kl_loss_39": 3156.0, "kl_loss_7": 6428.0, "learning_rate": 0.0009063655720661341, "loss": 10022.0, "step": 2060 }, { "ce_loss_13": 4.162076050043106, "ce_loss_26": 3.5850139617919923, "ce_loss_39": 2.9763170003890993, "ce_loss_52": 1.4114871382713319, "ce_loss_7": 4.525000536441803, "epoch": 0.207, "grad_norm": 23.335931334507656, "kl_loss_13": 5684.8, "kl_loss_26": 4486.4, "kl_loss_39": 3172.4, "kl_loss_7": 6440.8, "learning_rate": 0.000905439076010301, "loss": 9910.8, "step": 2070 }, { "ce_loss_13": 4.203662091493607, "ce_loss_26": 3.6453104853630065, "ce_loss_39": 3.031943756341934, "ce_loss_52": 1.4498305425047875, "ce_loss_7": 4.567969477176666, "epoch": 0.208, "grad_norm": 22.17297694250979, "kl_loss_13": 5653.6, "kl_loss_26": 4474.4, "kl_loss_39": 3178.4, "kl_loss_7": 6404.0, "learning_rate": 0.0009045084971874737, "loss": 9890.1, "step": 2080 }, { "ce_loss_13": 4.1253215074539185, "ce_loss_26": 3.5510447442531587, "ce_loss_39": 2.9255994498729705, "ce_loss_52": 1.383307683467865, "ce_loss_7": 4.503264659643174, "epoch": 0.209, "grad_norm": 21.83866221628796, "kl_loss_13": 5675.2, "kl_loss_26": 4462.8, "kl_loss_39": 3118.0, "kl_loss_7": 6452.0, "learning_rate": 0.0009035738449685707, "loss": 9916.2, "step": 2090 }, { "ce_loss_13": 4.232741326093674, "ce_loss_26": 3.6592664182186128, "ce_loss_39": 3.0402495503425597, "ce_loss_52": 1.4619457066059112, "ce_loss_7": 4.595292699337006, "epoch": 0.21, "grad_norm": 23.1172808852491, "kl_loss_13": 5709.6, "kl_loss_26": 4522.8, "kl_loss_39": 3192.0, "kl_loss_7": 6460.8, "learning_rate": 0.0009026351287655293, "loss": 9882.4, "step": 2100 }, { "ce_loss_13": 4.196552646160126, "ce_loss_26": 3.6394916236400605, "ce_loss_39": 3.029403269290924, "ce_loss_52": 1.431785149872303, "ce_loss_7": 4.555036389827729, "epoch": 0.211, "grad_norm": 22.472192746858727, "kl_loss_13": 5678.4, "kl_loss_26": 4513.2, "kl_loss_39": 3201.6, "kl_loss_7": 6423.2, "learning_rate": 0.0009016923580312113, "loss": 9778.0, "step": 2110 }, { "ce_loss_13": 4.267941182851791, "ce_loss_26": 3.6975386798381806, "ce_loss_39": 3.0839960873126984, "ce_loss_52": 1.4794423222541808, "ce_loss_7": 4.60888249874115, "epoch": 0.212, "grad_norm": 23.91558691594894, "kl_loss_13": 5697.6, "kl_loss_26": 4502.8, "kl_loss_39": 3191.2, "kl_loss_7": 6425.6, "learning_rate": 0.0009007455422593077, "loss": 9764.0, "step": 2120 }, { "ce_loss_13": 4.170402336120605, "ce_loss_26": 3.5973034620285036, "ce_loss_39": 2.9883872270584106, "ce_loss_52": 1.4295871376991272, "ce_loss_7": 4.519614219665527, "epoch": 0.213, "grad_norm": 22.812352077428177, "kl_loss_13": 5644.0, "kl_loss_26": 4456.4, "kl_loss_39": 3137.2, "kl_loss_7": 6387.2, "learning_rate": 0.0008997946909842425, "loss": 9755.6, "step": 2130 }, { "ce_loss_13": 4.10284715294838, "ce_loss_26": 3.5395869314670563, "ce_loss_39": 2.9268704533576964, "ce_loss_52": 1.4086133271455765, "ce_loss_7": 4.462130695581436, "epoch": 0.214, "grad_norm": 22.23702862817867, "kl_loss_13": 5524.8, "kl_loss_26": 4340.4, "kl_loss_39": 3036.0, "kl_loss_7": 6288.8, "learning_rate": 0.0008988398137810777, "loss": 9645.0, "step": 2140 }, { "ce_loss_13": 4.079320967197418, "ce_loss_26": 3.496495473384857, "ce_loss_39": 2.8816928565502167, "ce_loss_52": 1.3819068521261215, "ce_loss_7": 4.434896755218506, "epoch": 0.215, "grad_norm": 22.79915028059786, "kl_loss_13": 5541.6, "kl_loss_26": 4350.0, "kl_loss_39": 3045.2, "kl_loss_7": 6288.8, "learning_rate": 0.0008978809202654162, "loss": 9686.6, "step": 2150 }, { "ce_loss_13": 4.069333535432816, "ce_loss_26": 3.5059276044368746, "ce_loss_39": 2.8949272632598877, "ce_loss_52": 1.409129326045513, "ce_loss_7": 4.434026664495468, "epoch": 0.216, "grad_norm": 22.908702660837623, "kl_loss_13": 5464.0, "kl_loss_26": 4286.8, "kl_loss_39": 2986.8, "kl_loss_7": 6228.0, "learning_rate": 0.0008969180200933046, "loss": 9665.2, "step": 2160 }, { "ce_loss_13": 4.164896643161773, "ce_loss_26": 3.594840294122696, "ce_loss_39": 2.9824715733528135, "ce_loss_52": 1.4398185968399049, "ce_loss_7": 4.5187140583992, "epoch": 0.217, "grad_norm": 22.2992725858673, "kl_loss_13": 5602.4, "kl_loss_26": 4409.6, "kl_loss_39": 3101.6, "kl_loss_7": 6338.4, "learning_rate": 0.0008959511229611376, "loss": 9611.1, "step": 2170 }, { "ce_loss_13": 4.1424953758716585, "ce_loss_26": 3.5846896708011626, "ce_loss_39": 2.9776509165763856, "ce_loss_52": 1.4638631641864777, "ce_loss_7": 4.499098914861679, "epoch": 0.218, "grad_norm": 22.569206560566755, "kl_loss_13": 5520.8, "kl_loss_26": 4360.4, "kl_loss_39": 3039.2, "kl_loss_7": 6262.4, "learning_rate": 0.0008949802386055581, "loss": 9598.7, "step": 2180 }, { "ce_loss_13": 4.124321860074997, "ce_loss_26": 3.559172648191452, "ce_loss_39": 2.942674660682678, "ce_loss_52": 1.4182981908321382, "ce_loss_7": 4.483241724967956, "epoch": 0.219, "grad_norm": 22.517460780417444, "kl_loss_13": 5559.2, "kl_loss_26": 4374.8, "kl_loss_39": 3066.4, "kl_loss_7": 6304.0, "learning_rate": 0.0008940053768033609, "loss": 9610.7, "step": 2190 }, { "ce_loss_13": 4.136555308103562, "ce_loss_26": 3.5674175798892973, "ce_loss_39": 2.9441113233566285, "ce_loss_52": 1.4381250411272049, "ce_loss_7": 4.481418180465698, "epoch": 0.22, "grad_norm": 23.1169100672147, "kl_loss_13": 5545.6, "kl_loss_26": 4354.4, "kl_loss_39": 3035.2, "kl_loss_7": 6270.4, "learning_rate": 0.0008930265473713938, "loss": 9621.3, "step": 2200 }, { "ce_loss_13": 4.116154849529266, "ce_loss_26": 3.5370861172676085, "ce_loss_39": 2.9127448469400408, "ce_loss_52": 1.3838467657566071, "ce_loss_7": 4.479850220680237, "epoch": 0.221, "grad_norm": 23.327101471626264, "kl_loss_13": 5613.6, "kl_loss_26": 4407.2, "kl_loss_39": 3084.0, "kl_loss_7": 6376.8, "learning_rate": 0.0008920437601664579, "loss": 9580.3, "step": 2210 }, { "ce_loss_13": 4.081603097915649, "ce_loss_26": 3.533859223127365, "ce_loss_39": 2.9178696632385255, "ce_loss_52": 1.4558427572250365, "ce_loss_7": 4.428021937608719, "epoch": 0.222, "grad_norm": 24.571271492626643, "kl_loss_13": 5416.8, "kl_loss_26": 4265.2, "kl_loss_39": 2949.2, "kl_loss_7": 6150.4, "learning_rate": 0.0008910570250852097, "loss": 9535.0, "step": 2220 }, { "ce_loss_13": 4.016762095689773, "ce_loss_26": 3.453756958246231, "ce_loss_39": 2.8424510210752487, "ce_loss_52": 1.382763533294201, "ce_loss_7": 4.369659447669983, "epoch": 0.223, "grad_norm": 22.39109803193224, "kl_loss_13": 5417.6, "kl_loss_26": 4255.6, "kl_loss_39": 2951.6, "kl_loss_7": 6149.6, "learning_rate": 0.0008900663520640604, "loss": 9449.7, "step": 2230 }, { "ce_loss_13": 4.07697583436966, "ce_loss_26": 3.5205394327640533, "ce_loss_39": 2.9201291859149934, "ce_loss_52": 1.4419079095125198, "ce_loss_7": 4.432818019390107, "epoch": 0.224, "grad_norm": 29.33071925320992, "kl_loss_13": 5431.2, "kl_loss_26": 4260.4, "kl_loss_39": 2980.0, "kl_loss_7": 6176.8, "learning_rate": 0.0008890717510790764, "loss": 9471.4, "step": 2240 }, { "ce_loss_13": 4.099857300519943, "ce_loss_26": 3.5491108179092405, "ce_loss_39": 2.946030503511429, "ce_loss_52": 1.4482155337929725, "ce_loss_7": 4.446840679645538, "epoch": 0.225, "grad_norm": 24.393145108562546, "kl_loss_13": 5448.0, "kl_loss_26": 4290.4, "kl_loss_39": 3004.4, "kl_loss_7": 6176.0, "learning_rate": 0.0008880732321458784, "loss": 9429.4, "step": 2250 }, { "ce_loss_13": 4.008820396661759, "ce_loss_26": 3.466299217939377, "ce_loss_39": 2.8751066744327547, "ce_loss_52": 1.4349601715803146, "ce_loss_7": 4.357817393541336, "epoch": 0.226, "grad_norm": 23.790321486003762, "kl_loss_13": 5306.4, "kl_loss_26": 4167.6, "kl_loss_39": 2894.0, "kl_loss_7": 6034.4, "learning_rate": 0.0008870708053195413, "loss": 9349.3, "step": 2260 }, { "ce_loss_13": 4.0613229155540465, "ce_loss_26": 3.495078670978546, "ce_loss_39": 2.889867639541626, "ce_loss_52": 1.417461496591568, "ce_loss_7": 4.40853306055069, "epoch": 0.227, "grad_norm": 24.394028059861938, "kl_loss_13": 5412.8, "kl_loss_26": 4243.2, "kl_loss_39": 2960.8, "kl_loss_7": 6130.4, "learning_rate": 0.0008860644806944918, "loss": 9352.6, "step": 2270 }, { "ce_loss_13": 4.178533679246902, "ce_loss_26": 3.622497373819351, "ce_loss_39": 3.006651484966278, "ce_loss_52": 1.4494876891374588, "ce_loss_7": 4.527337849140167, "epoch": 0.228, "grad_norm": 22.806373163177923, "kl_loss_13": 5580.0, "kl_loss_26": 4405.2, "kl_loss_39": 3104.0, "kl_loss_7": 6317.6, "learning_rate": 0.0008850542684044079, "loss": 9441.9, "step": 2280 }, { "ce_loss_13": 4.018020331859589, "ce_loss_26": 3.458746635913849, "ce_loss_39": 2.8657322227954865, "ce_loss_52": 1.4288572728633882, "ce_loss_7": 4.375551146268845, "epoch": 0.229, "grad_norm": 22.45015355344987, "kl_loss_13": 5303.2, "kl_loss_26": 4136.0, "kl_loss_39": 2856.8, "kl_loss_7": 6051.2, "learning_rate": 0.0008840401786221159, "loss": 9343.7, "step": 2290 }, { "ce_loss_13": 4.0382424116134645, "ce_loss_26": 3.4842948436737062, "ce_loss_39": 2.894715803861618, "ce_loss_52": 1.4393651276826858, "ce_loss_7": 4.377675461769104, "epoch": 0.23, "grad_norm": 23.31839583792026, "kl_loss_13": 5351.2, "kl_loss_26": 4188.4, "kl_loss_39": 2912.0, "kl_loss_7": 6064.8, "learning_rate": 0.000883022221559489, "loss": 9246.3, "step": 2300 }, { "ce_loss_13": 4.038966596126556, "ce_loss_26": 3.488220602273941, "ce_loss_39": 2.8812575459480287, "ce_loss_52": 1.441010195016861, "ce_loss_7": 4.381568449735641, "epoch": 0.231, "grad_norm": 22.254622557882463, "kl_loss_13": 5335.2, "kl_loss_26": 4171.2, "kl_loss_39": 2887.2, "kl_loss_7": 6052.8, "learning_rate": 0.0008820004074673434, "loss": 9220.3, "step": 2310 }, { "ce_loss_13": 3.9854084312915803, "ce_loss_26": 3.4409989297389982, "ce_loss_39": 2.84497589468956, "ce_loss_52": 1.4122451767325401, "ce_loss_7": 4.340046459436417, "epoch": 0.232, "grad_norm": 21.074813671439337, "kl_loss_13": 5296.8, "kl_loss_26": 4147.6, "kl_loss_39": 2864.8, "kl_loss_7": 6036.8, "learning_rate": 0.0008809747466353355, "loss": 9279.8, "step": 2320 }, { "ce_loss_13": 4.115998637676239, "ce_loss_26": 3.5584963142871855, "ce_loss_39": 2.9395908057689666, "ce_loss_52": 1.465473085641861, "ce_loss_7": 4.470573830604553, "epoch": 0.233, "grad_norm": 22.26955688088229, "kl_loss_13": 5459.2, "kl_loss_26": 4288.0, "kl_loss_39": 2960.4, "kl_loss_7": 6197.6, "learning_rate": 0.0008799452493918585, "loss": 9213.2, "step": 2330 }, { "ce_loss_13": 3.9350290656089784, "ce_loss_26": 3.3855203211307527, "ce_loss_39": 2.799959135055542, "ce_loss_52": 1.4289379581809043, "ce_loss_7": 4.274723726511001, "epoch": 0.234, "grad_norm": 22.04159638849698, "kl_loss_13": 5186.8, "kl_loss_26": 4026.0, "kl_loss_39": 2752.2, "kl_loss_7": 5904.4, "learning_rate": 0.0008789119261039385, "loss": 9222.9, "step": 2340 }, { "ce_loss_13": 3.977653867006302, "ce_loss_26": 3.4353197515010834, "ce_loss_39": 2.8274969339370726, "ce_loss_52": 1.400892499089241, "ce_loss_7": 4.322875905036926, "epoch": 0.235, "grad_norm": 25.32335755706349, "kl_loss_13": 5282.4, "kl_loss_26": 4139.6, "kl_loss_39": 2849.2, "kl_loss_7": 6001.6, "learning_rate": 0.0008778747871771292, "loss": 9101.4, "step": 2350 }, { "ce_loss_13": 3.9699031889438627, "ce_loss_26": 3.4155133664608, "ce_loss_39": 2.8127492308616637, "ce_loss_52": 1.4119284138083459, "ce_loss_7": 4.322866821289063, "epoch": 0.236, "grad_norm": 24.250283920991954, "kl_loss_13": 5259.2, "kl_loss_26": 4092.8, "kl_loss_39": 2816.4, "kl_loss_7": 6000.8, "learning_rate": 0.0008768338430554083, "loss": 9104.0, "step": 2360 }, { "ce_loss_13": 3.928439366817474, "ce_loss_26": 3.382099211215973, "ce_loss_39": 2.789314305782318, "ce_loss_52": 1.3916988223791122, "ce_loss_7": 4.277575564384461, "epoch": 0.237, "grad_norm": 23.978839586298704, "kl_loss_13": 5212.0, "kl_loss_26": 4068.4, "kl_loss_39": 2797.2, "kl_loss_7": 5938.4, "learning_rate": 0.0008757891042210713, "loss": 9141.7, "step": 2370 }, { "ce_loss_13": 3.9462322175502775, "ce_loss_26": 3.397873044013977, "ce_loss_39": 2.8039229214191437, "ce_loss_52": 1.4037385553121566, "ce_loss_7": 4.290230017900467, "epoch": 0.238, "grad_norm": 23.01247346605362, "kl_loss_13": 5215.2, "kl_loss_26": 4079.6, "kl_loss_39": 2813.2, "kl_loss_7": 5942.4, "learning_rate": 0.0008747405811946271, "loss": 9055.8, "step": 2380 }, { "ce_loss_13": 3.98149796128273, "ce_loss_26": 3.442757821083069, "ce_loss_39": 2.84624342918396, "ce_loss_52": 1.445407471060753, "ce_loss_7": 4.318572920560837, "epoch": 0.239, "grad_norm": 22.74122664649998, "kl_loss_13": 5226.4, "kl_loss_26": 4089.6, "kl_loss_39": 2827.6, "kl_loss_7": 5934.4, "learning_rate": 0.0008736882845346905, "loss": 9110.6, "step": 2390 }, { "ce_loss_13": 3.9661067545413973, "ce_loss_26": 3.4294336676597594, "ce_loss_39": 2.836567336320877, "ce_loss_52": 1.442602628469467, "ce_loss_7": 4.3087667465209964, "epoch": 0.24, "grad_norm": 23.333126009298994, "kl_loss_13": 5196.0, "kl_loss_26": 4051.6, "kl_loss_39": 2790.0, "kl_loss_7": 5911.2, "learning_rate": 0.0008726322248378774, "loss": 9064.8, "step": 2400 }, { "ce_loss_13": 3.988937532901764, "ce_loss_26": 3.4361318945884705, "ce_loss_39": 2.833483111858368, "ce_loss_52": 1.4274606987833978, "ce_loss_7": 4.3361672222614285, "epoch": 0.241, "grad_norm": 21.55988300492865, "kl_loss_13": 5244.8, "kl_loss_26": 4091.2, "kl_loss_39": 2809.2, "kl_loss_7": 5974.4, "learning_rate": 0.0008715724127386971, "loss": 9048.5, "step": 2410 }, { "ce_loss_13": 3.93166036605835, "ce_loss_26": 3.3924847066402437, "ce_loss_39": 2.8114346325397492, "ce_loss_52": 1.433423739671707, "ce_loss_7": 4.277747517824173, "epoch": 0.242, "grad_norm": 21.76629675806892, "kl_loss_13": 5152.8, "kl_loss_26": 4024.4, "kl_loss_39": 2775.6, "kl_loss_7": 5876.0, "learning_rate": 0.0008705088589094458, "loss": 8950.6, "step": 2420 }, { "ce_loss_13": 4.0298320889472965, "ce_loss_26": 3.4784019589424133, "ce_loss_39": 2.8909366130828857, "ce_loss_52": 1.4593130856752397, "ce_loss_7": 4.372854852676392, "epoch": 0.243, "grad_norm": 22.782714549711034, "kl_loss_13": 5275.2, "kl_loss_26": 4132.0, "kl_loss_39": 2868.8, "kl_loss_7": 6000.0, "learning_rate": 0.0008694415740600988, "loss": 8979.7, "step": 2430 }, { "ce_loss_13": 3.957322496175766, "ce_loss_26": 3.391196775436401, "ce_loss_39": 2.7942449331283568, "ce_loss_52": 1.429965654015541, "ce_loss_7": 4.3002465009689335, "epoch": 0.244, "grad_norm": 22.108343623695664, "kl_loss_13": 5175.2, "kl_loss_26": 4000.0, "kl_loss_39": 2735.2, "kl_loss_7": 5894.4, "learning_rate": 0.0008683705689382025, "loss": 8983.5, "step": 2440 }, { "ce_loss_13": 3.914830905199051, "ce_loss_26": 3.371804046630859, "ce_loss_39": 2.789295125007629, "ce_loss_52": 1.4458730816841125, "ce_loss_7": 4.244754731655121, "epoch": 0.245, "grad_norm": 22.68476073719735, "kl_loss_13": 5094.4, "kl_loss_26": 3954.8, "kl_loss_39": 2705.0, "kl_loss_7": 5792.8, "learning_rate": 0.0008672958543287666, "loss": 8971.0, "step": 2450 }, { "ce_loss_13": 3.910190373659134, "ce_loss_26": 3.3697587728500364, "ce_loss_39": 2.7743864953517914, "ce_loss_52": 1.4169176414608955, "ce_loss_7": 4.245863050222397, "epoch": 0.246, "grad_norm": 23.78530061144511, "kl_loss_13": 5117.6, "kl_loss_26": 3979.2, "kl_loss_39": 2714.8, "kl_loss_7": 5818.4, "learning_rate": 0.0008662174410541554, "loss": 8871.3, "step": 2460 }, { "ce_loss_13": 3.905332827568054, "ce_loss_26": 3.3677509129047394, "ce_loss_39": 2.785578554868698, "ce_loss_52": 1.4284577563405036, "ce_loss_7": 4.243521982431412, "epoch": 0.247, "grad_norm": 21.62010382710404, "kl_loss_13": 5076.0, "kl_loss_26": 3951.2, "kl_loss_39": 2706.0, "kl_loss_7": 5774.4, "learning_rate": 0.0008651353399739787, "loss": 8827.8, "step": 2470 }, { "ce_loss_13": 3.9418592929840086, "ce_loss_26": 3.399972987174988, "ce_loss_39": 2.7943135529756544, "ce_loss_52": 1.4213671818375588, "ce_loss_7": 4.285894882678986, "epoch": 0.248, "grad_norm": 21.67451956689309, "kl_loss_13": 5164.0, "kl_loss_26": 4026.4, "kl_loss_39": 2746.4, "kl_loss_7": 5886.4, "learning_rate": 0.0008640495619849821, "loss": 8908.6, "step": 2480 }, { "ce_loss_13": 3.9583646595478057, "ce_loss_26": 3.418367612361908, "ce_loss_39": 2.8201009154319765, "ce_loss_52": 1.4697474852204322, "ce_loss_7": 4.3033524513244625, "epoch": 0.249, "grad_norm": 23.94241052015279, "kl_loss_13": 5140.0, "kl_loss_26": 4010.0, "kl_loss_39": 2731.6, "kl_loss_7": 5860.0, "learning_rate": 0.0008629601180209381, "loss": 8796.4, "step": 2490 }, { "ce_loss_13": 3.9134137570858, "ce_loss_26": 3.3699698984622954, "ce_loss_39": 2.781053990125656, "ce_loss_52": 1.4334075331687928, "ce_loss_7": 4.242314898967743, "epoch": 0.25, "grad_norm": 22.621772280297588, "kl_loss_13": 5100.0, "kl_loss_26": 3960.0, "kl_loss_39": 2699.6, "kl_loss_7": 5796.0, "learning_rate": 0.000861867019052535, "loss": 8802.5, "step": 2500 }, { "ce_loss_13": 3.975496470928192, "ce_loss_26": 3.4352354168891908, "ce_loss_39": 2.8324910700321198, "ce_loss_52": 1.469119620323181, "ce_loss_7": 4.319353139400482, "epoch": 0.251, "grad_norm": 24.031546669852546, "kl_loss_13": 5152.4, "kl_loss_26": 4019.6, "kl_loss_39": 2733.2, "kl_loss_7": 5876.8, "learning_rate": 0.0008607702760872678, "loss": 8791.0, "step": 2510 }, { "ce_loss_13": 3.970981556177139, "ce_loss_26": 3.4194670915603638, "ce_loss_39": 2.826087462902069, "ce_loss_52": 1.457669761776924, "ce_loss_7": 4.311935073137283, "epoch": 0.252, "grad_norm": 22.68902245721029, "kl_loss_13": 5173.6, "kl_loss_26": 4025.2, "kl_loss_39": 2753.2, "kl_loss_7": 5881.6, "learning_rate": 0.0008596699001693256, "loss": 8797.8, "step": 2520 }, { "ce_loss_13": 3.9163833260536194, "ce_loss_26": 3.378202974796295, "ce_loss_39": 2.789392131567001, "ce_loss_52": 1.4278187423944473, "ce_loss_7": 4.251722925901413, "epoch": 0.253, "grad_norm": 23.732161584585768, "kl_loss_13": 5073.6, "kl_loss_26": 3944.0, "kl_loss_39": 2692.8, "kl_loss_7": 5776.8, "learning_rate": 0.0008585659023794818, "loss": 8730.9, "step": 2530 }, { "ce_loss_13": 3.8952399492263794, "ce_loss_26": 3.3577997207641603, "ce_loss_39": 2.761990362405777, "ce_loss_52": 1.421005728840828, "ce_loss_7": 4.233315163850785, "epoch": 0.254, "grad_norm": 23.217787871644095, "kl_loss_13": 5080.0, "kl_loss_26": 3941.2, "kl_loss_39": 2674.0, "kl_loss_7": 5788.8, "learning_rate": 0.0008574582938349817, "loss": 8689.0, "step": 2540 }, { "ce_loss_13": 3.9610107481479644, "ce_loss_26": 3.423109310865402, "ce_loss_39": 2.8520358502864838, "ce_loss_52": 1.4856882840394974, "ce_loss_7": 4.297859001159668, "epoch": 0.255, "grad_norm": 24.36417114927956, "kl_loss_13": 5090.4, "kl_loss_26": 3959.2, "kl_loss_39": 2721.2, "kl_loss_7": 5797.6, "learning_rate": 0.0008563470856894315, "loss": 8682.7, "step": 2550 }, { "ce_loss_13": 3.9392871856689453, "ce_loss_26": 3.4012055695056915, "ce_loss_39": 2.808671069145203, "ce_loss_52": 1.472413820028305, "ce_loss_7": 4.270819437503815, "epoch": 0.256, "grad_norm": 22.260198047396518, "kl_loss_13": 5046.4, "kl_loss_26": 3928.4, "kl_loss_39": 2676.0, "kl_loss_7": 5743.2, "learning_rate": 0.0008552322891326845, "loss": 8696.9, "step": 2560 }, { "ce_loss_13": 3.920336198806763, "ce_loss_26": 3.379600703716278, "ce_loss_39": 2.7945084452629088, "ce_loss_52": 1.4492182582616806, "ce_loss_7": 4.2536624610424045, "epoch": 0.257, "grad_norm": 21.726891625639418, "kl_loss_13": 5079.2, "kl_loss_26": 3942.4, "kl_loss_39": 2686.4, "kl_loss_7": 5777.6, "learning_rate": 0.0008541139153907296, "loss": 8637.5, "step": 2570 }, { "ce_loss_13": 3.8855518221855165, "ce_loss_26": 3.3411356985569, "ce_loss_39": 2.7515164047479628, "ce_loss_52": 1.444500783085823, "ce_loss_7": 4.214444124698639, "epoch": 0.258, "grad_norm": 21.147965943731403, "kl_loss_13": 5012.8, "kl_loss_26": 3868.4, "kl_loss_39": 2618.4, "kl_loss_7": 5708.0, "learning_rate": 0.0008529919757255782, "loss": 8639.8, "step": 2580 }, { "ce_loss_13": 3.906326872110367, "ce_loss_26": 3.3759279191493987, "ce_loss_39": 2.794377237558365, "ce_loss_52": 1.4845586121082306, "ce_loss_7": 4.233397454023361, "epoch": 0.259, "grad_norm": 23.06645203237802, "kl_loss_13": 4970.4, "kl_loss_26": 3854.4, "kl_loss_39": 2612.0, "kl_loss_7": 5654.4, "learning_rate": 0.0008518664814351503, "loss": 8576.9, "step": 2590 }, { "ce_loss_13": 3.7816513538360597, "ce_loss_26": 3.2462404370307922, "ce_loss_39": 2.6499598264694213, "ce_loss_52": 1.391408371925354, "ce_loss_7": 4.128740018606186, "epoch": 0.26, "grad_norm": 23.574382647594796, "kl_loss_13": 4927.2, "kl_loss_26": 3814.8, "kl_loss_39": 2554.8, "kl_loss_7": 5640.8, "learning_rate": 0.0008507374438531607, "loss": 8563.9, "step": 2600 }, { "ce_loss_13": 3.9538560569286347, "ce_loss_26": 3.4095658123493195, "ce_loss_39": 2.8113979279994963, "ce_loss_52": 1.4748313665390014, "ce_loss_7": 4.2874442756176, "epoch": 0.261, "grad_norm": 22.486126104346287, "kl_loss_13": 5098.4, "kl_loss_26": 3961.2, "kl_loss_39": 2686.8, "kl_loss_7": 5806.4, "learning_rate": 0.0008496048743490053, "loss": 8565.1, "step": 2610 }, { "ce_loss_13": 3.806992840766907, "ce_loss_26": 3.282588803768158, "ce_loss_39": 2.7138577342033385, "ce_loss_52": 1.4272374346852303, "ce_loss_7": 4.137687039375305, "epoch": 0.262, "grad_norm": 23.33658890766694, "kl_loss_13": 4889.6, "kl_loss_26": 3786.4, "kl_loss_39": 2568.2, "kl_loss_7": 5575.2, "learning_rate": 0.0008484687843276469, "loss": 8535.4, "step": 2620 }, { "ce_loss_13": 3.8657312452793122, "ce_loss_26": 3.3368504345417023, "ce_loss_39": 2.7600394666194914, "ce_loss_52": 1.4648621320724486, "ce_loss_7": 4.185077089071274, "epoch": 0.263, "grad_norm": 21.52255395290807, "kl_loss_13": 4951.2, "kl_loss_26": 3845.2, "kl_loss_39": 2613.2, "kl_loss_7": 5622.4, "learning_rate": 0.0008473291852294987, "loss": 8580.4, "step": 2630 }, { "ce_loss_13": 3.8671354949474335, "ce_loss_26": 3.334774547815323, "ce_loss_39": 2.757487526535988, "ce_loss_52": 1.4462923228740692, "ce_loss_7": 4.198447376489639, "epoch": 0.264, "grad_norm": 22.57622271607983, "kl_loss_13": 4962.4, "kl_loss_26": 3854.4, "kl_loss_39": 2616.4, "kl_loss_7": 5648.8, "learning_rate": 0.0008461860885303114, "loss": 8492.7, "step": 2640 }, { "ce_loss_13": 3.875018262863159, "ce_loss_26": 3.3343500018119814, "ce_loss_39": 2.731833589076996, "ce_loss_52": 1.4120988547801971, "ce_loss_7": 4.215745764970779, "epoch": 0.265, "grad_norm": 21.329908759369555, "kl_loss_13": 5048.8, "kl_loss_26": 3930.4, "kl_loss_39": 2648.0, "kl_loss_7": 5764.8, "learning_rate": 0.000845039505741056, "loss": 8545.0, "step": 2650 }, { "ce_loss_13": 3.8392697393894197, "ce_loss_26": 3.313974368572235, "ce_loss_39": 2.743110102415085, "ce_loss_52": 1.4838833779096603, "ce_loss_7": 4.162828749418258, "epoch": 0.266, "grad_norm": 22.328558189428094, "kl_loss_13": 4850.4, "kl_loss_26": 3737.2, "kl_loss_39": 2504.0, "kl_loss_7": 5524.8, "learning_rate": 0.0008438894484078086, "loss": 8456.0, "step": 2660 }, { "ce_loss_13": 3.7446135103702547, "ce_loss_26": 3.2187088668346404, "ce_loss_39": 2.6340013802051545, "ce_loss_52": 1.389008679986, "ce_loss_7": 4.068695777654648, "epoch": 0.267, "grad_norm": 22.22147816002266, "kl_loss_13": 4894.4, "kl_loss_26": 3784.0, "kl_loss_39": 2542.0, "kl_loss_7": 5576.0, "learning_rate": 0.0008427359281116334, "loss": 8425.6, "step": 2670 }, { "ce_loss_13": 3.8235996186733248, "ce_loss_26": 3.292762166261673, "ce_loss_39": 2.7183104872703554, "ce_loss_52": 1.4328299894928933, "ce_loss_7": 4.153199070692063, "epoch": 0.268, "grad_norm": 22.48935879447384, "kl_loss_13": 4909.2, "kl_loss_26": 3800.0, "kl_loss_39": 2568.2, "kl_loss_7": 5595.2, "learning_rate": 0.0008415789564684673, "loss": 8422.0, "step": 2680 }, { "ce_loss_13": 3.776876950263977, "ce_loss_26": 3.2493546307086945, "ce_loss_39": 2.682066896557808, "ce_loss_52": 1.426028846204281, "ce_loss_7": 4.102496325969696, "epoch": 0.269, "grad_norm": 23.679133499819734, "kl_loss_13": 4848.8, "kl_loss_26": 3742.4, "kl_loss_39": 2509.6, "kl_loss_7": 5526.4, "learning_rate": 0.0008404185451290017, "loss": 8501.8, "step": 2690 }, { "ce_loss_13": 3.8134057581424714, "ce_loss_26": 3.272122323513031, "ce_loss_39": 2.686330908536911, "ce_loss_52": 1.4213334619998932, "ce_loss_7": 4.143577241897583, "epoch": 0.27, "grad_norm": 22.090911465136045, "kl_loss_13": 4938.4, "kl_loss_26": 3810.8, "kl_loss_39": 2554.8, "kl_loss_7": 5636.8, "learning_rate": 0.0008392547057785661, "loss": 8351.5, "step": 2700 }, { "ce_loss_13": 3.786053466796875, "ce_loss_26": 3.247877132892609, "ce_loss_39": 2.6689940333366393, "ce_loss_52": 1.4167698860168456, "ce_loss_7": 4.1162903845310215, "epoch": 0.271, "grad_norm": 20.5206064618152, "kl_loss_13": 4872.8, "kl_loss_26": 3742.0, "kl_loss_39": 2504.4, "kl_loss_7": 5563.2, "learning_rate": 0.0008380874501370098, "loss": 8427.4, "step": 2710 }, { "ce_loss_13": 3.704389762878418, "ce_loss_26": 3.187553709745407, "ce_loss_39": 2.624448519945145, "ce_loss_52": 1.42959221303463, "ce_loss_7": 4.022905468940735, "epoch": 0.272, "grad_norm": 24.53128855683424, "kl_loss_13": 4716.8, "kl_loss_26": 3630.0, "kl_loss_39": 2409.2, "kl_loss_7": 5390.4, "learning_rate": 0.0008369167899585841, "loss": 8346.0, "step": 2720 }, { "ce_loss_13": 3.7978334367275237, "ce_loss_26": 3.257440310716629, "ce_loss_39": 2.673193109035492, "ce_loss_52": 1.4194035559892655, "ce_loss_7": 4.132071840763092, "epoch": 0.273, "grad_norm": 22.71970164256676, "kl_loss_13": 4896.8, "kl_loss_26": 3759.2, "kl_loss_39": 2513.4, "kl_loss_7": 5595.2, "learning_rate": 0.0008357427370318238, "loss": 8347.6, "step": 2730 }, { "ce_loss_13": 3.7844323456287383, "ce_loss_26": 3.263492447137833, "ce_loss_39": 2.6871775329113006, "ce_loss_52": 1.452097550034523, "ce_loss_7": 4.105139708518982, "epoch": 0.274, "grad_norm": 22.48264422716788, "kl_loss_13": 4807.2, "kl_loss_26": 3710.0, "kl_loss_39": 2469.2, "kl_loss_7": 5490.4, "learning_rate": 0.0008345653031794292, "loss": 8382.9, "step": 2740 }, { "ce_loss_13": 3.8110480844974517, "ce_loss_26": 3.2820364594459535, "ce_loss_39": 2.7046351432800293, "ce_loss_52": 1.4519646763801575, "ce_loss_7": 4.134507310390473, "epoch": 0.275, "grad_norm": 21.943498417005777, "kl_loss_13": 4812.0, "kl_loss_26": 3713.6, "kl_loss_39": 2499.8, "kl_loss_7": 5488.0, "learning_rate": 0.0008333845002581458, "loss": 8287.2, "step": 2750 }, { "ce_loss_13": 3.822121250629425, "ce_loss_26": 3.3007264256477358, "ce_loss_39": 2.73195458650589, "ce_loss_52": 1.4655994832515717, "ce_loss_7": 4.141650629043579, "epoch": 0.276, "grad_norm": 22.59740270522763, "kl_loss_13": 4832.8, "kl_loss_26": 3745.6, "kl_loss_39": 2538.4, "kl_loss_7": 5500.0, "learning_rate": 0.0008322003401586462, "loss": 8283.1, "step": 2760 }, { "ce_loss_13": 3.726576977968216, "ce_loss_26": 3.214203953742981, "ce_loss_39": 2.669701686501503, "ce_loss_52": 1.4409982591867447, "ce_loss_7": 4.043469870090485, "epoch": 0.277, "grad_norm": 21.384417928568727, "kl_loss_13": 4712.0, "kl_loss_26": 3643.6, "kl_loss_39": 2461.0, "kl_loss_7": 5379.2, "learning_rate": 0.0008310128348054094, "loss": 8251.4, "step": 2770 }, { "ce_loss_13": 3.768916404247284, "ce_loss_26": 3.2343318104743957, "ce_loss_39": 2.6558803230524064, "ce_loss_52": 1.4221897169947624, "ce_loss_7": 4.097134619951248, "epoch": 0.278, "grad_norm": 21.8508758307847, "kl_loss_13": 4846.4, "kl_loss_26": 3737.6, "kl_loss_39": 2492.4, "kl_loss_7": 5534.4, "learning_rate": 0.0008298219961566008, "loss": 8264.2, "step": 2780 }, { "ce_loss_13": 3.73385471701622, "ce_loss_26": 3.216676640510559, "ce_loss_39": 2.634915125370026, "ce_loss_52": 1.4022609382867812, "ce_loss_7": 4.067033034563065, "epoch": 0.279, "grad_norm": 22.23449381616188, "kl_loss_13": 4806.0, "kl_loss_26": 3708.8, "kl_loss_39": 2479.2, "kl_loss_7": 5499.2, "learning_rate": 0.0008286278362039527, "loss": 8184.2, "step": 2790 }, { "ce_loss_13": 3.756936568021774, "ce_loss_26": 3.2383838176727293, "ce_loss_39": 2.672528338432312, "ce_loss_52": 1.452534568309784, "ce_loss_7": 4.076909917593002, "epoch": 0.28, "grad_norm": 21.54056853237845, "kl_loss_13": 4743.2, "kl_loss_26": 3661.6, "kl_loss_39": 2456.0, "kl_loss_7": 5414.4, "learning_rate": 0.0008274303669726426, "loss": 8160.7, "step": 2800 }, { "ce_loss_13": 3.8688619792461396, "ce_loss_26": 3.3306061148643495, "ce_loss_39": 2.7420520305633547, "ce_loss_52": 1.4561516880989074, "ce_loss_7": 4.191312706470489, "epoch": 0.281, "grad_norm": 23.01011471220724, "kl_loss_13": 4962.4, "kl_loss_26": 3836.4, "kl_loss_39": 2581.6, "kl_loss_7": 5640.8, "learning_rate": 0.0008262296005211721, "loss": 8239.5, "step": 2810 }, { "ce_loss_13": 3.7579640209674836, "ce_loss_26": 3.2256029903888703, "ce_loss_39": 2.650630474090576, "ce_loss_52": 1.4400919079780579, "ce_loss_7": 4.077768385410309, "epoch": 0.282, "grad_norm": 21.557554897738267, "kl_loss_13": 4784.8, "kl_loss_26": 3661.6, "kl_loss_39": 2435.2, "kl_loss_7": 5454.4, "learning_rate": 0.0008250255489412463, "loss": 8218.5, "step": 2820 }, { "ce_loss_13": 3.7878367722034456, "ce_loss_26": 3.255573272705078, "ce_loss_39": 2.667675232887268, "ce_loss_52": 1.4289155021309852, "ce_loss_7": 4.111210036277771, "epoch": 0.283, "grad_norm": 22.099755132556425, "kl_loss_13": 4851.2, "kl_loss_26": 3733.6, "kl_loss_39": 2480.6, "kl_loss_7": 5527.2, "learning_rate": 0.0008238182243576511, "loss": 8152.9, "step": 2830 }, { "ce_loss_13": 3.7699286341667175, "ce_loss_26": 3.2401221811771395, "ce_loss_39": 2.6614575743675233, "ce_loss_52": 1.4347774118185044, "ce_loss_7": 4.08802090883255, "epoch": 0.284, "grad_norm": 21.441617328301042, "kl_loss_13": 4791.6, "kl_loss_26": 3695.6, "kl_loss_39": 2469.8, "kl_loss_7": 5453.6, "learning_rate": 0.0008226076389281315, "loss": 8141.7, "step": 2840 }, { "ce_loss_13": 3.692534440755844, "ce_loss_26": 3.174397534132004, "ce_loss_39": 2.6222778260707855, "ce_loss_52": 1.4332606226205826, "ce_loss_7": 4.00234357714653, "epoch": 0.285, "grad_norm": 23.306650885126444, "kl_loss_13": 4633.6, "kl_loss_26": 3560.0, "kl_loss_39": 2376.6, "kl_loss_7": 5279.2, "learning_rate": 0.0008213938048432696, "loss": 8068.6, "step": 2850 }, { "ce_loss_13": 3.6946506440639495, "ce_loss_26": 3.1700410664081575, "ce_loss_39": 2.597234898805618, "ce_loss_52": 1.4046493530273438, "ce_loss_7": 4.0274644792079926, "epoch": 0.286, "grad_norm": 21.879949646782595, "kl_loss_13": 4721.6, "kl_loss_26": 3628.8, "kl_loss_39": 2400.4, "kl_loss_7": 5412.8, "learning_rate": 0.0008201767343263612, "loss": 8086.6, "step": 2860 }, { "ce_loss_13": 3.7227329850196837, "ce_loss_26": 3.200405848026276, "ce_loss_39": 2.637904042005539, "ce_loss_52": 1.422918725013733, "ce_loss_7": 4.043233323097229, "epoch": 0.287, "grad_norm": 24.428095636864317, "kl_loss_13": 4732.0, "kl_loss_26": 3643.6, "kl_loss_39": 2433.8, "kl_loss_7": 5399.2, "learning_rate": 0.0008189564396332927, "loss": 8066.0, "step": 2870 }, { "ce_loss_13": 3.721643441915512, "ce_loss_26": 3.185835379362106, "ce_loss_39": 2.6226376593112946, "ce_loss_52": 1.4448419839143753, "ce_loss_7": 4.039817118644715, "epoch": 0.288, "grad_norm": 22.93644669160459, "kl_loss_13": 4683.2, "kl_loss_26": 3561.2, "kl_loss_39": 2345.2, "kl_loss_7": 5352.8, "learning_rate": 0.0008177329330524181, "loss": 8090.5, "step": 2880 }, { "ce_loss_13": 3.732273721694946, "ce_loss_26": 3.217743480205536, "ce_loss_39": 2.656236010789871, "ce_loss_52": 1.4405113011598587, "ce_loss_7": 4.046578335762024, "epoch": 0.289, "grad_norm": 22.27500685105708, "kl_loss_13": 4702.0, "kl_loss_26": 3629.6, "kl_loss_39": 2416.0, "kl_loss_7": 5360.8, "learning_rate": 0.0008165062269044352, "loss": 8083.7, "step": 2890 }, { "ce_loss_13": 3.7308314204216004, "ce_loss_26": 3.2038592040538787, "ce_loss_39": 2.6387904793024064, "ce_loss_52": 1.44214668571949, "ce_loss_7": 4.04458264708519, "epoch": 0.29, "grad_norm": 22.45358727511497, "kl_loss_13": 4700.4, "kl_loss_26": 3605.2, "kl_loss_39": 2398.4, "kl_loss_7": 5358.4, "learning_rate": 0.0008152763335422613, "loss": 8063.0, "step": 2900 }, { "ce_loss_13": 3.6699211478233336, "ce_loss_26": 3.153660440444946, "ce_loss_39": 2.5951134085655214, "ce_loss_52": 1.4221089735627175, "ce_loss_7": 3.978937405347824, "epoch": 0.291, "grad_norm": 23.44237828375525, "kl_loss_13": 4614.0, "kl_loss_26": 3534.4, "kl_loss_39": 2341.4, "kl_loss_7": 5261.6, "learning_rate": 0.0008140432653509088, "loss": 8001.3, "step": 2910 }, { "ce_loss_13": 3.6406539916992187, "ce_loss_26": 3.1216741025447847, "ce_loss_39": 2.5570163398981096, "ce_loss_52": 1.39638482183218, "ce_loss_7": 3.9557625532150267, "epoch": 0.292, "grad_norm": 21.187460458215387, "kl_loss_13": 4592.8, "kl_loss_26": 3516.8, "kl_loss_39": 2326.8, "kl_loss_7": 5254.0, "learning_rate": 0.0008128070347473608, "loss": 7966.5, "step": 2920 }, { "ce_loss_13": 3.665645903348923, "ce_loss_26": 3.149553042650223, "ce_loss_39": 2.58870205283165, "ce_loss_52": 1.4208435118198395, "ce_loss_7": 3.9827320516109466, "epoch": 0.293, "grad_norm": 21.300592787802476, "kl_loss_13": 4619.6, "kl_loss_26": 3530.8, "kl_loss_39": 2336.0, "kl_loss_7": 5284.0, "learning_rate": 0.0008115676541804455, "loss": 7990.7, "step": 2930 }, { "ce_loss_13": 3.6261947989463805, "ce_loss_26": 3.1126498699188234, "ce_loss_39": 2.5498824626207353, "ce_loss_52": 1.3916691318154335, "ce_loss_7": 3.9482949018478393, "epoch": 0.294, "grad_norm": 21.8788417242541, "kl_loss_13": 4598.0, "kl_loss_26": 3513.2, "kl_loss_39": 2317.4, "kl_loss_7": 5269.6, "learning_rate": 0.0008103251361307119, "loss": 7972.2, "step": 2940 }, { "ce_loss_13": 3.6617009818553923, "ce_loss_26": 3.134212648868561, "ce_loss_39": 2.569432234764099, "ce_loss_52": 1.4306001305580138, "ce_loss_7": 3.978475254774094, "epoch": 0.295, "grad_norm": 21.383257731886584, "kl_loss_13": 4584.8, "kl_loss_26": 3486.8, "kl_loss_39": 2289.6, "kl_loss_7": 5252.0, "learning_rate": 0.0008090794931103026, "loss": 7903.9, "step": 2950 }, { "ce_loss_13": 3.674825745820999, "ce_loss_26": 3.1651513874530792, "ce_loss_39": 2.605163484811783, "ce_loss_52": 1.434949815273285, "ce_loss_7": 3.988205587863922, "epoch": 0.296, "grad_norm": 21.87171120261939, "kl_loss_13": 4585.6, "kl_loss_26": 3519.2, "kl_loss_39": 2327.0, "kl_loss_7": 5246.4, "learning_rate": 0.0008078307376628291, "loss": 7903.2, "step": 2960 }, { "ce_loss_13": 3.6504483819007874, "ce_loss_26": 3.1346111416816713, "ce_loss_39": 2.5827776730060577, "ce_loss_52": 1.4189698368310928, "ce_loss_7": 3.9625262200832365, "epoch": 0.297, "grad_norm": 23.048467847326563, "kl_loss_13": 4571.2, "kl_loss_26": 3498.4, "kl_loss_39": 2319.0, "kl_loss_7": 5224.0, "learning_rate": 0.000806578882363245, "loss": 7901.6, "step": 2970 }, { "ce_loss_13": 3.655070722103119, "ce_loss_26": 3.136745995283127, "ce_loss_39": 2.5604557782411574, "ce_loss_52": 1.401164847612381, "ce_loss_7": 3.973217171430588, "epoch": 0.298, "grad_norm": 21.078263907370157, "kl_loss_13": 4614.4, "kl_loss_26": 3538.4, "kl_loss_39": 2311.4, "kl_loss_7": 5287.2, "learning_rate": 0.0008053239398177191, "loss": 7911.8, "step": 2980 }, { "ce_loss_13": 3.6555157959461213, "ce_loss_26": 3.13962464928627, "ce_loss_39": 2.5709628492593763, "ce_loss_52": 1.4242349237203598, "ce_loss_7": 3.9756637513637543, "epoch": 0.299, "grad_norm": 22.608350182138345, "kl_loss_13": 4603.2, "kl_loss_26": 3520.4, "kl_loss_39": 2304.0, "kl_loss_7": 5274.4, "learning_rate": 0.0008040659226635089, "loss": 7892.4, "step": 2990 }, { "ce_loss_13": 3.6532657563686373, "ce_loss_26": 3.124306696653366, "ce_loss_39": 2.557386627793312, "ce_loss_52": 1.402693158388138, "ce_loss_7": 3.9695385217666628, "epoch": 0.3, "grad_norm": 22.376822604204033, "kl_loss_13": 4616.4, "kl_loss_26": 3523.6, "kl_loss_39": 2321.8, "kl_loss_7": 5287.2, "learning_rate": 0.0008028048435688333, "loss": 7820.7, "step": 3000 }, { "ce_loss_13": 3.6811940252780913, "ce_loss_26": 3.1677908301353455, "ce_loss_39": 2.607375094294548, "ce_loss_52": 1.4560914367437363, "ce_loss_7": 3.9915607273578644, "epoch": 0.301, "grad_norm": 21.84188714128543, "kl_loss_13": 4604.8, "kl_loss_26": 3538.4, "kl_loss_39": 2343.2, "kl_loss_7": 5256.0, "learning_rate": 0.0008015407152327448, "loss": 7933.0, "step": 3010 }, { "ce_loss_13": 3.737543153762817, "ce_loss_26": 3.2195757627487183, "ce_loss_39": 2.6466131448745727, "ce_loss_52": 1.4449012607336045, "ce_loss_7": 4.052252840995789, "epoch": 0.302, "grad_norm": 22.34664686545947, "kl_loss_13": 4700.8, "kl_loss_26": 3618.8, "kl_loss_39": 2394.4, "kl_loss_7": 5358.4, "learning_rate": 0.0008002735503850016, "loss": 7844.2, "step": 3020 }, { "ce_loss_13": 3.6698498368263244, "ce_loss_26": 3.155323106050491, "ce_loss_39": 2.579972979426384, "ce_loss_52": 1.4486516952514648, "ce_loss_7": 3.9814261555671693, "epoch": 0.303, "grad_norm": 22.316774640404955, "kl_loss_13": 4563.2, "kl_loss_26": 3486.8, "kl_loss_39": 2285.2, "kl_loss_7": 5213.6, "learning_rate": 0.0007990033617859396, "loss": 7844.3, "step": 3030 }, { "ce_loss_13": 3.661813771724701, "ce_loss_26": 3.1450955271720886, "ce_loss_39": 2.5806987404823305, "ce_loss_52": 1.4304928302764892, "ce_loss_7": 3.9737633407115935, "epoch": 0.304, "grad_norm": 22.094854051528255, "kl_loss_13": 4595.2, "kl_loss_26": 3519.2, "kl_loss_39": 2321.2, "kl_loss_7": 5248.0, "learning_rate": 0.000797730162226344, "loss": 7813.7, "step": 3040 }, { "ce_loss_13": 3.6036822319030763, "ce_loss_26": 3.0875354915857316, "ce_loss_39": 2.5262934505939483, "ce_loss_52": 1.3893155947327613, "ce_loss_7": 3.921951335668564, "epoch": 0.305, "grad_norm": 22.896126437016644, "kl_loss_13": 4538.0, "kl_loss_26": 3453.6, "kl_loss_39": 2252.6, "kl_loss_7": 5210.4, "learning_rate": 0.0007964539645273203, "loss": 7783.3, "step": 3050 }, { "ce_loss_13": 3.690790832042694, "ce_loss_26": 3.1806884586811064, "ce_loss_39": 2.6345931828022002, "ce_loss_52": 1.4827970415353775, "ce_loss_7": 3.996461832523346, "epoch": 0.306, "grad_norm": 22.12157409866164, "kl_loss_13": 4558.4, "kl_loss_26": 3495.6, "kl_loss_39": 2322.2, "kl_loss_7": 5202.4, "learning_rate": 0.000795174781540165, "loss": 7798.9, "step": 3060 }, { "ce_loss_13": 3.6345800876617433, "ce_loss_26": 3.126866352558136, "ce_loss_39": 2.5723444908857345, "ce_loss_52": 1.4505648389458656, "ce_loss_7": 3.938809943199158, "epoch": 0.307, "grad_norm": 21.67276371006888, "kl_loss_13": 4502.8, "kl_loss_26": 3435.6, "kl_loss_39": 2259.2, "kl_loss_7": 5140.8, "learning_rate": 0.0007938926261462366, "loss": 7786.2, "step": 3070 }, { "ce_loss_13": 3.6539651334285734, "ce_loss_26": 3.136826354265213, "ce_loss_39": 2.5686775982379912, "ce_loss_52": 1.4312876760959625, "ce_loss_7": 3.9670185923576353, "epoch": 0.308, "grad_norm": 23.264906723037097, "kl_loss_13": 4571.6, "kl_loss_26": 3495.6, "kl_loss_39": 2289.8, "kl_loss_7": 5231.2, "learning_rate": 0.0007926075112568258, "loss": 7773.0, "step": 3080 }, { "ce_loss_13": 3.6424070239067077, "ce_loss_26": 3.13111692070961, "ce_loss_39": 2.5661837816238404, "ce_loss_52": 1.4395585834980011, "ce_loss_7": 3.9482239544391633, "epoch": 0.309, "grad_norm": 22.051447045868983, "kl_loss_13": 4540.0, "kl_loss_26": 3467.2, "kl_loss_39": 2265.6, "kl_loss_7": 5186.4, "learning_rate": 0.0007913194498130252, "loss": 7730.0, "step": 3090 }, { "ce_loss_13": 3.6187573671340942, "ce_loss_26": 3.110442912578583, "ce_loss_39": 2.553048479557037, "ce_loss_52": 1.4316339492797852, "ce_loss_7": 3.92513769865036, "epoch": 0.31, "grad_norm": 22.041241368180156, "kl_loss_13": 4504.8, "kl_loss_26": 3436.0, "kl_loss_39": 2242.0, "kl_loss_7": 5140.0, "learning_rate": 0.0007900284547855992, "loss": 7742.0, "step": 3100 }, { "ce_loss_13": 3.6639523029327394, "ce_loss_26": 3.1580884575843813, "ce_loss_39": 2.5835469484329225, "ce_loss_52": 1.4442215472459794, "ce_loss_7": 3.976783311367035, "epoch": 0.311, "grad_norm": 20.880619592237565, "kl_loss_13": 4592.0, "kl_loss_26": 3526.4, "kl_loss_39": 2312.4, "kl_loss_7": 5244.8, "learning_rate": 0.0007887345391748532, "loss": 7735.3, "step": 3110 }, { "ce_loss_13": 3.6283124804496767, "ce_loss_26": 3.113315612077713, "ce_loss_39": 2.547743684053421, "ce_loss_52": 1.42053325176239, "ce_loss_7": 3.9331269919872285, "epoch": 0.312, "grad_norm": 22.15121587945531, "kl_loss_13": 4543.2, "kl_loss_26": 3463.2, "kl_loss_39": 2267.8, "kl_loss_7": 5184.8, "learning_rate": 0.0007874377160105036, "loss": 7729.4, "step": 3120 }, { "ce_loss_13": 3.6399169504642486, "ce_loss_26": 3.135877913236618, "ce_loss_39": 2.5725889205932617, "ce_loss_52": 1.4427233994007111, "ce_loss_7": 3.9606189668178557, "epoch": 0.313, "grad_norm": 21.87500401487531, "kl_loss_13": 4563.2, "kl_loss_26": 3490.0, "kl_loss_39": 2276.4, "kl_loss_7": 5228.0, "learning_rate": 0.0007861379983515449, "loss": 7710.9, "step": 3130 }, { "ce_loss_13": 3.634021121263504, "ce_loss_26": 3.111995500326157, "ce_loss_39": 2.5583092838525774, "ce_loss_52": 1.4399698421359062, "ce_loss_7": 3.94167400598526, "epoch": 0.314, "grad_norm": 22.854565496538875, "kl_loss_13": 4504.4, "kl_loss_26": 3415.2, "kl_loss_39": 2230.6, "kl_loss_7": 5153.6, "learning_rate": 0.0007848353992861195, "loss": 7710.3, "step": 3140 }, { "ce_loss_13": 3.6272457361221315, "ce_loss_26": 3.116968184709549, "ce_loss_39": 2.551611191034317, "ce_loss_52": 1.437747061252594, "ce_loss_7": 3.9388325929641725, "epoch": 0.315, "grad_norm": 21.84748688614269, "kl_loss_13": 4498.8, "kl_loss_26": 3427.6, "kl_loss_39": 2231.6, "kl_loss_7": 5142.4, "learning_rate": 0.0007835299319313853, "loss": 7607.0, "step": 3150 }, { "ce_loss_13": 3.613277268409729, "ce_loss_26": 3.0916620969772337, "ce_loss_39": 2.5186130821704866, "ce_loss_52": 1.3888636380434036, "ce_loss_7": 3.935783725976944, "epoch": 0.316, "grad_norm": 21.933561198317395, "kl_loss_13": 4519.2, "kl_loss_26": 3438.8, "kl_loss_39": 2232.0, "kl_loss_7": 5189.6, "learning_rate": 0.0007822216094333848, "loss": 7650.0, "step": 3160 }, { "ce_loss_13": 3.658072179555893, "ce_loss_26": 3.1417903542518615, "ce_loss_39": 2.577219474315643, "ce_loss_52": 1.437073315680027, "ce_loss_7": 3.970378410816193, "epoch": 0.317, "grad_norm": 22.034139537965903, "kl_loss_13": 4566.4, "kl_loss_26": 3493.2, "kl_loss_39": 2301.8, "kl_loss_7": 5224.0, "learning_rate": 0.0007809104449669101, "loss": 7644.7, "step": 3170 }, { "ce_loss_13": 3.593963289260864, "ce_loss_26": 3.080548882484436, "ce_loss_39": 2.5262903541326525, "ce_loss_52": 1.4362893968820571, "ce_loss_7": 3.8962223708629606, "epoch": 0.318, "grad_norm": 22.12833658126749, "kl_loss_13": 4417.6, "kl_loss_26": 3353.2, "kl_loss_39": 2169.6, "kl_loss_7": 5054.4, "learning_rate": 0.0007795964517353734, "loss": 7580.1, "step": 3180 }, { "ce_loss_13": 3.639219433069229, "ce_loss_26": 3.126335847377777, "ce_loss_39": 2.5598012149333953, "ce_loss_52": 1.4479554057121278, "ce_loss_7": 3.955880182981491, "epoch": 0.319, "grad_norm": 21.421584248628356, "kl_loss_13": 4524.8, "kl_loss_26": 3445.2, "kl_loss_39": 2238.4, "kl_loss_7": 5180.8, "learning_rate": 0.000778279642970672, "loss": 7577.4, "step": 3190 }, { "ce_loss_13": 3.593672776222229, "ce_loss_26": 3.076059252023697, "ce_loss_39": 2.5206238448619844, "ce_loss_52": 1.4138888984918594, "ce_loss_7": 3.898124760389328, "epoch": 0.32, "grad_norm": 23.27138036145762, "kl_loss_13": 4477.6, "kl_loss_26": 3400.0, "kl_loss_39": 2214.8, "kl_loss_7": 5123.2, "learning_rate": 0.0007769600319330552, "loss": 7595.6, "step": 3200 }, { "ce_loss_13": 3.6573951125144957, "ce_loss_26": 3.166206729412079, "ce_loss_39": 2.6105258047580717, "ce_loss_52": 1.47857309281826, "ce_loss_7": 3.9568731427192687, "epoch": 0.321, "grad_norm": 21.35600054948774, "kl_loss_13": 4470.4, "kl_loss_26": 3434.4, "kl_loss_39": 2255.6, "kl_loss_7": 5100.8, "learning_rate": 0.0007756376319109917, "loss": 7610.9, "step": 3210 }, { "ce_loss_13": 3.619207721948624, "ce_loss_26": 3.112484961748123, "ce_loss_39": 2.5595098197460175, "ce_loss_52": 1.442267394065857, "ce_loss_7": 3.9296476364135744, "epoch": 0.322, "grad_norm": 21.117056892906756, "kl_loss_13": 4453.6, "kl_loss_26": 3396.0, "kl_loss_39": 2218.0, "kl_loss_7": 5104.8, "learning_rate": 0.0007743124562210351, "loss": 7569.7, "step": 3220 }, { "ce_loss_13": 3.612431305646896, "ce_loss_26": 3.104649418592453, "ce_loss_39": 2.5444509416818617, "ce_loss_52": 1.4610149055719375, "ce_loss_7": 3.9223886907100676, "epoch": 0.323, "grad_norm": 22.510814919939268, "kl_loss_13": 4408.0, "kl_loss_26": 3338.4, "kl_loss_39": 2155.4, "kl_loss_7": 5054.4, "learning_rate": 0.0007729845182076895, "loss": 7565.6, "step": 3230 }, { "ce_loss_13": 3.5650066912174223, "ce_loss_26": 3.060505121946335, "ce_loss_39": 2.5127211630344393, "ce_loss_52": 1.445562407374382, "ce_loss_7": 3.8779995679855346, "epoch": 0.324, "grad_norm": 24.007681143469355, "kl_loss_13": 4388.4, "kl_loss_26": 3323.6, "kl_loss_39": 2150.2, "kl_loss_7": 5044.0, "learning_rate": 0.0007716538312432765, "loss": 7556.0, "step": 3240 }, { "ce_loss_13": 3.5737381398677828, "ce_loss_26": 3.0725920855998994, "ce_loss_39": 2.5134449005126953, "ce_loss_52": 1.4138619631528855, "ce_loss_7": 3.8855117499828338, "epoch": 0.325, "grad_norm": 22.203629206824775, "kl_loss_13": 4430.8, "kl_loss_26": 3379.6, "kl_loss_39": 2197.4, "kl_loss_7": 5081.6, "learning_rate": 0.0007703204087277988, "loss": 7530.7, "step": 3250 }, { "ce_loss_13": 3.5474561214447022, "ce_loss_26": 3.037938302755356, "ce_loss_39": 2.477022570371628, "ce_loss_52": 1.3884405881166457, "ce_loss_7": 3.85917187333107, "epoch": 0.326, "grad_norm": 21.98291246151193, "kl_loss_13": 4437.6, "kl_loss_26": 3371.6, "kl_loss_39": 2176.2, "kl_loss_7": 5089.6, "learning_rate": 0.0007689842640888063, "loss": 7519.3, "step": 3260 }, { "ce_loss_13": 3.6053310513496397, "ce_loss_26": 3.0936122059822084, "ce_loss_39": 2.5414693653583527, "ce_loss_52": 1.4531731829047203, "ce_loss_7": 3.913253253698349, "epoch": 0.327, "grad_norm": 22.418707773974628, "kl_loss_13": 4430.8, "kl_loss_26": 3360.0, "kl_loss_39": 2184.0, "kl_loss_7": 5068.4, "learning_rate": 0.0007676454107812607, "loss": 7473.1, "step": 3270 }, { "ce_loss_13": 3.545606768131256, "ce_loss_26": 3.0480951845645903, "ce_loss_39": 2.499777999520302, "ce_loss_52": 1.4314876705408097, "ce_loss_7": 3.849948841333389, "epoch": 0.328, "grad_norm": 22.389500426390892, "kl_loss_13": 4402.4, "kl_loss_26": 3351.6, "kl_loss_39": 2160.2, "kl_loss_7": 5035.2, "learning_rate": 0.0007663038622873999, "loss": 7510.3, "step": 3280 }, { "ce_loss_13": 3.6383297204971314, "ce_loss_26": 3.127288430929184, "ce_loss_39": 2.561781680583954, "ce_loss_52": 1.4617935866117477, "ce_loss_7": 3.9572836577892305, "epoch": 0.329, "grad_norm": 23.105081611165705, "kl_loss_13": 4472.4, "kl_loss_26": 3396.8, "kl_loss_39": 2202.8, "kl_loss_7": 5139.2, "learning_rate": 0.0007649596321166025, "loss": 7473.8, "step": 3290 }, { "ce_loss_13": 3.5131199419498444, "ce_loss_26": 3.0129006803035736, "ce_loss_39": 2.4699264496564863, "ce_loss_52": 1.4362694859504699, "ce_loss_7": 3.8114282488822937, "epoch": 0.33, "grad_norm": 22.77133190654901, "kl_loss_13": 4268.0, "kl_loss_26": 3223.6, "kl_loss_39": 2069.4, "kl_loss_7": 4882.4, "learning_rate": 0.0007636127338052513, "loss": 7443.1, "step": 3300 }, { "ce_loss_13": 3.5485077798366547, "ce_loss_26": 3.0334243774414062, "ce_loss_39": 2.471779704093933, "ce_loss_52": 1.4008762776851653, "ce_loss_7": 3.860434752702713, "epoch": 0.331, "grad_norm": 22.94544302407564, "kl_loss_13": 4425.6, "kl_loss_26": 3344.4, "kl_loss_39": 2154.8, "kl_loss_7": 5076.8, "learning_rate": 0.0007622631809165971, "loss": 7403.2, "step": 3310 }, { "ce_loss_13": 3.611973536014557, "ce_loss_26": 3.1062149882316588, "ce_loss_39": 2.5540910184383394, "ce_loss_52": 1.4812486261129378, "ce_loss_7": 3.913082367181778, "epoch": 0.332, "grad_norm": 21.844212510164496, "kl_loss_13": 4407.2, "kl_loss_26": 3357.6, "kl_loss_39": 2168.2, "kl_loss_7": 5035.2, "learning_rate": 0.000760910987040623, "loss": 7436.1, "step": 3320 }, { "ce_loss_13": 3.500666618347168, "ce_loss_26": 2.992197906970978, "ce_loss_39": 2.443836176395416, "ce_loss_52": 1.4172912210226059, "ce_loss_7": 3.809192955493927, "epoch": 0.333, "grad_norm": 22.341971135877618, "kl_loss_13": 4287.2, "kl_loss_26": 3229.2, "kl_loss_39": 2050.8, "kl_loss_7": 4934.4, "learning_rate": 0.000759556165793906, "loss": 7354.8, "step": 3330 }, { "ce_loss_13": 3.572561663389206, "ce_loss_26": 3.0666925728321077, "ce_loss_39": 2.5258089125156404, "ce_loss_52": 1.4658059388399125, "ce_loss_7": 3.8703009307384493, "epoch": 0.334, "grad_norm": 20.585398734523825, "kl_loss_13": 4346.0, "kl_loss_26": 3292.8, "kl_loss_39": 2121.6, "kl_loss_7": 4966.4, "learning_rate": 0.000758198730819481, "loss": 7376.9, "step": 3340 }, { "ce_loss_13": 3.5921706318855287, "ce_loss_26": 3.0865486025810243, "ce_loss_39": 2.5250521272420885, "ce_loss_52": 1.4276385620236396, "ce_loss_7": 3.9076746106147766, "epoch": 0.335, "grad_norm": 22.48300338159267, "kl_loss_13": 4447.2, "kl_loss_26": 3387.2, "kl_loss_39": 2194.8, "kl_loss_7": 5095.2, "learning_rate": 0.0007568386957867032, "loss": 7407.2, "step": 3350 }, { "ce_loss_13": 3.5395087361335755, "ce_loss_26": 3.0476285994052885, "ce_loss_39": 2.5039754688739775, "ce_loss_52": 1.4519936561584472, "ce_loss_7": 3.8360753774642946, "epoch": 0.336, "grad_norm": 22.282680621594952, "kl_loss_13": 4315.6, "kl_loss_26": 3284.8, "kl_loss_39": 2115.8, "kl_loss_7": 4935.2, "learning_rate": 0.0007554760743911103, "loss": 7349.9, "step": 3360 }, { "ce_loss_13": 3.5341054499149323, "ce_loss_26": 3.0255552768707275, "ce_loss_39": 2.4795517563819884, "ce_loss_52": 1.43424501568079, "ce_loss_7": 3.8406670331954955, "epoch": 0.337, "grad_norm": 21.82655281544531, "kl_loss_13": 4317.2, "kl_loss_26": 3248.8, "kl_loss_39": 2081.8, "kl_loss_7": 4959.2, "learning_rate": 0.0007541108803542846, "loss": 7352.9, "step": 3370 }, { "ce_loss_13": 3.571430027484894, "ce_loss_26": 3.0674545526504517, "ce_loss_39": 2.5179436981678007, "ce_loss_52": 1.4572079569101333, "ce_loss_7": 3.8665721654891967, "epoch": 0.338, "grad_norm": 20.283540782731166, "kl_loss_13": 4343.6, "kl_loss_26": 3291.2, "kl_loss_39": 2129.4, "kl_loss_7": 4966.4, "learning_rate": 0.0007527431274237149, "loss": 7371.7, "step": 3380 }, { "ce_loss_13": 3.53710196018219, "ce_loss_26": 3.0354479968547823, "ce_loss_39": 2.4885441571474076, "ce_loss_52": 1.4400692582130432, "ce_loss_7": 3.8393473029136658, "epoch": 0.339, "grad_norm": 21.387970982998613, "kl_loss_13": 4311.6, "kl_loss_26": 3256.4, "kl_loss_39": 2099.8, "kl_loss_7": 4936.0, "learning_rate": 0.0007513728293726579, "loss": 7294.4, "step": 3390 }, { "ce_loss_13": 3.523770880699158, "ce_loss_26": 3.0170785784721375, "ce_loss_39": 2.4583947211503983, "ce_loss_52": 1.43256463855505, "ce_loss_7": 3.822116255760193, "epoch": 0.34, "grad_norm": 21.22690089148789, "kl_loss_13": 4311.6, "kl_loss_26": 3260.0, "kl_loss_39": 2066.6, "kl_loss_7": 4940.0, "learning_rate": 0.00075, "loss": 7289.9, "step": 3400 }, { "ce_loss_13": 3.495673859119415, "ce_loss_26": 2.9982276618480683, "ce_loss_39": 2.444023036956787, "ce_loss_52": 1.4111162751913071, "ce_loss_7": 3.805855029821396, "epoch": 0.341, "grad_norm": 20.68401947266934, "kl_loss_13": 4286.0, "kl_loss_26": 3244.8, "kl_loss_39": 2061.6, "kl_loss_7": 4924.4, "learning_rate": 0.0007486246531301177, "loss": 7295.1, "step": 3410 }, { "ce_loss_13": 3.532993698120117, "ce_loss_26": 3.0362183272838594, "ce_loss_39": 2.485447385907173, "ce_loss_52": 1.4526691198349, "ce_loss_7": 3.830386519432068, "epoch": 0.342, "grad_norm": 22.222401081185772, "kl_loss_13": 4299.6, "kl_loss_26": 3249.2, "kl_loss_39": 2080.4, "kl_loss_7": 4928.0, "learning_rate": 0.0007472468026127384, "loss": 7341.7, "step": 3420 }, { "ce_loss_13": 3.463904342055321, "ce_loss_26": 2.9640887469053268, "ce_loss_39": 2.421750417351723, "ce_loss_52": 1.4137367144227029, "ce_loss_7": 3.7643026977777483, "epoch": 0.343, "grad_norm": 21.63494224797145, "kl_loss_13": 4250.8, "kl_loss_26": 3196.2, "kl_loss_39": 2039.9, "kl_loss_7": 4881.6, "learning_rate": 0.000745866462322802, "loss": 7230.95, "step": 3430 }, { "ce_loss_13": 3.5996195137500764, "ce_loss_26": 3.093309980630875, "ce_loss_39": 2.5379314005374907, "ce_loss_52": 1.4943716078996658, "ce_loss_7": 3.899878454208374, "epoch": 0.344, "grad_norm": 23.12022506478991, "kl_loss_13": 4332.0, "kl_loss_26": 3268.0, "kl_loss_39": 2085.8, "kl_loss_7": 4968.0, "learning_rate": 0.0007444836461603195, "loss": 7294.5, "step": 3440 }, { "ce_loss_13": 3.460267198085785, "ce_loss_26": 2.979328769445419, "ce_loss_39": 2.435830682516098, "ce_loss_52": 1.409597858786583, "ce_loss_7": 3.7637628614902496, "epoch": 0.345, "grad_norm": 22.11179077444158, "kl_loss_13": 4240.0, "kl_loss_26": 3220.0, "kl_loss_39": 2052.4, "kl_loss_7": 4872.8, "learning_rate": 0.0007430983680502344, "loss": 7260.3, "step": 3450 }, { "ce_loss_13": 3.484216260910034, "ce_loss_26": 2.986140418052673, "ce_loss_39": 2.4419540107250213, "ce_loss_52": 1.423793789744377, "ce_loss_7": 3.783113992214203, "epoch": 0.346, "grad_norm": 21.48292293909848, "kl_loss_13": 4242.0, "kl_loss_26": 3200.0, "kl_loss_39": 2037.4, "kl_loss_7": 4868.0, "learning_rate": 0.0007417106419422819, "loss": 7210.1, "step": 3460 }, { "ce_loss_13": 3.4797898173332213, "ce_loss_26": 2.9807622492313386, "ce_loss_39": 2.43132506608963, "ce_loss_52": 1.4044816851615907, "ce_loss_7": 3.782477653026581, "epoch": 0.347, "grad_norm": 21.84069780374938, "kl_loss_13": 4286.0, "kl_loss_26": 3248.0, "kl_loss_39": 2073.6, "kl_loss_7": 4917.6, "learning_rate": 0.0007403204818108486, "loss": 7232.3, "step": 3470 }, { "ce_loss_13": 3.461978626251221, "ce_loss_26": 2.971747863292694, "ce_loss_39": 2.4197792381048204, "ce_loss_52": 1.411722904443741, "ce_loss_7": 3.7553564965724946, "epoch": 0.348, "grad_norm": 20.773261789734953, "kl_loss_13": 4206.0, "kl_loss_26": 3177.2, "kl_loss_39": 2006.4, "kl_loss_7": 4823.2, "learning_rate": 0.0007389279016548316, "loss": 7200.0, "step": 3480 }, { "ce_loss_13": 3.412698417901993, "ce_loss_26": 2.910679543018341, "ce_loss_39": 2.3608890056610106, "ce_loss_52": 1.3876498267054558, "ce_loss_7": 3.7148698210716247, "epoch": 0.349, "grad_norm": 21.05607269401212, "kl_loss_13": 4174.0, "kl_loss_26": 3125.6, "kl_loss_39": 1966.0, "kl_loss_7": 4804.0, "learning_rate": 0.0007375329154974975, "loss": 7216.6, "step": 3490 }, { "ce_loss_13": 3.474409651756287, "ce_loss_26": 2.9683692157268524, "ce_loss_39": 2.418835300207138, "ce_loss_52": 1.4043258875608444, "ce_loss_7": 3.774983435869217, "epoch": 0.35, "grad_norm": 20.352131735407625, "kl_loss_13": 4246.8, "kl_loss_26": 3202.8, "kl_loss_39": 2042.2, "kl_loss_7": 4875.2, "learning_rate": 0.0007361355373863414, "loss": 7202.7, "step": 3500 }, { "ce_loss_13": 3.4584279537200926, "ce_loss_26": 2.9674349963665008, "ce_loss_39": 2.422105145454407, "ce_loss_52": 1.4255526602268218, "ce_loss_7": 3.767817974090576, "epoch": 0.351, "grad_norm": 20.416274052366226, "kl_loss_13": 4216.8, "kl_loss_26": 3188.8, "kl_loss_39": 2018.4, "kl_loss_7": 4854.4, "learning_rate": 0.0007347357813929454, "loss": 7180.1, "step": 3510 }, { "ce_loss_13": 3.4778851926326753, "ce_loss_26": 2.979369193315506, "ce_loss_39": 2.4347113519906998, "ce_loss_52": 1.4163423389196397, "ce_loss_7": 3.7732009410858156, "epoch": 0.352, "grad_norm": 24.260880475347793, "kl_loss_13": 4219.6, "kl_loss_26": 3191.6, "kl_loss_39": 2033.6, "kl_loss_7": 4844.0, "learning_rate": 0.0007333336616128369, "loss": 7181.8, "step": 3520 }, { "ce_loss_13": 3.479642480611801, "ce_loss_26": 2.9858607232570646, "ce_loss_39": 2.429011595249176, "ce_loss_52": 1.4224095463752746, "ce_loss_7": 3.779256856441498, "epoch": 0.353, "grad_norm": 20.532035835107088, "kl_loss_13": 4211.6, "kl_loss_26": 3185.2, "kl_loss_39": 2008.4, "kl_loss_7": 4841.6, "learning_rate": 0.0007319291921653463, "loss": 7183.4, "step": 3530 }, { "ce_loss_13": 3.4610216915607452, "ce_loss_26": 2.962309718132019, "ce_loss_39": 2.410178878903389, "ce_loss_52": 1.4115030318498611, "ce_loss_7": 3.757075273990631, "epoch": 0.354, "grad_norm": 23.640729954280236, "kl_loss_13": 4236.8, "kl_loss_26": 3190.0, "kl_loss_39": 2013.0, "kl_loss_7": 4864.8, "learning_rate": 0.0007305223871934656, "loss": 7181.4, "step": 3540 }, { "ce_loss_13": 3.5113906443119047, "ce_loss_26": 3.016271597146988, "ce_loss_39": 2.4817953169345857, "ce_loss_52": 1.4761293560266495, "ce_loss_7": 3.79822900891304, "epoch": 0.355, "grad_norm": 22.47350392427222, "kl_loss_13": 4203.2, "kl_loss_26": 3159.6, "kl_loss_39": 2013.8, "kl_loss_7": 4804.8, "learning_rate": 0.0007291132608637052, "loss": 7117.3, "step": 3550 }, { "ce_loss_13": 3.51672882437706, "ce_loss_26": 3.017621088027954, "ce_loss_39": 2.471283310651779, "ce_loss_52": 1.4747960895299912, "ce_loss_7": 3.8214517176151275, "epoch": 0.356, "grad_norm": 22.39492370466417, "kl_loss_13": 4220.0, "kl_loss_26": 3165.2, "kl_loss_39": 2000.0, "kl_loss_7": 4852.8, "learning_rate": 0.0007277018273659516, "loss": 7133.8, "step": 3560 }, { "ce_loss_13": 3.5502862453460695, "ce_loss_26": 3.05435825586319, "ce_loss_39": 2.5157745271921157, "ce_loss_52": 1.4953802406787873, "ce_loss_7": 3.8426124274730684, "epoch": 0.357, "grad_norm": 22.306092146539875, "kl_loss_13": 4247.6, "kl_loss_26": 3216.4, "kl_loss_39": 2067.2, "kl_loss_7": 4859.2, "learning_rate": 0.0007262881009133242, "loss": 7135.8, "step": 3570 }, { "ce_loss_13": 3.454521042108536, "ce_loss_26": 2.9510986149311065, "ce_loss_39": 2.4084193408489227, "ce_loss_52": 1.4200605943799018, "ce_loss_7": 3.7586602210998534, "epoch": 0.358, "grad_norm": 21.121962853812185, "kl_loss_13": 4202.8, "kl_loss_26": 3144.4, "kl_loss_39": 1986.2, "kl_loss_7": 4836.0, "learning_rate": 0.0007248720957420329, "loss": 7135.9, "step": 3580 }, { "ce_loss_13": 3.4299929022789, "ce_loss_26": 2.9262389481067657, "ce_loss_39": 2.3823306292295454, "ce_loss_52": 1.4015884697437286, "ce_loss_7": 3.7345054388046264, "epoch": 0.359, "grad_norm": 21.87103253394757, "kl_loss_13": 4194.4, "kl_loss_26": 3138.0, "kl_loss_39": 1982.0, "kl_loss_7": 4829.6, "learning_rate": 0.0007234538261112341, "loss": 7056.9, "step": 3590 }, { "ce_loss_13": 3.47941969037056, "ce_loss_26": 2.9830207943916323, "ce_loss_39": 2.429542663693428, "ce_loss_52": 1.4434623152017594, "ce_loss_7": 3.7679139375686646, "epoch": 0.36, "grad_norm": 21.216900982885303, "kl_loss_13": 4199.2, "kl_loss_26": 3151.6, "kl_loss_39": 1989.4, "kl_loss_7": 4814.0, "learning_rate": 0.0007220333063028871, "loss": 7096.6, "step": 3600 }, { "ce_loss_13": 3.3754841923713683, "ce_loss_26": 2.8831639885902405, "ce_loss_39": 2.3438637793064117, "ce_loss_52": 1.3872641950845719, "ce_loss_7": 3.681108373403549, "epoch": 0.361, "grad_norm": 21.80171673212867, "kl_loss_13": 4118.8, "kl_loss_26": 3083.2, "kl_loss_39": 1941.8, "kl_loss_7": 4758.0, "learning_rate": 0.0007206105506216106, "loss": 7029.4, "step": 3610 }, { "ce_loss_13": 3.548617047071457, "ce_loss_26": 3.054288852214813, "ce_loss_39": 2.504137873649597, "ce_loss_52": 1.4843981340527534, "ce_loss_7": 3.850842350721359, "epoch": 0.362, "grad_norm": 21.588123109222046, "kl_loss_13": 4246.4, "kl_loss_26": 3203.6, "kl_loss_39": 2032.4, "kl_loss_7": 4870.4, "learning_rate": 0.0007191855733945387, "loss": 7126.1, "step": 3620 }, { "ce_loss_13": 3.469277936220169, "ce_loss_26": 2.985336202383041, "ce_loss_39": 2.4588325411081313, "ce_loss_52": 1.4790852904319762, "ce_loss_7": 3.764431744813919, "epoch": 0.363, "grad_norm": 22.204733809635066, "kl_loss_13": 4129.2, "kl_loss_26": 3111.6, "kl_loss_39": 1976.2, "kl_loss_7": 4744.4, "learning_rate": 0.0007177583889711762, "loss": 7054.3, "step": 3630 }, { "ce_loss_13": 3.442378747463226, "ce_loss_26": 2.9400178849697114, "ce_loss_39": 2.394621509313583, "ce_loss_52": 1.417646163702011, "ce_loss_7": 3.743503212928772, "epoch": 0.364, "grad_norm": 21.957965313010384, "kl_loss_13": 4163.6, "kl_loss_26": 3123.2, "kl_loss_39": 1959.6, "kl_loss_7": 4796.0, "learning_rate": 0.0007163290117232541, "loss": 7054.5, "step": 3640 }, { "ce_loss_13": 3.4392197132110596, "ce_loss_26": 2.951560914516449, "ce_loss_39": 2.4180801689624785, "ce_loss_52": 1.4459613859653473, "ce_loss_7": 3.7318799614906313, "epoch": 0.365, "grad_norm": 21.451785041659093, "kl_loss_13": 4123.6, "kl_loss_26": 3098.8, "kl_loss_39": 1968.6, "kl_loss_7": 4730.8, "learning_rate": 0.0007148974560445859, "loss": 7029.7, "step": 3650 }, { "ce_loss_13": 3.4576940476894378, "ce_loss_26": 2.964629900455475, "ce_loss_39": 2.4162339717149734, "ce_loss_52": 1.4277015537023545, "ce_loss_7": 3.7549045085906982, "epoch": 0.366, "grad_norm": 22.817794972487214, "kl_loss_13": 4165.6, "kl_loss_26": 3135.6, "kl_loss_39": 1970.0, "kl_loss_7": 4792.8, "learning_rate": 0.0007134637363509209, "loss": 7013.0, "step": 3660 }, { "ce_loss_13": 3.5099750757217407, "ce_loss_26": 3.0163159906864165, "ce_loss_39": 2.4729519367218016, "ce_loss_52": 1.4603912830352783, "ce_loss_7": 3.8057311475276947, "epoch": 0.367, "grad_norm": 21.693779714382707, "kl_loss_13": 4227.6, "kl_loss_26": 3203.2, "kl_loss_39": 2043.8, "kl_loss_7": 4848.0, "learning_rate": 0.0007120278670798009, "loss": 7024.2, "step": 3670 }, { "ce_loss_13": 3.4791980743408204, "ce_loss_26": 2.992640608549118, "ce_loss_39": 2.4573631793260575, "ce_loss_52": 1.461830335855484, "ce_loss_7": 3.7739274382591246, "epoch": 0.368, "grad_norm": 22.105670609070703, "kl_loss_13": 4126.8, "kl_loss_26": 3112.4, "kl_loss_39": 1978.0, "kl_loss_7": 4751.2, "learning_rate": 0.0007105898626904133, "loss": 6924.7, "step": 3680 }, { "ce_loss_13": 3.4017152190208435, "ce_loss_26": 2.910390090942383, "ce_loss_39": 2.374891012907028, "ce_loss_52": 1.41865316927433, "ce_loss_7": 3.6938551664352417, "epoch": 0.369, "grad_norm": 20.08460323542704, "kl_loss_13": 4101.6, "kl_loss_26": 3071.6, "kl_loss_39": 1938.0, "kl_loss_7": 4706.8, "learning_rate": 0.0007091497376634463, "loss": 6952.1, "step": 3690 }, { "ce_loss_13": 3.4051457762718202, "ce_loss_26": 2.9142957627773285, "ce_loss_39": 2.3769975334405897, "ce_loss_52": 1.4461660206317901, "ce_loss_7": 3.7018753468990324, "epoch": 0.37, "grad_norm": 21.75095718081699, "kl_loss_13": 4034.8, "kl_loss_26": 3006.4, "kl_loss_39": 1877.2, "kl_loss_7": 4647.6, "learning_rate": 0.0007077075065009433, "loss": 6973.3, "step": 3700 }, { "ce_loss_13": 3.407693642377853, "ce_loss_26": 2.9163559854030607, "ce_loss_39": 2.3682307243347167, "ce_loss_52": 1.3965442717075347, "ce_loss_7": 3.7011303901672363, "epoch": 0.371, "grad_norm": 21.90346982831121, "kl_loss_13": 4126.8, "kl_loss_26": 3098.4, "kl_loss_39": 1951.8, "kl_loss_7": 4740.4, "learning_rate": 0.0007062631837261557, "loss": 6968.9, "step": 3710 }, { "ce_loss_13": 3.4375191271305083, "ce_loss_26": 2.9387122094631195, "ce_loss_39": 2.4049195408821107, "ce_loss_52": 1.4491820633411407, "ce_loss_7": 3.7319699347019197, "epoch": 0.372, "grad_norm": 22.15813884607567, "kl_loss_13": 4096.8, "kl_loss_26": 3062.8, "kl_loss_39": 1923.8, "kl_loss_7": 4716.0, "learning_rate": 0.0007048167838833977, "loss": 6892.9, "step": 3720 }, { "ce_loss_13": 3.443445736169815, "ce_loss_26": 2.9415276020765306, "ce_loss_39": 2.3946647971868513, "ce_loss_52": 1.4324263527989387, "ce_loss_7": 3.7482150912284853, "epoch": 0.373, "grad_norm": 20.533639539523726, "kl_loss_13": 4160.0, "kl_loss_26": 3107.6, "kl_loss_39": 1932.2, "kl_loss_7": 4794.8, "learning_rate": 0.0007033683215379002, "loss": 6994.9, "step": 3730 }, { "ce_loss_13": 3.440624713897705, "ce_loss_26": 2.932874071598053, "ce_loss_39": 2.384758135676384, "ce_loss_52": 1.4341223761439323, "ce_loss_7": 3.7404758751392366, "epoch": 0.374, "grad_norm": 22.169142032653717, "kl_loss_13": 4178.4, "kl_loss_26": 3120.4, "kl_loss_39": 1940.6, "kl_loss_7": 4809.6, "learning_rate": 0.0007019178112756625, "loss": 6960.1, "step": 3740 }, { "ce_loss_13": 3.479549217224121, "ce_loss_26": 2.9821768522262575, "ce_loss_39": 2.4379994481801988, "ce_loss_52": 1.4455731570720673, "ce_loss_7": 3.779719626903534, "epoch": 0.375, "grad_norm": 22.88722443184811, "kl_loss_13": 4190.8, "kl_loss_26": 3160.8, "kl_loss_39": 2000.2, "kl_loss_7": 4822.4, "learning_rate": 0.0007004652677033068, "loss": 6922.4, "step": 3750 }, { "ce_loss_13": 3.5048573672771455, "ce_loss_26": 2.996897077560425, "ce_loss_39": 2.4514291107654573, "ce_loss_52": 1.4677145808935166, "ce_loss_7": 3.8070162892341615, "epoch": 0.376, "grad_norm": 20.379791798469622, "kl_loss_13": 4200.4, "kl_loss_26": 3149.2, "kl_loss_39": 1988.0, "kl_loss_7": 4828.8, "learning_rate": 0.0006990107054479312, "loss": 6948.5, "step": 3760 }, { "ce_loss_13": 3.3857189416885376, "ce_loss_26": 2.8971070766448976, "ce_loss_39": 2.3661463767290116, "ce_loss_52": 1.4227028042078018, "ce_loss_7": 3.679090714454651, "epoch": 0.377, "grad_norm": 21.179119667844127, "kl_loss_13": 4050.8, "kl_loss_26": 3030.4, "kl_loss_39": 1892.4, "kl_loss_7": 4666.8, "learning_rate": 0.000697554139156961, "loss": 6941.0, "step": 3770 }, { "ce_loss_13": 3.512388813495636, "ce_loss_26": 3.0138610899448395, "ce_loss_39": 2.4606133818626406, "ce_loss_52": 1.4959001630544662, "ce_loss_7": 3.807480573654175, "epoch": 0.378, "grad_norm": 22.362162135534977, "kl_loss_13": 4145.2, "kl_loss_26": 3107.2, "kl_loss_39": 1951.0, "kl_loss_7": 4758.0, "learning_rate": 0.0006960955834980027, "loss": 6874.7, "step": 3780 }, { "ce_loss_13": 3.411291944980621, "ce_loss_26": 2.907497102022171, "ce_loss_39": 2.355439043045044, "ce_loss_52": 1.4057017982006073, "ce_loss_7": 3.7079379081726076, "epoch": 0.379, "grad_norm": 20.519862733845507, "kl_loss_13": 4121.2, "kl_loss_26": 3082.0, "kl_loss_39": 1917.0, "kl_loss_7": 4746.0, "learning_rate": 0.0006946350531586958, "loss": 6891.8, "step": 3790 }, { "ce_loss_13": 3.365473288297653, "ce_loss_26": 2.8626536786556245, "ce_loss_39": 2.3194735169410707, "ce_loss_52": 1.3925662517547608, "ce_loss_7": 3.661379265785217, "epoch": 0.38, "grad_norm": 21.211701089479526, "kl_loss_13": 4084.0, "kl_loss_26": 3048.4, "kl_loss_39": 1891.2, "kl_loss_7": 4702.0, "learning_rate": 0.0006931725628465643, "loss": 6889.0, "step": 3800 }, { "ce_loss_13": 3.375334745645523, "ce_loss_26": 2.891470319032669, "ce_loss_39": 2.3472714513540267, "ce_loss_52": 1.4055952280759811, "ce_loss_7": 3.667448806762695, "epoch": 0.381, "grad_norm": 22.083234786813115, "kl_loss_13": 4056.4, "kl_loss_26": 3038.0, "kl_loss_39": 1887.0, "kl_loss_7": 4669.2, "learning_rate": 0.0006917081272888696, "loss": 6821.1, "step": 3810 }, { "ce_loss_13": 3.413332349061966, "ce_loss_26": 2.918791648745537, "ce_loss_39": 2.3821532160043715, "ce_loss_52": 1.426029135286808, "ce_loss_7": 3.70468533039093, "epoch": 0.382, "grad_norm": 21.70379003852508, "kl_loss_13": 4066.4, "kl_loss_26": 3040.8, "kl_loss_39": 1897.2, "kl_loss_7": 4676.0, "learning_rate": 0.0006902417612324615, "loss": 6817.3, "step": 3820 }, { "ce_loss_13": 3.448424202203751, "ce_loss_26": 2.9532420337200165, "ce_loss_39": 2.389399054646492, "ce_loss_52": 1.4127800971269608, "ce_loss_7": 3.7562515437602997, "epoch": 0.383, "grad_norm": 22.611090782774035, "kl_loss_13": 4219.6, "kl_loss_26": 3187.2, "kl_loss_39": 2000.4, "kl_loss_7": 4858.4, "learning_rate": 0.00068877347944363, "loss": 6892.2, "step": 3830 }, { "ce_loss_13": 3.42295760512352, "ce_loss_26": 2.926942157745361, "ce_loss_39": 2.3873476177453994, "ce_loss_52": 1.439600521326065, "ce_loss_7": 3.719656354188919, "epoch": 0.384, "grad_norm": 20.756696465620674, "kl_loss_13": 4109.6, "kl_loss_26": 3061.2, "kl_loss_39": 1912.2, "kl_loss_7": 4727.6, "learning_rate": 0.0006873032967079561, "loss": 6876.7, "step": 3840 }, { "ce_loss_13": 3.4390079021453857, "ce_loss_26": 2.9532729268074034, "ce_loss_39": 2.4051371097564695, "ce_loss_52": 1.446793320775032, "ce_loss_7": 3.740493839979172, "epoch": 0.385, "grad_norm": 20.683464166323773, "kl_loss_13": 4102.8, "kl_loss_26": 3080.8, "kl_loss_39": 1919.2, "kl_loss_7": 4729.6, "learning_rate": 0.0006858312278301637, "loss": 6878.2, "step": 3850 }, { "ce_loss_13": 3.368044465780258, "ce_loss_26": 2.8783551871776583, "ce_loss_39": 2.35613272190094, "ce_loss_52": 1.4377225756645202, "ce_loss_7": 3.657758867740631, "epoch": 0.386, "grad_norm": 22.01788101919845, "kl_loss_13": 3983.6, "kl_loss_26": 2964.0, "kl_loss_39": 1838.2, "kl_loss_7": 4592.4, "learning_rate": 0.0006843572876339704, "loss": 6809.2, "step": 3860 }, { "ce_loss_13": 3.3198332667350767, "ce_loss_26": 2.8321444630622863, "ce_loss_39": 2.2942576706409454, "ce_loss_52": 1.394131037592888, "ce_loss_7": 3.6043868601322173, "epoch": 0.387, "grad_norm": 23.448962125354107, "kl_loss_13": 3965.6, "kl_loss_26": 2956.0, "kl_loss_39": 1822.8, "kl_loss_7": 4560.0, "learning_rate": 0.0006828814909619373, "loss": 6798.0, "step": 3870 }, { "ce_loss_13": 3.3714381575584413, "ce_loss_26": 2.88772599697113, "ce_loss_39": 2.3584464609622957, "ce_loss_52": 1.4422439962625504, "ce_loss_7": 3.662185198068619, "epoch": 0.388, "grad_norm": 22.031425075321017, "kl_loss_13": 3995.6, "kl_loss_26": 2987.6, "kl_loss_39": 1858.6, "kl_loss_7": 4603.2, "learning_rate": 0.0006814038526753205, "loss": 6790.2, "step": 3880 }, { "ce_loss_13": 3.4292624831199645, "ce_loss_26": 2.9387787103652956, "ce_loss_39": 2.392472392320633, "ce_loss_52": 1.458945381641388, "ce_loss_7": 3.722714525461197, "epoch": 0.389, "grad_norm": 21.623492145702706, "kl_loss_13": 4048.0, "kl_loss_26": 3026.8, "kl_loss_39": 1877.2, "kl_loss_7": 4660.8, "learning_rate": 0.0006799243876539213, "loss": 6774.2, "step": 3890 }, { "ce_loss_13": 3.398139035701752, "ce_loss_26": 2.903027367591858, "ce_loss_39": 2.351736932992935, "ce_loss_52": 1.4211576133966446, "ce_loss_7": 3.6922240018844605, "epoch": 0.39, "grad_norm": 20.862431117162615, "kl_loss_13": 4048.8, "kl_loss_26": 3010.0, "kl_loss_39": 1853.6, "kl_loss_7": 4666.4, "learning_rate": 0.0006784431107959359, "loss": 6774.2, "step": 3900 }, { "ce_loss_13": 3.442742919921875, "ce_loss_26": 2.950921058654785, "ce_loss_39": 2.407829362154007, "ce_loss_52": 1.4671964168548584, "ce_loss_7": 3.7354746580123903, "epoch": 0.391, "grad_norm": 22.203660063445458, "kl_loss_13": 4065.6, "kl_loss_26": 3044.4, "kl_loss_39": 1893.8, "kl_loss_7": 4682.8, "learning_rate": 0.0006769600370178059, "loss": 6751.0, "step": 3910 }, { "ce_loss_13": 3.3321305394172667, "ce_loss_26": 2.8416285693645476, "ce_loss_39": 2.3153179585933685, "ce_loss_52": 1.3940225571393967, "ce_loss_7": 3.628729373216629, "epoch": 0.392, "grad_norm": 20.321578181458975, "kl_loss_13": 3993.6, "kl_loss_26": 2972.0, "kl_loss_39": 1845.6, "kl_loss_7": 4610.8, "learning_rate": 0.0006754751812540679, "loss": 6716.4, "step": 3920 }, { "ce_loss_13": 3.382316732406616, "ce_loss_26": 2.8903696179389953, "ce_loss_39": 2.353957489132881, "ce_loss_52": 1.433479717373848, "ce_loss_7": 3.671325671672821, "epoch": 0.393, "grad_norm": 21.271101889195098, "kl_loss_13": 4022.4, "kl_loss_26": 2998.4, "kl_loss_39": 1861.0, "kl_loss_7": 4629.6, "learning_rate": 0.0006739885584572025, "loss": 6776.3, "step": 3930 }, { "ce_loss_13": 3.300927424430847, "ce_loss_26": 2.820311403274536, "ce_loss_39": 2.2964976727962494, "ce_loss_52": 1.4115092635154725, "ce_loss_7": 3.5904260516166686, "epoch": 0.394, "grad_norm": 20.82909920071906, "kl_loss_13": 3946.8, "kl_loss_26": 2937.6, "kl_loss_39": 1809.0, "kl_loss_7": 4550.0, "learning_rate": 0.0006725001835974853, "loss": 6768.3, "step": 3940 }, { "ce_loss_13": 3.3879170179367066, "ce_loss_26": 2.897449654340744, "ce_loss_39": 2.3513225704431533, "ce_loss_52": 1.423241639137268, "ce_loss_7": 3.6786913871765137, "epoch": 0.395, "grad_norm": 21.851496416724263, "kl_loss_13": 4040.0, "kl_loss_26": 3011.6, "kl_loss_39": 1852.2, "kl_loss_7": 4646.4, "learning_rate": 0.0006710100716628344, "loss": 6704.8, "step": 3950 }, { "ce_loss_13": 3.365324836969376, "ce_loss_26": 2.8627363234758376, "ce_loss_39": 2.30602003633976, "ce_loss_52": 1.3922662898898124, "ce_loss_7": 3.670176440477371, "epoch": 0.396, "grad_norm": 19.95099507420605, "kl_loss_13": 4070.0, "kl_loss_26": 3011.6, "kl_loss_39": 1834.6, "kl_loss_7": 4694.8, "learning_rate": 0.0006695182376586602, "loss": 6737.9, "step": 3960 }, { "ce_loss_13": 3.3035158634185793, "ce_loss_26": 2.820575511455536, "ce_loss_39": 2.2792143374681473, "ce_loss_52": 1.3631332144141197, "ce_loss_7": 3.5984981656074524, "epoch": 0.397, "grad_norm": 21.419651305345628, "kl_loss_13": 4016.4, "kl_loss_26": 2992.4, "kl_loss_39": 1851.6, "kl_loss_7": 4639.2, "learning_rate": 0.000668024696607715, "loss": 6659.1, "step": 3970 }, { "ce_loss_13": 3.2616395235061644, "ce_loss_26": 2.784494936466217, "ce_loss_39": 2.2700316429138185, "ce_loss_52": 1.3974194526672363, "ce_loss_7": 3.548482429981232, "epoch": 0.398, "grad_norm": 20.984610674219567, "kl_loss_13": 3844.8, "kl_loss_26": 2848.4, "kl_loss_39": 1755.2, "kl_loss_7": 4441.6, "learning_rate": 0.0006665294635499404, "loss": 6600.0, "step": 3980 }, { "ce_loss_13": 3.3228425204753878, "ce_loss_26": 2.836167597770691, "ce_loss_39": 2.3086318761110305, "ce_loss_52": 1.431598064303398, "ce_loss_7": 3.617774724960327, "epoch": 0.399, "grad_norm": 20.48469634280121, "kl_loss_13": 3879.6, "kl_loss_26": 2876.0, "kl_loss_39": 1769.2, "kl_loss_7": 4495.2, "learning_rate": 0.0006650325535423167, "loss": 6653.5, "step": 3990 }, { "ce_loss_13": 3.3134547114372253, "ce_loss_26": 2.8307457506656646, "ce_loss_39": 2.29368577003479, "ce_loss_52": 1.3969372153282165, "ce_loss_7": 3.6135133028030397, "epoch": 0.4, "grad_norm": 21.23260680511818, "kl_loss_13": 3962.8, "kl_loss_26": 2951.2, "kl_loss_39": 1800.2, "kl_loss_7": 4588.0, "learning_rate": 0.0006635339816587109, "loss": 6715.2, "step": 4000 }, { "ce_loss_13": 3.478778451681137, "ce_loss_26": 2.984225571155548, "ce_loss_39": 2.432309350371361, "ce_loss_52": 1.4644457131624222, "ce_loss_7": 3.782983124256134, "epoch": 0.401, "grad_norm": 21.3701964180473, "kl_loss_13": 4117.6, "kl_loss_26": 3090.4, "kl_loss_39": 1932.4, "kl_loss_7": 4754.8, "learning_rate": 0.0006620337629897252, "loss": 6698.2, "step": 4010 }, { "ce_loss_13": 3.3284165620803834, "ce_loss_26": 2.8434600114822386, "ce_loss_39": 2.311116448044777, "ce_loss_52": 1.4240004986524581, "ce_loss_7": 3.625540155172348, "epoch": 0.402, "grad_norm": 20.004792328254855, "kl_loss_13": 3939.2, "kl_loss_26": 2918.4, "kl_loss_39": 1780.6, "kl_loss_7": 4553.2, "learning_rate": 0.0006605319126425454, "loss": 6664.7, "step": 4020 }, { "ce_loss_13": 3.366453301906586, "ce_loss_26": 2.86657951772213, "ce_loss_39": 2.3278372526168822, "ce_loss_52": 1.4299451738595963, "ce_loss_7": 3.6654918253421784, "epoch": 0.403, "grad_norm": 20.588190398863947, "kl_loss_13": 4014.8, "kl_loss_26": 2970.0, "kl_loss_39": 1823.4, "kl_loss_7": 4628.8, "learning_rate": 0.0006590284457407876, "loss": 6644.4, "step": 4030 }, { "ce_loss_13": 3.3478448331356048, "ce_loss_26": 2.870484399795532, "ce_loss_39": 2.341677349805832, "ce_loss_52": 1.4598551213741302, "ce_loss_7": 3.644811862707138, "epoch": 0.404, "grad_norm": 20.8221732353937, "kl_loss_13": 3908.8, "kl_loss_26": 2901.6, "kl_loss_39": 1782.0, "kl_loss_7": 4526.4, "learning_rate": 0.0006575233774243465, "loss": 6645.55, "step": 4040 }, { "ce_loss_13": 3.28169704079628, "ce_loss_26": 2.7861692667007447, "ce_loss_39": 2.2428950667381287, "ce_loss_52": 1.374313686788082, "ce_loss_7": 3.5797726988792418, "epoch": 0.405, "grad_norm": 21.110763703238916, "kl_loss_13": 3936.4, "kl_loss_26": 2902.0, "kl_loss_39": 1754.0, "kl_loss_7": 4561.2, "learning_rate": 0.0006560167228492435, "loss": 6646.3, "step": 4050 }, { "ce_loss_13": 3.44179083108902, "ce_loss_26": 2.9405432820320128, "ce_loss_39": 2.3968080401420595, "ce_loss_52": 1.4659699857234956, "ce_loss_7": 3.736131912469864, "epoch": 0.406, "grad_norm": 20.50041516447212, "kl_loss_13": 4061.2, "kl_loss_26": 3015.6, "kl_loss_39": 1863.6, "kl_loss_7": 4672.4, "learning_rate": 0.0006545084971874737, "loss": 6655.9, "step": 4060 }, { "ce_loss_13": 3.360958731174469, "ce_loss_26": 2.870532661676407, "ce_loss_39": 2.321683007478714, "ce_loss_52": 1.4091651737689972, "ce_loss_7": 3.6588110864162444, "epoch": 0.407, "grad_norm": 20.463583112470857, "kl_loss_13": 3997.2, "kl_loss_26": 2979.2, "kl_loss_39": 1826.4, "kl_loss_7": 4618.8, "learning_rate": 0.0006529987156268526, "loss": 6617.7, "step": 4070 }, { "ce_loss_13": 3.2686664044857023, "ce_loss_26": 2.7765056490898132, "ce_loss_39": 2.238715943694115, "ce_loss_52": 1.362110722064972, "ce_loss_7": 3.557782357931137, "epoch": 0.408, "grad_norm": 21.232766427379584, "kl_loss_13": 3924.8, "kl_loss_26": 2908.8, "kl_loss_39": 1777.2, "kl_loss_7": 4538.0, "learning_rate": 0.0006514873933708637, "loss": 6653.7, "step": 4080 }, { "ce_loss_13": 3.275262689590454, "ce_loss_26": 2.7900135934352877, "ce_loss_39": 2.2641283214092254, "ce_loss_52": 1.3853250756859778, "ce_loss_7": 3.584109377861023, "epoch": 0.409, "grad_norm": 21.583109836521306, "kl_loss_13": 3887.6, "kl_loss_26": 2868.8, "kl_loss_39": 1755.0, "kl_loss_7": 4526.4, "learning_rate": 0.0006499745456385053, "loss": 6553.8, "step": 4090 }, { "ce_loss_13": 3.3286924988031386, "ce_loss_26": 2.8467950344085695, "ce_loss_39": 2.312630409002304, "ce_loss_52": 1.424817180633545, "ce_loss_7": 3.6197818219661713, "epoch": 0.41, "grad_norm": 20.90973793223202, "kl_loss_13": 3933.2, "kl_loss_26": 2921.2, "kl_loss_39": 1789.5, "kl_loss_7": 4539.2, "learning_rate": 0.0006484601876641375, "loss": 6620.35, "step": 4100 }, { "ce_loss_13": 3.414019340276718, "ce_loss_26": 2.922485715150833, "ce_loss_39": 2.3825585186481475, "ce_loss_52": 1.4558052003383637, "ce_loss_7": 3.709526652097702, "epoch": 0.411, "grad_norm": 21.026331487000416, "kl_loss_13": 4013.6, "kl_loss_26": 2986.4, "kl_loss_39": 1846.6, "kl_loss_7": 4632.0, "learning_rate": 0.000646944334697328, "loss": 6576.6, "step": 4110 }, { "ce_loss_13": 3.3258216440677644, "ce_loss_26": 2.848787486553192, "ce_loss_39": 2.3223425179719923, "ce_loss_52": 1.4663115084171294, "ce_loss_7": 3.604213911294937, "epoch": 0.412, "grad_norm": 20.97048408186008, "kl_loss_13": 3820.4, "kl_loss_26": 2832.4, "kl_loss_39": 1718.8, "kl_loss_7": 4410.0, "learning_rate": 0.0006454270020026995, "loss": 6611.1, "step": 4120 }, { "ce_loss_13": 3.3510149538517, "ce_loss_26": 2.853396385908127, "ce_loss_39": 2.3119456827640534, "ce_loss_52": 1.430704912543297, "ce_loss_7": 3.642722541093826, "epoch": 0.413, "grad_norm": 21.962054178499802, "kl_loss_13": 3953.6, "kl_loss_26": 2916.4, "kl_loss_39": 1780.4, "kl_loss_7": 4556.4, "learning_rate": 0.0006439082048597755, "loss": 6584.4, "step": 4130 }, { "ce_loss_13": 3.3082414746284483, "ce_loss_26": 2.8318909227848055, "ce_loss_39": 2.309774273633957, "ce_loss_52": 1.4488343179225922, "ce_loss_7": 3.6024456560611724, "epoch": 0.414, "grad_norm": 21.685181722314038, "kl_loss_13": 3830.8, "kl_loss_26": 2848.8, "kl_loss_39": 1746.2, "kl_loss_7": 4442.8, "learning_rate": 0.0006423879585628261, "loss": 6547.1, "step": 4140 }, { "ce_loss_13": 3.3523667633533476, "ce_loss_26": 2.855451303720474, "ce_loss_39": 2.3214808642864226, "ce_loss_52": 1.434419831633568, "ce_loss_7": 3.63877694606781, "epoch": 0.415, "grad_norm": 20.289217196894946, "kl_loss_13": 3971.6, "kl_loss_26": 2938.8, "kl_loss_39": 1799.8, "kl_loss_7": 4575.6, "learning_rate": 0.0006408662784207149, "loss": 6535.7, "step": 4150 }, { "ce_loss_13": 3.351851773262024, "ce_loss_26": 2.864642024040222, "ce_loss_39": 2.3303563445806503, "ce_loss_52": 1.417774812877178, "ce_loss_7": 3.6486425340175628, "epoch": 0.416, "grad_norm": 20.99970502270892, "kl_loss_13": 4014.4, "kl_loss_26": 2990.4, "kl_loss_39": 1838.4, "kl_loss_7": 4632.8, "learning_rate": 0.0006393431797567439, "loss": 6546.0, "step": 4160 }, { "ce_loss_13": 3.3471143901348115, "ce_loss_26": 2.866150665283203, "ce_loss_39": 2.3319087445735933, "ce_loss_52": 1.4416520655155183, "ce_loss_7": 3.6357293486595155, "epoch": 0.417, "grad_norm": 21.192503416792082, "kl_loss_13": 3909.6, "kl_loss_26": 2915.8, "kl_loss_39": 1798.8, "kl_loss_7": 4510.8, "learning_rate": 0.0006378186779084996, "loss": 6527.0, "step": 4170 }, { "ce_loss_13": 3.316433811187744, "ce_loss_26": 2.8411558747291563, "ce_loss_39": 2.3257210671901705, "ce_loss_52": 1.4415073692798615, "ce_loss_7": 3.604754400253296, "epoch": 0.418, "grad_norm": 20.807157262193087, "kl_loss_13": 3869.2, "kl_loss_26": 2884.4, "kl_loss_39": 1787.2, "kl_loss_7": 4470.0, "learning_rate": 0.0006362927882276989, "loss": 6561.5, "step": 4180 }, { "ce_loss_13": 3.339698684215546, "ce_loss_26": 2.85506985783577, "ce_loss_39": 2.311668387055397, "ce_loss_52": 1.4213855370879174, "ce_loss_7": 3.6274226009845734, "epoch": 0.419, "grad_norm": 22.02061084804507, "kl_loss_13": 3937.6, "kl_loss_26": 2928.0, "kl_loss_39": 1790.0, "kl_loss_7": 4535.2, "learning_rate": 0.000634765526080034, "loss": 6534.9, "step": 4190 }, { "ce_loss_13": 3.304267328977585, "ce_loss_26": 2.8152280390262603, "ce_loss_39": 2.277574297785759, "ce_loss_52": 1.3985714688897133, "ce_loss_7": 3.598735523223877, "epoch": 0.42, "grad_norm": 19.98512507622475, "kl_loss_13": 3911.6, "kl_loss_26": 2888.0, "kl_loss_39": 1761.0, "kl_loss_7": 4520.8, "learning_rate": 0.0006332369068450174, "loss": 6522.4, "step": 4200 }, { "ce_loss_13": 3.2766413748264314, "ce_loss_26": 2.793798440694809, "ce_loss_39": 2.2647649705410005, "ce_loss_52": 1.404912966489792, "ce_loss_7": 3.5614658653736115, "epoch": 0.421, "grad_norm": 21.98548722652707, "kl_loss_13": 3862.8, "kl_loss_26": 2858.0, "kl_loss_39": 1744.0, "kl_loss_7": 4457.6, "learning_rate": 0.0006317069459158283, "loss": 6461.5, "step": 4210 }, { "ce_loss_13": 3.303953742980957, "ce_loss_26": 2.817542538046837, "ce_loss_39": 2.2884632468223574, "ce_loss_52": 1.414811021089554, "ce_loss_7": 3.5892197132110595, "epoch": 0.422, "grad_norm": 21.22519434260025, "kl_loss_13": 3878.0, "kl_loss_26": 2873.4, "kl_loss_39": 1750.6, "kl_loss_7": 4478.4, "learning_rate": 0.0006301756586991561, "loss": 6510.1, "step": 4220 }, { "ce_loss_13": 3.3445753276348116, "ce_loss_26": 2.8686348736286162, "ce_loss_39": 2.3429405450820924, "ce_loss_52": 1.4785997077822686, "ce_loss_7": 3.6385815382003783, "epoch": 0.423, "grad_norm": 19.644468709446457, "kl_loss_13": 3856.4, "kl_loss_26": 2870.4, "kl_loss_39": 1760.0, "kl_loss_7": 4462.4, "learning_rate": 0.0006286430606150459, "loss": 6493.9, "step": 4230 }, { "ce_loss_13": 3.3064311265945436, "ce_loss_26": 2.8393130600452423, "ce_loss_39": 2.321084627509117, "ce_loss_52": 1.4542193472385407, "ce_loss_7": 3.591093236207962, "epoch": 0.424, "grad_norm": 19.629096721329073, "kl_loss_13": 3821.6, "kl_loss_26": 2834.0, "kl_loss_39": 1732.2, "kl_loss_7": 4414.0, "learning_rate": 0.0006271091670967436, "loss": 6458.7, "step": 4240 }, { "ce_loss_13": 3.335456043481827, "ce_loss_26": 2.8595637679100037, "ce_loss_39": 2.319040137529373, "ce_loss_52": 1.4537455767393113, "ce_loss_7": 3.6277152955532075, "epoch": 0.425, "grad_norm": 22.23231840847551, "kl_loss_13": 3861.2, "kl_loss_26": 2865.2, "kl_loss_39": 1725.4, "kl_loss_7": 4470.8, "learning_rate": 0.0006255739935905395, "loss": 6438.9, "step": 4250 }, { "ce_loss_13": 3.312756323814392, "ce_loss_26": 2.828430265188217, "ce_loss_39": 2.290999186038971, "ce_loss_52": 1.4147280350327491, "ce_loss_7": 3.606942754983902, "epoch": 0.426, "grad_norm": 22.096816976355754, "kl_loss_13": 3909.6, "kl_loss_26": 2901.6, "kl_loss_39": 1772.8, "kl_loss_7": 4524.4, "learning_rate": 0.0006240375555556145, "loss": 6443.7, "step": 4260 }, { "ce_loss_13": 3.2455935895442964, "ce_loss_26": 2.764835333824158, "ce_loss_39": 2.2400916039943697, "ce_loss_52": 1.3983588561415672, "ce_loss_7": 3.5336900293827056, "epoch": 0.427, "grad_norm": 21.016189257560278, "kl_loss_13": 3811.6, "kl_loss_26": 2807.4, "kl_loss_39": 1694.2, "kl_loss_7": 4414.0, "learning_rate": 0.000622499868463882, "loss": 6395.1, "step": 4270 }, { "ce_loss_13": 3.3086226165294645, "ce_loss_26": 2.823494350910187, "ce_loss_39": 2.301421931385994, "ce_loss_52": 1.4462398916482926, "ce_loss_7": 3.595276767015457, "epoch": 0.428, "grad_norm": 21.54081801528653, "kl_loss_13": 3848.8, "kl_loss_26": 2835.6, "kl_loss_39": 1725.6, "kl_loss_7": 4449.6, "learning_rate": 0.0006209609477998338, "loss": 6429.2, "step": 4280 }, { "ce_loss_13": 3.34853395819664, "ce_loss_26": 2.8662350177764893, "ce_loss_39": 2.340099334716797, "ce_loss_52": 1.455578488111496, "ce_loss_7": 3.6379907071590423, "epoch": 0.429, "grad_norm": 22.431607304032426, "kl_loss_13": 3907.2, "kl_loss_26": 2905.6, "kl_loss_39": 1784.0, "kl_loss_7": 4512.8, "learning_rate": 0.0006194208090603844, "loss": 6469.9, "step": 4290 }, { "ce_loss_13": 3.2378364205360413, "ce_loss_26": 2.771626591682434, "ce_loss_39": 2.2533502638339997, "ce_loss_52": 1.4288378104567527, "ce_loss_7": 3.523706406354904, "epoch": 0.43, "grad_norm": 19.478272699999504, "kl_loss_13": 3771.6, "kl_loss_26": 2794.0, "kl_loss_39": 1687.8, "kl_loss_7": 4368.8, "learning_rate": 0.0006178794677547138, "loss": 6399.1, "step": 4300 }, { "ce_loss_13": 3.3320172011852263, "ce_loss_26": 2.8396951615810395, "ce_loss_39": 2.305786609649658, "ce_loss_52": 1.434322476387024, "ce_loss_7": 3.622057580947876, "epoch": 0.431, "grad_norm": 21.288095261764262, "kl_loss_13": 3913.6, "kl_loss_26": 2898.0, "kl_loss_39": 1761.0, "kl_loss_7": 4522.8, "learning_rate": 0.0006163369394041111, "loss": 6430.5, "step": 4310 }, { "ce_loss_13": 3.2775667309761047, "ce_loss_26": 2.79736185669899, "ce_loss_39": 2.2730204701423644, "ce_loss_52": 1.4288703322410583, "ce_loss_7": 3.562648755311966, "epoch": 0.432, "grad_norm": 22.037654673864616, "kl_loss_13": 3816.0, "kl_loss_26": 2814.0, "kl_loss_39": 1690.8, "kl_loss_7": 4412.8, "learning_rate": 0.0006147932395418205, "loss": 6392.0, "step": 4320 }, { "ce_loss_13": 3.2999020636081697, "ce_loss_26": 2.8127501010894775, "ce_loss_39": 2.2844816505908967, "ce_loss_52": 1.4203194737434388, "ce_loss_7": 3.5926522493362425, "epoch": 0.433, "grad_norm": 23.105450049389578, "kl_loss_13": 3851.2, "kl_loss_26": 2850.0, "kl_loss_39": 1733.2, "kl_loss_7": 4468.8, "learning_rate": 0.0006132483837128823, "loss": 6416.4, "step": 4330 }, { "ce_loss_13": 3.314070051908493, "ce_loss_26": 2.8264743953943254, "ce_loss_39": 2.295166790485382, "ce_loss_52": 1.4510357692837714, "ce_loss_7": 3.5982041239738463, "epoch": 0.434, "grad_norm": 21.907818649309228, "kl_loss_13": 3842.0, "kl_loss_26": 2836.0, "kl_loss_39": 1709.4, "kl_loss_7": 4428.8, "learning_rate": 0.0006117023874739772, "loss": 6437.0, "step": 4340 }, { "ce_loss_13": 3.296399414539337, "ce_loss_26": 2.8059658110141754, "ce_loss_39": 2.271949994564056, "ce_loss_52": 1.4154132261872292, "ce_loss_7": 3.588996112346649, "epoch": 0.435, "grad_norm": 21.889212253545118, "kl_loss_13": 3898.8, "kl_loss_26": 2887.2, "kl_loss_39": 1746.6, "kl_loss_7": 4508.4, "learning_rate": 0.0006101552663932703, "loss": 6431.3, "step": 4350 }, { "ce_loss_13": 3.306167459487915, "ce_loss_26": 2.8240922570228575, "ce_loss_39": 2.2931392163038256, "ce_loss_52": 1.4310514152050018, "ce_loss_7": 3.596520256996155, "epoch": 0.436, "grad_norm": 21.05075754740656, "kl_loss_13": 3860.0, "kl_loss_26": 2856.4, "kl_loss_39": 1737.6, "kl_loss_7": 4459.2, "learning_rate": 0.0006086070360502539, "loss": 6370.7, "step": 4360 }, { "ce_loss_13": 3.2818971514701842, "ce_loss_26": 2.8096925973892213, "ce_loss_39": 2.2902305334806443, "ce_loss_52": 1.457671320438385, "ce_loss_7": 3.567191207408905, "epoch": 0.437, "grad_norm": 19.98373918443626, "kl_loss_13": 3770.8, "kl_loss_26": 2788.4, "kl_loss_39": 1692.0, "kl_loss_7": 4367.2, "learning_rate": 0.0006070577120355903, "loss": 6341.1, "step": 4370 }, { "ce_loss_13": 3.317246896028519, "ce_loss_26": 2.84905891418457, "ce_loss_39": 2.322802722454071, "ce_loss_52": 1.4916655078530312, "ce_loss_7": 3.593036550283432, "epoch": 0.438, "grad_norm": 20.010722762004242, "kl_loss_13": 3781.2, "kl_loss_26": 2799.2, "kl_loss_39": 1686.2, "kl_loss_7": 4360.0, "learning_rate": 0.0006055073099509549, "loss": 6355.5, "step": 4380 }, { "ce_loss_13": 3.2864172756671906, "ce_loss_26": 2.8085566580295565, "ce_loss_39": 2.2875818789005278, "ce_loss_52": 1.4407859086990356, "ce_loss_7": 3.5741762936115267, "epoch": 0.439, "grad_norm": 21.10222653748024, "kl_loss_13": 3824.4, "kl_loss_26": 2822.4, "kl_loss_39": 1713.2, "kl_loss_7": 4426.0, "learning_rate": 0.0006039558454088796, "loss": 6354.5, "step": 4390 }, { "ce_loss_13": 3.2985159277915956, "ce_loss_26": 2.817130261659622, "ce_loss_39": 2.2846481442451476, "ce_loss_52": 1.4308176964521409, "ce_loss_7": 3.5788950502872465, "epoch": 0.44, "grad_norm": 22.10167857860873, "kl_loss_13": 3852.4, "kl_loss_26": 2856.8, "kl_loss_39": 1727.6, "kl_loss_7": 4434.8, "learning_rate": 0.0006024033340325954, "loss": 6381.3, "step": 4400 }, { "ce_loss_13": 3.2772581815719604, "ce_loss_26": 2.7952946066856383, "ce_loss_39": 2.259748488664627, "ce_loss_52": 1.4122898250818252, "ce_loss_7": 3.564402920007706, "epoch": 0.441, "grad_norm": 21.766013784294948, "kl_loss_13": 3854.4, "kl_loss_26": 2847.6, "kl_loss_39": 1727.0, "kl_loss_7": 4457.2, "learning_rate": 0.0006008497914558743, "loss": 6338.3, "step": 4410 }, { "ce_loss_13": 3.310656875371933, "ce_loss_26": 2.825407701730728, "ce_loss_39": 2.3054873913526537, "ce_loss_52": 1.4574851334095, "ce_loss_7": 3.595499175786972, "epoch": 0.442, "grad_norm": 23.195817032303225, "kl_loss_13": 3826.4, "kl_loss_26": 2816.8, "kl_loss_39": 1707.6, "kl_loss_7": 4415.6, "learning_rate": 0.0005992952333228728, "loss": 6415.4, "step": 4420 }, { "ce_loss_13": 3.147319358587265, "ce_loss_26": 2.6695513784885407, "ce_loss_39": 2.153283026814461, "ce_loss_52": 1.362196257710457, "ce_loss_7": 3.4255874812602998, "epoch": 0.443, "grad_norm": 21.365484698145746, "kl_loss_13": 3677.2, "kl_loss_26": 2685.6, "kl_loss_39": 1588.2, "kl_loss_7": 4260.0, "learning_rate": 0.0005977396752879741, "loss": 6284.8, "step": 4430 }, { "ce_loss_13": 3.2730862379074095, "ce_loss_26": 2.790391606092453, "ce_loss_39": 2.257637658715248, "ce_loss_52": 1.4242565602064132, "ce_loss_7": 3.5511350512504576, "epoch": 0.444, "grad_norm": 20.84050157821156, "kl_loss_13": 3810.8, "kl_loss_26": 2811.6, "kl_loss_39": 1686.6, "kl_loss_7": 4399.2, "learning_rate": 0.0005961831330156305, "loss": 6282.4, "step": 4440 }, { "ce_loss_13": 3.303426647186279, "ce_loss_26": 2.8194876074790955, "ce_loss_39": 2.286717027425766, "ce_loss_52": 1.4367665380239487, "ce_loss_7": 3.589170789718628, "epoch": 0.445, "grad_norm": 22.09664086851361, "kl_loss_13": 3832.4, "kl_loss_26": 2828.0, "kl_loss_39": 1702.2, "kl_loss_7": 4435.6, "learning_rate": 0.0005946256221802051, "loss": 6310.7, "step": 4450 }, { "ce_loss_13": 3.2220256984233857, "ce_loss_26": 2.747016179561615, "ce_loss_39": 2.2203843981027602, "ce_loss_52": 1.4144678741693497, "ce_loss_7": 3.5019364655017853, "epoch": 0.446, "grad_norm": 20.80212123158213, "kl_loss_13": 3738.0, "kl_loss_26": 2737.6, "kl_loss_39": 1635.2, "kl_loss_7": 4324.0, "learning_rate": 0.0005930671584658151, "loss": 6275.9, "step": 4460 }, { "ce_loss_13": 3.262040966749191, "ce_loss_26": 2.7848617672920226, "ce_loss_39": 2.2554671108722686, "ce_loss_52": 1.4137398272752761, "ce_loss_7": 3.5524380266666413, "epoch": 0.447, "grad_norm": 21.070671360315192, "kl_loss_13": 3794.4, "kl_loss_26": 2805.6, "kl_loss_39": 1697.0, "kl_loss_7": 4396.4, "learning_rate": 0.0005915077575661722, "loss": 6360.7, "step": 4470 }, { "ce_loss_13": 3.2109140872955324, "ce_loss_26": 2.7298059910535812, "ce_loss_39": 2.2085951179265977, "ce_loss_52": 1.3873766094446183, "ce_loss_7": 3.4926227211952208, "epoch": 0.448, "grad_norm": 21.38920961497166, "kl_loss_13": 3765.2, "kl_loss_26": 2759.4, "kl_loss_39": 1656.0, "kl_loss_7": 4359.6, "learning_rate": 0.000589947435184427, "loss": 6255.15, "step": 4480 }, { "ce_loss_13": 3.2468604743480682, "ce_loss_26": 2.7669826805591584, "ce_loss_39": 2.2381924211978914, "ce_loss_52": 1.4454800367355347, "ce_loss_7": 3.5310903012752535, "epoch": 0.449, "grad_norm": 23.74435148547982, "kl_loss_13": 3708.0, "kl_loss_26": 2716.8, "kl_loss_39": 1595.4, "kl_loss_7": 4307.2, "learning_rate": 0.0005883862070330078, "loss": 6262.9, "step": 4490 }, { "ce_loss_13": 3.2490183234214784, "ce_loss_26": 2.775378829240799, "ce_loss_39": 2.259204548597336, "ce_loss_52": 1.4259025424718856, "ce_loss_7": 3.532491201162338, "epoch": 0.45, "grad_norm": 19.921742072679248, "kl_loss_13": 3736.0, "kl_loss_26": 2748.4, "kl_loss_39": 1665.2, "kl_loss_7": 4324.4, "learning_rate": 0.0005868240888334653, "loss": 6279.4, "step": 4500 }, { "ce_loss_13": 3.2022728264331817, "ce_loss_26": 2.7251765221357345, "ce_loss_39": 2.2201029896736144, "ce_loss_52": 1.4243690267205238, "ce_loss_7": 3.4856902956962585, "epoch": 0.451, "grad_norm": 22.336812688943994, "kl_loss_13": 3701.6, "kl_loss_26": 2716.6, "kl_loss_39": 1627.0, "kl_loss_7": 4291.6, "learning_rate": 0.0005852610963163119, "loss": 6274.9, "step": 4510 }, { "ce_loss_13": 3.2056246638298034, "ce_loss_26": 2.735306566953659, "ce_loss_39": 2.2278982251882553, "ce_loss_52": 1.4315001338720321, "ce_loss_7": 3.4921528518199922, "epoch": 0.452, "grad_norm": 21.324834799180188, "kl_loss_13": 3671.2, "kl_loss_26": 2692.4, "kl_loss_39": 1610.6, "kl_loss_7": 4263.2, "learning_rate": 0.0005836972452208654, "loss": 6241.8, "step": 4520 }, { "ce_loss_13": 3.2711530566215514, "ce_loss_26": 2.794381695985794, "ce_loss_39": 2.2675902634859084, "ce_loss_52": 1.4365027844905853, "ce_loss_7": 3.5576207876205443, "epoch": 0.453, "grad_norm": 21.88775011761487, "kl_loss_13": 3792.8, "kl_loss_26": 2804.8, "kl_loss_39": 1700.8, "kl_loss_7": 4388.0, "learning_rate": 0.0005821325512950885, "loss": 6283.8, "step": 4530 }, { "ce_loss_13": 3.2904800713062285, "ce_loss_26": 2.8134379625320434, "ce_loss_39": 2.2945436596870423, "ce_loss_52": 1.476692470908165, "ce_loss_7": 3.5703530073165894, "epoch": 0.454, "grad_norm": 20.942055908262446, "kl_loss_13": 3751.6, "kl_loss_26": 2758.0, "kl_loss_39": 1655.0, "kl_loss_7": 4336.0, "learning_rate": 0.0005805670302954321, "loss": 6268.6, "step": 4540 }, { "ce_loss_13": 3.2201909184455872, "ce_loss_26": 2.737530159950256, "ce_loss_39": 2.2144886016845704, "ce_loss_52": 1.4106894597411155, "ce_loss_7": 3.5021199345588685, "epoch": 0.455, "grad_norm": 21.81892564093558, "kl_loss_13": 3758.8, "kl_loss_26": 2750.8, "kl_loss_39": 1624.2, "kl_loss_7": 4358.4, "learning_rate": 0.000579000697986675, "loss": 6232.8, "step": 4550 }, { "ce_loss_13": 3.2489894032478333, "ce_loss_26": 2.780492717027664, "ce_loss_39": 2.2484638780355453, "ce_loss_52": 1.4397764205932617, "ce_loss_7": 3.5364687144756317, "epoch": 0.456, "grad_norm": 21.02937623207538, "kl_loss_13": 3754.8, "kl_loss_26": 2776.8, "kl_loss_39": 1651.6, "kl_loss_7": 4345.6, "learning_rate": 0.0005774335701417662, "loss": 6241.6, "step": 4560 }, { "ce_loss_13": 3.2101799607276917, "ce_loss_26": 2.7431035935878754, "ce_loss_39": 2.2130339086055755, "ce_loss_52": 1.4163681983947753, "ce_loss_7": 3.4954857528209686, "epoch": 0.457, "grad_norm": 19.83299768002262, "kl_loss_13": 3705.2, "kl_loss_26": 2723.2, "kl_loss_39": 1618.4, "kl_loss_7": 4301.2, "learning_rate": 0.0005758656625416658, "loss": 6247.2, "step": 4570 }, { "ce_loss_13": 3.2813266932964327, "ce_loss_26": 2.793111354112625, "ce_loss_39": 2.2625322908163072, "ce_loss_52": 1.4499622374773025, "ce_loss_7": 3.5644542396068575, "epoch": 0.458, "grad_norm": 21.53754747250698, "kl_loss_13": 3797.6, "kl_loss_26": 2782.8, "kl_loss_39": 1647.4, "kl_loss_7": 4391.2, "learning_rate": 0.0005742969909751859, "loss": 6266.1, "step": 4580 }, { "ce_loss_13": 3.3478006780147553, "ce_loss_26": 2.8727428793907164, "ce_loss_39": 2.3303479075431826, "ce_loss_52": 1.4829013347625732, "ce_loss_7": 3.6364180862903597, "epoch": 0.459, "grad_norm": 21.34326820051386, "kl_loss_13": 3844.0, "kl_loss_26": 2840.4, "kl_loss_39": 1703.4, "kl_loss_7": 4441.6, "learning_rate": 0.0005727275712388318, "loss": 6209.7, "step": 4590 }, { "ce_loss_13": 3.285200160741806, "ce_loss_26": 2.807385641336441, "ce_loss_39": 2.275821554660797, "ce_loss_52": 1.4415770262479781, "ce_loss_7": 3.563661777973175, "epoch": 0.46, "grad_norm": 20.952340509651894, "kl_loss_13": 3807.6, "kl_loss_26": 2814.0, "kl_loss_39": 1683.4, "kl_loss_7": 4390.0, "learning_rate": 0.0005711574191366427, "loss": 6174.2, "step": 4600 }, { "ce_loss_13": 3.2419202089309693, "ce_loss_26": 2.7672870814800263, "ce_loss_39": 2.2521429657936096, "ce_loss_52": 1.4457757875323296, "ce_loss_7": 3.5175400972366333, "epoch": 0.461, "grad_norm": 20.667083707625775, "kl_loss_13": 3685.6, "kl_loss_26": 2706.8, "kl_loss_39": 1618.2, "kl_loss_7": 4262.4, "learning_rate": 0.0005695865504800327, "loss": 6154.6, "step": 4610 }, { "ce_loss_13": 3.2078490257263184, "ce_loss_26": 2.7373934209346773, "ce_loss_39": 2.2357589691877364, "ce_loss_52": 1.4443277925252915, "ce_loss_7": 3.4876007556915285, "epoch": 0.462, "grad_norm": 21.07091453525893, "kl_loss_13": 3645.6, "kl_loss_26": 2666.8, "kl_loss_39": 1598.2, "kl_loss_7": 4226.0, "learning_rate": 0.0005680149810876322, "loss": 6178.4, "step": 4620 }, { "ce_loss_13": 3.2486107409000398, "ce_loss_26": 2.7601176381111143, "ce_loss_39": 2.227588337659836, "ce_loss_52": 1.4004375696182252, "ce_loss_7": 3.532775843143463, "epoch": 0.463, "grad_norm": 21.569532668695917, "kl_loss_13": 3786.8, "kl_loss_26": 2780.4, "kl_loss_39": 1664.6, "kl_loss_7": 4386.0, "learning_rate": 0.0005664427267851271, "loss": 6215.6, "step": 4630 }, { "ce_loss_13": 3.2331403851509095, "ce_loss_26": 2.7579665184020996, "ce_loss_39": 2.233389773964882, "ce_loss_52": 1.4353074416518212, "ce_loss_7": 3.5171454668045046, "epoch": 0.464, "grad_norm": 21.55234160622014, "kl_loss_13": 3697.6, "kl_loss_26": 2708.8, "kl_loss_39": 1599.2, "kl_loss_7": 4292.8, "learning_rate": 0.0005648698034051009, "loss": 6233.0, "step": 4640 }, { "ce_loss_13": 3.264119005203247, "ce_loss_26": 2.784970927238464, "ce_loss_39": 2.2625187635421753, "ce_loss_52": 1.4551048219203948, "ce_loss_7": 3.536862540245056, "epoch": 0.465, "grad_norm": 21.943732618524454, "kl_loss_13": 3715.6, "kl_loss_26": 2727.6, "kl_loss_39": 1626.8, "kl_loss_7": 4294.8, "learning_rate": 0.0005632962267868747, "loss": 6180.9, "step": 4650 }, { "ce_loss_13": 3.124424380064011, "ce_loss_26": 2.670001748204231, "ce_loss_39": 2.161810302734375, "ce_loss_52": 1.3920277938246728, "ce_loss_7": 3.4072051107883454, "epoch": 0.466, "grad_norm": 19.992372851535457, "kl_loss_13": 3608.0, "kl_loss_26": 2641.4, "kl_loss_39": 1573.8, "kl_loss_7": 4190.0, "learning_rate": 0.0005617220127763474, "loss": 6158.8, "step": 4660 }, { "ce_loss_13": 3.2301677465438843, "ce_loss_26": 2.7488952726125717, "ce_loss_39": 2.2431567162275314, "ce_loss_52": 1.4423212110996246, "ce_loss_7": 3.5069880545139314, "epoch": 0.467, "grad_norm": 20.78261479761198, "kl_loss_13": 3680.4, "kl_loss_26": 2686.0, "kl_loss_39": 1603.8, "kl_loss_7": 4262.0, "learning_rate": 0.0005601471772258368, "loss": 6129.5, "step": 4670 }, { "ce_loss_13": 3.2057377636432647, "ce_loss_26": 2.7412378191947937, "ce_loss_39": 2.2341296702623366, "ce_loss_52": 1.4304189920425414, "ce_loss_7": 3.4817180752754213, "epoch": 0.468, "grad_norm": 20.848135049030358, "kl_loss_13": 3672.8, "kl_loss_26": 2706.4, "kl_loss_39": 1624.6, "kl_loss_7": 4245.6, "learning_rate": 0.0005585717359939192, "loss": 6123.3, "step": 4680 }, { "ce_loss_13": 3.233567637205124, "ce_loss_26": 2.768052551150322, "ce_loss_39": 2.25777924656868, "ce_loss_52": 1.4504828751087189, "ce_loss_7": 3.5125366508960725, "epoch": 0.469, "grad_norm": 20.709517983153315, "kl_loss_13": 3660.4, "kl_loss_26": 2696.0, "kl_loss_39": 1619.6, "kl_loss_7": 4241.2, "learning_rate": 0.0005569957049452703, "loss": 6101.6, "step": 4690 }, { "ce_loss_13": 3.2725342512130737, "ce_loss_26": 2.7866754591464997, "ce_loss_39": 2.2480324536561964, "ce_loss_52": 1.404937854409218, "ce_loss_7": 3.5641084611415863, "epoch": 0.47, "grad_norm": 20.633433200619724, "kl_loss_13": 3863.2, "kl_loss_26": 2851.6, "kl_loss_39": 1720.6, "kl_loss_7": 4468.0, "learning_rate": 0.0005554190999505056, "loss": 6211.0, "step": 4700 }, { "ce_loss_13": 3.2052918612957, "ce_loss_26": 2.732904624938965, "ce_loss_39": 2.2116665810346605, "ce_loss_52": 1.4257652133703231, "ce_loss_7": 3.4880705952644346, "epoch": 0.471, "grad_norm": 20.706622626666558, "kl_loss_13": 3660.0, "kl_loss_26": 2670.8, "kl_loss_39": 1571.8, "kl_loss_7": 4252.4, "learning_rate": 0.0005538419368860196, "loss": 6097.3, "step": 4710 }, { "ce_loss_13": 3.201172482967377, "ce_loss_26": 2.726384937763214, "ce_loss_39": 2.198946151137352, "ce_loss_52": 1.4113327443599701, "ce_loss_7": 3.487533462047577, "epoch": 0.472, "grad_norm": 21.400203482886983, "kl_loss_13": 3675.6, "kl_loss_26": 2693.2, "kl_loss_39": 1591.2, "kl_loss_7": 4272.0, "learning_rate": 0.0005522642316338268, "loss": 6121.3, "step": 4720 }, { "ce_loss_13": 3.224886018037796, "ce_loss_26": 2.7551407277584077, "ce_loss_39": 2.2328554034233092, "ce_loss_52": 1.4563202857971191, "ce_loss_7": 3.5022718131542208, "epoch": 0.473, "grad_norm": 21.66295321363831, "kl_loss_13": 3648.0, "kl_loss_26": 2662.8, "kl_loss_39": 1575.8, "kl_loss_7": 4219.6, "learning_rate": 0.0005506860000814017, "loss": 6051.3, "step": 4730 }, { "ce_loss_13": 3.2025643050670625, "ce_loss_26": 2.7286852061748506, "ce_loss_39": 2.21729561984539, "ce_loss_52": 1.453881350159645, "ce_loss_7": 3.4832063794136046, "epoch": 0.474, "grad_norm": 20.603608171505254, "kl_loss_13": 3623.6, "kl_loss_26": 2642.4, "kl_loss_39": 1549.2, "kl_loss_7": 4212.8, "learning_rate": 0.0005491072581215186, "loss": 6098.5, "step": 4740 }, { "ce_loss_13": 3.212699604034424, "ce_loss_26": 2.732209050655365, "ce_loss_39": 2.2089115262031553, "ce_loss_52": 1.4185278177261353, "ce_loss_7": 3.4970239818096163, "epoch": 0.475, "grad_norm": 20.455172908061705, "kl_loss_13": 3701.2, "kl_loss_26": 2708.4, "kl_loss_39": 1600.0, "kl_loss_7": 4294.0, "learning_rate": 0.0005475280216520913, "loss": 6092.7, "step": 4750 }, { "ce_loss_13": 3.161313956975937, "ce_loss_26": 2.6922851324081423, "ce_loss_39": 2.185682702064514, "ce_loss_52": 1.4132045745849608, "ce_loss_7": 3.4361780524253844, "epoch": 0.476, "grad_norm": 21.048159476746886, "kl_loss_13": 3632.8, "kl_loss_26": 2655.2, "kl_loss_39": 1574.6, "kl_loss_7": 4208.4, "learning_rate": 0.0005459483065760138, "loss": 6159.3, "step": 4760 }, { "ce_loss_13": 3.2172334492206573, "ce_loss_26": 2.739536887407303, "ce_loss_39": 2.2192301630973814, "ce_loss_52": 1.4282803654670715, "ce_loss_7": 3.5064621806144713, "epoch": 0.477, "grad_norm": 20.62269454966768, "kl_loss_13": 3708.4, "kl_loss_26": 2710.0, "kl_loss_39": 1610.2, "kl_loss_7": 4301.6, "learning_rate": 0.0005443681288009991, "loss": 6104.1, "step": 4770 }, { "ce_loss_13": 3.217255789041519, "ce_loss_26": 2.7369415044784544, "ce_loss_39": 2.2001087069511414, "ce_loss_52": 1.401385571062565, "ce_loss_7": 3.5032376050949097, "epoch": 0.478, "grad_norm": 20.40853719962087, "kl_loss_13": 3760.0, "kl_loss_26": 2763.2, "kl_loss_39": 1625.0, "kl_loss_7": 4353.2, "learning_rate": 0.0005427875042394199, "loss": 6064.0, "step": 4780 }, { "ce_loss_13": 3.199622023105621, "ce_loss_26": 2.734530872106552, "ce_loss_39": 2.2225404649972917, "ce_loss_52": 1.4552370458841324, "ce_loss_7": 3.475612831115723, "epoch": 0.479, "grad_norm": 21.111949592982874, "kl_loss_13": 3598.8, "kl_loss_26": 2623.2, "kl_loss_39": 1545.4, "kl_loss_7": 4177.6, "learning_rate": 0.0005412064488081482, "loss": 6074.2, "step": 4790 }, { "ce_loss_13": 3.1485446810722353, "ce_loss_26": 2.6845290422439576, "ce_loss_39": 2.1669752955436707, "ce_loss_52": 1.4116393029689789, "ce_loss_7": 3.423712509870529, "epoch": 0.48, "grad_norm": 20.355440951189763, "kl_loss_13": 3606.4, "kl_loss_26": 2638.4, "kl_loss_39": 1543.2, "kl_loss_7": 4182.4, "learning_rate": 0.0005396249784283942, "loss": 6051.0, "step": 4800 }, { "ce_loss_13": 3.1923361301422117, "ce_loss_26": 2.71637277007103, "ce_loss_39": 2.1978514790534973, "ce_loss_52": 1.4327621147036553, "ce_loss_7": 3.47632372379303, "epoch": 0.481, "grad_norm": 22.229636039543518, "kl_loss_13": 3644.0, "kl_loss_26": 2653.2, "kl_loss_39": 1555.8, "kl_loss_7": 4232.8, "learning_rate": 0.0005380431090255476, "loss": 6143.3, "step": 4810 }, { "ce_loss_13": 3.232038801908493, "ce_loss_26": 2.7648274183273314, "ce_loss_39": 2.2573492497205736, "ce_loss_52": 1.4371329843997955, "ce_loss_7": 3.5092617154121397, "epoch": 0.482, "grad_norm": 21.36587062891148, "kl_loss_13": 3704.8, "kl_loss_26": 2740.8, "kl_loss_39": 1651.2, "kl_loss_7": 4281.6, "learning_rate": 0.0005364608565290155, "loss": 6031.2, "step": 4820 }, { "ce_loss_13": 3.250445681810379, "ce_loss_26": 2.773394727706909, "ce_loss_39": 2.243200385570526, "ce_loss_52": 1.4556994497776032, "ce_loss_7": 3.536140114068985, "epoch": 0.483, "grad_norm": 20.760727607225178, "kl_loss_13": 3706.4, "kl_loss_26": 2711.2, "kl_loss_39": 1595.8, "kl_loss_7": 4300.0, "learning_rate": 0.0005348782368720626, "loss": 6094.3, "step": 4830 }, { "ce_loss_13": 3.2234844088554384, "ce_loss_26": 2.7573211640119553, "ce_loss_39": 2.244653856754303, "ce_loss_52": 1.4427421689033508, "ce_loss_7": 3.497196841239929, "epoch": 0.484, "grad_norm": 20.726340018616938, "kl_loss_13": 3685.6, "kl_loss_26": 2708.0, "kl_loss_39": 1610.0, "kl_loss_7": 4262.4, "learning_rate": 0.000533295265991652, "loss": 6062.9, "step": 4840 }, { "ce_loss_13": 3.152006584405899, "ce_loss_26": 2.678810328245163, "ce_loss_39": 2.166279435157776, "ce_loss_52": 1.3957384467124938, "ce_loss_7": 3.4304138660430907, "epoch": 0.485, "grad_norm": 21.522248137711077, "kl_loss_13": 3630.8, "kl_loss_26": 2639.0, "kl_loss_39": 1547.4, "kl_loss_7": 4218.0, "learning_rate": 0.0005317119598282822, "loss": 6033.9, "step": 4850 }, { "ce_loss_13": 3.2258131086826323, "ce_loss_26": 2.7538253903388976, "ce_loss_39": 2.237151172757149, "ce_loss_52": 1.4640001267194749, "ce_loss_7": 3.493991768360138, "epoch": 0.486, "grad_norm": 19.71152116189248, "kl_loss_13": 3656.8, "kl_loss_26": 2681.2, "kl_loss_39": 1581.0, "kl_loss_7": 4221.6, "learning_rate": 0.0005301283343258293, "loss": 6062.4, "step": 4860 }, { "ce_loss_13": 3.188614493608475, "ce_loss_26": 2.7119751185178758, "ce_loss_39": 2.189143994450569, "ce_loss_52": 1.422459150850773, "ce_loss_7": 3.468545514345169, "epoch": 0.487, "grad_norm": 20.57479406007355, "kl_loss_13": 3638.4, "kl_loss_26": 2642.2, "kl_loss_39": 1539.5, "kl_loss_7": 4222.0, "learning_rate": 0.000528544405431384, "loss": 6047.1, "step": 4870 }, { "ce_loss_13": 3.1630406379699707, "ce_loss_26": 2.6917948126792908, "ce_loss_39": 2.187080183625221, "ce_loss_52": 1.430996198952198, "ce_loss_7": 3.437908464670181, "epoch": 0.488, "grad_norm": 20.266256252328688, "kl_loss_13": 3589.2, "kl_loss_26": 2606.0, "kl_loss_39": 1535.4, "kl_loss_7": 4160.0, "learning_rate": 0.000526960189095093, "loss": 6056.9, "step": 4880 }, { "ce_loss_13": 3.1363641381263734, "ce_loss_26": 2.6843234658241273, "ce_loss_39": 2.1881404638290407, "ce_loss_52": 1.4287778049707414, "ce_loss_7": 3.403191590309143, "epoch": 0.489, "grad_norm": 20.737227217707265, "kl_loss_13": 3534.8, "kl_loss_26": 2597.2, "kl_loss_39": 1535.4, "kl_loss_7": 4094.0, "learning_rate": 0.0005253757012699972, "loss": 6013.8, "step": 4890 }, { "ce_loss_13": 3.2066462457180025, "ce_loss_26": 2.7318670630455015, "ce_loss_39": 2.2165933042764663, "ce_loss_52": 1.4400058209896087, "ce_loss_7": 3.4839820206165313, "epoch": 0.49, "grad_norm": 20.942033187869956, "kl_loss_13": 3651.6, "kl_loss_26": 2660.8, "kl_loss_39": 1565.0, "kl_loss_7": 4233.6, "learning_rate": 0.0005237909579118712, "loss": 5973.0, "step": 4900 }, { "ce_loss_13": 3.2107683062553405, "ce_loss_26": 2.723215198516846, "ce_loss_39": 2.2134319245815277, "ce_loss_52": 1.4413373351097107, "ce_loss_7": 3.498019593954086, "epoch": 0.491, "grad_norm": 19.852386563944762, "kl_loss_13": 3647.2, "kl_loss_26": 2634.4, "kl_loss_39": 1545.0, "kl_loss_7": 4246.4, "learning_rate": 0.0005222059749790631, "loss": 5997.8, "step": 4910 }, { "ce_loss_13": 3.2151939988136293, "ce_loss_26": 2.7481437802314757, "ce_loss_39": 2.2312227368354796, "ce_loss_52": 1.4501317411661148, "ce_loss_7": 3.4992719650268556, "epoch": 0.492, "grad_norm": 21.940050434667352, "kl_loss_13": 3627.6, "kl_loss_26": 2655.2, "kl_loss_39": 1571.4, "kl_loss_7": 4213.6, "learning_rate": 0.0005206207684323337, "loss": 5989.6, "step": 4920 }, { "ce_loss_13": 3.14618239402771, "ce_loss_26": 2.6644466161727904, "ce_loss_39": 2.1522305369377137, "ce_loss_52": 1.4040746569633484, "ce_loss_7": 3.430801051855087, "epoch": 0.493, "grad_norm": 21.866819048126402, "kl_loss_13": 3609.6, "kl_loss_26": 2614.8, "kl_loss_39": 1523.4, "kl_loss_7": 4205.6, "learning_rate": 0.000519035354234695, "loss": 5971.3, "step": 4930 }, { "ce_loss_13": 3.2634301006793978, "ce_loss_26": 2.7843244314193725, "ce_loss_39": 2.264421299099922, "ce_loss_52": 1.4652716666460037, "ce_loss_7": 3.5409990727901457, "epoch": 0.494, "grad_norm": 22.02758833423459, "kl_loss_13": 3711.6, "kl_loss_26": 2710.6, "kl_loss_39": 1607.3, "kl_loss_7": 4290.4, "learning_rate": 0.0005174497483512506, "loss": 6017.3, "step": 4940 }, { "ce_loss_13": 3.2103551268577575, "ce_loss_26": 2.7440166890621187, "ce_loss_39": 2.2352791130542755, "ce_loss_52": 1.4523784220218658, "ce_loss_7": 3.486135560274124, "epoch": 0.495, "grad_norm": 23.749732875931574, "kl_loss_13": 3618.0, "kl_loss_26": 2656.6, "kl_loss_39": 1573.0, "kl_loss_7": 4201.2, "learning_rate": 0.0005158639667490339, "loss": 5989.9, "step": 4950 }, { "ce_loss_13": 3.118060350418091, "ce_loss_26": 2.6498723566532134, "ce_loss_39": 2.1354818284511565, "ce_loss_52": 1.3788613289594651, "ce_loss_7": 3.403479200601578, "epoch": 0.496, "grad_norm": 20.524190680845354, "kl_loss_13": 3599.2, "kl_loss_26": 2620.4, "kl_loss_39": 1541.0, "kl_loss_7": 4189.6, "learning_rate": 0.0005142780253968481, "loss": 5973.2, "step": 4960 }, { "ce_loss_13": 3.1573639094829558, "ce_loss_26": 2.6952777743339538, "ce_loss_39": 2.184722366929054, "ce_loss_52": 1.4360924899578094, "ce_loss_7": 3.4324650526046754, "epoch": 0.497, "grad_norm": 21.848899371522094, "kl_loss_13": 3594.0, "kl_loss_26": 2620.8, "kl_loss_39": 1532.4, "kl_loss_7": 4163.6, "learning_rate": 0.0005126919402651053, "loss": 5950.9, "step": 4970 }, { "ce_loss_13": 3.148969703912735, "ce_loss_26": 2.6792631447315216, "ce_loss_39": 2.168180876970291, "ce_loss_52": 1.4135416984558105, "ce_loss_7": 3.4292350709438324, "epoch": 0.498, "grad_norm": 21.082558392676134, "kl_loss_13": 3576.4, "kl_loss_26": 2608.8, "kl_loss_39": 1527.0, "kl_loss_7": 4166.4, "learning_rate": 0.0005111057273256647, "loss": 5924.7, "step": 4980 }, { "ce_loss_13": 3.1738288044929504, "ce_loss_26": 2.7059105813503264, "ce_loss_39": 2.194283801317215, "ce_loss_52": 1.4388620942831039, "ce_loss_7": 3.461875486373901, "epoch": 0.499, "grad_norm": 21.062049454907296, "kl_loss_13": 3574.8, "kl_loss_26": 2594.8, "kl_loss_39": 1521.0, "kl_loss_7": 4165.2, "learning_rate": 0.0005095194025516733, "loss": 5935.8, "step": 4990 }, { "ce_loss_13": 3.2131927073001862, "ce_loss_26": 2.7427126079797746, "ce_loss_39": 2.2358015894889833, "ce_loss_52": 1.4697326198220253, "ce_loss_7": 3.48975727558136, "epoch": 0.5, "grad_norm": 19.951336775236356, "kl_loss_13": 3620.8, "kl_loss_26": 2633.2, "kl_loss_39": 1549.0, "kl_loss_7": 4195.6, "learning_rate": 0.000507932981917404, "loss": 5955.8, "step": 5000 }, { "ce_loss_13": 3.064238077402115, "ce_loss_26": 2.605297487974167, "ce_loss_39": 2.106147512793541, "ce_loss_52": 1.3699454009532928, "ce_loss_7": 3.3354556441307066, "epoch": 0.501, "grad_norm": 22.434294806872032, "kl_loss_13": 3513.6, "kl_loss_26": 2556.0, "kl_loss_39": 1493.0, "kl_loss_7": 4079.2, "learning_rate": 0.0005063464813980949, "loss": 5921.7, "step": 5010 }, { "ce_loss_13": 3.120131802558899, "ce_loss_26": 2.6457916140556335, "ce_loss_39": 2.136381095647812, "ce_loss_52": 1.3973354250192642, "ce_loss_7": 3.3927165508270263, "epoch": 0.502, "grad_norm": 20.534145152408744, "kl_loss_13": 3558.4, "kl_loss_26": 2578.4, "kl_loss_39": 1497.8, "kl_loss_7": 4132.8, "learning_rate": 0.0005047599169697884, "loss": 5945.8, "step": 5020 }, { "ce_loss_13": 3.155858016014099, "ce_loss_26": 2.683425110578537, "ce_loss_39": 2.1650480359792708, "ce_loss_52": 1.426735344529152, "ce_loss_7": 3.4369399666786196, "epoch": 0.503, "grad_norm": 20.583275474881205, "kl_loss_13": 3593.2, "kl_loss_26": 2606.4, "kl_loss_39": 1509.8, "kl_loss_7": 4176.0, "learning_rate": 0.000503173304609171, "loss": 5949.4, "step": 5030 }, { "ce_loss_13": 3.224624240398407, "ce_loss_26": 2.742176574468613, "ce_loss_39": 2.2156084358692167, "ce_loss_52": 1.4474807173013686, "ce_loss_7": 3.508391612768173, "epoch": 0.504, "grad_norm": 20.860727938912092, "kl_loss_13": 3678.4, "kl_loss_26": 2680.8, "kl_loss_39": 1568.2, "kl_loss_7": 4265.2, "learning_rate": 0.0005015866602934111, "loss": 5957.4, "step": 5040 }, { "ce_loss_13": 3.1227844834327696, "ce_loss_26": 2.661714029312134, "ce_loss_39": 2.1653817743062973, "ce_loss_52": 1.4367017298936844, "ce_loss_7": 3.3902225315570833, "epoch": 0.505, "grad_norm": 19.797756106985307, "kl_loss_13": 3490.8, "kl_loss_26": 2530.0, "kl_loss_39": 1472.6, "kl_loss_7": 4055.6, "learning_rate": 0.0005, "loss": 5927.3, "step": 5050 }, { "ce_loss_13": 3.177449029684067, "ce_loss_26": 2.716005155444145, "ce_loss_39": 2.205168914794922, "ce_loss_52": 1.442250807583332, "ce_loss_7": 3.453594130277634, "epoch": 0.506, "grad_norm": 20.433104016560257, "kl_loss_13": 3588.0, "kl_loss_26": 2622.2, "kl_loss_39": 1546.2, "kl_loss_7": 4161.6, "learning_rate": 0.0004984133397065889, "loss": 5913.9, "step": 5060 }, { "ce_loss_13": 3.1423951983451843, "ce_loss_26": 2.674048882722855, "ce_loss_39": 2.1621575862169267, "ce_loss_52": 1.4322402387857438, "ce_loss_7": 3.416734743118286, "epoch": 0.507, "grad_norm": 20.444836977919294, "kl_loss_13": 3545.2, "kl_loss_26": 2564.8, "kl_loss_39": 1486.0, "kl_loss_7": 4119.6, "learning_rate": 0.0004968266953908291, "loss": 5880.6, "step": 5070 }, { "ce_loss_13": 3.070253336429596, "ce_loss_26": 2.6069840848445893, "ce_loss_39": 2.1048853427171705, "ce_loss_52": 1.3886964708566665, "ce_loss_7": 3.350748908519745, "epoch": 0.508, "grad_norm": 21.18309883625556, "kl_loss_13": 3487.2, "kl_loss_26": 2520.8, "kl_loss_39": 1460.8, "kl_loss_7": 4069.6, "learning_rate": 0.0004952400830302117, "loss": 5885.3, "step": 5080 }, { "ce_loss_13": 3.077636110782623, "ce_loss_26": 2.6177482545375823, "ce_loss_39": 2.1197337061166763, "ce_loss_52": 1.3917517423629762, "ce_loss_7": 3.3575133979320526, "epoch": 0.509, "grad_norm": 19.77626859818484, "kl_loss_13": 3496.0, "kl_loss_26": 2536.8, "kl_loss_39": 1477.0, "kl_loss_7": 4073.6, "learning_rate": 0.0004936535186019053, "loss": 5872.1, "step": 5090 }, { "ce_loss_13": 3.178175300359726, "ce_loss_26": 2.701016789674759, "ce_loss_39": 2.1893070548772813, "ce_loss_52": 1.4174780696630478, "ce_loss_7": 3.4626995623111725, "epoch": 0.51, "grad_norm": 19.62760678285289, "kl_loss_13": 3640.4, "kl_loss_26": 2642.0, "kl_loss_39": 1551.2, "kl_loss_7": 4232.4, "learning_rate": 0.000492067018082596, "loss": 5937.7, "step": 5100 }, { "ce_loss_13": 3.169191563129425, "ce_loss_26": 2.708436530828476, "ce_loss_39": 2.1886946499347686, "ce_loss_52": 1.4288851469755173, "ce_loss_7": 3.4514395534992217, "epoch": 0.511, "grad_norm": 20.552371425986603, "kl_loss_13": 3588.0, "kl_loss_26": 2634.8, "kl_loss_39": 1546.0, "kl_loss_7": 4178.4, "learning_rate": 0.0004904805974483267, "loss": 5867.4, "step": 5110 }, { "ce_loss_13": 3.152217388153076, "ce_loss_26": 2.6922530949115755, "ce_loss_39": 2.181437623500824, "ce_loss_52": 1.4460914835333825, "ce_loss_7": 3.429844158887863, "epoch": 0.512, "grad_norm": 20.36189036420726, "kl_loss_13": 3504.0, "kl_loss_26": 2543.2, "kl_loss_39": 1481.0, "kl_loss_7": 4075.2, "learning_rate": 0.0004888942726743353, "loss": 5848.7, "step": 5120 }, { "ce_loss_13": 3.1243839859962463, "ce_loss_26": 2.6554999887943267, "ce_loss_39": 2.1515370845794677, "ce_loss_52": 1.4089675784111022, "ce_loss_7": 3.4115704774856566, "epoch": 0.513, "grad_norm": 20.218379503515084, "kl_loss_13": 3534.0, "kl_loss_26": 2557.4, "kl_loss_39": 1487.0, "kl_loss_7": 4121.6, "learning_rate": 0.0004873080597348947, "loss": 5856.5, "step": 5130 }, { "ce_loss_13": 3.223481798171997, "ce_loss_26": 2.754591333866119, "ce_loss_39": 2.236158034205437, "ce_loss_52": 1.4544740557670592, "ce_loss_7": 3.5026029109954835, "epoch": 0.514, "grad_norm": 20.810099829480396, "kl_loss_13": 3656.8, "kl_loss_26": 2670.8, "kl_loss_39": 1575.8, "kl_loss_7": 4237.2, "learning_rate": 0.0004857219746031519, "loss": 5882.8, "step": 5140 }, { "ce_loss_13": 3.146134835481644, "ce_loss_26": 2.680527698993683, "ce_loss_39": 2.168422257900238, "ce_loss_52": 1.4367385059595108, "ce_loss_7": 3.423633599281311, "epoch": 0.515, "grad_norm": 20.424421047481275, "kl_loss_13": 3516.8, "kl_loss_26": 2539.6, "kl_loss_39": 1472.0, "kl_loss_7": 4094.8, "learning_rate": 0.0004841360332509663, "loss": 5895.45, "step": 5150 }, { "ce_loss_13": 3.1724873065948485, "ce_loss_26": 2.7010417520999908, "ce_loss_39": 2.188371130824089, "ce_loss_52": 1.4464277178049088, "ce_loss_7": 3.4471147775650026, "epoch": 0.516, "grad_norm": 20.221560032680266, "kl_loss_13": 3555.2, "kl_loss_26": 2575.2, "kl_loss_39": 1495.4, "kl_loss_7": 4124.8, "learning_rate": 0.0004825502516487497, "loss": 5883.8, "step": 5160 }, { "ce_loss_13": 3.1568028390407563, "ce_loss_26": 2.695826065540314, "ce_loss_39": 2.1907377928495406, "ce_loss_52": 1.467595374584198, "ce_loss_7": 3.4282791554927825, "epoch": 0.517, "grad_norm": 21.375776905692355, "kl_loss_13": 3490.8, "kl_loss_26": 2528.8, "kl_loss_39": 1463.6, "kl_loss_7": 4049.2, "learning_rate": 0.00048096464576530507, "loss": 5813.7, "step": 5170 }, { "ce_loss_13": 3.0959118604660034, "ce_loss_26": 2.6389028072357177, "ce_loss_39": 2.1422775775194167, "ce_loss_52": 1.4158973768353462, "ce_loss_7": 3.3716647744178774, "epoch": 0.518, "grad_norm": 20.796367284367612, "kl_loss_13": 3490.0, "kl_loss_26": 2527.0, "kl_loss_39": 1467.8, "kl_loss_7": 4068.4, "learning_rate": 0.00047937923156766646, "loss": 5832.2, "step": 5180 }, { "ce_loss_13": 3.181716579198837, "ce_loss_26": 2.7091287076473236, "ce_loss_39": 2.205903950333595, "ce_loss_52": 1.4591009467840195, "ce_loss_7": 3.4553593516349794, "epoch": 0.519, "grad_norm": 21.655670358447043, "kl_loss_13": 3570.4, "kl_loss_26": 2594.8, "kl_loss_39": 1522.8, "kl_loss_7": 4142.4, "learning_rate": 0.00047779402502093696, "loss": 5844.1, "step": 5190 }, { "ce_loss_13": 3.114813321828842, "ce_loss_26": 2.6511843532323836, "ce_loss_39": 2.1350393682718276, "ce_loss_52": 1.4030846193432809, "ce_loss_7": 3.3881891489028932, "epoch": 0.52, "grad_norm": 21.261317204954832, "kl_loss_13": 3537.2, "kl_loss_26": 2569.4, "kl_loss_39": 1490.2, "kl_loss_7": 4112.8, "learning_rate": 0.0004762090420881289, "loss": 5904.5, "step": 5200 }, { "ce_loss_13": 3.178787976503372, "ce_loss_26": 2.716679725050926, "ce_loss_39": 2.214344197511673, "ce_loss_52": 1.4766788110136986, "ce_loss_7": 3.445727747678757, "epoch": 0.521, "grad_norm": 21.5237930218517, "kl_loss_13": 3533.2, "kl_loss_26": 2579.6, "kl_loss_39": 1511.4, "kl_loss_7": 4104.0, "learning_rate": 0.00047462429873000296, "loss": 5807.7, "step": 5210 }, { "ce_loss_13": 3.1838007628917695, "ce_loss_26": 2.7048393905162813, "ce_loss_39": 2.19879055917263, "ce_loss_52": 1.4492772698402405, "ce_loss_7": 3.460488295555115, "epoch": 0.522, "grad_norm": 23.509628530732027, "kl_loss_13": 3558.8, "kl_loss_26": 2571.2, "kl_loss_39": 1503.6, "kl_loss_7": 4141.6, "learning_rate": 0.0004730398109049071, "loss": 5850.6, "step": 5220 }, { "ce_loss_13": 3.1778542578220366, "ce_loss_26": 2.723250871896744, "ce_loss_39": 2.2311396062374116, "ce_loss_52": 1.4870843648910523, "ce_loss_7": 3.4500561714172364, "epoch": 0.523, "grad_norm": 20.540010606623795, "kl_loss_13": 3511.6, "kl_loss_26": 2561.4, "kl_loss_39": 1495.8, "kl_loss_7": 4082.4, "learning_rate": 0.000471455594568616, "loss": 5864.9, "step": 5230 }, { "ce_loss_13": 3.184338331222534, "ce_loss_26": 2.719081574678421, "ce_loss_39": 2.1989813148975372, "ce_loss_52": 1.4533298462629318, "ce_loss_7": 3.465100187063217, "epoch": 0.524, "grad_norm": 19.66913851254436, "kl_loss_13": 3588.4, "kl_loss_26": 2616.0, "kl_loss_39": 1518.0, "kl_loss_7": 4170.0, "learning_rate": 0.00046987166567417086, "loss": 5881.2, "step": 5240 }, { "ce_loss_13": 3.097227877378464, "ce_loss_26": 2.6349131643772123, "ce_loss_39": 2.1339926183223725, "ce_loss_52": 1.3901747956871986, "ce_loss_7": 3.3714997708797454, "epoch": 0.525, "grad_norm": 21.094189724694242, "kl_loss_13": 3503.6, "kl_loss_26": 2546.6, "kl_loss_39": 1491.4, "kl_loss_7": 4079.2, "learning_rate": 0.00046828804017171776, "loss": 5869.8, "step": 5250 }, { "ce_loss_13": 3.1320539236068727, "ce_loss_26": 2.667567166686058, "ce_loss_39": 2.17643720805645, "ce_loss_52": 1.450567391514778, "ce_loss_7": 3.4080025017261506, "epoch": 0.526, "grad_norm": 20.61619717149242, "kl_loss_13": 3507.6, "kl_loss_26": 2536.4, "kl_loss_39": 1483.6, "kl_loss_7": 4082.4, "learning_rate": 0.00046670473400834805, "loss": 5811.0, "step": 5260 }, { "ce_loss_13": 3.123244607448578, "ce_loss_26": 2.6622200667858125, "ce_loss_39": 2.1446547359228134, "ce_loss_52": 1.4058131739497184, "ce_loss_7": 3.393282580375671, "epoch": 0.527, "grad_norm": 20.13067058218258, "kl_loss_13": 3536.4, "kl_loss_26": 2574.0, "kl_loss_39": 1499.8, "kl_loss_7": 4103.6, "learning_rate": 0.00046512176312793734, "loss": 5799.9, "step": 5270 }, { "ce_loss_13": 3.0799066185951234, "ce_loss_26": 2.620718148350716, "ce_loss_39": 2.1284131199121474, "ce_loss_52": 1.4004437893629074, "ce_loss_7": 3.357691395282745, "epoch": 0.528, "grad_norm": 20.729785702601426, "kl_loss_13": 3458.8, "kl_loss_26": 2500.6, "kl_loss_39": 1456.6, "kl_loss_7": 4039.2, "learning_rate": 0.00046353914347098467, "loss": 5784.9, "step": 5280 }, { "ce_loss_13": 3.100508135557175, "ce_loss_26": 2.645443448424339, "ce_loss_39": 2.140786075592041, "ce_loss_52": 1.4068747192621232, "ce_loss_7": 3.3814845025539397, "epoch": 0.529, "grad_norm": 20.87154357218591, "kl_loss_13": 3506.0, "kl_loss_26": 2553.6, "kl_loss_39": 1489.5, "kl_loss_7": 4091.2, "learning_rate": 0.0004619568909744524, "loss": 5772.2, "step": 5290 }, { "ce_loss_13": 3.127873086929321, "ce_loss_26": 2.6634323090314864, "ce_loss_39": 2.157263731956482, "ce_loss_52": 1.4283392548561096, "ce_loss_7": 3.3975966036319734, "epoch": 0.53, "grad_norm": 20.20322543048614, "kl_loss_13": 3490.8, "kl_loss_26": 2524.4, "kl_loss_39": 1474.6, "kl_loss_7": 4055.2, "learning_rate": 0.00046037502157160573, "loss": 5795.9, "step": 5300 }, { "ce_loss_13": 3.076627087593079, "ce_loss_26": 2.616869166493416, "ce_loss_39": 2.117774197459221, "ce_loss_52": 1.4120985105633737, "ce_loss_7": 3.3510021567344666, "epoch": 0.531, "grad_norm": 20.811499078870163, "kl_loss_13": 3444.0, "kl_loss_26": 2482.4, "kl_loss_39": 1422.8, "kl_loss_7": 4016.0, "learning_rate": 0.00045879355119185207, "loss": 5749.7, "step": 5310 }, { "ce_loss_13": 3.1226239800453186, "ce_loss_26": 2.662443572282791, "ce_loss_39": 2.156531369686127, "ce_loss_52": 1.4430431425571442, "ce_loss_7": 3.396123135089874, "epoch": 0.532, "grad_norm": 18.59996382548897, "kl_loss_13": 3496.0, "kl_loss_26": 2528.8, "kl_loss_39": 1445.2, "kl_loss_7": 4069.2, "learning_rate": 0.0004572124957605803, "loss": 5776.5, "step": 5320 }, { "ce_loss_13": 3.1100788176059724, "ce_loss_26": 2.647669917345047, "ce_loss_39": 2.140716627240181, "ce_loss_52": 1.4189704924821853, "ce_loss_7": 3.387048715353012, "epoch": 0.533, "grad_norm": 20.905976368739214, "kl_loss_13": 3497.6, "kl_loss_26": 2533.0, "kl_loss_39": 1466.2, "kl_loss_7": 4079.6, "learning_rate": 0.00045563187119900103, "loss": 5752.6, "step": 5330 }, { "ce_loss_13": 3.06461244225502, "ce_loss_26": 2.6009975552558897, "ce_loss_39": 2.1015468716621397, "ce_loss_52": 1.3909726276993752, "ce_loss_7": 3.3485201001167297, "epoch": 0.534, "grad_norm": 19.831333266576646, "kl_loss_13": 3462.4, "kl_loss_26": 2487.0, "kl_loss_39": 1427.1, "kl_loss_7": 4053.2, "learning_rate": 0.00045405169342398633, "loss": 5804.75, "step": 5340 }, { "ce_loss_13": 3.1465537667274477, "ce_loss_26": 2.6862030625343323, "ce_loss_39": 2.1878136694431305, "ce_loss_52": 1.4572405338287353, "ce_loss_7": 3.4159956216812133, "epoch": 0.535, "grad_norm": 20.130648070413336, "kl_loss_13": 3486.8, "kl_loss_26": 2529.2, "kl_loss_39": 1463.2, "kl_loss_7": 4050.4, "learning_rate": 0.0004524719783479088, "loss": 5785.8, "step": 5350 }, { "ce_loss_13": 3.1608322679996492, "ce_loss_26": 2.6899396955966948, "ce_loss_39": 2.192571198940277, "ce_loss_52": 1.466444182395935, "ce_loss_7": 3.4340961396694185, "epoch": 0.536, "grad_norm": 21.175578440541553, "kl_loss_13": 3511.2, "kl_loss_26": 2537.2, "kl_loss_39": 1482.0, "kl_loss_7": 4079.6, "learning_rate": 0.00045089274187848144, "loss": 5831.5, "step": 5360 }, { "ce_loss_13": 3.122362142801285, "ce_loss_26": 2.6539461642503737, "ce_loss_39": 2.1548154592514037, "ce_loss_52": 1.4296748742461205, "ce_loss_7": 3.403215527534485, "epoch": 0.537, "grad_norm": 21.426385415702033, "kl_loss_13": 3491.6, "kl_loss_26": 2524.6, "kl_loss_39": 1464.2, "kl_loss_7": 4068.8, "learning_rate": 0.00044931399991859835, "loss": 5775.7, "step": 5370 }, { "ce_loss_13": 3.0924407064914705, "ce_loss_26": 2.6285818815231323, "ce_loss_39": 2.12394041121006, "ce_loss_52": 1.4268731981515885, "ce_loss_7": 3.365303188562393, "epoch": 0.538, "grad_norm": 21.06287805296543, "kl_loss_13": 3469.6, "kl_loss_26": 2495.8, "kl_loss_39": 1421.6, "kl_loss_7": 4039.2, "learning_rate": 0.00044773576836617336, "loss": 5748.1, "step": 5380 }, { "ce_loss_13": 3.0936496675014498, "ce_loss_26": 2.6234502464532854, "ce_loss_39": 2.112909361720085, "ce_loss_52": 1.3892342567443847, "ce_loss_7": 3.3770604133605957, "epoch": 0.539, "grad_norm": 20.257845085428166, "kl_loss_13": 3512.0, "kl_loss_26": 2533.6, "kl_loss_39": 1454.2, "kl_loss_7": 4101.2, "learning_rate": 0.00044615806311398056, "loss": 5750.7, "step": 5390 }, { "ce_loss_13": 3.1293481528759, "ce_loss_26": 2.661091110110283, "ce_loss_39": 2.1563747018575667, "ce_loss_52": 1.4326880395412445, "ce_loss_7": 3.4015913248062133, "epoch": 0.54, "grad_norm": 19.884995716311128, "kl_loss_13": 3510.0, "kl_loss_26": 2528.6, "kl_loss_39": 1461.8, "kl_loss_7": 4087.2, "learning_rate": 0.00044458090004949454, "loss": 5775.2, "step": 5400 }, { "ce_loss_13": 3.1498380303382874, "ce_loss_26": 2.6860203623771666, "ce_loss_39": 2.184649482369423, "ce_loss_52": 1.4619063645601273, "ce_loss_7": 3.425897455215454, "epoch": 0.541, "grad_norm": 21.787073133904556, "kl_loss_13": 3476.0, "kl_loss_26": 2516.4, "kl_loss_39": 1456.6, "kl_loss_7": 4048.0, "learning_rate": 0.0004430042950547297, "loss": 5755.9, "step": 5410 }, { "ce_loss_13": 3.1863462030887604, "ce_loss_26": 2.726431465148926, "ce_loss_39": 2.2207835763692856, "ce_loss_52": 1.5021536648273468, "ce_loss_7": 3.454101949930191, "epoch": 0.542, "grad_norm": 20.7568922984159, "kl_loss_13": 3510.0, "kl_loss_26": 2532.8, "kl_loss_39": 1455.8, "kl_loss_7": 4068.8, "learning_rate": 0.0004414282640060809, "loss": 5749.1, "step": 5420 }, { "ce_loss_13": 3.1338580727577208, "ce_loss_26": 2.6763171195983886, "ce_loss_39": 2.181287834048271, "ce_loss_52": 1.4664452508091927, "ce_loss_7": 3.4055596351623536, "epoch": 0.543, "grad_norm": 19.620114685157933, "kl_loss_13": 3421.6, "kl_loss_26": 2476.8, "kl_loss_39": 1437.8, "kl_loss_7": 3988.8, "learning_rate": 0.0004398528227741633, "loss": 5704.8, "step": 5430 }, { "ce_loss_13": 3.104156017303467, "ce_loss_26": 2.639963132143021, "ce_loss_39": 2.127584692835808, "ce_loss_52": 1.4150341883301736, "ce_loss_7": 3.390649217367172, "epoch": 0.544, "grad_norm": 20.487750099372004, "kl_loss_13": 3479.6, "kl_loss_26": 2511.2, "kl_loss_39": 1437.2, "kl_loss_7": 4085.6, "learning_rate": 0.00043827798722365264, "loss": 5688.3, "step": 5440 }, { "ce_loss_13": 3.0290561497211455, "ce_loss_26": 2.5664966076612474, "ce_loss_39": 2.065689593553543, "ce_loss_52": 1.3703163504600524, "ce_loss_7": 3.3105548918247223, "epoch": 0.545, "grad_norm": 20.36704365885761, "kl_loss_13": 3438.0, "kl_loss_26": 2471.0, "kl_loss_39": 1405.0, "kl_loss_7": 4010.4, "learning_rate": 0.00043670377321312535, "loss": 5715.9, "step": 5450 }, { "ce_loss_13": 3.1363044023513793, "ce_loss_26": 2.674453055858612, "ce_loss_39": 2.169647827744484, "ce_loss_52": 1.440093258023262, "ce_loss_7": 3.409278839826584, "epoch": 0.546, "grad_norm": 20.32480480357714, "kl_loss_13": 3496.8, "kl_loss_26": 2524.8, "kl_loss_39": 1466.2, "kl_loss_7": 4078.0, "learning_rate": 0.0004351301965948991, "loss": 5722.2, "step": 5460 }, { "ce_loss_13": 3.152593141794205, "ce_loss_26": 2.6999820828437806, "ce_loss_39": 2.1983327239751818, "ce_loss_52": 1.4628954619169234, "ce_loss_7": 3.4299716293811797, "epoch": 0.547, "grad_norm": 19.856114793460193, "kl_loss_13": 3499.2, "kl_loss_26": 2548.2, "kl_loss_39": 1495.2, "kl_loss_7": 4066.8, "learning_rate": 0.000433557273214873, "loss": 5741.7, "step": 5470 }, { "ce_loss_13": 3.064018839597702, "ce_loss_26": 2.6090955317020414, "ce_loss_39": 2.1058148056268693, "ce_loss_52": 1.4195792496204376, "ce_loss_7": 3.344401216506958, "epoch": 0.548, "grad_norm": 20.616048950091404, "kl_loss_13": 3411.2, "kl_loss_26": 2459.2, "kl_loss_39": 1389.6, "kl_loss_7": 3992.4, "learning_rate": 0.000431985018912368, "loss": 5718.6, "step": 5480 }, { "ce_loss_13": 3.0607047379016876, "ce_loss_26": 2.597879120707512, "ce_loss_39": 2.108409595489502, "ce_loss_52": 1.4123182266950607, "ce_loss_7": 3.336356836557388, "epoch": 0.549, "grad_norm": 20.036021472042393, "kl_loss_13": 3404.4, "kl_loss_26": 2447.2, "kl_loss_39": 1413.8, "kl_loss_7": 3983.2, "learning_rate": 0.0004304134495199674, "loss": 5692.5, "step": 5490 }, { "ce_loss_13": 3.052510768175125, "ce_loss_26": 2.6108580827713013, "ce_loss_39": 2.1228879362344744, "ce_loss_52": 1.4244433492422104, "ce_loss_7": 3.3247893154621124, "epoch": 0.55, "grad_norm": 20.158536855725483, "kl_loss_13": 3391.6, "kl_loss_26": 2469.0, "kl_loss_39": 1430.9, "kl_loss_7": 3959.6, "learning_rate": 0.0004288425808633575, "loss": 5660.8, "step": 5500 }, { "ce_loss_13": 3.164420074224472, "ce_loss_26": 2.6986677646636963, "ce_loss_39": 2.192535865306854, "ce_loss_52": 1.4702347338199615, "ce_loss_7": 3.4305537164211275, "epoch": 0.551, "grad_norm": 21.34549178191646, "kl_loss_13": 3508.0, "kl_loss_26": 2541.6, "kl_loss_39": 1463.2, "kl_loss_7": 4075.6, "learning_rate": 0.0004272724287611684, "loss": 5697.3, "step": 5510 }, { "ce_loss_13": 3.1066312968730925, "ce_loss_26": 2.6526737749576568, "ce_loss_39": 2.1508010149002077, "ce_loss_52": 1.4371357694268228, "ce_loss_7": 3.381526863574982, "epoch": 0.552, "grad_norm": 20.017257048745616, "kl_loss_13": 3449.6, "kl_loss_26": 2497.2, "kl_loss_39": 1440.4, "kl_loss_7": 4027.2, "learning_rate": 0.00042570300902481425, "loss": 5661.1, "step": 5520 }, { "ce_loss_13": 3.0768387794494627, "ce_loss_26": 2.6105665415525436, "ce_loss_39": 2.1041111290454864, "ce_loss_52": 1.3769602328538895, "ce_loss_7": 3.3504232287406923, "epoch": 0.553, "grad_norm": 20.71287480099847, "kl_loss_13": 3497.6, "kl_loss_26": 2529.0, "kl_loss_39": 1464.6, "kl_loss_7": 4067.6, "learning_rate": 0.00042413433745833423, "loss": 5675.2, "step": 5530 }, { "ce_loss_13": 3.078362447023392, "ce_loss_26": 2.6186123132705688, "ce_loss_39": 2.1204461604356766, "ce_loss_52": 1.42312493622303, "ce_loss_7": 3.3565984547138212, "epoch": 0.554, "grad_norm": 20.243506516231662, "kl_loss_13": 3424.0, "kl_loss_26": 2463.4, "kl_loss_39": 1411.4, "kl_loss_7": 4003.6, "learning_rate": 0.0004225664298582339, "loss": 5650.4, "step": 5540 }, { "ce_loss_13": 3.104430967569351, "ce_loss_26": 2.6466626435518266, "ce_loss_39": 2.149568349123001, "ce_loss_52": 1.4317003712058067, "ce_loss_7": 3.380819743871689, "epoch": 0.555, "grad_norm": 20.086456724534386, "kl_loss_13": 3478.8, "kl_loss_26": 2528.0, "kl_loss_39": 1462.6, "kl_loss_7": 4051.6, "learning_rate": 0.000420999302013325, "loss": 5686.3, "step": 5550 }, { "ce_loss_13": 3.09715017080307, "ce_loss_26": 2.6249477684497835, "ce_loss_39": 2.112127733230591, "ce_loss_52": 1.3998657062649726, "ce_loss_7": 3.3848826706409456, "epoch": 0.556, "grad_norm": 19.84836283457165, "kl_loss_13": 3500.0, "kl_loss_26": 2520.0, "kl_loss_39": 1432.4, "kl_loss_7": 4098.4, "learning_rate": 0.000419432969704568, "loss": 5721.05, "step": 5560 }, { "ce_loss_13": 3.1611645042896273, "ce_loss_26": 2.7037321001291277, "ce_loss_39": 2.2008607923984527, "ce_loss_52": 1.4731642618775367, "ce_loss_7": 3.4352382242679598, "epoch": 0.557, "grad_norm": 19.486736620492533, "kl_loss_13": 3494.8, "kl_loss_26": 2542.2, "kl_loss_39": 1471.9, "kl_loss_7": 4071.6, "learning_rate": 0.00041786744870491154, "loss": 5664.5, "step": 5570 }, { "ce_loss_13": 3.1723786175251005, "ce_loss_26": 2.712140661478043, "ce_loss_39": 2.2021081149578094, "ce_loss_52": 1.4713574051856995, "ce_loss_7": 3.4434271275997164, "epoch": 0.558, "grad_norm": 21.812924852409434, "kl_loss_13": 3518.4, "kl_loss_26": 2552.0, "kl_loss_39": 1476.8, "kl_loss_7": 4082.0, "learning_rate": 0.0004163027547791347, "loss": 5667.5, "step": 5580 }, { "ce_loss_13": 3.0999899983406065, "ce_loss_26": 2.6524887919425963, "ce_loss_39": 2.159899726510048, "ce_loss_52": 1.457381361722946, "ce_loss_7": 3.376901388168335, "epoch": 0.559, "grad_norm": 20.855137229386383, "kl_loss_13": 3402.8, "kl_loss_26": 2469.2, "kl_loss_39": 1417.0, "kl_loss_7": 3983.6, "learning_rate": 0.0004147389036836881, "loss": 5623.2, "step": 5590 }, { "ce_loss_13": 3.048000919818878, "ce_loss_26": 2.6028879463672636, "ce_loss_39": 2.110449159145355, "ce_loss_52": 1.4273749262094497, "ce_loss_7": 3.3209989249706267, "epoch": 0.56, "grad_norm": 21.293221784840117, "kl_loss_13": 3370.8, "kl_loss_26": 2436.6, "kl_loss_39": 1393.6, "kl_loss_7": 3935.6, "learning_rate": 0.00041317591116653486, "loss": 5661.4, "step": 5600 }, { "ce_loss_13": 3.117449927330017, "ce_loss_26": 2.6561968684196473, "ce_loss_39": 2.1393496483564376, "ce_loss_52": 1.4263568341732025, "ce_loss_7": 3.3912305176258086, "epoch": 0.561, "grad_norm": 19.806130661521266, "kl_loss_13": 3510.8, "kl_loss_26": 2543.6, "kl_loss_39": 1446.0, "kl_loss_7": 4074.8, "learning_rate": 0.0004116137929669921, "loss": 5646.7, "step": 5610 }, { "ce_loss_13": 3.032761037349701, "ce_loss_26": 2.5814107984304426, "ce_loss_39": 2.092069110274315, "ce_loss_52": 1.4175067842006683, "ce_loss_7": 3.3051693975925445, "epoch": 0.562, "grad_norm": 21.265424868671836, "kl_loss_13": 3342.0, "kl_loss_26": 2397.0, "kl_loss_39": 1360.0, "kl_loss_7": 3902.8, "learning_rate": 0.00041005256481557305, "loss": 5649.3, "step": 5620 }, { "ce_loss_13": 3.1011641681194306, "ce_loss_26": 2.6472670078277587, "ce_loss_39": 2.1640954107046126, "ce_loss_52": 1.4572645276784897, "ce_loss_7": 3.3653019249439238, "epoch": 0.563, "grad_norm": 19.60469852329931, "kl_loss_13": 3397.2, "kl_loss_26": 2452.6, "kl_loss_39": 1426.2, "kl_loss_7": 3957.6, "learning_rate": 0.00040849224243382767, "loss": 5635.6, "step": 5630 }, { "ce_loss_13": 3.0702085912227632, "ce_loss_26": 2.6029874324798583, "ce_loss_39": 2.1070942997932436, "ce_loss_52": 1.4102862730622292, "ce_loss_7": 3.345056527853012, "epoch": 0.564, "grad_norm": 19.66351329562965, "kl_loss_13": 3444.8, "kl_loss_26": 2478.8, "kl_loss_39": 1416.0, "kl_loss_7": 4010.8, "learning_rate": 0.000406932841534185, "loss": 5678.4, "step": 5640 }, { "ce_loss_13": 3.1035412073135378, "ce_loss_26": 2.6404273927211763, "ce_loss_39": 2.148519089818001, "ce_loss_52": 1.4659966766834258, "ce_loss_7": 3.3797259271144866, "epoch": 0.565, "grad_norm": 20.61367124543832, "kl_loss_13": 3418.4, "kl_loss_26": 2452.6, "kl_loss_39": 1399.6, "kl_loss_7": 3989.2, "learning_rate": 0.0004053743778197951, "loss": 5619.4, "step": 5650 }, { "ce_loss_13": 3.074153536558151, "ce_loss_26": 2.6227691769599915, "ce_loss_39": 2.12532425224781, "ce_loss_52": 1.4269753962755203, "ce_loss_7": 3.3447964787483215, "epoch": 0.566, "grad_norm": 20.08816441246758, "kl_loss_13": 3398.8, "kl_loss_26": 2455.2, "kl_loss_39": 1406.8, "kl_loss_7": 3968.8, "learning_rate": 0.0004038168669843697, "loss": 5607.4, "step": 5660 }, { "ce_loss_13": 3.117138236761093, "ce_loss_26": 2.6556679248809814, "ce_loss_39": 2.1434233844280244, "ce_loss_52": 1.4418231889605522, "ce_loss_7": 3.3935614347457888, "epoch": 0.567, "grad_norm": 19.629663422829108, "kl_loss_13": 3460.4, "kl_loss_26": 2499.6, "kl_loss_39": 1423.0, "kl_loss_7": 4038.4, "learning_rate": 0.000402260324712026, "loss": 5653.85, "step": 5670 }, { "ce_loss_13": 3.0241773188114167, "ce_loss_26": 2.5764558911323547, "ce_loss_39": 2.0826069891452788, "ce_loss_52": 1.4262234181165696, "ce_loss_7": 3.292912298440933, "epoch": 0.568, "grad_norm": 19.863333175376273, "kl_loss_13": 3317.2, "kl_loss_26": 2380.6, "kl_loss_39": 1341.9, "kl_loss_7": 3879.6, "learning_rate": 0.00040070476667712743, "loss": 5602.0, "step": 5680 }, { "ce_loss_13": 3.1207240760326385, "ce_loss_26": 2.667350098490715, "ce_loss_39": 2.1637227922677993, "ce_loss_52": 1.4507419973611833, "ce_loss_7": 3.3947570443153383, "epoch": 0.569, "grad_norm": 20.18457161613193, "kl_loss_13": 3470.8, "kl_loss_26": 2511.8, "kl_loss_39": 1453.2, "kl_loss_7": 4038.0, "learning_rate": 0.0003991502085441259, "loss": 5637.8, "step": 5690 }, { "ce_loss_13": 3.052716261148453, "ce_loss_26": 2.5999310851097106, "ce_loss_39": 2.103394716978073, "ce_loss_52": 1.4248915880918502, "ce_loss_7": 3.3296144127845766, "epoch": 0.57, "grad_norm": 21.32933636042312, "kl_loss_13": 3384.4, "kl_loss_26": 2434.4, "kl_loss_39": 1385.3, "kl_loss_7": 3961.6, "learning_rate": 0.0003975966659674047, "loss": 5572.65, "step": 5700 }, { "ce_loss_13": 3.0565085709095, "ce_loss_26": 2.612689271569252, "ce_loss_39": 2.133056679368019, "ce_loss_52": 1.456464058160782, "ce_loss_7": 3.3283673584461213, "epoch": 0.571, "grad_norm": 20.795332274261725, "kl_loss_13": 3328.4, "kl_loss_26": 2402.4, "kl_loss_39": 1384.6, "kl_loss_7": 3893.6, "learning_rate": 0.0003960441545911204, "loss": 5637.0, "step": 5710 }, { "ce_loss_13": 3.100340133905411, "ce_loss_26": 2.6507264733314515, "ce_loss_39": 2.1515814483165743, "ce_loss_52": 1.4689666152000427, "ce_loss_7": 3.36507533788681, "epoch": 0.572, "grad_norm": 19.479562193670343, "kl_loss_13": 3367.6, "kl_loss_26": 2430.4, "kl_loss_39": 1381.6, "kl_loss_7": 3924.4, "learning_rate": 0.0003944926900490452, "loss": 5586.7, "step": 5720 }, { "ce_loss_13": 3.035090607404709, "ce_loss_26": 2.5765923827886583, "ce_loss_39": 2.0693789660930633, "ce_loss_52": 1.394070391356945, "ce_loss_7": 3.3126678228378297, "epoch": 0.573, "grad_norm": 20.769836690311337, "kl_loss_13": 3418.0, "kl_loss_26": 2455.8, "kl_loss_39": 1385.6, "kl_loss_7": 3996.8, "learning_rate": 0.0003929422879644099, "loss": 5607.2, "step": 5730 }, { "ce_loss_13": 3.0840226650238036, "ce_loss_26": 2.6376162350177763, "ce_loss_39": 2.1402535855770113, "ce_loss_52": 1.4594999521970748, "ce_loss_7": 3.3528446674346926, "epoch": 0.574, "grad_norm": 19.55271310482914, "kl_loss_13": 3396.0, "kl_loss_26": 2453.0, "kl_loss_39": 1389.4, "kl_loss_7": 3971.6, "learning_rate": 0.0003913929639497462, "loss": 5561.5, "step": 5740 }, { "ce_loss_13": 3.0994504928588866, "ce_loss_26": 2.645833945274353, "ce_loss_39": 2.151085004210472, "ce_loss_52": 1.453307920694351, "ce_loss_7": 3.375282955169678, "epoch": 0.575, "grad_norm": 20.394651449213793, "kl_loss_13": 3410.8, "kl_loss_26": 2459.0, "kl_loss_39": 1418.8, "kl_loss_7": 3977.2, "learning_rate": 0.00038984473360672965, "loss": 5587.6, "step": 5750 }, { "ce_loss_13": 3.063554251194, "ce_loss_26": 2.602804532647133, "ce_loss_39": 2.112149253487587, "ce_loss_52": 1.4378221184015274, "ce_loss_7": 3.3263466000556945, "epoch": 0.576, "grad_norm": 20.732107583485522, "kl_loss_13": 3335.2, "kl_loss_26": 2397.0, "kl_loss_39": 1362.5, "kl_loss_7": 3885.6, "learning_rate": 0.0003882976125260229, "loss": 5618.5, "step": 5760 }, { "ce_loss_13": 3.011310315132141, "ce_loss_26": 2.5538589358329773, "ce_loss_39": 2.0608473628759385, "ce_loss_52": 1.3871852427721023, "ce_loss_7": 3.2823293566703797, "epoch": 0.577, "grad_norm": 20.222489437839133, "kl_loss_13": 3356.0, "kl_loss_26": 2403.4, "kl_loss_39": 1363.6, "kl_loss_7": 3921.2, "learning_rate": 0.00038675161628711776, "loss": 5583.6, "step": 5770 }, { "ce_loss_13": 3.060704934597015, "ce_loss_26": 2.6082344591617583, "ce_loss_39": 2.1105374455451966, "ce_loss_52": 1.4133148401975633, "ce_loss_7": 3.337174046039581, "epoch": 0.578, "grad_norm": 20.5258419136392, "kl_loss_13": 3407.2, "kl_loss_26": 2468.6, "kl_loss_39": 1416.8, "kl_loss_7": 3978.0, "learning_rate": 0.0003852067604581794, "loss": 5550.9, "step": 5780 }, { "ce_loss_13": 3.0313555896282196, "ce_loss_26": 2.5763757705688475, "ce_loss_39": 2.090648338198662, "ce_loss_52": 1.4237483531236648, "ce_loss_7": 3.303388088941574, "epoch": 0.579, "grad_norm": 19.375379659731948, "kl_loss_13": 3338.4, "kl_loss_26": 2400.6, "kl_loss_39": 1371.4, "kl_loss_7": 3902.8, "learning_rate": 0.0003836630605958888, "loss": 5548.1, "step": 5790 }, { "ce_loss_13": 3.0871520400047303, "ce_loss_26": 2.628482538461685, "ce_loss_39": 2.1345098853111266, "ce_loss_52": 1.439907690882683, "ce_loss_7": 3.360053616762161, "epoch": 0.58, "grad_norm": 20.217419963891068, "kl_loss_13": 3384.8, "kl_loss_26": 2429.6, "kl_loss_39": 1387.0, "kl_loss_7": 3952.0, "learning_rate": 0.0003821205322452863, "loss": 5608.2, "step": 5800 }, { "ce_loss_13": 3.098143881559372, "ce_loss_26": 2.648523300886154, "ce_loss_39": 2.1544575184583663, "ce_loss_52": 1.4684545397758484, "ce_loss_7": 3.369327354431152, "epoch": 0.581, "grad_norm": 20.306772003501575, "kl_loss_13": 3376.0, "kl_loss_26": 2423.6, "kl_loss_39": 1374.4, "kl_loss_7": 3943.2, "learning_rate": 0.0003805791909396155, "loss": 5609.7, "step": 5810 }, { "ce_loss_13": 3.03250247836113, "ce_loss_26": 2.5763048112392424, "ce_loss_39": 2.0811164885759355, "ce_loss_52": 1.4048690304160119, "ce_loss_7": 3.2992306888103484, "epoch": 0.582, "grad_norm": 19.674023919479502, "kl_loss_13": 3376.8, "kl_loss_26": 2428.6, "kl_loss_39": 1384.2, "kl_loss_7": 3936.8, "learning_rate": 0.0003790390522001662, "loss": 5494.5, "step": 5820 }, { "ce_loss_13": 3.0025113344192507, "ce_loss_26": 2.55022137761116, "ce_loss_39": 2.0565065026283262, "ce_loss_52": 1.377510306239128, "ce_loss_7": 3.2718186736106873, "epoch": 0.583, "grad_norm": 20.728894667471106, "kl_loss_13": 3354.8, "kl_loss_26": 2406.2, "kl_loss_39": 1367.9, "kl_loss_7": 3917.6, "learning_rate": 0.0003775001315361183, "loss": 5559.7, "step": 5830 }, { "ce_loss_13": 3.0971179008483887, "ce_loss_26": 2.6451155722141264, "ce_loss_39": 2.1508567333221436, "ce_loss_52": 1.4824247717857362, "ce_loss_7": 3.37031666636467, "epoch": 0.584, "grad_norm": 20.600074401568932, "kl_loss_13": 3337.2, "kl_loss_26": 2403.8, "kl_loss_39": 1359.2, "kl_loss_7": 3907.2, "learning_rate": 0.0003759624444443858, "loss": 5519.4, "step": 5840 }, { "ce_loss_13": 3.0961884200572967, "ce_loss_26": 2.626956915855408, "ce_loss_39": 2.1310097485780717, "ce_loss_52": 1.4370131075382233, "ce_loss_7": 3.371414542198181, "epoch": 0.585, "grad_norm": 21.119848921717185, "kl_loss_13": 3437.6, "kl_loss_26": 2463.6, "kl_loss_39": 1412.8, "kl_loss_7": 4010.4, "learning_rate": 0.00037442600640946044, "loss": 5564.2, "step": 5850 }, { "ce_loss_13": 3.022693085670471, "ce_loss_26": 2.556842041015625, "ce_loss_39": 2.054610991477966, "ce_loss_52": 1.3797120869159698, "ce_loss_7": 3.2991441190242767, "epoch": 0.586, "grad_norm": 20.829952423189983, "kl_loss_13": 3400.0, "kl_loss_26": 2436.4, "kl_loss_39": 1378.6, "kl_loss_7": 3976.4, "learning_rate": 0.00037289083290325663, "loss": 5555.3, "step": 5860 }, { "ce_loss_13": 3.048868530988693, "ce_loss_26": 2.597111147642136, "ce_loss_39": 2.1075122743844985, "ce_loss_52": 1.4477093726396562, "ce_loss_7": 3.3170620620250704, "epoch": 0.587, "grad_norm": 19.743999290482414, "kl_loss_13": 3295.2, "kl_loss_26": 2349.8, "kl_loss_39": 1314.6, "kl_loss_7": 3862.0, "learning_rate": 0.0003713569393849543, "loss": 5582.2, "step": 5870 }, { "ce_loss_13": 3.0275582671165466, "ce_loss_26": 2.5641341865062715, "ce_loss_39": 2.075681546330452, "ce_loss_52": 1.413079009950161, "ce_loss_7": 3.30134374499321, "epoch": 0.588, "grad_norm": 19.60417578779171, "kl_loss_13": 3342.8, "kl_loss_26": 2385.2, "kl_loss_39": 1348.4, "kl_loss_7": 3911.6, "learning_rate": 0.00036982434130084397, "loss": 5547.2, "step": 5880 }, { "ce_loss_13": 3.0114518344402312, "ce_loss_26": 2.5489202946424485, "ce_loss_39": 2.051037350296974, "ce_loss_52": 1.3947512209415436, "ce_loss_7": 3.2770422041416167, "epoch": 0.589, "grad_norm": 19.943871445513693, "kl_loss_13": 3324.8, "kl_loss_26": 2358.6, "kl_loss_39": 1318.4, "kl_loss_7": 3886.4, "learning_rate": 0.00036829305408417166, "loss": 5519.0, "step": 5890 }, { "ce_loss_13": 2.9831090033054353, "ce_loss_26": 2.5218098521232606, "ce_loss_39": 2.0276191979646683, "ce_loss_52": 1.3660520613193512, "ce_loss_7": 3.262023079395294, "epoch": 0.59, "grad_norm": 20.136107811495098, "kl_loss_13": 3337.2, "kl_loss_26": 2377.0, "kl_loss_39": 1333.6, "kl_loss_7": 3918.4, "learning_rate": 0.0003667630931549826, "loss": 5547.1, "step": 5900 }, { "ce_loss_13": 3.1185491621494292, "ce_loss_26": 2.6609552204608917, "ce_loss_39": 2.154519048333168, "ce_loss_52": 1.4638203248381614, "ce_loss_7": 3.3888748466968535, "epoch": 0.591, "grad_norm": 21.04398596009474, "kl_loss_13": 3422.0, "kl_loss_26": 2461.2, "kl_loss_39": 1398.6, "kl_loss_7": 3996.8, "learning_rate": 0.00036523447391996613, "loss": 5529.25, "step": 5910 }, { "ce_loss_13": 3.01438627243042, "ce_loss_26": 2.5740538477897643, "ce_loss_39": 2.0915403455495833, "ce_loss_52": 1.431156474351883, "ce_loss_7": 3.2843648850917817, "epoch": 0.592, "grad_norm": 21.11836249464285, "kl_loss_13": 3292.4, "kl_loss_26": 2358.6, "kl_loss_39": 1331.3, "kl_loss_7": 3853.2, "learning_rate": 0.00036370721177230114, "loss": 5565.45, "step": 5920 }, { "ce_loss_13": 3.069197976589203, "ce_loss_26": 2.619757717847824, "ce_loss_39": 2.1231694549322127, "ce_loss_52": 1.4499645620584487, "ce_loss_7": 3.339762735366821, "epoch": 0.593, "grad_norm": 19.81588610039522, "kl_loss_13": 3318.4, "kl_loss_26": 2373.0, "kl_loss_39": 1346.8, "kl_loss_7": 3881.2, "learning_rate": 0.00036218132209150044, "loss": 5483.7, "step": 5930 }, { "ce_loss_13": 3.0346123337745667, "ce_loss_26": 2.5827606499195097, "ce_loss_39": 2.090969371795654, "ce_loss_52": 1.4306629955768586, "ce_loss_7": 3.3081043541431425, "epoch": 0.594, "grad_norm": 20.579777117155064, "kl_loss_13": 3338.4, "kl_loss_26": 2404.8, "kl_loss_39": 1361.4, "kl_loss_7": 3897.6, "learning_rate": 0.0003606568202432562, "loss": 5508.2, "step": 5940 }, { "ce_loss_13": 2.9628920376300814, "ce_loss_26": 2.5103672593832016, "ce_loss_39": 2.028717193007469, "ce_loss_52": 1.3798889175057412, "ce_loss_7": 3.238497519493103, "epoch": 0.595, "grad_norm": 19.336805098702218, "kl_loss_13": 3271.2, "kl_loss_26": 2333.8, "kl_loss_39": 1324.3, "kl_loss_7": 3845.6, "learning_rate": 0.0003591337215792851, "loss": 5512.3, "step": 5950 }, { "ce_loss_13": 3.0788431644439695, "ce_loss_26": 2.608420321345329, "ce_loss_39": 2.1091116696596144, "ce_loss_52": 1.4199562400579453, "ce_loss_7": 3.359624499082565, "epoch": 0.596, "grad_norm": 20.533566806186286, "kl_loss_13": 3426.4, "kl_loss_26": 2447.8, "kl_loss_39": 1399.7, "kl_loss_7": 4004.4, "learning_rate": 0.00035761204143717383, "loss": 5551.7, "step": 5960 }, { "ce_loss_13": 3.03274342417717, "ce_loss_26": 2.585944026708603, "ce_loss_39": 2.096950164437294, "ce_loss_52": 1.416708105802536, "ce_loss_7": 3.3088025748729706, "epoch": 0.597, "grad_norm": 19.695670321869642, "kl_loss_13": 3339.2, "kl_loss_26": 2400.2, "kl_loss_39": 1368.0, "kl_loss_7": 3908.0, "learning_rate": 0.0003560917951402245, "loss": 5483.9, "step": 5970 }, { "ce_loss_13": 3.01766916513443, "ce_loss_26": 2.561279395222664, "ce_loss_39": 2.06943539083004, "ce_loss_52": 1.418130737543106, "ce_loss_7": 3.2899491429328918, "epoch": 0.598, "grad_norm": 21.072429989583483, "kl_loss_13": 3344.4, "kl_loss_26": 2382.6, "kl_loss_39": 1331.0, "kl_loss_7": 3913.6, "learning_rate": 0.00035457299799730046, "loss": 5551.3, "step": 5980 }, { "ce_loss_13": 3.0659261345863342, "ce_loss_26": 2.600133925676346, "ce_loss_39": 2.110896447300911, "ce_loss_52": 1.437103909254074, "ce_loss_7": 3.3336906135082245, "epoch": 0.599, "grad_norm": 24.325101698795564, "kl_loss_13": 3380.0, "kl_loss_26": 2416.4, "kl_loss_39": 1376.0, "kl_loss_7": 3938.8, "learning_rate": 0.0003530556653026721, "loss": 5495.3, "step": 5990 }, { "ce_loss_13": 3.0716091096401215, "ce_loss_26": 2.614444798231125, "ce_loss_39": 2.1203446626663207, "ce_loss_52": 1.4388466402888298, "ce_loss_7": 3.3488605439662935, "epoch": 0.6, "grad_norm": 20.521363612152747, "kl_loss_13": 3384.4, "kl_loss_26": 2432.4, "kl_loss_39": 1382.6, "kl_loss_7": 3959.2, "learning_rate": 0.00035153981233586274, "loss": 5545.9, "step": 6000 }, { "ce_loss_13": 3.106086379289627, "ce_loss_26": 2.6430875420570374, "ce_loss_39": 2.1337583631277086, "ce_loss_52": 1.4439469754695893, "ce_loss_7": 3.3797987580299376, "epoch": 0.601, "grad_norm": 20.80497849503253, "kl_loss_13": 3453.6, "kl_loss_26": 2484.8, "kl_loss_39": 1398.0, "kl_loss_7": 4028.4, "learning_rate": 0.00035002545436149473, "loss": 5491.4, "step": 6010 }, { "ce_loss_13": 3.000385183095932, "ce_loss_26": 2.5445862293243406, "ce_loss_39": 2.045133265852928, "ce_loss_52": 1.387436880171299, "ce_loss_7": 3.271744018793106, "epoch": 0.602, "grad_norm": 32.49284937945306, "kl_loss_13": 3335.6, "kl_loss_26": 2380.8, "kl_loss_39": 1330.6, "kl_loss_7": 3907.2, "learning_rate": 0.0003485126066291364, "loss": 5483.7, "step": 6020 }, { "ce_loss_13": 3.0256562530994415, "ce_loss_26": 2.577129301428795, "ce_loss_39": 2.0889712274074554, "ce_loss_52": 1.4449194520711899, "ce_loss_7": 3.2831216752529144, "epoch": 0.603, "grad_norm": 20.23621815480592, "kl_loss_13": 3272.0, "kl_loss_26": 2339.4, "kl_loss_39": 1312.8, "kl_loss_7": 3817.2, "learning_rate": 0.0003470012843731476, "loss": 5461.5, "step": 6030 }, { "ce_loss_13": 3.0063592195510864, "ce_loss_26": 2.5461477816104887, "ce_loss_39": 2.047996437549591, "ce_loss_52": 1.4004468455910684, "ce_loss_7": 3.2747744262218474, "epoch": 0.604, "grad_norm": 20.161854907584466, "kl_loss_13": 3331.2, "kl_loss_26": 2365.2, "kl_loss_39": 1318.3, "kl_loss_7": 3894.4, "learning_rate": 0.00034549150281252633, "loss": 5446.25, "step": 6040 }, { "ce_loss_13": 3.0043693661689757, "ce_loss_26": 2.5464318215847017, "ce_loss_39": 2.054910770058632, "ce_loss_52": 1.402955588698387, "ce_loss_7": 3.277425891160965, "epoch": 0.605, "grad_norm": 19.39479695015959, "kl_loss_13": 3303.6, "kl_loss_26": 2362.4, "kl_loss_39": 1315.5, "kl_loss_7": 3867.2, "learning_rate": 0.0003439832771507565, "loss": 5513.7, "step": 6050 }, { "ce_loss_13": 3.0073243379592896, "ce_loss_26": 2.552249348163605, "ce_loss_39": 2.0556343287229537, "ce_loss_52": 1.4056400418281556, "ce_loss_7": 3.276465517282486, "epoch": 0.606, "grad_norm": 20.632328630604196, "kl_loss_13": 3315.2, "kl_loss_26": 2367.2, "kl_loss_39": 1320.7, "kl_loss_7": 3890.0, "learning_rate": 0.0003424766225756537, "loss": 5437.2, "step": 6060 }, { "ce_loss_13": 3.0342160046100615, "ce_loss_26": 2.5702687412500382, "ce_loss_39": 2.0748631983995436, "ce_loss_52": 1.4053118824958801, "ce_loss_7": 3.30224769115448, "epoch": 0.607, "grad_norm": 20.381462378536757, "kl_loss_13": 3375.6, "kl_loss_26": 2412.6, "kl_loss_39": 1368.5, "kl_loss_7": 3938.0, "learning_rate": 0.00034097155425921255, "loss": 5453.95, "step": 6070 }, { "ce_loss_13": 2.979208827018738, "ce_loss_26": 2.5391657948493958, "ce_loss_39": 2.0675252109766005, "ce_loss_52": 1.4293861359357833, "ce_loss_7": 3.243548810482025, "epoch": 0.608, "grad_norm": 20.059894251099692, "kl_loss_13": 3201.6, "kl_loss_26": 2277.2, "kl_loss_39": 1276.3, "kl_loss_7": 3759.2, "learning_rate": 0.0003394680873574546, "loss": 5463.6, "step": 6080 }, { "ce_loss_13": 3.0087065517902376, "ce_loss_26": 2.5543720543384554, "ce_loss_39": 2.0706711769104005, "ce_loss_52": 1.4312650561332703, "ce_loss_7": 3.274740469455719, "epoch": 0.609, "grad_norm": 19.682020137215275, "kl_loss_13": 3278.0, "kl_loss_26": 2334.8, "kl_loss_39": 1311.6, "kl_loss_7": 3829.2, "learning_rate": 0.0003379662370102747, "loss": 5485.1, "step": 6090 }, { "ce_loss_13": 2.9708913624286652, "ce_loss_26": 2.526490569114685, "ce_loss_39": 2.045279270410538, "ce_loss_52": 1.406798492372036, "ce_loss_7": 3.2348900735378265, "epoch": 0.61, "grad_norm": 20.816822332244506, "kl_loss_13": 3220.0, "kl_loss_26": 2302.2, "kl_loss_39": 1290.7, "kl_loss_7": 3776.8, "learning_rate": 0.0003364660183412892, "loss": 5434.0, "step": 6100 }, { "ce_loss_13": 3.034948408603668, "ce_loss_26": 2.5749203741550444, "ce_loss_39": 2.083053132891655, "ce_loss_52": 1.4239173114299775, "ce_loss_7": 3.308141976594925, "epoch": 0.611, "grad_norm": 19.1619951590758, "kl_loss_13": 3322.8, "kl_loss_26": 2375.8, "kl_loss_39": 1335.6, "kl_loss_7": 3891.2, "learning_rate": 0.0003349674464576834, "loss": 5422.4, "step": 6110 }, { "ce_loss_13": 3.004230409860611, "ce_loss_26": 2.559123533964157, "ce_loss_39": 2.0680998235940935, "ce_loss_52": 1.416846913099289, "ce_loss_7": 3.268228167295456, "epoch": 0.612, "grad_norm": 19.814048572070753, "kl_loss_13": 3277.2, "kl_loss_26": 2352.2, "kl_loss_39": 1329.4, "kl_loss_7": 3827.6, "learning_rate": 0.00033347053645005966, "loss": 5408.25, "step": 6120 }, { "ce_loss_13": 3.043248528242111, "ce_loss_26": 2.5893827825784683, "ce_loss_39": 2.106617513298988, "ce_loss_52": 1.4531341344118118, "ce_loss_7": 3.3129510939121247, "epoch": 0.613, "grad_norm": 20.76807127463998, "kl_loss_13": 3286.4, "kl_loss_26": 2356.0, "kl_loss_39": 1332.4, "kl_loss_7": 3850.8, "learning_rate": 0.00033197530339228485, "loss": 5370.55, "step": 6130 }, { "ce_loss_13": 3.00436030626297, "ce_loss_26": 2.551770430803299, "ce_loss_39": 2.0617963910102843, "ce_loss_52": 1.4093145355582237, "ce_loss_7": 3.287700629234314, "epoch": 0.614, "grad_norm": 20.13277338184701, "kl_loss_13": 3319.2, "kl_loss_26": 2374.0, "kl_loss_39": 1331.1, "kl_loss_7": 3899.6, "learning_rate": 0.00033048176234133967, "loss": 5468.4, "step": 6140 }, { "ce_loss_13": 3.0555618822574617, "ce_loss_26": 2.6027767241001127, "ce_loss_39": 2.1256050765514374, "ce_loss_52": 1.4570606127381325, "ce_loss_7": 3.3252673983573913, "epoch": 0.615, "grad_norm": 20.812161363431123, "kl_loss_13": 3324.4, "kl_loss_26": 2376.0, "kl_loss_39": 1350.9, "kl_loss_7": 3884.4, "learning_rate": 0.0003289899283371657, "loss": 5469.45, "step": 6150 }, { "ce_loss_13": 2.9699956268072127, "ce_loss_26": 2.5214530378580093, "ce_loss_39": 2.047743359208107, "ce_loss_52": 1.404917973279953, "ce_loss_7": 3.242190235853195, "epoch": 0.616, "grad_norm": 21.707368727671295, "kl_loss_13": 3253.6, "kl_loss_26": 2330.0, "kl_loss_39": 1321.7, "kl_loss_7": 3830.4, "learning_rate": 0.0003274998164025148, "loss": 5448.5, "step": 6160 }, { "ce_loss_13": 3.1136968553066255, "ce_loss_26": 2.6612686932086946, "ce_loss_39": 2.1661961168050765, "ce_loss_52": 1.4886637568473815, "ce_loss_7": 3.390312284231186, "epoch": 0.617, "grad_norm": 20.098473719799063, "kl_loss_13": 3356.0, "kl_loss_26": 2410.6, "kl_loss_39": 1369.8, "kl_loss_7": 3926.4, "learning_rate": 0.0003260114415427975, "loss": 5420.0, "step": 6170 }, { "ce_loss_13": 3.0537400960922243, "ce_loss_26": 2.6002777397632597, "ce_loss_39": 2.0997320264577866, "ce_loss_52": 1.425080481171608, "ce_loss_7": 3.3301878392696382, "epoch": 0.618, "grad_norm": 20.959268913691595, "kl_loss_13": 3376.8, "kl_loss_26": 2431.0, "kl_loss_39": 1376.8, "kl_loss_7": 3956.0, "learning_rate": 0.0003245248187459323, "loss": 5467.5, "step": 6180 }, { "ce_loss_13": 3.0195475459098815, "ce_loss_26": 2.5703806400299074, "ce_loss_39": 2.0828246504068373, "ce_loss_52": 1.4389306217432023, "ce_loss_7": 3.2870283126831055, "epoch": 0.619, "grad_norm": 19.77508773242922, "kl_loss_13": 3278.4, "kl_loss_26": 2325.6, "kl_loss_39": 1301.5, "kl_loss_7": 3832.0, "learning_rate": 0.00032303996298219416, "loss": 5436.5, "step": 6190 }, { "ce_loss_13": 3.080602079629898, "ce_loss_26": 2.6279105126857756, "ce_loss_39": 2.1348745226860046, "ce_loss_52": 1.4668798118829727, "ce_loss_7": 3.349226105213165, "epoch": 0.62, "grad_norm": 20.105835424543073, "kl_loss_13": 3342.4, "kl_loss_26": 2402.4, "kl_loss_39": 1358.2, "kl_loss_7": 3903.6, "learning_rate": 0.00032155688920406414, "loss": 5427.1, "step": 6200 }, { "ce_loss_13": 2.980887794494629, "ce_loss_26": 2.5403966814279557, "ce_loss_39": 2.06268994808197, "ce_loss_52": 1.4154802724719047, "ce_loss_7": 3.252926254272461, "epoch": 0.621, "grad_norm": 19.282334771133197, "kl_loss_13": 3252.8, "kl_loss_26": 2327.6, "kl_loss_39": 1309.6, "kl_loss_7": 3818.4, "learning_rate": 0.0003200756123460788, "loss": 5482.55, "step": 6210 }, { "ce_loss_13": 2.9776609361171724, "ce_loss_26": 2.5325854122638702, "ce_loss_39": 2.0461337983608248, "ce_loss_52": 1.410691213607788, "ce_loss_7": 3.24890678524971, "epoch": 0.622, "grad_norm": 20.106994551943693, "kl_loss_13": 3252.0, "kl_loss_26": 2326.2, "kl_loss_39": 1296.8, "kl_loss_7": 3812.4, "learning_rate": 0.00031859614732467957, "loss": 5416.4, "step": 6220 }, { "ce_loss_13": 3.032334786653519, "ce_loss_26": 2.579844218492508, "ce_loss_39": 2.1053399711847307, "ce_loss_52": 1.4573373839259147, "ce_loss_7": 3.2939690172672274, "epoch": 0.623, "grad_norm": 19.37466323417316, "kl_loss_13": 3254.0, "kl_loss_26": 2315.6, "kl_loss_39": 1307.4, "kl_loss_7": 3800.0, "learning_rate": 0.00031711850903806275, "loss": 5384.0, "step": 6230 }, { "ce_loss_13": 3.0033844828605654, "ce_loss_26": 2.554124391078949, "ce_loss_39": 2.072583147883415, "ce_loss_52": 1.405614359676838, "ce_loss_7": 3.272351396083832, "epoch": 0.624, "grad_norm": 19.68798944815722, "kl_loss_13": 3318.8, "kl_loss_26": 2383.4, "kl_loss_39": 1353.1, "kl_loss_7": 3878.4, "learning_rate": 0.0003156427123660297, "loss": 5409.1, "step": 6240 }, { "ce_loss_13": 3.044550156593323, "ce_loss_26": 2.5933004200458525, "ce_loss_39": 2.0951038181781767, "ce_loss_52": 1.432724517583847, "ce_loss_7": 3.31173922419548, "epoch": 0.625, "grad_norm": 19.588532833188335, "kl_loss_13": 3324.0, "kl_loss_26": 2390.2, "kl_loss_39": 1351.9, "kl_loss_7": 3884.8, "learning_rate": 0.0003141687721698363, "loss": 5410.2, "step": 6250 }, { "ce_loss_13": 3.0133075952529906, "ce_loss_26": 2.562904554605484, "ce_loss_39": 2.084453445672989, "ce_loss_52": 1.4354033678770066, "ce_loss_7": 3.2846658766269683, "epoch": 0.626, "grad_norm": 20.175418509715428, "kl_loss_13": 3256.4, "kl_loss_26": 2322.2, "kl_loss_39": 1302.4, "kl_loss_7": 3820.0, "learning_rate": 0.00031269670329204396, "loss": 5413.4, "step": 6260 }, { "ce_loss_13": 3.020740455389023, "ce_loss_26": 2.576273998618126, "ce_loss_39": 2.094499522447586, "ce_loss_52": 1.46863095164299, "ce_loss_7": 3.2838824689388275, "epoch": 0.627, "grad_norm": 19.159006125141875, "kl_loss_13": 3217.6, "kl_loss_26": 2295.2, "kl_loss_39": 1276.9, "kl_loss_7": 3769.2, "learning_rate": 0.00031122652055637015, "loss": 5419.65, "step": 6270 }, { "ce_loss_13": 2.9717007994651796, "ce_loss_26": 2.5309597969055178, "ce_loss_39": 2.045916485786438, "ce_loss_52": 1.4202698469161987, "ce_loss_7": 3.244756191968918, "epoch": 0.628, "grad_norm": 20.34275756584657, "kl_loss_13": 3237.2, "kl_loss_26": 2312.2, "kl_loss_39": 1284.9, "kl_loss_7": 3804.0, "learning_rate": 0.0003097582387675385, "loss": 5361.6, "step": 6280 }, { "ce_loss_13": 2.959231287240982, "ce_loss_26": 2.5172866880893707, "ce_loss_39": 2.043312183022499, "ce_loss_52": 1.4159017190337182, "ce_loss_7": 3.220345306396484, "epoch": 0.629, "grad_norm": 20.150529720020295, "kl_loss_13": 3215.2, "kl_loss_26": 2296.4, "kl_loss_39": 1276.2, "kl_loss_7": 3763.2, "learning_rate": 0.00030829187271113034, "loss": 5363.8, "step": 6290 }, { "ce_loss_13": 2.9990767776966094, "ce_loss_26": 2.5505200415849685, "ce_loss_39": 2.060671201348305, "ce_loss_52": 1.395907147228718, "ce_loss_7": 3.2725743770599367, "epoch": 0.63, "grad_norm": 19.518127150050482, "kl_loss_13": 3309.2, "kl_loss_26": 2380.2, "kl_loss_39": 1347.4, "kl_loss_7": 3878.4, "learning_rate": 0.00030682743715343565, "loss": 5435.15, "step": 6300 }, { "ce_loss_13": 3.080632323026657, "ce_loss_26": 2.6302684545516968, "ce_loss_39": 2.135207986831665, "ce_loss_52": 1.4774031162261962, "ce_loss_7": 3.3441965878009796, "epoch": 0.631, "grad_norm": 21.06860944259249, "kl_loss_13": 3302.8, "kl_loss_26": 2366.6, "kl_loss_39": 1332.6, "kl_loss_7": 3862.8, "learning_rate": 0.0003053649468413043, "loss": 5425.25, "step": 6310 }, { "ce_loss_13": 3.0259739339351652, "ce_loss_26": 2.570155072212219, "ce_loss_39": 2.078566926717758, "ce_loss_52": 1.4384133130311967, "ce_loss_7": 3.288107806444168, "epoch": 0.632, "grad_norm": 20.78357258068192, "kl_loss_13": 3282.0, "kl_loss_26": 2335.8, "kl_loss_39": 1293.6, "kl_loss_7": 3833.2, "learning_rate": 0.00030390441650199725, "loss": 5412.2, "step": 6320 }, { "ce_loss_13": 2.9379296779632567, "ce_loss_26": 2.49056881070137, "ce_loss_39": 2.0068995296955108, "ce_loss_52": 1.3864078581333161, "ce_loss_7": 3.203891623020172, "epoch": 0.633, "grad_norm": 20.174137035703254, "kl_loss_13": 3206.8, "kl_loss_26": 2268.4, "kl_loss_39": 1251.1, "kl_loss_7": 3762.4, "learning_rate": 0.00030244586084303903, "loss": 5352.9, "step": 6330 }, { "ce_loss_13": 2.9555823683738707, "ce_loss_26": 2.501230263710022, "ce_loss_39": 2.0173334002494814, "ce_loss_52": 1.3891687452793122, "ce_loss_7": 3.2268748760223387, "epoch": 0.634, "grad_norm": 20.047186620123167, "kl_loss_13": 3264.8, "kl_loss_26": 2316.6, "kl_loss_39": 1274.1, "kl_loss_7": 3843.2, "learning_rate": 0.00030098929455206903, "loss": 5365.4, "step": 6340 }, { "ce_loss_13": 2.9806570291519163, "ce_loss_26": 2.533248084783554, "ce_loss_39": 2.0557660490274428, "ce_loss_52": 1.4183998316526414, "ce_loss_7": 3.248631852865219, "epoch": 0.635, "grad_norm": 19.42792815463783, "kl_loss_13": 3226.0, "kl_loss_26": 2301.0, "kl_loss_39": 1289.2, "kl_loss_7": 3784.4, "learning_rate": 0.00029953473229669324, "loss": 5429.0, "step": 6350 }, { "ce_loss_13": 3.008418655395508, "ce_loss_26": 2.5531763255596163, "ce_loss_39": 2.069964846968651, "ce_loss_52": 1.4419535219669342, "ce_loss_7": 3.274876070022583, "epoch": 0.636, "grad_norm": 20.040461482265158, "kl_loss_13": 3248.4, "kl_loss_26": 2315.4, "kl_loss_39": 1287.5, "kl_loss_7": 3802.8, "learning_rate": 0.00029808218872433767, "loss": 5390.5, "step": 6360 }, { "ce_loss_13": 2.9624147057533263, "ce_loss_26": 2.509669789671898, "ce_loss_39": 2.017057329416275, "ce_loss_52": 1.392769531905651, "ce_loss_7": 3.2405923306941986, "epoch": 0.637, "grad_norm": 19.969518630784094, "kl_loss_13": 3281.6, "kl_loss_26": 2321.8, "kl_loss_39": 1284.8, "kl_loss_7": 3857.6, "learning_rate": 0.0002966316784621, "loss": 5344.4, "step": 6370 }, { "ce_loss_13": 2.9690242230892183, "ce_loss_26": 2.5119642555713653, "ce_loss_39": 2.022391200065613, "ce_loss_52": 1.3881384432315826, "ce_loss_7": 3.2436072409152983, "epoch": 0.638, "grad_norm": 19.443805471212187, "kl_loss_13": 3260.4, "kl_loss_26": 2314.8, "kl_loss_39": 1281.4, "kl_loss_7": 3835.6, "learning_rate": 0.0002951832161166024, "loss": 5333.0, "step": 6380 }, { "ce_loss_13": 3.019889771938324, "ce_loss_26": 2.574093183875084, "ce_loss_39": 2.085198149085045, "ce_loss_52": 1.4531068801879883, "ce_loss_7": 3.286811703443527, "epoch": 0.639, "grad_norm": 19.50308932074232, "kl_loss_13": 3246.0, "kl_loss_26": 2313.6, "kl_loss_39": 1281.1, "kl_loss_7": 3800.8, "learning_rate": 0.0002937368162738445, "loss": 5358.7, "step": 6390 }, { "ce_loss_13": 2.979416298866272, "ce_loss_26": 2.5201680839061735, "ce_loss_39": 2.0343465119600297, "ce_loss_52": 1.4125050336122513, "ce_loss_7": 3.2494026124477386, "epoch": 0.64, "grad_norm": 19.887826117374196, "kl_loss_13": 3261.2, "kl_loss_26": 2302.8, "kl_loss_39": 1277.2, "kl_loss_7": 3827.6, "learning_rate": 0.0002922924934990568, "loss": 5361.0, "step": 6400 }, { "ce_loss_13": 2.966394138336182, "ce_loss_26": 2.5159747898578644, "ce_loss_39": 2.0343314677476885, "ce_loss_52": 1.3983295410871506, "ce_loss_7": 3.2374337732791902, "epoch": 0.641, "grad_norm": 21.205292313379594, "kl_loss_13": 3264.4, "kl_loss_26": 2317.8, "kl_loss_39": 1301.9, "kl_loss_7": 3828.8, "learning_rate": 0.0002908502623365536, "loss": 5348.95, "step": 6410 }, { "ce_loss_13": 3.0186184704303742, "ce_loss_26": 2.5620053589344023, "ce_loss_39": 2.079856187105179, "ce_loss_52": 1.437650018930435, "ce_loss_7": 3.2890258550643923, "epoch": 0.642, "grad_norm": 20.335253629932936, "kl_loss_13": 3264.4, "kl_loss_26": 2325.4, "kl_loss_39": 1302.1, "kl_loss_7": 3830.0, "learning_rate": 0.0002894101373095867, "loss": 5303.0, "step": 6420 }, { "ce_loss_13": 3.0677368700504304, "ce_loss_26": 2.6209602475166323, "ce_loss_39": 2.1306197196245193, "ce_loss_52": 1.4929826736450196, "ce_loss_7": 3.3314808785915373, "epoch": 0.643, "grad_norm": 20.265575824381624, "kl_loss_13": 3289.6, "kl_loss_26": 2355.4, "kl_loss_39": 1318.2, "kl_loss_7": 3835.2, "learning_rate": 0.00028797213292019926, "loss": 5380.85, "step": 6430 }, { "ce_loss_13": 2.966922277212143, "ce_loss_26": 2.51933411359787, "ce_loss_39": 2.040864047408104, "ce_loss_52": 1.4254619121551513, "ce_loss_7": 3.232788211107254, "epoch": 0.644, "grad_norm": 19.660532004484757, "kl_loss_13": 3209.2, "kl_loss_26": 2273.0, "kl_loss_39": 1260.8, "kl_loss_7": 3770.0, "learning_rate": 0.0002865362636490791, "loss": 5309.3, "step": 6440 }, { "ce_loss_13": 3.0011440992355345, "ce_loss_26": 2.5491324365139008, "ce_loss_39": 2.057476672530174, "ce_loss_52": 1.4156641319394112, "ce_loss_7": 3.2698193073272703, "epoch": 0.645, "grad_norm": 20.528114278014176, "kl_loss_13": 3284.8, "kl_loss_26": 2344.4, "kl_loss_39": 1307.4, "kl_loss_7": 3853.6, "learning_rate": 0.0002851025439554142, "loss": 5329.3, "step": 6450 }, { "ce_loss_13": 3.0585977435112, "ce_loss_26": 2.593171867728233, "ce_loss_39": 2.1089387238025665, "ce_loss_52": 1.4590320155024528, "ce_loss_7": 3.3243721425533295, "epoch": 0.646, "grad_norm": 19.177895864651298, "kl_loss_13": 3308.0, "kl_loss_26": 2342.6, "kl_loss_39": 1306.8, "kl_loss_7": 3858.4, "learning_rate": 0.00028367098827674573, "loss": 5399.4, "step": 6460 }, { "ce_loss_13": 3.0068358182907104, "ce_loss_26": 2.559490966796875, "ce_loss_39": 2.0818791508674623, "ce_loss_52": 1.4510639190673829, "ce_loss_7": 3.2726912021636965, "epoch": 0.647, "grad_norm": 20.001351203231668, "kl_loss_13": 3231.2, "kl_loss_26": 2299.0, "kl_loss_39": 1281.6, "kl_loss_7": 3785.2, "learning_rate": 0.00028224161102882397, "loss": 5353.5, "step": 6470 }, { "ce_loss_13": 3.0042510509490965, "ce_loss_26": 2.545697581768036, "ce_loss_39": 2.0517130315303804, "ce_loss_52": 1.4082761898636817, "ce_loss_7": 3.2736884713172913, "epoch": 0.648, "grad_norm": 19.176592480543317, "kl_loss_13": 3304.8, "kl_loss_26": 2344.6, "kl_loss_39": 1308.6, "kl_loss_7": 3870.8, "learning_rate": 0.00028081442660546124, "loss": 5357.45, "step": 6480 }, { "ce_loss_13": 2.9604024648666383, "ce_loss_26": 2.5067259430885316, "ce_loss_39": 2.0217309921979902, "ce_loss_52": 1.396033638715744, "ce_loss_7": 3.229094612598419, "epoch": 0.649, "grad_norm": 20.132685826085943, "kl_loss_13": 3230.8, "kl_loss_26": 2292.8, "kl_loss_39": 1272.4, "kl_loss_7": 3788.8, "learning_rate": 0.0002793894493783892, "loss": 5337.2, "step": 6490 }, { "ce_loss_13": 3.038835954666138, "ce_loss_26": 2.5828086912631987, "ce_loss_39": 2.094580352306366, "ce_loss_52": 1.4504825562238692, "ce_loss_7": 3.3101982474327087, "epoch": 0.65, "grad_norm": 20.26204607317675, "kl_loss_13": 3292.0, "kl_loss_26": 2347.2, "kl_loss_39": 1312.5, "kl_loss_7": 3857.2, "learning_rate": 0.0002779666936971129, "loss": 5341.9, "step": 6500 }, { "ce_loss_13": 2.981122875213623, "ce_loss_26": 2.5436301648616793, "ce_loss_39": 2.06348480284214, "ce_loss_52": 1.4388723462820052, "ce_loss_7": 3.2536712110042574, "epoch": 0.651, "grad_norm": 19.527282619134503, "kl_loss_13": 3203.2, "kl_loss_26": 2279.0, "kl_loss_39": 1260.5, "kl_loss_7": 3763.6, "learning_rate": 0.00027654617388876614, "loss": 5303.55, "step": 6510 }, { "ce_loss_13": 2.985329604148865, "ce_loss_26": 2.5429716140031813, "ce_loss_39": 2.0698666363954543, "ce_loss_52": 1.43270433396101, "ce_loss_7": 3.2484419345855713, "epoch": 0.652, "grad_norm": 19.361430489337188, "kl_loss_13": 3187.2, "kl_loss_26": 2277.2, "kl_loss_39": 1278.4, "kl_loss_7": 3732.4, "learning_rate": 0.0002751279042579672, "loss": 5316.3, "step": 6520 }, { "ce_loss_13": 2.9774204194545746, "ce_loss_26": 2.5208486020565033, "ce_loss_39": 2.0273024052381516, "ce_loss_52": 1.402597150206566, "ce_loss_7": 3.249239844083786, "epoch": 0.653, "grad_norm": 19.527542591573294, "kl_loss_13": 3262.4, "kl_loss_26": 2313.8, "kl_loss_39": 1281.3, "kl_loss_7": 3834.0, "learning_rate": 0.00027371189908667604, "loss": 5336.1, "step": 6530 }, { "ce_loss_13": 3.003592276573181, "ce_loss_26": 2.5556287467479706, "ce_loss_39": 2.075370451807976, "ce_loss_52": 1.4352567225694657, "ce_loss_7": 3.271352219581604, "epoch": 0.654, "grad_norm": 19.924679150402795, "kl_loss_13": 3252.0, "kl_loss_26": 2323.8, "kl_loss_39": 1299.3, "kl_loss_7": 3811.2, "learning_rate": 0.00027229817263404863, "loss": 5288.8, "step": 6540 }, { "ce_loss_13": 2.995857471227646, "ce_loss_26": 2.5282726138830185, "ce_loss_39": 2.0456599622964857, "ce_loss_52": 1.4150889962911606, "ce_loss_7": 3.259833812713623, "epoch": 0.655, "grad_norm": 19.56917393810092, "kl_loss_13": 3242.4, "kl_loss_26": 2291.2, "kl_loss_39": 1266.0, "kl_loss_7": 3806.4, "learning_rate": 0.0002708867391362948, "loss": 5328.1, "step": 6550 }, { "ce_loss_13": 3.004334282875061, "ce_loss_26": 2.553117799758911, "ce_loss_39": 2.0656849920749663, "ce_loss_52": 1.4380108654499053, "ce_loss_7": 3.275963246822357, "epoch": 0.656, "grad_norm": 19.95397617147254, "kl_loss_13": 3232.8, "kl_loss_26": 2292.0, "kl_loss_39": 1262.4, "kl_loss_7": 3798.0, "learning_rate": 0.0002694776128065345, "loss": 5289.15, "step": 6560 }, { "ce_loss_13": 3.0339253902435304, "ce_loss_26": 2.5763088524341584, "ce_loss_39": 2.0788813173770904, "ce_loss_52": 1.4449120432138443, "ce_loss_7": 3.3040917217731476, "epoch": 0.657, "grad_norm": 20.212973328584127, "kl_loss_13": 3268.4, "kl_loss_26": 2314.6, "kl_loss_39": 1279.8, "kl_loss_7": 3840.8, "learning_rate": 0.00026807080783465374, "loss": 5293.2, "step": 6570 }, { "ce_loss_13": 3.032737511396408, "ce_loss_26": 2.5693029284477236, "ce_loss_39": 2.081709760427475, "ce_loss_52": 1.4346143543720244, "ce_loss_7": 3.308923304080963, "epoch": 0.658, "grad_norm": 19.880750606313658, "kl_loss_13": 3330.4, "kl_loss_26": 2365.8, "kl_loss_39": 1316.5, "kl_loss_7": 3904.0, "learning_rate": 0.00026666633838716316, "loss": 5330.1, "step": 6580 }, { "ce_loss_13": 3.0193063259124755, "ce_loss_26": 2.576409709453583, "ce_loss_39": 2.0992994725704195, "ce_loss_52": 1.4646779403090477, "ce_loss_7": 3.285114985704422, "epoch": 0.659, "grad_norm": 20.363457370030773, "kl_loss_13": 3237.2, "kl_loss_26": 2313.0, "kl_loss_39": 1303.8, "kl_loss_7": 3788.4, "learning_rate": 0.00026526421860705474, "loss": 5307.4, "step": 6590 }, { "ce_loss_13": 2.9948873639106752, "ce_loss_26": 2.563684010505676, "ce_loss_39": 2.0847960352897643, "ce_loss_52": 1.4657811507582665, "ce_loss_7": 3.2635042905807494, "epoch": 0.66, "grad_norm": 20.73724151705583, "kl_loss_13": 3176.8, "kl_loss_26": 2267.2, "kl_loss_39": 1259.0, "kl_loss_7": 3732.8, "learning_rate": 0.0002638644626136587, "loss": 5326.5, "step": 6600 }, { "ce_loss_13": 3.007009822130203, "ce_loss_26": 2.5652425408363344, "ce_loss_39": 2.085008403658867, "ce_loss_52": 1.4418020695447922, "ce_loss_7": 3.2768814861774445, "epoch": 0.661, "grad_norm": 19.59865222649701, "kl_loss_13": 3192.4, "kl_loss_26": 2272.6, "kl_loss_39": 1265.8, "kl_loss_7": 3758.8, "learning_rate": 0.00026246708450250255, "loss": 5252.1, "step": 6610 }, { "ce_loss_13": 3.0007415533065798, "ce_loss_26": 2.57885719537735, "ce_loss_39": 2.1149094998836517, "ce_loss_52": 1.4889407217502595, "ce_loss_7": 3.26429398059845, "epoch": 0.662, "grad_norm": 19.791378915341742, "kl_loss_13": 3154.4, "kl_loss_26": 2261.8, "kl_loss_39": 1265.9, "kl_loss_7": 3702.8, "learning_rate": 0.00026107209834516854, "loss": 5253.75, "step": 6620 }, { "ce_loss_13": 3.023489362001419, "ce_loss_26": 2.565022760629654, "ce_loss_39": 2.0684966832399367, "ce_loss_52": 1.4181891351938247, "ce_loss_7": 3.295238083600998, "epoch": 0.663, "grad_norm": 18.83951798457028, "kl_loss_13": 3308.8, "kl_loss_26": 2356.8, "kl_loss_39": 1317.4, "kl_loss_7": 3874.4, "learning_rate": 0.0002596795181891514, "loss": 5303.2, "step": 6630 }, { "ce_loss_13": 2.948693299293518, "ce_loss_26": 2.511568069458008, "ce_loss_39": 2.0302646070718766, "ce_loss_52": 1.40511611700058, "ce_loss_7": 3.215245670080185, "epoch": 0.664, "grad_norm": 19.938510106571005, "kl_loss_13": 3190.8, "kl_loss_26": 2276.8, "kl_loss_39": 1269.2, "kl_loss_7": 3747.6, "learning_rate": 0.000258289358057718, "loss": 5355.55, "step": 6640 }, { "ce_loss_13": 2.964204251766205, "ce_loss_26": 2.5196115612983703, "ce_loss_39": 2.032116264104843, "ce_loss_52": 1.397587490081787, "ce_loss_7": 3.234144788980484, "epoch": 0.665, "grad_norm": 19.688819745922057, "kl_loss_13": 3241.6, "kl_loss_26": 2310.8, "kl_loss_39": 1285.8, "kl_loss_7": 3812.0, "learning_rate": 0.0002569016319497657, "loss": 5275.7, "step": 6650 }, { "ce_loss_13": 3.0219891548156737, "ce_loss_26": 2.568303269147873, "ce_loss_39": 2.0779166162014007, "ce_loss_52": 1.4420379608869554, "ce_loss_7": 3.2859797060489653, "epoch": 0.666, "grad_norm": 19.909960186263373, "kl_loss_13": 3259.2, "kl_loss_26": 2308.2, "kl_loss_39": 1277.3, "kl_loss_7": 3822.8, "learning_rate": 0.00025551635383968066, "loss": 5336.5, "step": 6660 }, { "ce_loss_13": 2.994647592306137, "ce_loss_26": 2.5398232668638228, "ce_loss_39": 2.0492498099803926, "ce_loss_52": 1.434773786365986, "ce_loss_7": 3.264130574464798, "epoch": 0.667, "grad_norm": 20.004167920061033, "kl_loss_13": 3214.0, "kl_loss_26": 2272.2, "kl_loss_39": 1256.6, "kl_loss_7": 3778.0, "learning_rate": 0.00025413353767719804, "loss": 5257.5, "step": 6670 }, { "ce_loss_13": 2.96993693113327, "ce_loss_26": 2.533370888233185, "ce_loss_39": 2.055029663443565, "ce_loss_52": 1.4544675678014756, "ce_loss_7": 3.2342797338962557, "epoch": 0.668, "grad_norm": 20.025849444564383, "kl_loss_13": 3142.4, "kl_loss_26": 2230.8, "kl_loss_39": 1222.0, "kl_loss_7": 3698.4, "learning_rate": 0.0002527531973872617, "loss": 5248.5, "step": 6680 }, { "ce_loss_13": 2.9402814984321592, "ce_loss_26": 2.4974717676639555, "ce_loss_39": 2.0207353264093397, "ce_loss_52": 1.4053010821342469, "ce_loss_7": 3.211796945333481, "epoch": 0.669, "grad_norm": 20.57900906104847, "kl_loss_13": 3184.0, "kl_loss_26": 2262.6, "kl_loss_39": 1252.7, "kl_loss_7": 3753.6, "learning_rate": 0.0002513753468698826, "loss": 5296.7, "step": 6690 }, { "ce_loss_13": 3.049540191888809, "ce_loss_26": 2.5854183793067933, "ce_loss_39": 2.092196524143219, "ce_loss_52": 1.4520713061094284, "ce_loss_7": 3.3153574585914614, "epoch": 0.67, "grad_norm": 20.064255813843516, "kl_loss_13": 3277.6, "kl_loss_26": 2320.0, "kl_loss_39": 1288.6, "kl_loss_7": 3837.2, "learning_rate": 0.0002500000000000001, "loss": 5320.3, "step": 6700 }, { "ce_loss_13": 2.944129317998886, "ce_loss_26": 2.503596860170364, "ce_loss_39": 2.0286095440387726, "ce_loss_52": 1.4282706409692765, "ce_loss_7": 3.2095106482505797, "epoch": 0.671, "grad_norm": 20.12603370941262, "kl_loss_13": 3178.8, "kl_loss_26": 2257.6, "kl_loss_39": 1240.0, "kl_loss_7": 3734.8, "learning_rate": 0.0002486271706273421, "loss": 5232.2, "step": 6710 }, { "ce_loss_13": 2.9713922500610352, "ce_loss_26": 2.5247735172510146, "ce_loss_39": 2.050862190127373, "ce_loss_52": 1.449659252166748, "ce_loss_7": 3.2344084203243257, "epoch": 0.672, "grad_norm": 20.72667693004533, "kl_loss_13": 3149.6, "kl_loss_26": 2225.2, "kl_loss_39": 1217.0, "kl_loss_7": 3703.2, "learning_rate": 0.0002472568725762853, "loss": 5273.45, "step": 6720 }, { "ce_loss_13": 2.9712363362312315, "ce_loss_26": 2.5258888751268387, "ce_loss_39": 2.038512706756592, "ce_loss_52": 1.4098822742700576, "ce_loss_7": 3.238377648591995, "epoch": 0.673, "grad_norm": 19.364942978516346, "kl_loss_13": 3241.6, "kl_loss_26": 2313.0, "kl_loss_39": 1283.6, "kl_loss_7": 3804.0, "learning_rate": 0.00024588911964571554, "loss": 5264.25, "step": 6730 }, { "ce_loss_13": 3.0029661655426025, "ce_loss_26": 2.5618400514125823, "ce_loss_39": 2.0779304295778274, "ce_loss_52": 1.4597969472408294, "ce_loss_7": 3.2699401795864107, "epoch": 0.674, "grad_norm": 19.386861595432553, "kl_loss_13": 3201.2, "kl_loss_26": 2283.2, "kl_loss_39": 1260.7, "kl_loss_7": 3760.0, "learning_rate": 0.00024452392560888974, "loss": 5256.1, "step": 6740 }, { "ce_loss_13": 2.9799251735210417, "ce_loss_26": 2.5340505450963975, "ce_loss_39": 2.049144822359085, "ce_loss_52": 1.4137116000056267, "ce_loss_7": 3.257823657989502, "epoch": 0.675, "grad_norm": 19.909504320065746, "kl_loss_13": 3246.4, "kl_loss_26": 2320.6, "kl_loss_39": 1295.2, "kl_loss_7": 3816.4, "learning_rate": 0.00024316130421329695, "loss": 5221.1, "step": 6750 }, { "ce_loss_13": 2.9629843533039093, "ce_loss_26": 2.524860253930092, "ce_loss_39": 2.042747235298157, "ce_loss_52": 1.4334406018257142, "ce_loss_7": 3.2345532715320586, "epoch": 0.676, "grad_norm": 20.369497806638545, "kl_loss_13": 3186.8, "kl_loss_26": 2271.6, "kl_loss_39": 1245.2, "kl_loss_7": 3754.4, "learning_rate": 0.00024180126918051909, "loss": 5236.3, "step": 6760 }, { "ce_loss_13": 2.9732554376125337, "ce_loss_26": 2.5324235647916793, "ce_loss_39": 2.0435447841882706, "ce_loss_52": 1.4181665301322937, "ce_loss_7": 3.2470718741416933, "epoch": 0.677, "grad_norm": 20.49515152965971, "kl_loss_13": 3219.2, "kl_loss_26": 2299.4, "kl_loss_39": 1278.7, "kl_loss_7": 3786.8, "learning_rate": 0.00024044383420609406, "loss": 5319.65, "step": 6770 }, { "ce_loss_13": 2.9884051978588104, "ce_loss_26": 2.552768051624298, "ce_loss_39": 2.0788974314928055, "ce_loss_52": 1.460537651181221, "ce_loss_7": 3.251654601097107, "epoch": 0.678, "grad_norm": 19.11355169384201, "kl_loss_13": 3167.6, "kl_loss_26": 2248.8, "kl_loss_39": 1249.3, "kl_loss_7": 3718.4, "learning_rate": 0.00023908901295937712, "loss": 5270.2, "step": 6780 }, { "ce_loss_13": 2.974563705921173, "ce_loss_26": 2.536205679178238, "ce_loss_39": 2.059639421105385, "ce_loss_52": 1.4519853800535203, "ce_loss_7": 3.2307840466499327, "epoch": 0.679, "grad_norm": 19.49943649381752, "kl_loss_13": 3131.2, "kl_loss_26": 2224.6, "kl_loss_39": 1227.3, "kl_loss_7": 3672.8, "learning_rate": 0.00023773681908340283, "loss": 5293.35, "step": 6790 }, { "ce_loss_13": 2.961294001340866, "ce_loss_26": 2.5129422783851623, "ce_loss_39": 2.0338284403085707, "ce_loss_52": 1.409618005156517, "ce_loss_7": 3.226576966047287, "epoch": 0.68, "grad_norm": 19.714496760455777, "kl_loss_13": 3218.4, "kl_loss_26": 2291.6, "kl_loss_39": 1267.1, "kl_loss_7": 3768.8, "learning_rate": 0.00023638726619474876, "loss": 5250.5, "step": 6800 }, { "ce_loss_13": 3.0715033173561097, "ce_loss_26": 2.626942425966263, "ce_loss_39": 2.1481954157352448, "ce_loss_52": 1.5187551528215408, "ce_loss_7": 3.336813968420029, "epoch": 0.681, "grad_norm": 19.805927066338636, "kl_loss_13": 3238.0, "kl_loss_26": 2300.0, "kl_loss_39": 1278.2, "kl_loss_7": 3790.0, "learning_rate": 0.0002350403678833976, "loss": 5234.9, "step": 6810 }, { "ce_loss_13": 2.957927519083023, "ce_loss_26": 2.5146523237228395, "ce_loss_39": 2.0373351722955704, "ce_loss_52": 1.42108353972435, "ce_loss_7": 3.223799991607666, "epoch": 0.682, "grad_norm": 20.479900710283268, "kl_loss_13": 3202.4, "kl_loss_26": 2281.6, "kl_loss_39": 1265.3, "kl_loss_7": 3751.2, "learning_rate": 0.00023369613771260007, "loss": 5258.8, "step": 6820 }, { "ce_loss_13": 2.9837976515293123, "ce_loss_26": 2.5472346246242523, "ce_loss_39": 2.0789157301187515, "ce_loss_52": 1.4713156789541244, "ce_loss_7": 3.2464165806770326, "epoch": 0.683, "grad_norm": 19.479700971519588, "kl_loss_13": 3160.0, "kl_loss_26": 2251.4, "kl_loss_39": 1251.2, "kl_loss_7": 3708.8, "learning_rate": 0.00023235458921873925, "loss": 5205.3, "step": 6830 }, { "ce_loss_13": 2.9887463808059693, "ce_loss_26": 2.54727523624897, "ce_loss_39": 2.0671548724174498, "ce_loss_52": 1.429240283370018, "ce_loss_7": 3.2527148902416227, "epoch": 0.684, "grad_norm": 19.517242754730322, "kl_loss_13": 3196.4, "kl_loss_26": 2285.2, "kl_loss_39": 1272.4, "kl_loss_7": 3748.4, "learning_rate": 0.0002310157359111938, "loss": 5234.8, "step": 6840 }, { "ce_loss_13": 2.916054058074951, "ce_loss_26": 2.4669763922691343, "ce_loss_39": 1.992121958732605, "ce_loss_52": 1.390305233001709, "ce_loss_7": 3.179300290346146, "epoch": 0.685, "grad_norm": 20.2160173629293, "kl_loss_13": 3168.4, "kl_loss_26": 2234.6, "kl_loss_39": 1230.6, "kl_loss_7": 3719.2, "learning_rate": 0.0002296795912722014, "loss": 5214.55, "step": 6850 }, { "ce_loss_13": 2.9123338878154756, "ce_loss_26": 2.4650843650102616, "ce_loss_39": 1.9903331339359283, "ce_loss_52": 1.3830609425902367, "ce_loss_7": 3.174854850769043, "epoch": 0.686, "grad_norm": 19.3573154940235, "kl_loss_13": 3154.0, "kl_loss_26": 2228.4, "kl_loss_39": 1226.6, "kl_loss_7": 3712.0, "learning_rate": 0.0002283461687567236, "loss": 5186.2, "step": 6860 }, { "ce_loss_13": 2.9503078758716583, "ce_loss_26": 2.5050206154584886, "ce_loss_39": 2.0353992134332657, "ce_loss_52": 1.4260726869106293, "ce_loss_7": 3.2181301593780516, "epoch": 0.687, "grad_norm": 19.436405773017547, "kl_loss_13": 3173.6, "kl_loss_26": 2249.4, "kl_loss_39": 1239.4, "kl_loss_7": 3725.2, "learning_rate": 0.00022701548179231045, "loss": 5180.9, "step": 6870 }, { "ce_loss_13": 2.989210718870163, "ce_loss_26": 2.5456956744194033, "ce_loss_39": 2.07299542427063, "ce_loss_52": 1.45269995033741, "ce_loss_7": 3.25973704457283, "epoch": 0.688, "grad_norm": 19.21415326438658, "kl_loss_13": 3172.0, "kl_loss_26": 2245.6, "kl_loss_39": 1252.7, "kl_loss_7": 3735.6, "learning_rate": 0.00022568754377896516, "loss": 5258.6, "step": 6880 }, { "ce_loss_13": 2.9914295256137846, "ce_loss_26": 2.5424347430467606, "ce_loss_39": 2.0552540928125382, "ce_loss_52": 1.4221117675304413, "ce_loss_7": 3.2662317156791687, "epoch": 0.689, "grad_norm": 19.29554445155232, "kl_loss_13": 3243.2, "kl_loss_26": 2310.8, "kl_loss_39": 1282.1, "kl_loss_7": 3819.2, "learning_rate": 0.00022436236808900844, "loss": 5241.3, "step": 6890 }, { "ce_loss_13": 2.9910283386707306, "ce_loss_26": 2.550189185142517, "ce_loss_39": 2.07351476252079, "ce_loss_52": 1.4624590903520585, "ce_loss_7": 3.2594147861003875, "epoch": 0.69, "grad_norm": 19.896412241424787, "kl_loss_13": 3197.2, "kl_loss_26": 2269.0, "kl_loss_39": 1261.5, "kl_loss_7": 3754.8, "learning_rate": 0.00022303996806694487, "loss": 5245.0, "step": 6900 }, { "ce_loss_13": 2.9984730899333956, "ce_loss_26": 2.563878893852234, "ce_loss_39": 2.073988217115402, "ce_loss_52": 1.4545040100812912, "ce_loss_7": 3.266710376739502, "epoch": 0.691, "grad_norm": 18.309749884904267, "kl_loss_13": 3220.8, "kl_loss_26": 2301.4, "kl_loss_39": 1269.6, "kl_loss_7": 3776.4, "learning_rate": 0.00022172035702932823, "loss": 5221.5, "step": 6910 }, { "ce_loss_13": 2.9589293122291567, "ce_loss_26": 2.5164781630039217, "ce_loss_39": 2.045464962720871, "ce_loss_52": 1.4442868947982788, "ce_loss_7": 3.2192385673522947, "epoch": 0.692, "grad_norm": 19.310059013922874, "kl_loss_13": 3138.4, "kl_loss_26": 2227.4, "kl_loss_39": 1231.9, "kl_loss_7": 3680.8, "learning_rate": 0.00022040354826462666, "loss": 5190.7, "step": 6920 }, { "ce_loss_13": 2.947145390510559, "ce_loss_26": 2.5085155785083773, "ce_loss_39": 2.0424805164337156, "ce_loss_52": 1.443164300918579, "ce_loss_7": 3.206177592277527, "epoch": 0.693, "grad_norm": 20.439949582745886, "kl_loss_13": 3135.6, "kl_loss_26": 2219.4, "kl_loss_39": 1222.2, "kl_loss_7": 3681.2, "learning_rate": 0.0002190895550330899, "loss": 5252.8, "step": 6930 }, { "ce_loss_13": 2.951523560285568, "ce_loss_26": 2.4968682497739794, "ce_loss_39": 2.021780180931091, "ce_loss_52": 1.4140155717730523, "ce_loss_7": 3.2223109781742094, "epoch": 0.694, "grad_norm": 19.632700567273133, "kl_loss_13": 3171.6, "kl_loss_26": 2224.6, "kl_loss_39": 1221.6, "kl_loss_7": 3730.8, "learning_rate": 0.00021777839056661552, "loss": 5204.95, "step": 6940 }, { "ce_loss_13": 2.9996495246887207, "ce_loss_26": 2.545914036035538, "ce_loss_39": 2.0629312634468078, "ce_loss_52": 1.4569539099931716, "ce_loss_7": 3.260616344213486, "epoch": 0.695, "grad_norm": 19.802798519542876, "kl_loss_13": 3188.0, "kl_loss_26": 2252.8, "kl_loss_39": 1238.7, "kl_loss_7": 3738.8, "learning_rate": 0.0002164700680686147, "loss": 5219.0, "step": 6950 }, { "ce_loss_13": 2.965209072828293, "ce_loss_26": 2.523294594883919, "ce_loss_39": 2.0499251425266265, "ce_loss_52": 1.450638398528099, "ce_loss_7": 3.224820476770401, "epoch": 0.696, "grad_norm": 19.91434345309615, "kl_loss_13": 3134.0, "kl_loss_26": 2218.0, "kl_loss_39": 1215.6, "kl_loss_7": 3669.6, "learning_rate": 0.0002151646007138806, "loss": 5247.2, "step": 6960 }, { "ce_loss_13": 2.989179176092148, "ce_loss_26": 2.5318516552448274, "ce_loss_39": 2.049258217215538, "ce_loss_52": 1.4338771492242812, "ce_loss_7": 3.2527658343315125, "epoch": 0.697, "grad_norm": 19.22493467335688, "kl_loss_13": 3224.0, "kl_loss_26": 2281.6, "kl_loss_39": 1260.3, "kl_loss_7": 3768.8, "learning_rate": 0.00021386200164845526, "loss": 5208.2, "step": 6970 }, { "ce_loss_13": 2.9675691723823547, "ce_loss_26": 2.524250292778015, "ce_loss_39": 2.0449122846126557, "ce_loss_52": 1.4266346216201782, "ce_loss_7": 3.238176566362381, "epoch": 0.698, "grad_norm": 19.28582875590594, "kl_loss_13": 3208.4, "kl_loss_26": 2279.6, "kl_loss_39": 1258.2, "kl_loss_7": 3762.8, "learning_rate": 0.0002125622839894964, "loss": 5196.95, "step": 6980 }, { "ce_loss_13": 3.08748916387558, "ce_loss_26": 2.6341689109802244, "ce_loss_39": 2.131711891293526, "ce_loss_52": 1.4756682693958283, "ce_loss_7": 3.3573212742805483, "epoch": 0.699, "grad_norm": 19.570816095455605, "kl_loss_13": 3319.2, "kl_loss_26": 2375.0, "kl_loss_39": 1332.9, "kl_loss_7": 3878.0, "learning_rate": 0.00021126546082514663, "loss": 5264.6, "step": 6990 }, { "ce_loss_13": 2.9623800575733186, "ce_loss_26": 2.528759664297104, "ce_loss_39": 2.0456511676311493, "ce_loss_52": 1.4381566911935806, "ce_loss_7": 3.2283441185951234, "epoch": 0.7, "grad_norm": 20.0544191043279, "kl_loss_13": 3144.4, "kl_loss_26": 2245.4, "kl_loss_39": 1237.7, "kl_loss_7": 3701.6, "learning_rate": 0.00020997154521440098, "loss": 5184.75, "step": 7000 }, { "ce_loss_13": 2.9258078813552855, "ce_loss_26": 2.5000339925289152, "ce_loss_39": 2.030132883787155, "ce_loss_52": 1.432218487560749, "ce_loss_7": 3.1859234631061555, "epoch": 0.701, "grad_norm": 20.032367176949514, "kl_loss_13": 3112.4, "kl_loss_26": 2211.2, "kl_loss_39": 1212.0, "kl_loss_7": 3653.6, "learning_rate": 0.0002086805501869749, "loss": 5163.7, "step": 7010 }, { "ce_loss_13": 2.9877611219882967, "ce_loss_26": 2.5467711210250856, "ce_loss_39": 2.0729553580284117, "ce_loss_52": 1.4694935828447342, "ce_loss_7": 3.2482242822647094, "epoch": 0.702, "grad_norm": 19.48021495423088, "kl_loss_13": 3139.2, "kl_loss_26": 2221.2, "kl_loss_39": 1226.6, "kl_loss_7": 3688.4, "learning_rate": 0.0002073924887431744, "loss": 5172.1, "step": 7020 }, { "ce_loss_13": 2.908235615491867, "ce_loss_26": 2.477229207754135, "ce_loss_39": 2.0130053520202638, "ce_loss_52": 1.4191944628953934, "ce_loss_7": 3.1711674451828005, "epoch": 0.703, "grad_norm": 19.67889818109448, "kl_loss_13": 3069.6, "kl_loss_26": 2175.0, "kl_loss_39": 1200.3, "kl_loss_7": 3607.2, "learning_rate": 0.00020610737385376348, "loss": 5178.0, "step": 7030 }, { "ce_loss_13": 2.925868648290634, "ce_loss_26": 2.4821185052394865, "ce_loss_39": 2.013263535499573, "ce_loss_52": 1.407904815673828, "ce_loss_7": 3.184937173128128, "epoch": 0.704, "grad_norm": 19.315710978547724, "kl_loss_13": 3152.4, "kl_loss_26": 2238.2, "kl_loss_39": 1233.7, "kl_loss_7": 3694.8, "learning_rate": 0.00020482521845983521, "loss": 5182.5, "step": 7040 }, { "ce_loss_13": 2.978561645746231, "ce_loss_26": 2.537975686788559, "ce_loss_39": 2.068689134716988, "ce_loss_52": 1.449235063791275, "ce_loss_7": 3.239302319288254, "epoch": 0.705, "grad_norm": 20.022921411442997, "kl_loss_13": 3163.2, "kl_loss_26": 2247.8, "kl_loss_39": 1249.9, "kl_loss_7": 3709.6, "learning_rate": 0.00020354603547267987, "loss": 5191.65, "step": 7050 }, { "ce_loss_13": 2.926995551586151, "ce_loss_26": 2.4728329688310624, "ce_loss_39": 2.000428321957588, "ce_loss_52": 1.4053510591387748, "ce_loss_7": 3.1873682618141173, "epoch": 0.706, "grad_norm": 20.203929178538456, "kl_loss_13": 3170.4, "kl_loss_26": 2222.2, "kl_loss_39": 1210.8, "kl_loss_7": 3714.0, "learning_rate": 0.00020226983777365604, "loss": 5154.3, "step": 7060 }, { "ce_loss_13": 2.9693270325660706, "ce_loss_26": 2.520361030101776, "ce_loss_39": 2.040296342968941, "ce_loss_52": 1.4353806316852569, "ce_loss_7": 3.2412941575050356, "epoch": 0.707, "grad_norm": 19.721352799273095, "kl_loss_13": 3196.4, "kl_loss_26": 2259.8, "kl_loss_39": 1250.8, "kl_loss_7": 3758.8, "learning_rate": 0.00020099663821406056, "loss": 5217.7, "step": 7070 }, { "ce_loss_13": 2.9820376515388487, "ce_loss_26": 2.5476091861724854, "ce_loss_39": 2.072761395573616, "ce_loss_52": 1.4543047964572906, "ce_loss_7": 3.2506080687046053, "epoch": 0.708, "grad_norm": 20.324232804485565, "kl_loss_13": 3159.6, "kl_loss_26": 2249.8, "kl_loss_39": 1247.5, "kl_loss_7": 3714.4, "learning_rate": 0.00019972644961499853, "loss": 5197.1, "step": 7080 }, { "ce_loss_13": 2.9332118809223173, "ce_loss_26": 2.488256406784058, "ce_loss_39": 2.0110603511333465, "ce_loss_52": 1.4073437690734862, "ce_loss_7": 3.202910542488098, "epoch": 0.709, "grad_norm": 19.810367107777207, "kl_loss_13": 3178.8, "kl_loss_26": 2255.4, "kl_loss_39": 1244.9, "kl_loss_7": 3745.2, "learning_rate": 0.00019845928476725522, "loss": 5159.15, "step": 7090 }, { "ce_loss_13": 2.963654935359955, "ce_loss_26": 2.530976951122284, "ce_loss_39": 2.0635117918252943, "ce_loss_52": 1.466596108675003, "ce_loss_7": 3.219101697206497, "epoch": 0.71, "grad_norm": 20.1039471333501, "kl_loss_13": 3130.4, "kl_loss_26": 2221.8, "kl_loss_39": 1222.3, "kl_loss_7": 3668.4, "learning_rate": 0.00019719515643116677, "loss": 5138.7, "step": 7100 }, { "ce_loss_13": 2.9432359755039217, "ce_loss_26": 2.497118225693703, "ce_loss_39": 2.0123680919408797, "ce_loss_52": 1.3938605546951295, "ce_loss_7": 3.214134621620178, "epoch": 0.711, "grad_norm": 20.875475364068677, "kl_loss_13": 3169.6, "kl_loss_26": 2236.6, "kl_loss_39": 1232.8, "kl_loss_7": 3728.8, "learning_rate": 0.0001959340773364911, "loss": 5177.25, "step": 7110 }, { "ce_loss_13": 2.937902510166168, "ce_loss_26": 2.5005611896514894, "ce_loss_39": 2.025138959288597, "ce_loss_52": 1.4121045261621474, "ce_loss_7": 3.21354022026062, "epoch": 0.712, "grad_norm": 19.300987998871754, "kl_loss_13": 3168.4, "kl_loss_26": 2254.8, "kl_loss_39": 1247.8, "kl_loss_7": 3737.2, "learning_rate": 0.0001946760601822809, "loss": 5183.35, "step": 7120 }, { "ce_loss_13": 2.9532769322395325, "ce_loss_26": 2.5094838380813598, "ce_loss_39": 2.042011481523514, "ce_loss_52": 1.4359831362962723, "ce_loss_7": 3.2172477781772613, "epoch": 0.713, "grad_norm": 19.806306622567096, "kl_loss_13": 3130.8, "kl_loss_26": 2213.4, "kl_loss_39": 1210.7, "kl_loss_7": 3690.4, "learning_rate": 0.00019342111763675512, "loss": 5121.55, "step": 7130 }, { "ce_loss_13": 2.9316843450069427, "ce_loss_26": 2.4771865159273148, "ce_loss_39": 1.9945536375045776, "ce_loss_52": 1.3953458324074746, "ce_loss_7": 3.1933025121688843, "epoch": 0.714, "grad_norm": 19.89107625803987, "kl_loss_13": 3143.2, "kl_loss_26": 2214.4, "kl_loss_39": 1207.5, "kl_loss_7": 3687.6, "learning_rate": 0.00019216926233717085, "loss": 5175.1, "step": 7140 }, { "ce_loss_13": 2.9416719019412993, "ce_loss_26": 2.4982976377010346, "ce_loss_39": 2.018396332859993, "ce_loss_52": 1.4147447228431702, "ce_loss_7": 3.206580412387848, "epoch": 0.715, "grad_norm": 19.473553749134638, "kl_loss_13": 3170.0, "kl_loss_26": 2250.0, "kl_loss_39": 1236.5, "kl_loss_7": 3716.0, "learning_rate": 0.00019092050688969737, "loss": 5168.85, "step": 7150 }, { "ce_loss_13": 2.93907487988472, "ce_loss_26": 2.5087509632110594, "ce_loss_39": 2.036549669504166, "ce_loss_52": 1.4339062184095384, "ce_loss_7": 3.2007872402668, "epoch": 0.716, "grad_norm": 18.582597661219776, "kl_loss_13": 3097.2, "kl_loss_26": 2211.2, "kl_loss_39": 1215.8, "kl_loss_7": 3638.4, "learning_rate": 0.00018967486386928817, "loss": 5158.35, "step": 7160 }, { "ce_loss_13": 2.945317584276199, "ce_loss_26": 2.4993871986865996, "ce_loss_39": 2.0176479905843734, "ce_loss_52": 1.4336241394281388, "ce_loss_7": 3.20940922498703, "epoch": 0.717, "grad_norm": 20.943645323187056, "kl_loss_13": 3133.6, "kl_loss_26": 2211.4, "kl_loss_39": 1198.8, "kl_loss_7": 3684.0, "learning_rate": 0.00018843234581955443, "loss": 5165.1, "step": 7170 }, { "ce_loss_13": 2.9533539593219755, "ce_loss_26": 2.5154259234666823, "ce_loss_39": 2.0371447414159776, "ce_loss_52": 1.4300806164741515, "ce_loss_7": 3.2220456659793855, "epoch": 0.718, "grad_norm": 20.352991453837664, "kl_loss_13": 3138.0, "kl_loss_26": 2233.8, "kl_loss_39": 1229.9, "kl_loss_7": 3696.4, "learning_rate": 0.00018719296525263924, "loss": 5165.1, "step": 7180 }, { "ce_loss_13": 2.8928813517093657, "ce_loss_26": 2.4584825813770292, "ce_loss_39": 1.9816097348928452, "ce_loss_52": 1.4100207000970841, "ce_loss_7": 3.1512379109859467, "epoch": 0.719, "grad_norm": 19.73288927838942, "kl_loss_13": 3091.2, "kl_loss_26": 2168.2, "kl_loss_39": 1162.9, "kl_loss_7": 3634.8, "learning_rate": 0.0001859567346490913, "loss": 5125.45, "step": 7190 }, { "ce_loss_13": 3.013565558195114, "ce_loss_26": 2.5717740774154665, "ce_loss_39": 2.0986361503601074, "ce_loss_52": 1.4729616045951843, "ce_loss_7": 3.270446163415909, "epoch": 0.72, "grad_norm": 19.301343533343292, "kl_loss_13": 3201.2, "kl_loss_26": 2278.0, "kl_loss_39": 1269.9, "kl_loss_7": 3744.0, "learning_rate": 0.0001847236664577389, "loss": 5151.05, "step": 7200 }, { "ce_loss_13": 2.8901243984699247, "ce_loss_26": 2.447618916630745, "ce_loss_39": 1.9805465787649155, "ce_loss_52": 1.393562839925289, "ce_loss_7": 3.152049034833908, "epoch": 0.721, "grad_norm": 19.80481816325832, "kl_loss_13": 3108.8, "kl_loss_26": 2184.6, "kl_loss_39": 1197.1, "kl_loss_7": 3651.2, "learning_rate": 0.00018349377309556487, "loss": 5147.6, "step": 7210 }, { "ce_loss_13": 2.935315173864365, "ce_loss_26": 2.4844827204942703, "ce_loss_39": 2.014389392733574, "ce_loss_52": 1.4218608409166336, "ce_loss_7": 3.202117031812668, "epoch": 0.722, "grad_norm": 21.29811195159757, "kl_loss_13": 3145.6, "kl_loss_26": 2210.6, "kl_loss_39": 1204.5, "kl_loss_7": 3703.6, "learning_rate": 0.00018226706694758193, "loss": 5128.1, "step": 7220 }, { "ce_loss_13": 2.9786236941814423, "ce_loss_26": 2.536505568027496, "ce_loss_39": 2.0584143906831742, "ce_loss_52": 1.4622384160757065, "ce_loss_7": 3.243917632102966, "epoch": 0.723, "grad_norm": 19.6163049246679, "kl_loss_13": 3157.6, "kl_loss_26": 2230.8, "kl_loss_39": 1218.2, "kl_loss_7": 3710.4, "learning_rate": 0.0001810435603667075, "loss": 5135.45, "step": 7230 }, { "ce_loss_13": 2.9429832458496095, "ce_loss_26": 2.4910512387752535, "ce_loss_39": 2.0214726239442826, "ce_loss_52": 1.4324376732110977, "ce_loss_7": 3.201977092027664, "epoch": 0.724, "grad_norm": 19.933959617154258, "kl_loss_13": 3130.8, "kl_loss_26": 2197.0, "kl_loss_39": 1197.6, "kl_loss_7": 3676.0, "learning_rate": 0.0001798232656736389, "loss": 5101.6, "step": 7240 }, { "ce_loss_13": 3.017507255077362, "ce_loss_26": 2.5613476634025574, "ce_loss_39": 2.065951904654503, "ce_loss_52": 1.4529381558299064, "ce_loss_7": 3.284585565328598, "epoch": 0.725, "grad_norm": 19.69163026795737, "kl_loss_13": 3244.0, "kl_loss_26": 2294.8, "kl_loss_39": 1252.0, "kl_loss_7": 3804.4, "learning_rate": 0.0001786061951567303, "loss": 5145.8, "step": 7250 }, { "ce_loss_13": 2.8927790343761446, "ce_loss_26": 2.4552118331193924, "ce_loss_39": 1.9874976933002473, "ce_loss_52": 1.3983306601643561, "ce_loss_7": 3.160378706455231, "epoch": 0.726, "grad_norm": 19.959972483591027, "kl_loss_13": 3097.2, "kl_loss_26": 2189.6, "kl_loss_39": 1199.8, "kl_loss_7": 3653.2, "learning_rate": 0.00017739236107186857, "loss": 5152.65, "step": 7260 }, { "ce_loss_13": 2.938109403848648, "ce_loss_26": 2.4923312455415725, "ce_loss_39": 2.0182687640190125, "ce_loss_52": 1.4262833833694457, "ce_loss_7": 3.2080613017082213, "epoch": 0.727, "grad_norm": 19.42416009314478, "kl_loss_13": 3165.6, "kl_loss_26": 2229.8, "kl_loss_39": 1226.4, "kl_loss_7": 3724.0, "learning_rate": 0.00017618177564234904, "loss": 5132.0, "step": 7270 }, { "ce_loss_13": 2.932406869530678, "ce_loss_26": 2.481098806858063, "ce_loss_39": 1.9998224407434464, "ce_loss_52": 1.4021466106176377, "ce_loss_7": 3.192963147163391, "epoch": 0.728, "grad_norm": 19.542374311814292, "kl_loss_13": 3155.2, "kl_loss_26": 2223.4, "kl_loss_39": 1216.9, "kl_loss_7": 3704.0, "learning_rate": 0.00017497445105875377, "loss": 5186.8, "step": 7280 }, { "ce_loss_13": 2.9192141771316527, "ce_loss_26": 2.490318274497986, "ce_loss_39": 2.027893853187561, "ce_loss_52": 1.43733262270689, "ce_loss_7": 3.177316850423813, "epoch": 0.729, "grad_norm": 20.20715160517786, "kl_loss_13": 3084.0, "kl_loss_26": 2177.6, "kl_loss_39": 1196.3, "kl_loss_7": 3620.0, "learning_rate": 0.000173770399478828, "loss": 5076.85, "step": 7290 }, { "ce_loss_13": 2.9109710931777952, "ce_loss_26": 2.4745964229106905, "ce_loss_39": 2.007509797811508, "ce_loss_52": 1.4228856399655343, "ce_loss_7": 3.179339534044266, "epoch": 0.73, "grad_norm": 19.31391716230683, "kl_loss_13": 3123.2, "kl_loss_26": 2212.0, "kl_loss_39": 1198.3, "kl_loss_7": 3681.6, "learning_rate": 0.0001725696330273575, "loss": 5123.9, "step": 7300 }, { "ce_loss_13": 2.9543818056583406, "ce_loss_26": 2.5187111288309096, "ce_loss_39": 2.046798062324524, "ce_loss_52": 1.4316737815737723, "ce_loss_7": 3.2196085810661317, "epoch": 0.731, "grad_norm": 19.333490210710867, "kl_loss_13": 3136.4, "kl_loss_26": 2223.6, "kl_loss_39": 1228.2, "kl_loss_7": 3683.6, "learning_rate": 0.00017137216379604724, "loss": 5093.05, "step": 7310 }, { "ce_loss_13": 2.991909348964691, "ce_loss_26": 2.533888804912567, "ce_loss_39": 2.048043805360794, "ce_loss_52": 1.426993179321289, "ce_loss_7": 3.2596666753292083, "epoch": 0.732, "grad_norm": 18.669638559883268, "kl_loss_13": 3225.6, "kl_loss_26": 2287.6, "kl_loss_39": 1264.8, "kl_loss_7": 3784.0, "learning_rate": 0.00017017800384339925, "loss": 5127.2, "step": 7320 }, { "ce_loss_13": 2.919608438014984, "ce_loss_26": 2.475746387243271, "ce_loss_39": 2.0119441866874697, "ce_loss_52": 1.416624790430069, "ce_loss_7": 3.1857059836387633, "epoch": 0.733, "grad_norm": 19.375624695020313, "kl_loss_13": 3109.2, "kl_loss_26": 2190.6, "kl_loss_39": 1207.2, "kl_loss_7": 3667.2, "learning_rate": 0.00016898716519459073, "loss": 5204.9, "step": 7330 }, { "ce_loss_13": 2.978672456741333, "ce_loss_26": 2.5233275532722472, "ce_loss_39": 2.0352649986743927, "ce_loss_52": 1.396337878704071, "ce_loss_7": 3.2537453293800356, "epoch": 0.734, "grad_norm": 19.487833479563225, "kl_loss_13": 3269.6, "kl_loss_26": 2319.4, "kl_loss_39": 1294.6, "kl_loss_7": 3847.2, "learning_rate": 0.00016779965984135375, "loss": 5141.65, "step": 7340 }, { "ce_loss_13": 2.946110498905182, "ce_loss_26": 2.507070618867874, "ce_loss_39": 2.0425552487373353, "ce_loss_52": 1.4492309480905532, "ce_loss_7": 3.2022292137146, "epoch": 0.735, "grad_norm": 19.428920750438415, "kl_loss_13": 3113.2, "kl_loss_26": 2205.0, "kl_loss_39": 1210.6, "kl_loss_7": 3652.4, "learning_rate": 0.00016661549974185424, "loss": 5094.6, "step": 7350 }, { "ce_loss_13": 2.9823583602905273, "ce_loss_26": 2.539780905842781, "ce_loss_39": 2.0619786471128463, "ce_loss_52": 1.45176909416914, "ce_loss_7": 3.249054718017578, "epoch": 0.736, "grad_norm": 19.839368194621894, "kl_loss_13": 3204.0, "kl_loss_26": 2280.6, "kl_loss_39": 1256.4, "kl_loss_7": 3753.6, "learning_rate": 0.00016543469682057105, "loss": 5196.95, "step": 7360 }, { "ce_loss_13": 2.958816784620285, "ce_loss_26": 2.526939642429352, "ce_loss_39": 2.0634298622608185, "ce_loss_52": 1.4754424065351486, "ce_loss_7": 3.223200261592865, "epoch": 0.737, "grad_norm": 19.922895259671222, "kl_loss_13": 3096.0, "kl_loss_26": 2185.8, "kl_loss_39": 1192.1, "kl_loss_7": 3641.6, "learning_rate": 0.00016425726296817632, "loss": 5155.3, "step": 7370 }, { "ce_loss_13": 2.9623453855514525, "ce_loss_26": 2.5124567419290544, "ce_loss_39": 2.032202622294426, "ce_loss_52": 1.4377738699316978, "ce_loss_7": 3.22493896484375, "epoch": 0.738, "grad_norm": 19.913040297451975, "kl_loss_13": 3150.8, "kl_loss_26": 2213.6, "kl_loss_39": 1202.5, "kl_loss_7": 3703.2, "learning_rate": 0.00016308321004141607, "loss": 5152.6, "step": 7380 }, { "ce_loss_13": 2.9184991478919984, "ce_loss_26": 2.4769508123397825, "ce_loss_39": 2.0019295692443846, "ce_loss_52": 1.423092892765999, "ce_loss_7": 3.185350716114044, "epoch": 0.739, "grad_norm": 19.322834578437774, "kl_loss_13": 3095.6, "kl_loss_26": 2174.4, "kl_loss_39": 1176.6, "kl_loss_7": 3652.0, "learning_rate": 0.00016191254986299043, "loss": 5134.25, "step": 7390 }, { "ce_loss_13": 2.8498477935791016, "ce_loss_26": 2.4134464621543885, "ce_loss_39": 1.952101919054985, "ce_loss_52": 1.3852853626012802, "ce_loss_7": 3.117431342601776, "epoch": 0.74, "grad_norm": 20.243774871674358, "kl_loss_13": 3055.2, "kl_loss_26": 2147.0, "kl_loss_39": 1163.3, "kl_loss_7": 3606.8, "learning_rate": 0.00016074529422143398, "loss": 5086.95, "step": 7400 }, { "ce_loss_13": 2.999220699071884, "ce_loss_26": 2.550427186489105, "ce_loss_39": 2.0656515032052996, "ce_loss_52": 1.4574634283781052, "ce_loss_7": 3.2632993936538695, "epoch": 0.741, "grad_norm": 20.693157593916027, "kl_loss_13": 3181.6, "kl_loss_26": 2251.2, "kl_loss_39": 1241.3, "kl_loss_7": 3731.6, "learning_rate": 0.0001595814548709983, "loss": 5127.4, "step": 7410 }, { "ce_loss_13": 2.9262797057628633, "ce_loss_26": 2.4955521285533906, "ce_loss_39": 2.043664366006851, "ce_loss_52": 1.4607123613357544, "ce_loss_7": 3.178546887636185, "epoch": 0.742, "grad_norm": 19.251560643481486, "kl_loss_13": 3060.8, "kl_loss_26": 2159.0, "kl_loss_39": 1190.2, "kl_loss_7": 3588.8, "learning_rate": 0.00015842104353153285, "loss": 5092.2, "step": 7420 }, { "ce_loss_13": 3.0160707533359528, "ce_loss_26": 2.5663387060165403, "ce_loss_39": 2.0802172899246214, "ce_loss_52": 1.4715656280517577, "ce_loss_7": 3.2800197422504427, "epoch": 0.743, "grad_norm": 19.587217025482605, "kl_loss_13": 3190.4, "kl_loss_26": 2255.8, "kl_loss_39": 1233.7, "kl_loss_7": 3735.6, "learning_rate": 0.0001572640718883667, "loss": 5115.1, "step": 7430 }, { "ce_loss_13": 2.9469042241573336, "ce_loss_26": 2.504935991764069, "ce_loss_39": 2.0331138372421265, "ce_loss_52": 1.4291576787829399, "ce_loss_7": 3.2098668992519377, "epoch": 0.744, "grad_norm": 18.92686745034597, "kl_loss_13": 3124.8, "kl_loss_26": 2209.8, "kl_loss_39": 1212.4, "kl_loss_7": 3675.2, "learning_rate": 0.0001561105515921915, "loss": 5076.55, "step": 7440 }, { "ce_loss_13": 2.924536573886871, "ce_loss_26": 2.487199380993843, "ce_loss_39": 2.0280190229415895, "ce_loss_52": 1.4308366000652313, "ce_loss_7": 3.184017467498779, "epoch": 0.745, "grad_norm": 20.464433868383605, "kl_loss_13": 3079.6, "kl_loss_26": 2173.0, "kl_loss_39": 1203.3, "kl_loss_7": 3634.0, "learning_rate": 0.0001549604942589441, "loss": 5072.9, "step": 7450 }, { "ce_loss_13": 2.929040068387985, "ce_loss_26": 2.4782379269599915, "ce_loss_39": 1.997541171312332, "ce_loss_52": 1.4025927037000656, "ce_loss_7": 3.1986856281757357, "epoch": 0.746, "grad_norm": 19.749937853484084, "kl_loss_13": 3136.4, "kl_loss_26": 2195.8, "kl_loss_39": 1197.5, "kl_loss_7": 3701.6, "learning_rate": 0.00015381391146968864, "loss": 5119.05, "step": 7460 }, { "ce_loss_13": 2.9494260370731356, "ce_loss_26": 2.5060392141342165, "ce_loss_39": 2.0337665289640428, "ce_loss_52": 1.437817743420601, "ce_loss_7": 3.2058426082134246, "epoch": 0.747, "grad_norm": 20.32794285148468, "kl_loss_13": 3134.4, "kl_loss_26": 2216.0, "kl_loss_39": 1213.4, "kl_loss_7": 3678.0, "learning_rate": 0.00015267081477050133, "loss": 5102.65, "step": 7470 }, { "ce_loss_13": 2.921141803264618, "ce_loss_26": 2.4805154383182524, "ce_loss_39": 2.0182774633169176, "ce_loss_52": 1.4364572942256928, "ce_loss_7": 3.1770897448062896, "epoch": 0.748, "grad_norm": 19.06438842103908, "kl_loss_13": 3083.4, "kl_loss_26": 2171.6, "kl_loss_39": 1187.2, "kl_loss_7": 3614.4, "learning_rate": 0.00015153121567235335, "loss": 5127.55, "step": 7480 }, { "ce_loss_13": 2.913339024782181, "ce_loss_26": 2.4667980909347533, "ce_loss_39": 1.9971046984195708, "ce_loss_52": 1.4167528375983238, "ce_loss_7": 3.1835066616535186, "epoch": 0.749, "grad_norm": 19.868010576940023, "kl_loss_13": 3104.0, "kl_loss_26": 2178.6, "kl_loss_39": 1188.7, "kl_loss_7": 3662.4, "learning_rate": 0.00015039512565099468, "loss": 5094.65, "step": 7490 }, { "ce_loss_13": 2.914568355679512, "ce_loss_26": 2.469626322388649, "ce_loss_39": 2.0020422458648683, "ce_loss_52": 1.4171572998166084, "ce_loss_7": 3.1783276200294495, "epoch": 0.75, "grad_norm": 19.266335144116216, "kl_loss_13": 3099.6, "kl_loss_26": 2178.4, "kl_loss_39": 1189.1, "kl_loss_7": 3658.8, "learning_rate": 0.00014926255614683932, "loss": 5132.95, "step": 7500 }, { "ce_loss_13": 2.9255091905593873, "ce_loss_26": 2.4897490620613096, "ce_loss_39": 2.020438665151596, "ce_loss_52": 1.43346728682518, "ce_loss_7": 3.188784825801849, "epoch": 0.751, "grad_norm": 18.946748872518604, "kl_loss_13": 3108.8, "kl_loss_26": 2190.0, "kl_loss_39": 1197.3, "kl_loss_7": 3652.8, "learning_rate": 0.0001481335185648498, "loss": 5140.95, "step": 7510 }, { "ce_loss_13": 2.9898211777210237, "ce_loss_26": 2.541801372170448, "ce_loss_39": 2.068241673707962, "ce_loss_52": 1.474491646885872, "ce_loss_7": 3.2494523525238037, "epoch": 0.752, "grad_norm": 19.297834732191596, "kl_loss_13": 3108.0, "kl_loss_26": 2187.0, "kl_loss_39": 1193.8, "kl_loss_7": 3655.2, "learning_rate": 0.0001470080242744218, "loss": 5080.45, "step": 7520 }, { "ce_loss_13": 2.989736980199814, "ce_loss_26": 2.5422766327857973, "ce_loss_39": 2.0724924355745316, "ce_loss_52": 1.4764755725860597, "ce_loss_7": 3.2583333015441895, "epoch": 0.753, "grad_norm": 19.759302349149518, "kl_loss_13": 3127.2, "kl_loss_26": 2207.8, "kl_loss_39": 1205.1, "kl_loss_7": 3686.4, "learning_rate": 0.0001458860846092705, "loss": 5089.25, "step": 7530 }, { "ce_loss_13": 2.9518058955669404, "ce_loss_26": 2.5123462677001953, "ce_loss_39": 2.033254536986351, "ce_loss_52": 1.430069674551487, "ce_loss_7": 3.216610902547836, "epoch": 0.754, "grad_norm": 19.101743494796594, "kl_loss_13": 3130.4, "kl_loss_26": 2214.4, "kl_loss_39": 1206.1, "kl_loss_7": 3688.8, "learning_rate": 0.00014476771086731566, "loss": 5132.95, "step": 7540 }, { "ce_loss_13": 2.9499751746654512, "ce_loss_26": 2.5148087441921234, "ce_loss_39": 2.0469634413719175, "ce_loss_52": 1.4645337551832198, "ce_loss_7": 3.2114050924777984, "epoch": 0.755, "grad_norm": 18.95244057854832, "kl_loss_13": 3084.4, "kl_loss_26": 2166.2, "kl_loss_39": 1178.4, "kl_loss_7": 3625.2, "learning_rate": 0.00014365291431056872, "loss": 5111.9, "step": 7550 }, { "ce_loss_13": 2.915192812681198, "ce_loss_26": 2.470223453640938, "ce_loss_39": 1.9942311495542526, "ce_loss_52": 1.4130516573786736, "ce_loss_7": 3.186279386281967, "epoch": 0.756, "grad_norm": 19.87294750540145, "kl_loss_13": 3138.4, "kl_loss_26": 2214.2, "kl_loss_39": 1201.4, "kl_loss_7": 3700.0, "learning_rate": 0.00014254170616501827, "loss": 5096.4, "step": 7560 }, { "ce_loss_13": 2.954612511396408, "ce_loss_26": 2.5071854114532472, "ce_loss_39": 2.035946971178055, "ce_loss_52": 1.439925280213356, "ce_loss_7": 3.2222863495349885, "epoch": 0.757, "grad_norm": 20.90881340934633, "kl_loss_13": 3122.0, "kl_loss_26": 2194.2, "kl_loss_39": 1192.5, "kl_loss_7": 3677.6, "learning_rate": 0.0001414340976205183, "loss": 5060.45, "step": 7570 }, { "ce_loss_13": 2.9101392149925234, "ce_loss_26": 2.471102824807167, "ce_loss_39": 2.0064873933792113, "ce_loss_52": 1.4280226349830627, "ce_loss_7": 3.1714185059070585, "epoch": 0.758, "grad_norm": 19.49522818126467, "kl_loss_13": 3076.0, "kl_loss_26": 2159.8, "kl_loss_39": 1176.2, "kl_loss_7": 3620.4, "learning_rate": 0.00014033009983067452, "loss": 5108.35, "step": 7580 }, { "ce_loss_13": 2.966978985071182, "ce_loss_26": 2.5087331235408783, "ce_loss_39": 2.0336913764476776, "ce_loss_52": 1.4274784743785858, "ce_loss_7": 3.2379914104938505, "epoch": 0.759, "grad_norm": 18.693022063405696, "kl_loss_13": 3203.2, "kl_loss_26": 2255.0, "kl_loss_39": 1243.9, "kl_loss_7": 3762.0, "learning_rate": 0.00013922972391273224, "loss": 5094.65, "step": 7590 }, { "ce_loss_13": 2.9872674524784086, "ce_loss_26": 2.5407335460186005, "ce_loss_39": 2.061782196164131, "ce_loss_52": 1.4462621062994003, "ce_loss_7": 3.253601038455963, "epoch": 0.76, "grad_norm": 19.707863398571995, "kl_loss_13": 3172.0, "kl_loss_26": 2252.8, "kl_loss_39": 1251.4, "kl_loss_7": 3726.4, "learning_rate": 0.0001381329809474649, "loss": 5098.7, "step": 7600 }, { "ce_loss_13": 2.8771551668643953, "ce_loss_26": 2.4450180411338804, "ce_loss_39": 1.9818484753370285, "ce_loss_52": 1.4145073384046554, "ce_loss_7": 3.1438543021678926, "epoch": 0.761, "grad_norm": 18.46540807586579, "kl_loss_13": 3029.2, "kl_loss_26": 2126.8, "kl_loss_39": 1154.6, "kl_loss_7": 3583.2, "learning_rate": 0.0001370398819790621, "loss": 5084.25, "step": 7610 }, { "ce_loss_13": 2.960028713941574, "ce_loss_26": 2.525826930999756, "ce_loss_39": 2.0549875289201736, "ce_loss_52": 1.4615961879491806, "ce_loss_7": 3.2287797749042513, "epoch": 0.762, "grad_norm": 19.79273649753162, "kl_loss_13": 3108.0, "kl_loss_26": 2202.8, "kl_loss_39": 1209.5, "kl_loss_7": 3652.8, "learning_rate": 0.00013595043801501794, "loss": 5052.75, "step": 7620 }, { "ce_loss_13": 2.9331182718276976, "ce_loss_26": 2.4871296346187592, "ce_loss_39": 2.019395884871483, "ce_loss_52": 1.4329772531986236, "ce_loss_7": 3.19427090883255, "epoch": 0.763, "grad_norm": 20.265530214115834, "kl_loss_13": 3124.0, "kl_loss_26": 2195.4, "kl_loss_39": 1199.7, "kl_loss_7": 3669.6, "learning_rate": 0.00013486466002602133, "loss": 5092.15, "step": 7630 }, { "ce_loss_13": 2.86139075756073, "ce_loss_26": 2.4176136374473574, "ce_loss_39": 1.941940224170685, "ce_loss_52": 1.3821519583463668, "ce_loss_7": 3.1196699738502502, "epoch": 0.764, "grad_norm": 19.727213627271716, "kl_loss_13": 3072.8, "kl_loss_26": 2145.6, "kl_loss_39": 1144.7, "kl_loss_7": 3617.2, "learning_rate": 0.00013378255894584462, "loss": 5002.6, "step": 7640 }, { "ce_loss_13": 2.9353716015815734, "ce_loss_26": 2.490555015206337, "ce_loss_39": 2.019370597600937, "ce_loss_52": 1.4347517609596252, "ce_loss_7": 3.196541225910187, "epoch": 0.765, "grad_norm": 20.424012523901062, "kl_loss_13": 3127.6, "kl_loss_26": 2205.8, "kl_loss_39": 1202.7, "kl_loss_7": 3670.0, "learning_rate": 0.0001327041456712334, "loss": 5085.1, "step": 7650 }, { "ce_loss_13": 2.994156318902969, "ce_loss_26": 2.56048826277256, "ce_loss_39": 2.084024053812027, "ce_loss_52": 1.487925711274147, "ce_loss_7": 3.2512714982032778, "epoch": 0.766, "grad_norm": 19.650498118539073, "kl_loss_13": 3125.6, "kl_loss_26": 2221.6, "kl_loss_39": 1220.7, "kl_loss_7": 3665.2, "learning_rate": 0.00013162943106179747, "loss": 5105.9, "step": 7660 }, { "ce_loss_13": 2.9769886791706086, "ce_loss_26": 2.5290128916501997, "ce_loss_39": 2.0444375783205033, "ce_loss_52": 1.4427102521061896, "ce_loss_7": 3.2454589188098906, "epoch": 0.767, "grad_norm": 19.69057867231776, "kl_loss_13": 3203.2, "kl_loss_26": 2269.2, "kl_loss_39": 1243.2, "kl_loss_7": 3758.4, "learning_rate": 0.00013055842593990132, "loss": 5067.35, "step": 7670 }, { "ce_loss_13": 2.9738565921783446, "ce_loss_26": 2.5335902631282807, "ce_loss_39": 2.06500606238842, "ce_loss_52": 1.4531759321689606, "ce_loss_7": 3.2406944632530212, "epoch": 0.768, "grad_norm": 19.854778454902203, "kl_loss_13": 3164.0, "kl_loss_26": 2244.2, "kl_loss_39": 1253.0, "kl_loss_7": 3714.8, "learning_rate": 0.00012949114109055414, "loss": 5080.45, "step": 7680 }, { "ce_loss_13": 2.8718224823474885, "ce_loss_26": 2.434892734885216, "ce_loss_39": 1.9763070404529572, "ce_loss_52": 1.4119910702109337, "ce_loss_7": 3.1341715812683106, "epoch": 0.769, "grad_norm": 19.065166102671203, "kl_loss_13": 3037.6, "kl_loss_26": 2134.2, "kl_loss_39": 1156.1, "kl_loss_7": 3588.0, "learning_rate": 0.00012842758726130281, "loss": 5110.75, "step": 7690 }, { "ce_loss_13": 2.9209058582782745, "ce_loss_26": 2.4878934979438783, "ce_loss_39": 2.0205056190490724, "ce_loss_52": 1.4421792283654213, "ce_loss_7": 3.182013803720474, "epoch": 0.77, "grad_norm": 19.547331822021178, "kl_loss_13": 3063.6, "kl_loss_26": 2150.4, "kl_loss_39": 1168.8, "kl_loss_7": 3608.0, "learning_rate": 0.00012736777516212267, "loss": 5073.5, "step": 7700 }, { "ce_loss_13": 2.9334555983543398, "ce_loss_26": 2.493560019135475, "ce_loss_39": 2.0158845692873, "ce_loss_52": 1.4151214450597762, "ce_loss_7": 3.2045696437358857, "epoch": 0.771, "grad_norm": 18.662664559432073, "kl_loss_13": 3145.2, "kl_loss_26": 2230.4, "kl_loss_39": 1228.4, "kl_loss_7": 3706.0, "learning_rate": 0.00012631171546530968, "loss": 5058.75, "step": 7710 }, { "ce_loss_13": 2.944778233766556, "ce_loss_26": 2.5031532883644103, "ce_loss_39": 2.0205067574977873, "ce_loss_52": 1.414848119020462, "ce_loss_7": 3.2128884732723235, "epoch": 0.772, "grad_norm": 19.409023945233233, "kl_loss_13": 3144.0, "kl_loss_26": 2229.4, "kl_loss_39": 1221.0, "kl_loss_7": 3700.4, "learning_rate": 0.00012525941880537307, "loss": 5071.55, "step": 7720 }, { "ce_loss_13": 2.937892961502075, "ce_loss_26": 2.492938667535782, "ce_loss_39": 2.020972582697868, "ce_loss_52": 1.4354220196604728, "ce_loss_7": 3.2008834302425386, "epoch": 0.773, "grad_norm": 19.46905923212589, "kl_loss_13": 3121.2, "kl_loss_26": 2201.4, "kl_loss_39": 1204.2, "kl_loss_7": 3662.4, "learning_rate": 0.00012421089577892869, "loss": 5040.15, "step": 7730 }, { "ce_loss_13": 2.949704957008362, "ce_loss_26": 2.5045041382312774, "ce_loss_39": 2.016797697544098, "ce_loss_52": 1.4014781221747399, "ce_loss_7": 3.212642914056778, "epoch": 0.774, "grad_norm": 19.613908494270376, "kl_loss_13": 3187.2, "kl_loss_26": 2266.0, "kl_loss_39": 1252.6, "kl_loss_7": 3732.0, "learning_rate": 0.0001231661569445919, "loss": 5076.45, "step": 7740 }, { "ce_loss_13": 2.955250400304794, "ce_loss_26": 2.5274777173995973, "ce_loss_39": 2.067678835988045, "ce_loss_52": 1.4838283985853196, "ce_loss_7": 3.2114856481552123, "epoch": 0.775, "grad_norm": 19.844798674533834, "kl_loss_13": 3084.8, "kl_loss_26": 2190.6, "kl_loss_39": 1199.8, "kl_loss_7": 3616.4, "learning_rate": 0.00012212521282287093, "loss": 5060.4, "step": 7750 }, { "ce_loss_13": 2.9763452112674713, "ce_loss_26": 2.5314755111932756, "ce_loss_39": 2.0540303111076357, "ce_loss_52": 1.450203076004982, "ce_loss_7": 3.2422658264636994, "epoch": 0.776, "grad_norm": 20.432260851236787, "kl_loss_13": 3181.2, "kl_loss_26": 2258.4, "kl_loss_39": 1248.7, "kl_loss_7": 3726.0, "learning_rate": 0.00012108807389606158, "loss": 5084.95, "step": 7760 }, { "ce_loss_13": 2.920662760734558, "ce_loss_26": 2.475513318181038, "ce_loss_39": 2.0011812478303908, "ce_loss_52": 1.4144959792494773, "ce_loss_7": 3.1814299404621122, "epoch": 0.777, "grad_norm": 19.88207983374875, "kl_loss_13": 3106.4, "kl_loss_26": 2178.8, "kl_loss_39": 1189.6, "kl_loss_7": 3652.0, "learning_rate": 0.00012005475060814159, "loss": 5075.25, "step": 7770 }, { "ce_loss_13": 2.982200914621353, "ce_loss_26": 2.5349232286214827, "ce_loss_39": 2.069617584347725, "ce_loss_52": 1.4744601517915725, "ce_loss_7": 3.2433891892433167, "epoch": 0.778, "grad_norm": 19.290497638146196, "kl_loss_13": 3128.4, "kl_loss_26": 2198.0, "kl_loss_39": 1199.9, "kl_loss_7": 3677.2, "learning_rate": 0.00011902525336466464, "loss": 5053.05, "step": 7780 }, { "ce_loss_13": 2.9154593467712404, "ce_loss_26": 2.477250945568085, "ce_loss_39": 2.0098533272743224, "ce_loss_52": 1.4167337000370026, "ce_loss_7": 3.1773332476615908, "epoch": 0.779, "grad_norm": 19.252580297991088, "kl_loss_13": 3096.4, "kl_loss_26": 2190.8, "kl_loss_39": 1202.1, "kl_loss_7": 3643.6, "learning_rate": 0.00011799959253265668, "loss": 5067.65, "step": 7790 }, { "ce_loss_13": 2.9013588786125184, "ce_loss_26": 2.468735784292221, "ce_loss_39": 2.0137620836496355, "ce_loss_52": 1.4429293110966683, "ce_loss_7": 3.1603996396064757, "epoch": 0.78, "grad_norm": 18.773016641793355, "kl_loss_13": 3040.0, "kl_loss_26": 2138.8, "kl_loss_39": 1157.9, "kl_loss_7": 3579.6, "learning_rate": 0.00011697777844051105, "loss": 5056.85, "step": 7800 }, { "ce_loss_13": 2.99123472571373, "ce_loss_26": 2.5481519401073456, "ce_loss_39": 2.0644855082035063, "ce_loss_52": 1.4605911195278167, "ce_loss_7": 3.2550659775733948, "epoch": 0.781, "grad_norm": 19.108089078575876, "kl_loss_13": 3156.0, "kl_loss_26": 2232.8, "kl_loss_39": 1225.5, "kl_loss_7": 3705.6, "learning_rate": 0.00011595982137788402, "loss": 5045.05, "step": 7810 }, { "ce_loss_13": 2.979940289258957, "ce_loss_26": 2.5384896367788317, "ce_loss_39": 2.066192331910133, "ce_loss_52": 1.4709408730268478, "ce_loss_7": 3.2419922232627867, "epoch": 0.782, "grad_norm": 19.283597029818793, "kl_loss_13": 3151.6, "kl_loss_26": 2222.2, "kl_loss_39": 1217.6, "kl_loss_7": 3697.2, "learning_rate": 0.00011494573159559212, "loss": 5088.85, "step": 7820 }, { "ce_loss_13": 2.9142948031425475, "ce_loss_26": 2.4835946947336196, "ce_loss_39": 2.015666288137436, "ce_loss_52": 1.433475723862648, "ce_loss_7": 3.1793313324451447, "epoch": 0.783, "grad_norm": 18.805294673266925, "kl_loss_13": 3065.2, "kl_loss_26": 2169.8, "kl_loss_39": 1195.1, "kl_loss_7": 3620.0, "learning_rate": 0.00011393551930550828, "loss": 5021.8, "step": 7830 }, { "ce_loss_13": 2.942464643716812, "ce_loss_26": 2.4966187834739686, "ce_loss_39": 2.018853786587715, "ce_loss_52": 1.4241285115480422, "ce_loss_7": 3.209713137149811, "epoch": 0.784, "grad_norm": 18.802417227998035, "kl_loss_13": 3153.6, "kl_loss_26": 2218.8, "kl_loss_39": 1214.7, "kl_loss_7": 3709.6, "learning_rate": 0.00011292919468045875, "loss": 5056.2, "step": 7840 }, { "ce_loss_13": 2.9324662506580355, "ce_loss_26": 2.490780544281006, "ce_loss_39": 2.014774057269096, "ce_loss_52": 1.4378075569868087, "ce_loss_7": 3.2002897918224336, "epoch": 0.785, "grad_norm": 18.31675592106317, "kl_loss_13": 3120.4, "kl_loss_26": 2191.6, "kl_loss_39": 1189.3, "kl_loss_7": 3674.4, "learning_rate": 0.00011192676785412154, "loss": 5025.65, "step": 7850 }, { "ce_loss_13": 2.9004018545150756, "ce_loss_26": 2.4679500609636307, "ce_loss_39": 2.0106911092996596, "ce_loss_52": 1.454608330130577, "ce_loss_7": 3.1605159759521486, "epoch": 0.786, "grad_norm": 21.238240444429067, "kl_loss_13": 2987.2, "kl_loss_26": 2093.8, "kl_loss_39": 1136.1, "kl_loss_7": 3522.4, "learning_rate": 0.00011092824892092374, "loss": 4990.8, "step": 7860 }, { "ce_loss_13": 3.0144290030002594, "ce_loss_26": 2.5742201924324037, "ce_loss_39": 2.095407247543335, "ce_loss_52": 1.4918664544820786, "ce_loss_7": 3.2818655967712402, "epoch": 0.787, "grad_norm": 20.48506344270259, "kl_loss_13": 3166.0, "kl_loss_26": 2239.6, "kl_loss_39": 1229.4, "kl_loss_7": 3722.0, "learning_rate": 0.0001099336479359398, "loss": 5084.85, "step": 7870 }, { "ce_loss_13": 2.9180380165576936, "ce_loss_26": 2.4719971120357513, "ce_loss_39": 1.9984097123146056, "ce_loss_52": 1.4270031958818437, "ce_loss_7": 3.1842149913311006, "epoch": 0.788, "grad_norm": 19.617074455019072, "kl_loss_13": 3109.6, "kl_loss_26": 2182.8, "kl_loss_39": 1178.3, "kl_loss_7": 3662.0, "learning_rate": 0.00010894297491479043, "loss": 5092.2, "step": 7880 }, { "ce_loss_13": 2.8819404006004334, "ce_loss_26": 2.449340745806694, "ce_loss_39": 1.9862482339143752, "ce_loss_52": 1.413575354218483, "ce_loss_7": 3.143266361951828, "epoch": 0.789, "grad_norm": 19.73420487885875, "kl_loss_13": 3037.2, "kl_loss_26": 2140.0, "kl_loss_39": 1164.1, "kl_loss_7": 3572.0, "learning_rate": 0.00010795623983354214, "loss": 5012.15, "step": 7890 }, { "ce_loss_13": 2.936946928501129, "ce_loss_26": 2.50938241481781, "ce_loss_39": 2.0399589776992797, "ce_loss_52": 1.4433409079909325, "ce_loss_7": 3.1964890122413636, "epoch": 0.79, "grad_norm": 20.456776413214342, "kl_loss_13": 3094.8, "kl_loss_26": 2194.4, "kl_loss_39": 1203.3, "kl_loss_7": 3650.0, "learning_rate": 0.00010697345262860636, "loss": 5033.95, "step": 7900 }, { "ce_loss_13": 2.9234113335609435, "ce_loss_26": 2.486256945133209, "ce_loss_39": 2.014375075697899, "ce_loss_52": 1.449743601679802, "ce_loss_7": 3.18265563249588, "epoch": 0.791, "grad_norm": 19.900735615836812, "kl_loss_13": 3074.8, "kl_loss_26": 2163.0, "kl_loss_39": 1161.1, "kl_loss_7": 3612.8, "learning_rate": 0.00010599462319663906, "loss": 5038.65, "step": 7910 }, { "ce_loss_13": 2.9553000926971436, "ce_loss_26": 2.5179340064525606, "ce_loss_39": 2.0430867671966553, "ce_loss_52": 1.444689854979515, "ce_loss_7": 3.21486856341362, "epoch": 0.792, "grad_norm": 19.250533773319045, "kl_loss_13": 3122.0, "kl_loss_26": 2206.2, "kl_loss_39": 1213.4, "kl_loss_7": 3668.8, "learning_rate": 0.00010501976139444191, "loss": 5048.55, "step": 7920 }, { "ce_loss_13": 2.946609389781952, "ce_loss_26": 2.5028085887432097, "ce_loss_39": 2.0339547246694565, "ce_loss_52": 1.4416055083274841, "ce_loss_7": 3.2140405058860777, "epoch": 0.793, "grad_norm": 20.61899934712358, "kl_loss_13": 3133.6, "kl_loss_26": 2209.4, "kl_loss_39": 1208.3, "kl_loss_7": 3694.0, "learning_rate": 0.0001040488770388625, "loss": 5057.15, "step": 7930 }, { "ce_loss_13": 2.866725343465805, "ce_loss_26": 2.424889090657234, "ce_loss_39": 1.9532368332147598, "ce_loss_52": 1.37542584836483, "ce_loss_7": 3.1246279418468474, "epoch": 0.794, "grad_norm": 19.267832695141472, "kl_loss_13": 3083.6, "kl_loss_26": 2164.8, "kl_loss_39": 1176.3, "kl_loss_7": 3624.4, "learning_rate": 0.00010308197990669538, "loss": 5026.95, "step": 7940 }, { "ce_loss_13": 2.901643234491348, "ce_loss_26": 2.4638597697019575, "ce_loss_39": 1.9964057832956315, "ce_loss_52": 1.4079250425100327, "ce_loss_7": 3.1619919717311857, "epoch": 0.795, "grad_norm": 19.24818590589668, "kl_loss_13": 3123.2, "kl_loss_26": 2199.2, "kl_loss_39": 1202.4, "kl_loss_7": 3666.4, "learning_rate": 0.0001021190797345839, "loss": 5013.5, "step": 7950 }, { "ce_loss_13": 2.967918246984482, "ce_loss_26": 2.533867511153221, "ce_loss_39": 2.069682791829109, "ce_loss_52": 1.4818589851260184, "ce_loss_7": 3.222521889209747, "epoch": 0.796, "grad_norm": 19.75622423793478, "kl_loss_13": 3058.8, "kl_loss_26": 2155.2, "kl_loss_39": 1175.7, "kl_loss_7": 3592.0, "learning_rate": 0.00010116018621892236, "loss": 5009.6, "step": 7960 }, { "ce_loss_13": 2.8812991797924044, "ce_loss_26": 2.4400356858968735, "ce_loss_39": 1.9724171191453934, "ce_loss_52": 1.4085147365927697, "ce_loss_7": 3.1415765941143037, "epoch": 0.797, "grad_norm": 19.13497167869677, "kl_loss_13": 3057.6, "kl_loss_26": 2142.8, "kl_loss_39": 1159.2, "kl_loss_7": 3602.8, "learning_rate": 0.00010020530901575753, "loss": 5020.4, "step": 7970 }, { "ce_loss_13": 2.904066652059555, "ce_loss_26": 2.4755689650774, "ce_loss_39": 2.0152323603630067, "ce_loss_52": 1.4373595044016838, "ce_loss_7": 3.1696866393089294, "epoch": 0.798, "grad_norm": 20.01532447536976, "kl_loss_13": 3044.0, "kl_loss_26": 2150.0, "kl_loss_39": 1170.5, "kl_loss_7": 3601.6, "learning_rate": 9.925445774069231e-05, "loss": 5018.45, "step": 7980 }, { "ce_loss_13": 2.9288057029247283, "ce_loss_26": 2.4822009325027468, "ce_loss_39": 2.011521789431572, "ce_loss_52": 1.4153838574886322, "ce_loss_7": 3.1911131501197816, "epoch": 0.799, "grad_norm": 19.205851361122782, "kl_loss_13": 3126.0, "kl_loss_26": 2203.4, "kl_loss_39": 1210.7, "kl_loss_7": 3672.4, "learning_rate": 9.830764196878872e-05, "loss": 5069.4, "step": 7990 }, { "ce_loss_13": 2.9970316886901855, "ce_loss_26": 2.5632322430610657, "ce_loss_39": 2.089646649360657, "ce_loss_52": 1.4764455169439317, "ce_loss_7": 3.2621945440769196, "epoch": 0.8, "grad_norm": 18.817128068716947, "kl_loss_13": 3164.4, "kl_loss_26": 2259.8, "kl_loss_39": 1248.2, "kl_loss_7": 3716.4, "learning_rate": 9.736487123447069e-05, "loss": 5026.75, "step": 8000 }, { "ce_loss_13": 2.955092731118202, "ce_loss_26": 2.5211679935455322, "ce_loss_39": 2.061183473467827, "ce_loss_52": 1.4790756076574325, "ce_loss_7": 3.211963188648224, "epoch": 0.801, "grad_norm": 19.13731613570068, "kl_loss_13": 3058.4, "kl_loss_26": 2157.0, "kl_loss_39": 1183.6, "kl_loss_7": 3589.2, "learning_rate": 9.642615503142926e-05, "loss": 5013.8, "step": 8010 }, { "ce_loss_13": 2.8814174115657805, "ce_loss_26": 2.451471582055092, "ce_loss_39": 1.9925315648317337, "ce_loss_52": 1.4264169082045555, "ce_loss_7": 3.150370055437088, "epoch": 0.802, "grad_norm": 19.95644855086655, "kl_loss_13": 3015.2, "kl_loss_26": 2114.4, "kl_loss_39": 1144.7, "kl_loss_7": 3572.8, "learning_rate": 9.549150281252633e-05, "loss": 5066.0, "step": 8020 }, { "ce_loss_13": 2.9213546574115754, "ce_loss_26": 2.488295114040375, "ce_loss_39": 2.0217175424098968, "ce_loss_52": 1.448304545879364, "ce_loss_7": 3.177370023727417, "epoch": 0.803, "grad_norm": 19.167189329215354, "kl_loss_13": 3046.0, "kl_loss_26": 2140.4, "kl_loss_39": 1161.6, "kl_loss_7": 3582.8, "learning_rate": 9.4560923989699e-05, "loss": 5040.7, "step": 8030 }, { "ce_loss_13": 2.8780395865440367, "ce_loss_26": 2.43271960914135, "ce_loss_39": 1.966169360280037, "ce_loss_52": 1.3888942331075669, "ce_loss_7": 3.1494656085968016, "epoch": 0.804, "grad_norm": 18.853764898775175, "kl_loss_13": 3103.6, "kl_loss_26": 2173.0, "kl_loss_39": 1171.5, "kl_loss_7": 3672.8, "learning_rate": 9.363442793386607e-05, "loss": 5021.25, "step": 8040 }, { "ce_loss_13": 2.9130735754966737, "ce_loss_26": 2.480276498198509, "ce_loss_39": 2.0196522653102873, "ce_loss_52": 1.4448804795742034, "ce_loss_7": 3.1761886417865752, "epoch": 0.805, "grad_norm": 18.76619687601556, "kl_loss_13": 3048.8, "kl_loss_26": 2147.8, "kl_loss_39": 1165.5, "kl_loss_7": 3590.8, "learning_rate": 9.271202397483213e-05, "loss": 4983.8, "step": 8050 }, { "ce_loss_13": 2.960085618495941, "ce_loss_26": 2.516904118657112, "ce_loss_39": 2.0490771383047104, "ce_loss_52": 1.4702347993850708, "ce_loss_7": 3.2195077538490295, "epoch": 0.806, "grad_norm": 20.341613718883853, "kl_loss_13": 3080.0, "kl_loss_26": 2163.0, "kl_loss_39": 1179.4, "kl_loss_7": 3617.6, "learning_rate": 9.179372140119524e-05, "loss": 5044.25, "step": 8060 }, { "ce_loss_13": 2.895679956674576, "ce_loss_26": 2.447220724821091, "ce_loss_39": 1.978733304142952, "ce_loss_52": 1.4091844826936721, "ce_loss_7": 3.1559928357601166, "epoch": 0.807, "grad_norm": 19.648457937725187, "kl_loss_13": 3088.0, "kl_loss_26": 2160.6, "kl_loss_39": 1172.1, "kl_loss_7": 3631.6, "learning_rate": 9.087952946025175e-05, "loss": 5019.35, "step": 8070 }, { "ce_loss_13": 2.8867613554000853, "ce_loss_26": 2.4542810022830963, "ce_loss_39": 1.997492003440857, "ce_loss_52": 1.4237906068563462, "ce_loss_7": 3.1490099489688874, "epoch": 0.808, "grad_norm": 19.108440716050485, "kl_loss_13": 3031.2, "kl_loss_26": 2132.0, "kl_loss_39": 1161.2, "kl_loss_7": 3573.2, "learning_rate": 8.996945735790446e-05, "loss": 5081.55, "step": 8080 }, { "ce_loss_13": 2.903727024793625, "ce_loss_26": 2.4667694687843325, "ce_loss_39": 2.0024666130542754, "ce_loss_52": 1.4252464413642882, "ce_loss_7": 3.1627338111400602, "epoch": 0.809, "grad_norm": 19.45516980399187, "kl_loss_13": 3065.2, "kl_loss_26": 2156.8, "kl_loss_39": 1173.6, "kl_loss_7": 3607.6, "learning_rate": 8.906351425856951e-05, "loss": 5032.55, "step": 8090 }, { "ce_loss_13": 2.9971861839294434, "ce_loss_26": 2.558201992511749, "ce_loss_39": 2.0943466246128084, "ce_loss_52": 1.5071399331092834, "ce_loss_7": 3.2508726358413695, "epoch": 0.81, "grad_norm": 18.578032258449234, "kl_loss_13": 3080.0, "kl_loss_26": 2167.8, "kl_loss_39": 1188.5, "kl_loss_7": 3612.8, "learning_rate": 8.816170928508365e-05, "loss": 5060.5, "step": 8100 }, { "ce_loss_13": 2.9514600038528442, "ce_loss_26": 2.511053240299225, "ce_loss_39": 2.036428925395012, "ce_loss_52": 1.4632433116436006, "ce_loss_7": 3.2164508640766143, "epoch": 0.811, "grad_norm": 19.46007345669389, "kl_loss_13": 3077.6, "kl_loss_26": 2160.6, "kl_loss_39": 1164.9, "kl_loss_7": 3629.6, "learning_rate": 8.7264051518613e-05, "loss": 5036.75, "step": 8110 }, { "ce_loss_13": 2.8350981414318084, "ce_loss_26": 2.3989670783281327, "ce_loss_39": 1.9394948929548264, "ce_loss_52": 1.385464173555374, "ce_loss_7": 3.0943558514118195, "epoch": 0.812, "grad_norm": 20.429118003347835, "kl_loss_13": 3019.6, "kl_loss_26": 2118.2, "kl_loss_39": 1139.6, "kl_loss_7": 3559.6, "learning_rate": 8.637054999856148e-05, "loss": 5033.2, "step": 8120 }, { "ce_loss_13": 2.956312870979309, "ce_loss_26": 2.510516768693924, "ce_loss_39": 2.0456418454647065, "ce_loss_52": 1.454368954896927, "ce_loss_7": 3.2136961221694946, "epoch": 0.813, "grad_norm": 19.290479115918554, "kl_loss_13": 3110.4, "kl_loss_26": 2193.4, "kl_loss_39": 1206.0, "kl_loss_7": 3651.6, "learning_rate": 8.548121372247918e-05, "loss": 5042.35, "step": 8130 }, { "ce_loss_13": 2.8997408151626587, "ce_loss_26": 2.4578535914421082, "ce_loss_39": 1.9877970427274705, "ce_loss_52": 1.4171391934156419, "ce_loss_7": 3.1653897404670714, "epoch": 0.814, "grad_norm": 19.231104242907662, "kl_loss_13": 3062.4, "kl_loss_26": 2154.2, "kl_loss_39": 1161.8, "kl_loss_7": 3620.0, "learning_rate": 8.459605164597267e-05, "loss": 4990.3, "step": 8140 }, { "ce_loss_13": 2.8967735528945924, "ce_loss_26": 2.462614360451698, "ce_loss_39": 1.9984548151493073, "ce_loss_52": 1.438458850979805, "ce_loss_7": 3.1581345558166505, "epoch": 0.815, "grad_norm": 19.45006377816174, "kl_loss_13": 3056.8, "kl_loss_26": 2153.0, "kl_loss_39": 1160.6, "kl_loss_7": 3602.0, "learning_rate": 8.371507268261436e-05, "loss": 4980.2, "step": 8150 }, { "ce_loss_13": 2.936253345012665, "ce_loss_26": 2.5057778120040894, "ce_loss_39": 2.038064029812813, "ce_loss_52": 1.4622955560684203, "ce_loss_7": 3.1979693949222563, "epoch": 0.816, "grad_norm": 18.91111939228637, "kl_loss_13": 3060.8, "kl_loss_26": 2153.2, "kl_loss_39": 1175.6, "kl_loss_7": 3612.0, "learning_rate": 8.283828570385238e-05, "loss": 5006.35, "step": 8160 }, { "ce_loss_13": 2.9529894649982453, "ce_loss_26": 2.5044034361839294, "ce_loss_39": 2.042719992995262, "ce_loss_52": 1.4732781440019607, "ce_loss_7": 3.2085989713668823, "epoch": 0.817, "grad_norm": 18.881507142327827, "kl_loss_13": 3068.4, "kl_loss_26": 2134.6, "kl_loss_39": 1154.6, "kl_loss_7": 3608.0, "learning_rate": 8.196569953892202e-05, "loss": 5023.2, "step": 8170 }, { "ce_loss_13": 2.901773339509964, "ce_loss_26": 2.46816024184227, "ce_loss_39": 2.0061379730701447, "ce_loss_52": 1.4472751855850219, "ce_loss_7": 3.162723332643509, "epoch": 0.818, "grad_norm": 19.58512082927694, "kl_loss_13": 3021.6, "kl_loss_26": 2116.2, "kl_loss_39": 1138.3, "kl_loss_7": 3567.6, "learning_rate": 8.109732297475635e-05, "loss": 5011.1, "step": 8180 }, { "ce_loss_13": 2.9283832788467405, "ce_loss_26": 2.495754861831665, "ce_loss_39": 2.0315854638814925, "ce_loss_52": 1.4544063314795495, "ce_loss_7": 3.184633868932724, "epoch": 0.819, "grad_norm": 20.338540288781616, "kl_loss_13": 3056.0, "kl_loss_26": 2152.0, "kl_loss_39": 1164.7, "kl_loss_7": 3604.8, "learning_rate": 8.023316475589754e-05, "loss": 4985.65, "step": 8190 }, { "ce_loss_13": 2.8736020922660828, "ce_loss_26": 2.4326390773057938, "ce_loss_39": 1.9687805682420731, "ce_loss_52": 1.4114506781101226, "ce_loss_7": 3.1362832963466643, "epoch": 0.82, "grad_norm": 19.348124098647066, "kl_loss_13": 3039.6, "kl_loss_26": 2118.4, "kl_loss_39": 1131.5, "kl_loss_7": 3592.4, "learning_rate": 7.937323358440934e-05, "loss": 4999.05, "step": 8200 }, { "ce_loss_13": 2.9405028223991394, "ce_loss_26": 2.5053980708122254, "ce_loss_39": 2.031425711512566, "ce_loss_52": 1.4602935075759889, "ce_loss_7": 3.1905817687511444, "epoch": 0.821, "grad_norm": 19.308929525960306, "kl_loss_13": 3053.6, "kl_loss_26": 2142.8, "kl_loss_39": 1164.5, "kl_loss_7": 3580.4, "learning_rate": 7.851753811978923e-05, "loss": 5013.6, "step": 8210 }, { "ce_loss_13": 2.8261436820030212, "ce_loss_26": 2.396569001674652, "ce_loss_39": 1.9376911997795105, "ce_loss_52": 1.3843101486563683, "ce_loss_7": 3.08916922211647, "epoch": 0.822, "grad_norm": 18.760581120890944, "kl_loss_13": 2979.6, "kl_loss_26": 2084.0, "kl_loss_39": 1118.1, "kl_loss_7": 3526.4, "learning_rate": 7.766608697888095e-05, "loss": 4996.05, "step": 8220 }, { "ce_loss_13": 2.895614618062973, "ce_loss_26": 2.4538383156061174, "ce_loss_39": 1.9913074195384979, "ce_loss_52": 1.4129166051745414, "ce_loss_7": 3.156418579816818, "epoch": 0.823, "grad_norm": 19.448137234461008, "kl_loss_13": 3094.8, "kl_loss_26": 2171.8, "kl_loss_39": 1179.4, "kl_loss_7": 3644.0, "learning_rate": 7.681888873578785e-05, "loss": 5010.15, "step": 8230 }, { "ce_loss_13": 2.8753804206848144, "ce_loss_26": 2.448589825630188, "ce_loss_39": 1.9881916165351867, "ce_loss_52": 1.433535772562027, "ce_loss_7": 3.133152514696121, "epoch": 0.824, "grad_norm": 19.780732761004185, "kl_loss_13": 2998.8, "kl_loss_26": 2105.2, "kl_loss_39": 1137.2, "kl_loss_7": 3529.6, "learning_rate": 7.597595192178702e-05, "loss": 4951.6, "step": 8240 }, { "ce_loss_13": 2.874552935361862, "ce_loss_26": 2.426975393295288, "ce_loss_39": 1.969171154499054, "ce_loss_52": 1.4067743465304374, "ce_loss_7": 3.1344926774501802, "epoch": 0.825, "grad_norm": 19.081018400834456, "kl_loss_13": 3050.0, "kl_loss_26": 2129.2, "kl_loss_39": 1153.6, "kl_loss_7": 3590.8, "learning_rate": 7.513728502524286e-05, "loss": 4924.6, "step": 8250 }, { "ce_loss_13": 2.8894054651260377, "ce_loss_26": 2.4550040304660796, "ce_loss_39": 1.9873575389385223, "ce_loss_52": 1.417596697807312, "ce_loss_7": 3.152091747522354, "epoch": 0.826, "grad_norm": 19.8137268983455, "kl_loss_13": 3043.6, "kl_loss_26": 2138.6, "kl_loss_39": 1152.9, "kl_loss_7": 3588.0, "learning_rate": 7.430289649152156e-05, "loss": 5032.7, "step": 8260 }, { "ce_loss_13": 2.9286361813545225, "ce_loss_26": 2.500110092759132, "ce_loss_39": 2.0343170315027237, "ce_loss_52": 1.4783238634467124, "ce_loss_7": 3.186429667472839, "epoch": 0.827, "grad_norm": 19.162288533319998, "kl_loss_13": 3004.0, "kl_loss_26": 2105.8, "kl_loss_39": 1135.2, "kl_loss_7": 3537.2, "learning_rate": 7.347279472290646e-05, "loss": 4997.95, "step": 8270 }, { "ce_loss_13": 2.8661180198192597, "ce_loss_26": 2.4267468631267546, "ce_loss_39": 1.9584647029638291, "ce_loss_52": 1.3965127140283584, "ce_loss_7": 3.1264285147190094, "epoch": 0.828, "grad_norm": 18.88865775629702, "kl_loss_13": 3042.4, "kl_loss_26": 2132.2, "kl_loss_39": 1143.9, "kl_loss_7": 3592.0, "learning_rate": 7.264698807851328e-05, "loss": 4945.25, "step": 8280 }, { "ce_loss_13": 2.963280272483826, "ce_loss_26": 2.5102931052446364, "ce_loss_39": 2.034750634431839, "ce_loss_52": 1.4600775420665741, "ce_loss_7": 3.2234760582447053, "epoch": 0.829, "grad_norm": 18.537963317549014, "kl_loss_13": 3144.4, "kl_loss_26": 2206.8, "kl_loss_39": 1191.9, "kl_loss_7": 3687.6, "learning_rate": 7.182548487420554e-05, "loss": 5033.4, "step": 8290 }, { "ce_loss_13": 2.9992467045783995, "ce_loss_26": 2.55620219707489, "ce_loss_39": 2.0874054759740828, "ce_loss_52": 1.4909266605973244, "ce_loss_7": 3.263571548461914, "epoch": 0.83, "grad_norm": 18.891053458181357, "kl_loss_13": 3132.0, "kl_loss_26": 2207.0, "kl_loss_39": 1207.0, "kl_loss_7": 3681.2, "learning_rate": 7.100829338251146e-05, "loss": 5053.05, "step": 8300 }, { "ce_loss_13": 2.9434437096118926, "ce_loss_26": 2.5084387719631196, "ce_loss_39": 2.0438412368297576, "ce_loss_52": 1.455627703666687, "ce_loss_7": 3.203293949365616, "epoch": 0.831, "grad_norm": 18.63506341583337, "kl_loss_13": 3080.0, "kl_loss_26": 2173.6, "kl_loss_39": 1196.0, "kl_loss_7": 3632.8, "learning_rate": 7.019542183254046e-05, "loss": 5020.25, "step": 8310 }, { "ce_loss_13": 2.9150700867176056, "ce_loss_26": 2.4821571350097655, "ce_loss_39": 2.011009243130684, "ce_loss_52": 1.4295171827077866, "ce_loss_7": 3.1743992388248445, "epoch": 0.832, "grad_norm": 19.224604457055225, "kl_loss_13": 3061.6, "kl_loss_26": 2156.4, "kl_loss_39": 1178.0, "kl_loss_7": 3603.6, "learning_rate": 6.938687840989971e-05, "loss": 4998.8, "step": 8320 }, { "ce_loss_13": 2.9248284220695497, "ce_loss_26": 2.4842200338840486, "ce_loss_39": 2.016203221678734, "ce_loss_52": 1.4306745767593383, "ce_loss_7": 3.1828024327754973, "epoch": 0.833, "grad_norm": 21.39912114954264, "kl_loss_13": 3084.0, "kl_loss_26": 2177.0, "kl_loss_39": 1194.5, "kl_loss_7": 3630.0, "learning_rate": 6.858267125661271e-05, "loss": 5022.85, "step": 8330 }, { "ce_loss_13": 2.8756704360246657, "ce_loss_26": 2.447014120221138, "ce_loss_39": 1.9950514793395997, "ce_loss_52": 1.4190126568078996, "ce_loss_7": 3.135387209057808, "epoch": 0.834, "grad_norm": 19.16865484866817, "kl_loss_13": 3030.8, "kl_loss_26": 2137.8, "kl_loss_39": 1174.0, "kl_loss_7": 3584.4, "learning_rate": 6.778280847103668e-05, "loss": 5009.55, "step": 8340 }, { "ce_loss_13": 2.8521511554718018, "ce_loss_26": 2.415205565094948, "ce_loss_39": 1.9484322667121887, "ce_loss_52": 1.3919154837727548, "ce_loss_7": 3.11352881193161, "epoch": 0.835, "grad_norm": 19.565513612829772, "kl_loss_13": 3035.6, "kl_loss_26": 2128.8, "kl_loss_39": 1129.1, "kl_loss_7": 3584.4, "learning_rate": 6.698729810778065e-05, "loss": 4997.8, "step": 8350 }, { "ce_loss_13": 2.9550336360931397, "ce_loss_26": 2.516478735208511, "ce_loss_39": 2.0486765056848526, "ce_loss_52": 1.4607697233557702, "ce_loss_7": 3.2147393763065337, "epoch": 0.836, "grad_norm": 19.2117514895061, "kl_loss_13": 3091.2, "kl_loss_26": 2181.2, "kl_loss_39": 1192.7, "kl_loss_7": 3642.8, "learning_rate": 6.619614817762538e-05, "loss": 4980.45, "step": 8360 }, { "ce_loss_13": 2.8862642347812653, "ce_loss_26": 2.453189605474472, "ce_loss_39": 1.9996996372938156, "ce_loss_52": 1.42118998169899, "ce_loss_7": 3.1397067666053773, "epoch": 0.837, "grad_norm": 19.74025196121118, "kl_loss_13": 3027.2, "kl_loss_26": 2130.4, "kl_loss_39": 1163.7, "kl_loss_7": 3555.6, "learning_rate": 6.540936664744196e-05, "loss": 5003.55, "step": 8370 }, { "ce_loss_13": 2.8884230494499206, "ce_loss_26": 2.461115485429764, "ce_loss_39": 1.996484610438347, "ce_loss_52": 1.431185284256935, "ce_loss_7": 3.1595689237117766, "epoch": 0.838, "grad_norm": 18.91687230970295, "kl_loss_13": 3028.0, "kl_loss_26": 2134.2, "kl_loss_39": 1162.1, "kl_loss_7": 3585.2, "learning_rate": 6.462696144011149e-05, "loss": 4978.95, "step": 8380 }, { "ce_loss_13": 2.9092856884002685, "ce_loss_26": 2.477367341518402, "ce_loss_39": 2.0064304888248445, "ce_loss_52": 1.4392234981060028, "ce_loss_7": 3.166864866018295, "epoch": 0.839, "grad_norm": 18.987849334165656, "kl_loss_13": 3037.2, "kl_loss_26": 2130.0, "kl_loss_39": 1148.3, "kl_loss_7": 3579.6, "learning_rate": 6.384894043444567e-05, "loss": 4978.1, "step": 8390 }, { "ce_loss_13": 2.926213449239731, "ce_loss_26": 2.4814498484134675, "ce_loss_39": 2.011722648143768, "ce_loss_52": 1.4253545701503754, "ce_loss_7": 3.1928284585475923, "epoch": 0.84, "grad_norm": 18.67190019337169, "kl_loss_13": 3118.4, "kl_loss_26": 2194.4, "kl_loss_39": 1203.2, "kl_loss_7": 3673.2, "learning_rate": 6.307531146510753e-05, "loss": 4975.3, "step": 8400 }, { "ce_loss_13": 2.9474796772003176, "ce_loss_26": 2.513693606853485, "ce_loss_39": 2.039980337023735, "ce_loss_52": 1.4588691473007203, "ce_loss_7": 3.208155167102814, "epoch": 0.841, "grad_norm": 19.23898472020216, "kl_loss_13": 3091.2, "kl_loss_26": 2176.0, "kl_loss_39": 1186.3, "kl_loss_7": 3638.8, "learning_rate": 6.230608232253226e-05, "loss": 4972.8, "step": 8410 }, { "ce_loss_13": 2.98299777507782, "ce_loss_26": 2.546931451559067, "ce_loss_39": 2.066228356957436, "ce_loss_52": 1.4528164565563202, "ce_loss_7": 3.2408275246620177, "epoch": 0.842, "grad_norm": 19.590505544042042, "kl_loss_13": 3177.6, "kl_loss_26": 2260.2, "kl_loss_39": 1246.2, "kl_loss_7": 3722.8, "learning_rate": 6.154126075284855e-05, "loss": 5025.9, "step": 8420 }, { "ce_loss_13": 2.8128190338611603, "ce_loss_26": 2.3779567658901213, "ce_loss_39": 1.922488284111023, "ce_loss_52": 1.3714163228869438, "ce_loss_7": 3.0708466947078703, "epoch": 0.843, "grad_norm": 19.244313422130332, "kl_loss_13": 3014.4, "kl_loss_26": 2105.4, "kl_loss_39": 1130.8, "kl_loss_7": 3552.8, "learning_rate": 6.078085445780129e-05, "loss": 5004.75, "step": 8430 }, { "ce_loss_13": 2.938475805521011, "ce_loss_26": 2.497309777140617, "ce_loss_39": 2.027024504542351, "ce_loss_52": 1.4460827559232712, "ce_loss_7": 3.194866645336151, "epoch": 0.844, "grad_norm": 18.715084502177618, "kl_loss_13": 3086.4, "kl_loss_26": 2168.0, "kl_loss_39": 1180.4, "kl_loss_7": 3630.0, "learning_rate": 6.002487109467347e-05, "loss": 5005.05, "step": 8440 }, { "ce_loss_13": 2.940459841489792, "ce_loss_26": 2.5169106662273406, "ce_loss_39": 2.0435358375310897, "ce_loss_52": 1.4713785827159882, "ce_loss_7": 3.199807566404343, "epoch": 0.845, "grad_norm": 20.662636581778713, "kl_loss_13": 3050.8, "kl_loss_26": 2152.2, "kl_loss_39": 1166.1, "kl_loss_7": 3588.8, "learning_rate": 5.927331827620902e-05, "loss": 5015.45, "step": 8450 }, { "ce_loss_13": 2.8635286152362824, "ce_loss_26": 2.426770511269569, "ce_loss_39": 1.9639566600322724, "ce_loss_52": 1.4056006461381911, "ce_loss_7": 3.126782363653183, "epoch": 0.846, "grad_norm": 19.43955116705752, "kl_loss_13": 3026.8, "kl_loss_26": 2123.4, "kl_loss_39": 1139.5, "kl_loss_7": 3581.2, "learning_rate": 5.852620357053651e-05, "loss": 4930.5, "step": 8460 }, { "ce_loss_13": 2.967438644170761, "ce_loss_26": 2.5302784025669096, "ce_loss_39": 2.0548608988523482, "ce_loss_52": 1.456875516474247, "ce_loss_7": 3.232648569345474, "epoch": 0.847, "grad_norm": 18.917933547815785, "kl_loss_13": 3141.6, "kl_loss_26": 2229.0, "kl_loss_39": 1221.3, "kl_loss_7": 3698.0, "learning_rate": 5.778353450109286e-05, "loss": 5049.2, "step": 8470 }, { "ce_loss_13": 2.8490840077400206, "ce_loss_26": 2.4299704492092133, "ce_loss_39": 1.9720161318778993, "ce_loss_52": 1.428696632385254, "ce_loss_7": 3.1050261557102203, "epoch": 0.848, "grad_norm": 19.028513845565772, "kl_loss_13": 2946.4, "kl_loss_26": 2077.0, "kl_loss_39": 1109.1, "kl_loss_7": 3480.0, "learning_rate": 5.7045318546547206e-05, "loss": 4964.25, "step": 8480 }, { "ce_loss_13": 2.91025772690773, "ce_loss_26": 2.470404103398323, "ce_loss_39": 1.9989687472581863, "ce_loss_52": 1.4330752968788147, "ce_loss_7": 3.1751048266887665, "epoch": 0.849, "grad_norm": 18.63333546431516, "kl_loss_13": 3053.6, "kl_loss_26": 2137.8, "kl_loss_39": 1151.7, "kl_loss_7": 3603.6, "learning_rate": 5.631156314072605e-05, "loss": 4997.6, "step": 8490 }, { "ce_loss_13": 2.969731491804123, "ce_loss_26": 2.5224834442138673, "ce_loss_39": 2.043315088748932, "ce_loss_52": 1.4606564939022064, "ce_loss_7": 3.235535615682602, "epoch": 0.85, "grad_norm": 19.13874159874534, "kl_loss_13": 3116.0, "kl_loss_26": 2192.8, "kl_loss_39": 1187.7, "kl_loss_7": 3671.6, "learning_rate": 5.5582275672538315e-05, "loss": 4973.7, "step": 8500 }, { "ce_loss_13": 2.9457097470760347, "ce_loss_26": 2.5094266653060915, "ce_loss_39": 2.037208506464958, "ce_loss_52": 1.4461605846881866, "ce_loss_7": 3.2044992685317992, "epoch": 0.851, "grad_norm": 19.084085332754032, "kl_loss_13": 3088.0, "kl_loss_26": 2174.4, "kl_loss_39": 1186.9, "kl_loss_7": 3629.2, "learning_rate": 5.4857463485900484e-05, "loss": 4979.7, "step": 8510 }, { "ce_loss_13": 2.932442033290863, "ce_loss_26": 2.4945150196552275, "ce_loss_39": 2.024565789103508, "ce_loss_52": 1.4535668522119523, "ce_loss_7": 3.1925411999225615, "epoch": 0.852, "grad_norm": 18.598717511449827, "kl_loss_13": 3049.2, "kl_loss_26": 2138.2, "kl_loss_39": 1163.5, "kl_loss_7": 3596.0, "learning_rate": 5.413713387966329e-05, "loss": 4976.35, "step": 8520 }, { "ce_loss_13": 2.850284093618393, "ce_loss_26": 2.4304546415805817, "ce_loss_39": 1.9769367069005965, "ce_loss_52": 1.4221994251012802, "ce_loss_7": 3.106715601682663, "epoch": 0.853, "grad_norm": 19.800943553162387, "kl_loss_13": 2971.2, "kl_loss_26": 2092.8, "kl_loss_39": 1135.0, "kl_loss_7": 3509.2, "learning_rate": 5.34212941075381e-05, "loss": 4969.3, "step": 8530 }, { "ce_loss_13": 2.8934908270835877, "ce_loss_26": 2.4596606254577638, "ce_loss_39": 2.00773600935936, "ce_loss_52": 1.4613569289445878, "ce_loss_7": 3.150895756483078, "epoch": 0.854, "grad_norm": 18.83825612066199, "kl_loss_13": 3003.6, "kl_loss_26": 2097.4, "kl_loss_39": 1124.7, "kl_loss_7": 3544.0, "learning_rate": 5.270995137802315e-05, "loss": 4942.95, "step": 8540 }, { "ce_loss_13": 2.9037817120552063, "ce_loss_26": 2.462430712580681, "ce_loss_39": 1.9962077885866165, "ce_loss_52": 1.425583516061306, "ce_loss_7": 3.173860079050064, "epoch": 0.855, "grad_norm": 18.951683103960118, "kl_loss_13": 3081.6, "kl_loss_26": 2158.2, "kl_loss_39": 1162.8, "kl_loss_7": 3640.8, "learning_rate": 5.2003112854332125e-05, "loss": 4931.9, "step": 8550 }, { "ce_loss_13": 2.945174980163574, "ce_loss_26": 2.504639369249344, "ce_loss_39": 2.038501372933388, "ce_loss_52": 1.4634816706180573, "ce_loss_7": 3.206812459230423, "epoch": 0.856, "grad_norm": 19.71064196024924, "kl_loss_13": 3095.2, "kl_loss_26": 2169.8, "kl_loss_39": 1182.5, "kl_loss_7": 3640.4, "learning_rate": 5.130078565432089e-05, "loss": 5022.2, "step": 8560 }, { "ce_loss_13": 2.916484522819519, "ce_loss_26": 2.4740715622901917, "ce_loss_39": 2.0083792597055434, "ce_loss_52": 1.4411062002182007, "ce_loss_7": 3.1799224853515624, "epoch": 0.857, "grad_norm": 18.476735822226924, "kl_loss_13": 3088.4, "kl_loss_26": 2169.8, "kl_loss_39": 1171.0, "kl_loss_7": 3638.0, "learning_rate": 5.060297685041659e-05, "loss": 4959.95, "step": 8570 }, { "ce_loss_13": 2.9657407224178316, "ce_loss_26": 2.5159328460693358, "ce_loss_39": 2.0297839671373366, "ce_loss_52": 1.4374482572078704, "ce_loss_7": 3.2368306040763857, "epoch": 0.858, "grad_norm": 18.666614846165707, "kl_loss_13": 3174.0, "kl_loss_26": 2243.8, "kl_loss_39": 1214.4, "kl_loss_7": 3739.2, "learning_rate": 4.99096934695461e-05, "loss": 4973.6, "step": 8580 }, { "ce_loss_13": 2.9122063517570496, "ce_loss_26": 2.4818194091320036, "ce_loss_39": 2.0184349328279496, "ce_loss_52": 1.4382916703820228, "ce_loss_7": 3.17412588596344, "epoch": 0.859, "grad_norm": 19.2367820447764, "kl_loss_13": 3051.6, "kl_loss_26": 2150.2, "kl_loss_39": 1170.3, "kl_loss_7": 3600.8, "learning_rate": 4.922094249306558e-05, "loss": 4986.8, "step": 8590 }, { "ce_loss_13": 2.856007623672485, "ce_loss_26": 2.4249674677848816, "ce_loss_39": 1.9603979021310807, "ce_loss_52": 1.3998382538557053, "ce_loss_7": 3.120637094974518, "epoch": 0.86, "grad_norm": 19.45927110525327, "kl_loss_13": 3034.0, "kl_loss_26": 2133.8, "kl_loss_39": 1154.5, "kl_loss_7": 3588.0, "learning_rate": 4.853673085668947e-05, "loss": 5020.0, "step": 8600 }, { "ce_loss_13": 2.8736523926258086, "ce_loss_26": 2.4339311927556992, "ce_loss_39": 1.9728148251771926, "ce_loss_52": 1.4134869635105134, "ce_loss_7": 3.132260227203369, "epoch": 0.861, "grad_norm": 18.566757154178084, "kl_loss_13": 3035.6, "kl_loss_26": 2121.8, "kl_loss_39": 1143.1, "kl_loss_7": 3595.6, "learning_rate": 4.78570654504214e-05, "loss": 5002.4, "step": 8610 }, { "ce_loss_13": 2.918911075592041, "ce_loss_26": 2.4760118186473847, "ce_loss_39": 2.0035496681928633, "ce_loss_52": 1.4298115074634552, "ce_loss_7": 3.1867982387542724, "epoch": 0.862, "grad_norm": 19.07407083640675, "kl_loss_13": 3104.8, "kl_loss_26": 2178.0, "kl_loss_39": 1171.7, "kl_loss_7": 3661.6, "learning_rate": 4.7181953118484556e-05, "loss": 4962.1, "step": 8620 }, { "ce_loss_13": 2.90929571390152, "ce_loss_26": 2.4703837007284166, "ce_loss_39": 2.004773771762848, "ce_loss_52": 1.4389754503965377, "ce_loss_7": 3.1703392446041105, "epoch": 0.863, "grad_norm": 19.469181472497027, "kl_loss_13": 3027.6, "kl_loss_26": 2134.8, "kl_loss_39": 1156.0, "kl_loss_7": 3579.2, "learning_rate": 4.651140065925269e-05, "loss": 4937.2, "step": 8630 }, { "ce_loss_13": 2.9975267946720123, "ce_loss_26": 2.5536694526672363, "ce_loss_39": 2.07455490231514, "ce_loss_52": 1.4786568373441695, "ce_loss_7": 3.265315741300583, "epoch": 0.864, "grad_norm": 19.328625062361652, "kl_loss_13": 3149.6, "kl_loss_26": 2221.4, "kl_loss_39": 1210.4, "kl_loss_7": 3701.6, "learning_rate": 4.58454148251814e-05, "loss": 4974.2, "step": 8640 }, { "ce_loss_13": 2.914253044128418, "ce_loss_26": 2.4742087960243224, "ce_loss_39": 2.0090451925992965, "ce_loss_52": 1.4428718268871308, "ce_loss_7": 3.174784082174301, "epoch": 0.865, "grad_norm": 18.948427807011946, "kl_loss_13": 3048.4, "kl_loss_26": 2129.2, "kl_loss_39": 1142.4, "kl_loss_7": 3588.0, "learning_rate": 4.518400232274078e-05, "loss": 4950.8, "step": 8650 }, { "ce_loss_13": 2.895064812898636, "ce_loss_26": 2.452637565135956, "ce_loss_39": 1.9877680152654649, "ce_loss_52": 1.419823595881462, "ce_loss_7": 3.151270192861557, "epoch": 0.866, "grad_norm": 18.81553972523, "kl_loss_13": 3058.0, "kl_loss_26": 2145.0, "kl_loss_39": 1169.4, "kl_loss_7": 3595.6, "learning_rate": 4.452716981234745e-05, "loss": 5007.2, "step": 8660 }, { "ce_loss_13": 2.929095983505249, "ce_loss_26": 2.480497121810913, "ce_loss_39": 2.006428611278534, "ce_loss_52": 1.4315154731273652, "ce_loss_7": 3.2007214546203615, "epoch": 0.867, "grad_norm": 18.95890107682501, "kl_loss_13": 3086.0, "kl_loss_26": 2159.8, "kl_loss_39": 1163.2, "kl_loss_7": 3645.6, "learning_rate": 4.3874923908297335e-05, "loss": 4988.0, "step": 8670 }, { "ce_loss_13": 2.917868083715439, "ce_loss_26": 2.466423386335373, "ce_loss_39": 1.9825115293264388, "ce_loss_52": 1.4014427214860916, "ce_loss_7": 3.187401866912842, "epoch": 0.868, "grad_norm": 18.57272436866935, "kl_loss_13": 3128.0, "kl_loss_26": 2194.4, "kl_loss_39": 1181.8, "kl_loss_7": 3698.4, "learning_rate": 4.322727117869951e-05, "loss": 4966.1, "step": 8680 }, { "ce_loss_13": 2.864998000860214, "ce_loss_26": 2.4295243114233016, "ce_loss_39": 1.9672368943691254, "ce_loss_52": 1.4193324148654938, "ce_loss_7": 3.128977674245834, "epoch": 0.869, "grad_norm": 18.701108919653294, "kl_loss_13": 3002.4, "kl_loss_26": 2088.8, "kl_loss_39": 1117.8, "kl_loss_7": 3558.4, "learning_rate": 4.2584218145409916e-05, "loss": 4955.6, "step": 8690 }, { "ce_loss_13": 2.866414725780487, "ce_loss_26": 2.4338418275117872, "ce_loss_39": 1.961911031603813, "ce_loss_52": 1.3979329317808151, "ce_loss_7": 3.1249643862247467, "epoch": 0.87, "grad_norm": 19.695244279115233, "kl_loss_13": 3026.4, "kl_loss_26": 2123.4, "kl_loss_39": 1140.7, "kl_loss_7": 3568.4, "learning_rate": 4.194577128396521e-05, "loss": 4954.85, "step": 8700 }, { "ce_loss_13": 2.9441056907176972, "ce_loss_26": 2.5146015286445618, "ce_loss_39": 2.0488742887973785, "ce_loss_52": 1.4835967749357224, "ce_loss_7": 3.202489811182022, "epoch": 0.871, "grad_norm": 18.533833127334756, "kl_loss_13": 3031.2, "kl_loss_26": 2129.4, "kl_loss_39": 1154.3, "kl_loss_7": 3572.4, "learning_rate": 4.1311937023518264e-05, "loss": 4983.55, "step": 8710 }, { "ce_loss_13": 2.927008146047592, "ce_loss_26": 2.4801330626010896, "ce_loss_39": 2.0113667100667953, "ce_loss_52": 1.4392758041620255, "ce_loss_7": 3.1931580364704133, "epoch": 0.872, "grad_norm": 19.604731242944656, "kl_loss_13": 3073.2, "kl_loss_26": 2148.4, "kl_loss_39": 1161.3, "kl_loss_7": 3631.6, "learning_rate": 4.0682721746773344e-05, "loss": 4966.3, "step": 8720 }, { "ce_loss_13": 2.875244301557541, "ce_loss_26": 2.4395941644906998, "ce_loss_39": 1.9802403211593629, "ce_loss_52": 1.425346952676773, "ce_loss_7": 3.1323861300945284, "epoch": 0.873, "grad_norm": 19.084321030186736, "kl_loss_13": 3020.0, "kl_loss_26": 2114.6, "kl_loss_39": 1135.6, "kl_loss_7": 3558.4, "learning_rate": 4.0058131789920904e-05, "loss": 4966.9, "step": 8730 }, { "ce_loss_13": 2.91582133769989, "ce_loss_26": 2.469817638397217, "ce_loss_39": 1.9961349010467528, "ce_loss_52": 1.4161934450268745, "ce_loss_7": 3.1843641221523287, "epoch": 0.874, "grad_norm": 19.701093771188038, "kl_loss_13": 3130.0, "kl_loss_26": 2205.0, "kl_loss_39": 1187.7, "kl_loss_7": 3684.8, "learning_rate": 3.9438173442575e-05, "loss": 4920.3, "step": 8740 }, { "ce_loss_13": 2.931247502565384, "ce_loss_26": 2.499329847097397, "ce_loss_39": 2.038593566417694, "ce_loss_52": 1.4660022050142287, "ce_loss_7": 3.1989371538162232, "epoch": 0.875, "grad_norm": 19.499459693858824, "kl_loss_13": 3039.6, "kl_loss_26": 2133.8, "kl_loss_39": 1152.2, "kl_loss_7": 3588.0, "learning_rate": 3.882285294770937e-05, "loss": 4984.7, "step": 8750 }, { "ce_loss_13": 2.903587061166763, "ce_loss_26": 2.4606124222278596, "ce_loss_39": 1.9871950060129167, "ce_loss_52": 1.3980468481779098, "ce_loss_7": 3.17354930639267, "epoch": 0.876, "grad_norm": 19.253090654259744, "kl_loss_13": 3078.4, "kl_loss_26": 2163.8, "kl_loss_39": 1181.0, "kl_loss_7": 3644.8, "learning_rate": 3.821217650159453e-05, "loss": 4982.75, "step": 8760 }, { "ce_loss_13": 2.820340207219124, "ce_loss_26": 2.3951667070388796, "ce_loss_39": 1.9451639890670775, "ce_loss_52": 1.4168721199035645, "ce_loss_7": 3.074656307697296, "epoch": 0.877, "grad_norm": 19.190959579311908, "kl_loss_13": 2910.8, "kl_loss_26": 2031.2, "kl_loss_39": 1086.2, "kl_loss_7": 3448.8, "learning_rate": 3.760615025373543e-05, "loss": 4941.25, "step": 8770 }, { "ce_loss_13": 2.9391641199588774, "ce_loss_26": 2.5044194877147676, "ce_loss_39": 2.0340330809354783, "ce_loss_52": 1.4550057530403138, "ce_loss_7": 3.200497591495514, "epoch": 0.878, "grad_norm": 19.218011314135488, "kl_loss_13": 3063.6, "kl_loss_26": 2155.8, "kl_loss_39": 1164.4, "kl_loss_7": 3604.8, "learning_rate": 3.700478030680987e-05, "loss": 4989.0, "step": 8780 }, { "ce_loss_13": 2.9181353628635405, "ce_loss_26": 2.484974616765976, "ce_loss_39": 2.0213694095611574, "ce_loss_52": 1.4473600834608078, "ce_loss_7": 3.1764037668704987, "epoch": 0.879, "grad_norm": 18.881217884773644, "kl_loss_13": 3048.4, "kl_loss_26": 2145.6, "kl_loss_39": 1162.1, "kl_loss_7": 3594.8, "learning_rate": 3.6408072716606344e-05, "loss": 4996.95, "step": 8790 }, { "ce_loss_13": 2.878078305721283, "ce_loss_26": 2.445318901538849, "ce_loss_39": 1.9787369549274445, "ce_loss_52": 1.4154168665409088, "ce_loss_7": 3.13910374045372, "epoch": 0.88, "grad_norm": 19.232153237296814, "kl_loss_13": 3023.6, "kl_loss_26": 2120.2, "kl_loss_39": 1138.8, "kl_loss_7": 3570.8, "learning_rate": 3.5816033491963716e-05, "loss": 4957.2, "step": 8800 }, { "ce_loss_13": 2.8982558727264403, "ce_loss_26": 2.4716543793678283, "ce_loss_39": 2.0096321552991867, "ce_loss_52": 1.4367865800857544, "ce_loss_7": 3.162083399295807, "epoch": 0.881, "grad_norm": 19.92275213516334, "kl_loss_13": 3003.6, "kl_loss_26": 2113.0, "kl_loss_39": 1151.0, "kl_loss_7": 3555.2, "learning_rate": 3.522866859471047e-05, "loss": 4925.7, "step": 8810 }, { "ce_loss_13": 2.9355869591236115, "ce_loss_26": 2.495421326160431, "ce_loss_39": 2.0324677735567094, "ce_loss_52": 1.4521033734083175, "ce_loss_7": 3.20084969997406, "epoch": 0.882, "grad_norm": 18.63804720981334, "kl_loss_13": 3093.2, "kl_loss_26": 2164.2, "kl_loss_39": 1179.6, "kl_loss_7": 3645.6, "learning_rate": 3.46459839396045e-05, "loss": 5007.2, "step": 8820 }, { "ce_loss_13": 2.9325197875499724, "ce_loss_26": 2.4879075407981874, "ce_loss_39": 2.0197067618370057, "ce_loss_52": 1.4270877152681352, "ce_loss_7": 3.198145306110382, "epoch": 0.883, "grad_norm": 18.241829484742485, "kl_loss_13": 3112.0, "kl_loss_26": 2188.6, "kl_loss_39": 1188.1, "kl_loss_7": 3676.4, "learning_rate": 3.406798539427386e-05, "loss": 4970.75, "step": 8830 }, { "ce_loss_13": 2.9144038438796995, "ce_loss_26": 2.490026795864105, "ce_loss_39": 2.0250850170850754, "ce_loss_52": 1.4695867449045181, "ce_loss_7": 3.171018958091736, "epoch": 0.884, "grad_norm": 19.295732101275807, "kl_loss_13": 3016.4, "kl_loss_26": 2109.6, "kl_loss_39": 1133.6, "kl_loss_7": 3556.0, "learning_rate": 3.349467877915746e-05, "loss": 4929.15, "step": 8840 }, { "ce_loss_13": 2.9367240130901338, "ce_loss_26": 2.5029721915721894, "ce_loss_39": 2.038401874899864, "ce_loss_52": 1.4602129399776458, "ce_loss_7": 3.1973277926445007, "epoch": 0.885, "grad_norm": 18.483553110198095, "kl_loss_13": 3071.2, "kl_loss_26": 2165.0, "kl_loss_39": 1179.3, "kl_loss_7": 3608.0, "learning_rate": 3.292606986744667e-05, "loss": 4997.15, "step": 8850 }, { "ce_loss_13": 2.962205785512924, "ce_loss_26": 2.5182169795036318, "ce_loss_39": 2.0426361471414567, "ce_loss_52": 1.4736472845077515, "ce_loss_7": 3.2304830133914946, "epoch": 0.886, "grad_norm": 19.501438521011444, "kl_loss_13": 3089.6, "kl_loss_26": 2164.0, "kl_loss_39": 1168.6, "kl_loss_7": 3642.4, "learning_rate": 3.23621643850267e-05, "loss": 4957.35, "step": 8860 }, { "ce_loss_13": 2.8543384969234467, "ce_loss_26": 2.4200983941555023, "ce_loss_39": 1.9575607985258103, "ce_loss_52": 1.3949070930480958, "ce_loss_7": 3.1160971879959107, "epoch": 0.887, "grad_norm": 19.320586944244614, "kl_loss_13": 3009.6, "kl_loss_26": 2108.8, "kl_loss_39": 1139.1, "kl_loss_7": 3548.0, "learning_rate": 3.180296801041971e-05, "loss": 4940.1, "step": 8870 }, { "ce_loss_13": 2.8845052778720857, "ce_loss_26": 2.456773716211319, "ce_loss_39": 1.9930354177951812, "ce_loss_52": 1.4307963967323303, "ce_loss_7": 3.144515538215637, "epoch": 0.888, "grad_norm": 19.659033917356428, "kl_loss_13": 2998.8, "kl_loss_26": 2105.2, "kl_loss_39": 1137.6, "kl_loss_7": 3543.2, "learning_rate": 3.124848637472688e-05, "loss": 4952.5, "step": 8880 }, { "ce_loss_13": 2.8944738626480104, "ce_loss_26": 2.4629203975200653, "ce_loss_39": 1.9913650721311569, "ce_loss_52": 1.4339269563555717, "ce_loss_7": 3.1572672605514525, "epoch": 0.889, "grad_norm": 18.88843635803768, "kl_loss_13": 3032.4, "kl_loss_26": 2127.8, "kl_loss_39": 1135.0, "kl_loss_7": 3579.2, "learning_rate": 3.069872506157212e-05, "loss": 4974.35, "step": 8890 }, { "ce_loss_13": 2.8339054346084596, "ce_loss_26": 2.394126781821251, "ce_loss_39": 1.9339392215013504, "ce_loss_52": 1.3971292108297348, "ce_loss_7": 3.0929621160030365, "epoch": 0.89, "grad_norm": 18.978687158228045, "kl_loss_13": 2982.0, "kl_loss_26": 2066.8, "kl_loss_39": 1100.8, "kl_loss_7": 3526.4, "learning_rate": 3.0153689607045842e-05, "loss": 4941.5, "step": 8900 }, { "ce_loss_13": 2.8811903417110445, "ce_loss_26": 2.440036287903786, "ce_loss_39": 1.975409933924675, "ce_loss_52": 1.416017021238804, "ce_loss_7": 3.1414348661899565, "epoch": 0.891, "grad_norm": 19.560639422793763, "kl_loss_13": 3040.4, "kl_loss_26": 2123.2, "kl_loss_39": 1133.6, "kl_loss_7": 3586.4, "learning_rate": 2.9613385499648926e-05, "loss": 4965.6, "step": 8910 }, { "ce_loss_13": 2.8596278965473174, "ce_loss_26": 2.4295035183429716, "ce_loss_39": 1.9728092432022095, "ce_loss_52": 1.4283979684114456, "ce_loss_7": 3.1134236633777617, "epoch": 0.892, "grad_norm": 18.908746612036932, "kl_loss_13": 2974.8, "kl_loss_26": 2082.0, "kl_loss_39": 1115.7, "kl_loss_7": 3505.2, "learning_rate": 2.9077818180237692e-05, "loss": 5007.95, "step": 8920 }, { "ce_loss_13": 2.8744696974754333, "ce_loss_26": 2.4505746215581894, "ce_loss_39": 1.9992403596639634, "ce_loss_52": 1.4506706580519677, "ce_loss_7": 3.13488364815712, "epoch": 0.893, "grad_norm": 18.957478200982788, "kl_loss_13": 2985.6, "kl_loss_26": 2094.6, "kl_loss_39": 1129.4, "kl_loss_7": 3523.6, "learning_rate": 2.8546993041969172e-05, "loss": 4940.15, "step": 8930 }, { "ce_loss_13": 2.8975345969200133, "ce_loss_26": 2.4653249740600587, "ce_loss_39": 2.0007053166627884, "ce_loss_52": 1.4314887911081313, "ce_loss_7": 3.155570811033249, "epoch": 0.894, "grad_norm": 18.739265695771255, "kl_loss_13": 3012.4, "kl_loss_26": 2117.2, "kl_loss_39": 1151.3, "kl_loss_7": 3551.6, "learning_rate": 2.802091543024671e-05, "loss": 4940.05, "step": 8940 }, { "ce_loss_13": 2.9072438359260557, "ce_loss_26": 2.471187961101532, "ce_loss_39": 2.0144962787628176, "ce_loss_52": 1.42624132335186, "ce_loss_7": 3.1694943487644194, "epoch": 0.895, "grad_norm": 19.21731826372691, "kl_loss_13": 3062.0, "kl_loss_26": 2168.4, "kl_loss_39": 1196.7, "kl_loss_7": 3618.8, "learning_rate": 2.7499590642665774e-05, "loss": 4979.0, "step": 8950 }, { "ce_loss_13": 2.893015044927597, "ce_loss_26": 2.4596860975027086, "ce_loss_39": 2.0018325716257097, "ce_loss_52": 1.4285719782114028, "ce_loss_7": 3.154858148097992, "epoch": 0.896, "grad_norm": 18.801842424478984, "kl_loss_13": 3027.6, "kl_loss_26": 2117.0, "kl_loss_39": 1153.1, "kl_loss_7": 3566.0, "learning_rate": 2.6983023928961405e-05, "loss": 4959.6, "step": 8960 }, { "ce_loss_13": 2.8546026587486266, "ce_loss_26": 2.4183767944574357, "ce_loss_39": 1.956614688038826, "ce_loss_52": 1.3939528629183768, "ce_loss_7": 3.1231652200222015, "epoch": 0.897, "grad_norm": 19.808391264092982, "kl_loss_13": 3029.2, "kl_loss_26": 2118.8, "kl_loss_39": 1139.0, "kl_loss_7": 3584.4, "learning_rate": 2.6471220490954628e-05, "loss": 4973.5, "step": 8970 }, { "ce_loss_13": 2.9009016394615172, "ce_loss_26": 2.467138040065765, "ce_loss_39": 2.0069568186998366, "ce_loss_52": 1.4654083251953125, "ce_loss_7": 3.155901938676834, "epoch": 0.898, "grad_norm": 19.041913132666583, "kl_loss_13": 2967.2, "kl_loss_26": 2069.0, "kl_loss_39": 1105.9, "kl_loss_7": 3506.0, "learning_rate": 2.596418548250029e-05, "loss": 4886.7, "step": 8980 }, { "ce_loss_13": 2.8516535699367522, "ce_loss_26": 2.426672577857971, "ce_loss_39": 1.9635347902774811, "ce_loss_52": 1.419031423330307, "ce_loss_7": 3.112126684188843, "epoch": 0.899, "grad_norm": 19.026364280537617, "kl_loss_13": 2990.8, "kl_loss_26": 2103.8, "kl_loss_39": 1123.3, "kl_loss_7": 3530.0, "learning_rate": 2.5461924009435368e-05, "loss": 4885.15, "step": 8990 }, { "ce_loss_13": 2.870776003599167, "ce_loss_26": 2.4326407968997956, "ce_loss_39": 1.9669345051050187, "ce_loss_52": 1.420183390378952, "ce_loss_7": 3.127993369102478, "epoch": 0.9, "grad_norm": 18.943171008960356, "kl_loss_13": 3002.4, "kl_loss_26": 2090.0, "kl_loss_39": 1108.9, "kl_loss_7": 3537.6, "learning_rate": 2.4964441129527336e-05, "loss": 4949.55, "step": 9000 }, { "ce_loss_13": 2.91824157834053, "ce_loss_26": 2.4817179054021836, "ce_loss_39": 2.015528929233551, "ce_loss_52": 1.434774386882782, "ce_loss_7": 3.181524306535721, "epoch": 0.901, "grad_norm": 19.717113587964203, "kl_loss_13": 3071.6, "kl_loss_26": 2176.4, "kl_loss_39": 1186.6, "kl_loss_7": 3618.4, "learning_rate": 2.4471741852423235e-05, "loss": 4970.45, "step": 9010 }, { "ce_loss_13": 2.8203544914722443, "ce_loss_26": 2.394831323623657, "ce_loss_39": 1.9447649121284485, "ce_loss_52": 1.3895054385066032, "ce_loss_7": 3.0815266370773315, "epoch": 0.902, "grad_norm": 19.201709482987514, "kl_loss_13": 2963.6, "kl_loss_26": 2077.6, "kl_loss_39": 1127.3, "kl_loss_7": 3508.4, "learning_rate": 2.3983831139599287e-05, "loss": 4939.65, "step": 9020 }, { "ce_loss_13": 2.8914650082588196, "ce_loss_26": 2.46552118062973, "ce_loss_39": 1.9927147597074508, "ce_loss_52": 1.4259023681282996, "ce_loss_7": 3.1594585537910462, "epoch": 0.903, "grad_norm": 18.327777842458563, "kl_loss_13": 3054.4, "kl_loss_26": 2152.4, "kl_loss_39": 1157.6, "kl_loss_7": 3607.6, "learning_rate": 2.3500713904311022e-05, "loss": 4963.8, "step": 9030 }, { "ce_loss_13": 2.8807626605033874, "ce_loss_26": 2.432952329516411, "ce_loss_39": 1.9567799299955368, "ce_loss_52": 1.4073475629091263, "ce_loss_7": 3.1469713926315306, "epoch": 0.904, "grad_norm": 20.180916839551323, "kl_loss_13": 3056.0, "kl_loss_26": 2131.4, "kl_loss_39": 1132.8, "kl_loss_7": 3607.6, "learning_rate": 2.3022395011543685e-05, "loss": 4930.8, "step": 9040 }, { "ce_loss_13": 2.895359253883362, "ce_loss_26": 2.4620601534843445, "ce_loss_39": 1.9888764083385468, "ce_loss_52": 1.4224872916936875, "ce_loss_7": 3.1553323328495027, "epoch": 0.905, "grad_norm": 19.515330390455173, "kl_loss_13": 3061.2, "kl_loss_26": 2157.8, "kl_loss_39": 1165.6, "kl_loss_7": 3605.6, "learning_rate": 2.2548879277963063e-05, "loss": 4965.65, "step": 9050 }, { "ce_loss_13": 2.913999766111374, "ce_loss_26": 2.4826299071311952, "ce_loss_39": 2.025396314263344, "ce_loss_52": 1.4647169053554534, "ce_loss_7": 3.1696211397647858, "epoch": 0.906, "grad_norm": 19.958942164442686, "kl_loss_13": 2998.4, "kl_loss_26": 2104.6, "kl_loss_39": 1143.0, "kl_loss_7": 3532.8, "learning_rate": 2.208017147186736e-05, "loss": 4953.35, "step": 9060 }, { "ce_loss_13": 2.931579887866974, "ce_loss_26": 2.497947371006012, "ce_loss_39": 2.0416529774665833, "ce_loss_52": 1.4668794304132462, "ce_loss_7": 3.1927560210227965, "epoch": 0.907, "grad_norm": 18.915099363205265, "kl_loss_13": 3050.8, "kl_loss_26": 2150.2, "kl_loss_39": 1170.4, "kl_loss_7": 3596.4, "learning_rate": 2.1616276313139227e-05, "loss": 4967.35, "step": 9070 }, { "ce_loss_13": 2.8379551649093626, "ce_loss_26": 2.4058648884296416, "ce_loss_39": 1.9522909700870514, "ce_loss_52": 1.401316450536251, "ce_loss_7": 3.096262776851654, "epoch": 0.908, "grad_norm": 17.939476679877323, "kl_loss_13": 2996.4, "kl_loss_26": 2103.4, "kl_loss_39": 1130.6, "kl_loss_7": 3545.6, "learning_rate": 2.1157198473197415e-05, "loss": 4983.65, "step": 9080 }, { "ce_loss_13": 2.9229146242141724, "ce_loss_26": 2.4857192397117616, "ce_loss_39": 2.014718788862228, "ce_loss_52": 1.4371613681316375, "ce_loss_7": 3.192232495546341, "epoch": 0.909, "grad_norm": 19.195813577565207, "kl_loss_13": 3088.8, "kl_loss_26": 2172.6, "kl_loss_39": 1177.9, "kl_loss_7": 3649.2, "learning_rate": 2.0702942574950812e-05, "loss": 4961.5, "step": 9090 }, { "ce_loss_13": 2.9034866452217103, "ce_loss_26": 2.4769316017627716, "ce_loss_39": 2.006492680311203, "ce_loss_52": 1.4311101764440537, "ce_loss_7": 3.164641487598419, "epoch": 0.91, "grad_norm": 18.881356963654913, "kl_loss_13": 3059.6, "kl_loss_26": 2172.6, "kl_loss_39": 1174.0, "kl_loss_7": 3598.4, "learning_rate": 2.025351319275137e-05, "loss": 4952.9, "step": 9100 }, { "ce_loss_13": 2.915982037782669, "ce_loss_26": 2.4739193379879, "ce_loss_39": 2.002902591228485, "ce_loss_52": 1.4319583177566528, "ce_loss_7": 3.17461501955986, "epoch": 0.911, "grad_norm": 18.844750958897702, "kl_loss_13": 3071.6, "kl_loss_26": 2158.2, "kl_loss_39": 1169.6, "kl_loss_7": 3613.6, "learning_rate": 1.9808914852347816e-05, "loss": 4969.1, "step": 9110 }, { "ce_loss_13": 2.9571076393127442, "ce_loss_26": 2.5076956033706663, "ce_loss_39": 2.0347861379384993, "ce_loss_52": 1.4524286478757857, "ce_loss_7": 3.2245921969413756, "epoch": 0.912, "grad_norm": 18.516316183492112, "kl_loss_13": 3096.4, "kl_loss_26": 2170.0, "kl_loss_39": 1183.2, "kl_loss_7": 3656.0, "learning_rate": 1.9369152030840554e-05, "loss": 4969.7, "step": 9120 }, { "ce_loss_13": 2.8493692874908447, "ce_loss_26": 2.4184423595666886, "ce_loss_39": 1.9636689513921737, "ce_loss_52": 1.4161925345659256, "ce_loss_7": 3.1067621290683745, "epoch": 0.913, "grad_norm": 19.557541417790066, "kl_loss_13": 2969.6, "kl_loss_26": 2073.0, "kl_loss_39": 1110.7, "kl_loss_7": 3508.8, "learning_rate": 1.893422915663645e-05, "loss": 4967.05, "step": 9130 }, { "ce_loss_13": 2.950014758110046, "ce_loss_26": 2.5281428694725037, "ce_loss_39": 2.0640300661325455, "ce_loss_52": 1.4911428451538087, "ce_loss_7": 3.20717169046402, "epoch": 0.914, "grad_norm": 19.126297962103095, "kl_loss_13": 3062.4, "kl_loss_26": 2166.6, "kl_loss_39": 1184.2, "kl_loss_7": 3597.2, "learning_rate": 1.850415060940386e-05, "loss": 4910.65, "step": 9140 }, { "ce_loss_13": 2.9050391018390656, "ce_loss_26": 2.4736091554164887, "ce_loss_39": 2.017327818274498, "ce_loss_52": 1.4512428998947144, "ce_loss_7": 3.162110447883606, "epoch": 0.915, "grad_norm": 18.914613896159423, "kl_loss_13": 3018.0, "kl_loss_26": 2122.6, "kl_loss_39": 1153.5, "kl_loss_7": 3547.6, "learning_rate": 1.8078920720028978e-05, "loss": 4898.6, "step": 9150 }, { "ce_loss_13": 2.8679368257522584, "ce_loss_26": 2.4424335032701494, "ce_loss_39": 1.9891292452812195, "ce_loss_52": 1.4451990023255348, "ce_loss_7": 3.127324694395065, "epoch": 0.916, "grad_norm": 20.019541856401887, "kl_loss_13": 2959.2, "kl_loss_26": 2064.4, "kl_loss_39": 1098.8, "kl_loss_7": 3496.8, "learning_rate": 1.765854377057219e-05, "loss": 4940.15, "step": 9160 }, { "ce_loss_13": 2.873015010356903, "ce_loss_26": 2.433028203248978, "ce_loss_39": 1.9746823519468308, "ce_loss_52": 1.407148177921772, "ce_loss_7": 3.1320842862129212, "epoch": 0.917, "grad_norm": 18.323389911529343, "kl_loss_13": 3046.8, "kl_loss_26": 2143.6, "kl_loss_39": 1161.3, "kl_loss_7": 3586.0, "learning_rate": 1.724302399422456e-05, "loss": 4937.75, "step": 9170 }, { "ce_loss_13": 2.864003378152847, "ce_loss_26": 2.4368703365325928, "ce_loss_39": 1.978201287984848, "ce_loss_52": 1.4284173011779786, "ce_loss_7": 3.117653822898865, "epoch": 0.918, "grad_norm": 19.851903614612837, "kl_loss_13": 2960.4, "kl_loss_26": 2069.6, "kl_loss_39": 1110.5, "kl_loss_7": 3494.8, "learning_rate": 1.683236557526574e-05, "loss": 4948.85, "step": 9180 }, { "ce_loss_13": 2.841529107093811, "ce_loss_26": 2.4092674642801284, "ce_loss_39": 1.9591031044721603, "ce_loss_52": 1.4043558463454247, "ce_loss_7": 3.099020904302597, "epoch": 0.919, "grad_norm": 18.98858399937095, "kl_loss_13": 2971.2, "kl_loss_26": 2076.4, "kl_loss_39": 1122.0, "kl_loss_7": 3508.0, "learning_rate": 1.6426572649021475e-05, "loss": 4944.1, "step": 9190 }, { "ce_loss_13": 2.902883565425873, "ce_loss_26": 2.4624376207590104, "ce_loss_39": 2.0033867925405504, "ce_loss_52": 1.4482421904802323, "ce_loss_7": 3.1540717780590057, "epoch": 0.92, "grad_norm": 19.524164167367193, "kl_loss_13": 3016.4, "kl_loss_26": 2114.8, "kl_loss_39": 1136.7, "kl_loss_7": 3542.4, "learning_rate": 1.6025649301821876e-05, "loss": 4936.95, "step": 9200 }, { "ce_loss_13": 2.962231194972992, "ce_loss_26": 2.5141273856163027, "ce_loss_39": 2.038061347603798, "ce_loss_52": 1.4573458433151245, "ce_loss_7": 3.22137930393219, "epoch": 0.921, "grad_norm": 19.07781770056793, "kl_loss_13": 3091.6, "kl_loss_26": 2174.2, "kl_loss_39": 1179.6, "kl_loss_7": 3638.0, "learning_rate": 1.5629599570960716e-05, "loss": 4931.05, "step": 9210 }, { "ce_loss_13": 2.828860414028168, "ce_loss_26": 2.394576147198677, "ce_loss_39": 1.940834417939186, "ce_loss_52": 1.396960550546646, "ce_loss_7": 3.0943815410137177, "epoch": 0.922, "grad_norm": 18.68562598066032, "kl_loss_13": 2986.4, "kl_loss_26": 2085.0, "kl_loss_39": 1113.6, "kl_loss_7": 3535.2, "learning_rate": 1.5238427444654367e-05, "loss": 4919.35, "step": 9220 }, { "ce_loss_13": 2.854993385076523, "ce_loss_26": 2.4067456245422365, "ce_loss_39": 1.9481880724430085, "ce_loss_52": 1.392129084467888, "ce_loss_7": 3.119863528013229, "epoch": 0.923, "grad_norm": 19.56628173058375, "kl_loss_13": 3048.8, "kl_loss_26": 2126.2, "kl_loss_39": 1141.3, "kl_loss_7": 3609.6, "learning_rate": 1.4852136862001764e-05, "loss": 4956.25, "step": 9230 }, { "ce_loss_13": 2.8672266066074372, "ce_loss_26": 2.428171756863594, "ce_loss_39": 1.967655423283577, "ce_loss_52": 1.4228445023298264, "ce_loss_7": 3.12925271987915, "epoch": 0.924, "grad_norm": 18.655136558750065, "kl_loss_13": 3020.8, "kl_loss_26": 2114.0, "kl_loss_39": 1134.6, "kl_loss_7": 3562.4, "learning_rate": 1.4470731712944884e-05, "loss": 4914.5, "step": 9240 }, { "ce_loss_13": 2.967056131362915, "ce_loss_26": 2.527100908756256, "ce_loss_39": 2.0592952966690063, "ce_loss_52": 1.466832235455513, "ce_loss_7": 3.2317879140377044, "epoch": 0.925, "grad_norm": 18.755830587214348, "kl_loss_13": 3074.0, "kl_loss_26": 2172.6, "kl_loss_39": 1194.2, "kl_loss_7": 3633.2, "learning_rate": 1.4094215838229174e-05, "loss": 4941.0, "step": 9250 }, { "ce_loss_13": 2.8956347942352294, "ce_loss_26": 2.4609649628400803, "ce_loss_39": 1.998116421699524, "ce_loss_52": 1.4327284812927246, "ce_loss_7": 3.1544252693653108, "epoch": 0.926, "grad_norm": 19.440875104184542, "kl_loss_13": 3037.6, "kl_loss_26": 2133.0, "kl_loss_39": 1149.6, "kl_loss_7": 3582.4, "learning_rate": 1.372259302936546e-05, "loss": 4929.25, "step": 9260 }, { "ce_loss_13": 2.818482467532158, "ce_loss_26": 2.3888671875, "ce_loss_39": 1.9417572438716888, "ce_loss_52": 1.3873827829957008, "ce_loss_7": 3.0732292413711546, "epoch": 0.927, "grad_norm": 19.09848340283336, "kl_loss_13": 2988.4, "kl_loss_26": 2096.8, "kl_loss_39": 1136.6, "kl_loss_7": 3519.2, "learning_rate": 1.3355867028591206e-05, "loss": 4917.85, "step": 9270 }, { "ce_loss_13": 2.8812867999076843, "ce_loss_26": 2.445907565951347, "ce_loss_39": 1.9824917227029801, "ce_loss_52": 1.4204061418771743, "ce_loss_7": 3.1463906168937683, "epoch": 0.928, "grad_norm": 19.73371377973639, "kl_loss_13": 3015.6, "kl_loss_26": 2109.8, "kl_loss_39": 1132.3, "kl_loss_7": 3565.2, "learning_rate": 1.2994041528833267e-05, "loss": 4914.15, "step": 9280 }, { "ce_loss_13": 2.989528793096542, "ce_loss_26": 2.545223152637482, "ce_loss_39": 2.0668440997600555, "ce_loss_52": 1.4640702456235886, "ce_loss_7": 3.2542518198490145, "epoch": 0.929, "grad_norm": 18.497071159749588, "kl_loss_13": 3146.0, "kl_loss_26": 2234.4, "kl_loss_39": 1221.2, "kl_loss_7": 3704.8, "learning_rate": 1.2637120173670358e-05, "loss": 4971.25, "step": 9290 }, { "ce_loss_13": 2.9433493435382845, "ce_loss_26": 2.503718575835228, "ce_loss_39": 2.029115191102028, "ce_loss_52": 1.4293665170669556, "ce_loss_7": 3.21596360206604, "epoch": 0.93, "grad_norm": 19.233646690177977, "kl_loss_13": 3119.2, "kl_loss_26": 2210.0, "kl_loss_39": 1209.9, "kl_loss_7": 3688.8, "learning_rate": 1.2285106557296478e-05, "loss": 4970.8, "step": 9300 }, { "ce_loss_13": 2.8525869846343994, "ce_loss_26": 2.4185830265283585, "ce_loss_39": 1.9529170453548432, "ce_loss_52": 1.4022331610321999, "ce_loss_7": 3.112831687927246, "epoch": 0.931, "grad_norm": 19.01919083076588, "kl_loss_13": 3012.8, "kl_loss_26": 2100.6, "kl_loss_39": 1120.5, "kl_loss_7": 3542.0, "learning_rate": 1.1938004224484989e-05, "loss": 4934.7, "step": 9310 }, { "ce_loss_13": 2.9074361979961396, "ce_loss_26": 2.477645492553711, "ce_loss_39": 2.010756382346153, "ce_loss_52": 1.4430534109473228, "ce_loss_7": 3.1718304812908173, "epoch": 0.932, "grad_norm": 18.572431056907458, "kl_loss_13": 3020.0, "kl_loss_26": 2116.0, "kl_loss_39": 1144.2, "kl_loss_7": 3575.6, "learning_rate": 1.1595816670552429e-05, "loss": 4913.95, "step": 9320 }, { "ce_loss_13": 2.8636857986450197, "ce_loss_26": 2.424144572019577, "ce_loss_39": 1.9600117355585098, "ce_loss_52": 1.402983972430229, "ce_loss_7": 3.1251452922821046, "epoch": 0.933, "grad_norm": 18.288942605726792, "kl_loss_13": 3044.0, "kl_loss_26": 2129.6, "kl_loss_39": 1139.2, "kl_loss_7": 3581.6, "learning_rate": 1.1258547341323699e-05, "loss": 4937.25, "step": 9330 }, { "ce_loss_13": 2.893140608072281, "ce_loss_26": 2.457077306509018, "ce_loss_39": 2.003238731622696, "ce_loss_52": 1.4437968581914902, "ce_loss_7": 3.1531366109848022, "epoch": 0.934, "grad_norm": 18.739319955640763, "kl_loss_13": 3019.6, "kl_loss_26": 2115.0, "kl_loss_39": 1131.3, "kl_loss_7": 3558.4, "learning_rate": 1.0926199633097156e-05, "loss": 4899.9, "step": 9340 }, { "ce_loss_13": 2.9001421511173247, "ce_loss_26": 2.4687224984169007, "ce_loss_39": 2.0006180971860887, "ce_loss_52": 1.4220335900783538, "ce_loss_7": 3.1691121637821196, "epoch": 0.935, "grad_norm": 19.392869535691936, "kl_loss_13": 3054.0, "kl_loss_26": 2149.8, "kl_loss_39": 1176.1, "kl_loss_7": 3619.2, "learning_rate": 1.0598776892610684e-05, "loss": 4922.25, "step": 9350 }, { "ce_loss_13": 2.953709363937378, "ce_loss_26": 2.5250791788101195, "ce_loss_39": 2.0616777926683425, "ce_loss_52": 1.5004188895225525, "ce_loss_7": 3.2059156119823458, "epoch": 0.936, "grad_norm": 18.98607482508187, "kl_loss_13": 3007.2, "kl_loss_26": 2107.0, "kl_loss_39": 1138.6, "kl_loss_7": 3536.0, "learning_rate": 1.0276282417007399e-05, "loss": 4935.75, "step": 9360 }, { "ce_loss_13": 2.902603155374527, "ce_loss_26": 2.464896833896637, "ce_loss_39": 2.0031634330749513, "ce_loss_52": 1.4478828191757203, "ce_loss_7": 3.166324245929718, "epoch": 0.937, "grad_norm": 18.72231921789515, "kl_loss_13": 3025.6, "kl_loss_26": 2120.4, "kl_loss_39": 1137.7, "kl_loss_7": 3574.4, "learning_rate": 9.958719453803277e-06, "loss": 4933.2, "step": 9370 }, { "ce_loss_13": 2.878055286407471, "ce_loss_26": 2.4367445170879365, "ce_loss_39": 1.9698922991752625, "ce_loss_52": 1.40206458568573, "ce_loss_7": 3.1407361745834352, "epoch": 0.938, "grad_norm": 19.520797823561637, "kl_loss_13": 3045.6, "kl_loss_26": 2130.6, "kl_loss_39": 1145.7, "kl_loss_7": 3591.6, "learning_rate": 9.646091200853802e-06, "loss": 4932.45, "step": 9380 }, { "ce_loss_13": 2.8573631644248962, "ce_loss_26": 2.429997554421425, "ce_loss_39": 1.9779304087162017, "ce_loss_52": 1.4321624323725701, "ce_loss_7": 3.119151920080185, "epoch": 0.939, "grad_norm": 18.61104788500602, "kl_loss_13": 2968.4, "kl_loss_26": 2075.6, "kl_loss_39": 1113.7, "kl_loss_7": 3509.6, "learning_rate": 9.338400806321978e-06, "loss": 4899.9, "step": 9390 }, { "ce_loss_13": 2.8828431129455567, "ce_loss_26": 2.4453956365585325, "ce_loss_39": 1.986677783727646, "ce_loss_52": 1.4324709355831147, "ce_loss_7": 3.1462887287139893, "epoch": 0.94, "grad_norm": 18.660409146960177, "kl_loss_13": 3006.4, "kl_loss_26": 2102.2, "kl_loss_39": 1130.9, "kl_loss_7": 3551.2, "learning_rate": 9.035651368646646e-06, "loss": 4963.1, "step": 9400 }, { "ce_loss_13": 2.856483778357506, "ce_loss_26": 2.426860272884369, "ce_loss_39": 1.9708759590983391, "ce_loss_52": 1.4115710154175758, "ce_loss_7": 3.114782178401947, "epoch": 0.941, "grad_norm": 19.55117077640538, "kl_loss_13": 2986.0, "kl_loss_26": 2096.2, "kl_loss_39": 1131.2, "kl_loss_7": 3526.4, "learning_rate": 8.737845936511335e-06, "loss": 4960.75, "step": 9410 }, { "ce_loss_13": 2.894274836778641, "ce_loss_26": 2.454681032896042, "ce_loss_39": 1.9826824754476546, "ce_loss_52": 1.4298861980438233, "ce_loss_7": 3.15040722489357, "epoch": 0.942, "grad_norm": 19.039583654377346, "kl_loss_13": 3067.6, "kl_loss_26": 2152.8, "kl_loss_39": 1152.4, "kl_loss_7": 3608.4, "learning_rate": 8.444987508813451e-06, "loss": 4899.6, "step": 9420 }, { "ce_loss_13": 2.9001412212848665, "ce_loss_26": 2.4617854237556456, "ce_loss_39": 1.999165838956833, "ce_loss_52": 1.4294554442167282, "ce_loss_7": 3.165439170598984, "epoch": 0.943, "grad_norm": 18.564983933864266, "kl_loss_13": 3046.0, "kl_loss_26": 2136.6, "kl_loss_39": 1159.1, "kl_loss_7": 3592.8, "learning_rate": 8.157079034633974e-06, "loss": 4920.3, "step": 9430 }, { "ce_loss_13": 2.863955610990524, "ce_loss_26": 2.435519364476204, "ce_loss_39": 1.9886516004800796, "ce_loss_52": 1.4344154298305511, "ce_loss_7": 3.1265052914619447, "epoch": 0.944, "grad_norm": 17.82647647549486, "kl_loss_13": 2962.4, "kl_loss_26": 2073.6, "kl_loss_39": 1123.1, "kl_loss_7": 3508.4, "learning_rate": 7.874123413208145e-06, "loss": 4921.7, "step": 9440 }, { "ce_loss_13": 2.8527204990386963, "ce_loss_26": 2.418528434634209, "ce_loss_39": 1.960913023352623, "ce_loss_52": 1.4082367643713951, "ce_loss_7": 3.118280106782913, "epoch": 0.945, "grad_norm": 17.642678200140654, "kl_loss_13": 3000.8, "kl_loss_26": 2093.6, "kl_loss_39": 1127.9, "kl_loss_7": 3547.2, "learning_rate": 7.59612349389599e-06, "loss": 4941.9, "step": 9450 }, { "ce_loss_13": 2.8983235955238342, "ce_loss_26": 2.4708085656166077, "ce_loss_39": 2.01363542675972, "ce_loss_52": 1.4459212511777877, "ce_loss_7": 3.1573162257671354, "epoch": 0.946, "grad_norm": 18.21137845155402, "kl_loss_13": 3012.8, "kl_loss_26": 2129.0, "kl_loss_39": 1159.6, "kl_loss_7": 3550.0, "learning_rate": 7.323082076153509e-06, "loss": 4932.45, "step": 9460 }, { "ce_loss_13": 2.8793884813785553, "ce_loss_26": 2.444310560822487, "ce_loss_39": 1.9878242909908295, "ce_loss_52": 1.4219153225421906, "ce_loss_7": 3.1359946370124816, "epoch": 0.947, "grad_norm": 19.11147526952516, "kl_loss_13": 3000.4, "kl_loss_26": 2106.2, "kl_loss_39": 1141.4, "kl_loss_7": 3539.6, "learning_rate": 7.055001909504755e-06, "loss": 4932.95, "step": 9470 }, { "ce_loss_13": 2.8483738005161285, "ce_loss_26": 2.4169380724430085, "ce_loss_39": 1.9552814781665802, "ce_loss_52": 1.4036450207233429, "ce_loss_7": 3.1039236187934875, "epoch": 0.948, "grad_norm": 19.227610169601164, "kl_loss_13": 3000.4, "kl_loss_26": 2102.0, "kl_loss_39": 1125.6, "kl_loss_7": 3530.0, "learning_rate": 6.791885693514133e-06, "loss": 4941.55, "step": 9480 }, { "ce_loss_13": 2.8693545699119567, "ce_loss_26": 2.4362709283828736, "ce_loss_39": 1.9619301795959472, "ce_loss_52": 1.400461108982563, "ce_loss_7": 3.133023035526276, "epoch": 0.949, "grad_norm": 19.323995399615697, "kl_loss_13": 3058.8, "kl_loss_26": 2146.6, "kl_loss_39": 1149.1, "kl_loss_7": 3608.8, "learning_rate": 6.533736077758867e-06, "loss": 4986.35, "step": 9490 }, { "ce_loss_13": 2.8667274117469788, "ce_loss_26": 2.4240807622671126, "ce_loss_39": 1.9586560875177383, "ce_loss_52": 1.3980020493268968, "ce_loss_7": 3.127706527709961, "epoch": 0.95, "grad_norm": 18.253118734633716, "kl_loss_13": 3033.6, "kl_loss_26": 2126.0, "kl_loss_39": 1140.7, "kl_loss_7": 3581.6, "learning_rate": 6.2805556618028556e-06, "loss": 4971.65, "step": 9500 }, { "ce_loss_13": 2.9265355467796326, "ce_loss_26": 2.4994624704122543, "ce_loss_39": 2.03882916867733, "ce_loss_52": 1.4773303151130677, "ce_loss_7": 3.1838342785835265, "epoch": 0.951, "grad_norm": 19.482478782354722, "kl_loss_13": 2998.4, "kl_loss_26": 2103.6, "kl_loss_39": 1144.8, "kl_loss_7": 3540.4, "learning_rate": 6.032346995169968e-06, "loss": 4951.7, "step": 9510 }, { "ce_loss_13": 2.9545272469520567, "ce_loss_26": 2.5311076641082764, "ce_loss_39": 2.068070963025093, "ce_loss_52": 1.4843237161636353, "ce_loss_7": 3.2110206544399262, "epoch": 0.952, "grad_norm": 19.225083219290383, "kl_loss_13": 3055.2, "kl_loss_26": 2164.8, "kl_loss_39": 1188.4, "kl_loss_7": 3590.4, "learning_rate": 5.789112577318789e-06, "loss": 4961.65, "step": 9520 }, { "ce_loss_13": 2.852985817193985, "ce_loss_26": 2.4146564304828644, "ce_loss_39": 1.961009207367897, "ce_loss_52": 1.3947103202342988, "ce_loss_7": 3.1259153723716735, "epoch": 0.953, "grad_norm": 18.155555980380427, "kl_loss_13": 3021.6, "kl_loss_26": 2118.2, "kl_loss_39": 1157.4, "kl_loss_7": 3575.6, "learning_rate": 5.550854857617194e-06, "loss": 4909.2, "step": 9530 }, { "ce_loss_13": 2.8418005287647246, "ce_loss_26": 2.4128061681985855, "ce_loss_39": 1.9482584029436112, "ce_loss_52": 1.3925445035099984, "ce_loss_7": 3.1012724101543427, "epoch": 0.954, "grad_norm": 18.797933923537936, "kl_loss_13": 3018.8, "kl_loss_26": 2116.8, "kl_loss_39": 1135.1, "kl_loss_7": 3566.4, "learning_rate": 5.317576235317756e-06, "loss": 4951.35, "step": 9540 }, { "ce_loss_13": 2.9137533485889433, "ce_loss_26": 2.479102221131325, "ce_loss_39": 2.011527943611145, "ce_loss_52": 1.4640387833118438, "ce_loss_7": 3.16554337143898, "epoch": 0.955, "grad_norm": 18.308431062302134, "kl_loss_13": 3004.0, "kl_loss_26": 2104.0, "kl_loss_39": 1124.5, "kl_loss_7": 3534.4, "learning_rate": 5.089279059533658e-06, "loss": 4893.9, "step": 9550 }, { "ce_loss_13": 2.9561933636665345, "ce_loss_26": 2.520014223456383, "ce_loss_39": 2.044199249148369, "ce_loss_52": 1.4634439080953598, "ce_loss_7": 3.2146646201610567, "epoch": 0.956, "grad_norm": 19.011558845630333, "kl_loss_13": 3107.6, "kl_loss_26": 2205.4, "kl_loss_39": 1192.1, "kl_loss_7": 3652.4, "learning_rate": 4.865965629214819e-06, "loss": 4928.6, "step": 9560 }, { "ce_loss_13": 2.9264722049236296, "ce_loss_26": 2.500628116726875, "ce_loss_39": 2.0424467980861665, "ce_loss_52": 1.4656882539391518, "ce_loss_7": 3.1882854044437408, "epoch": 0.957, "grad_norm": 19.491005801371585, "kl_loss_13": 3028.0, "kl_loss_26": 2138.6, "kl_loss_39": 1174.2, "kl_loss_7": 3566.0, "learning_rate": 4.6476381931251366e-06, "loss": 4947.75, "step": 9570 }, { "ce_loss_13": 2.8677931249141695, "ce_loss_26": 2.432820278406143, "ce_loss_39": 1.970087245106697, "ce_loss_52": 1.4129542678594589, "ce_loss_7": 3.129103422164917, "epoch": 0.958, "grad_norm": 18.903925554756427, "kl_loss_13": 2994.0, "kl_loss_26": 2096.0, "kl_loss_39": 1120.2, "kl_loss_7": 3546.4, "learning_rate": 4.434298949819449e-06, "loss": 4918.2, "step": 9580 }, { "ce_loss_13": 2.894206315279007, "ce_loss_26": 2.4608440458774568, "ce_loss_39": 2.00639765560627, "ce_loss_52": 1.4453970074653626, "ce_loss_7": 3.151961898803711, "epoch": 0.959, "grad_norm": 17.742534881377313, "kl_loss_13": 2993.6, "kl_loss_26": 2091.6, "kl_loss_39": 1135.9, "kl_loss_7": 3542.0, "learning_rate": 4.2259500476214406e-06, "loss": 4904.1, "step": 9590 }, { "ce_loss_13": 2.907763344049454, "ce_loss_26": 2.4721481442451476, "ce_loss_39": 2.013452297449112, "ce_loss_52": 1.4463645279407502, "ce_loss_7": 3.1700760960578918, "epoch": 0.96, "grad_norm": 18.762064601939382, "kl_loss_13": 3033.2, "kl_loss_26": 2127.6, "kl_loss_39": 1154.1, "kl_loss_7": 3576.8, "learning_rate": 4.02259358460233e-06, "loss": 4944.15, "step": 9600 }, { "ce_loss_13": 2.9404530614614486, "ce_loss_26": 2.5118053376674654, "ce_loss_39": 2.045960560441017, "ce_loss_52": 1.4736278399825096, "ce_loss_7": 3.199742293357849, "epoch": 0.961, "grad_norm": 19.091693827270714, "kl_loss_13": 3049.6, "kl_loss_26": 2150.4, "kl_loss_39": 1165.5, "kl_loss_7": 3590.8, "learning_rate": 3.8242316085594916e-06, "loss": 4931.75, "step": 9610 }, { "ce_loss_13": 2.8952401757240294, "ce_loss_26": 2.4520116090774535, "ce_loss_39": 1.9771205306053161, "ce_loss_52": 1.3961644172668457, "ce_loss_7": 3.1528802454471587, "epoch": 0.962, "grad_norm": 18.822596918413492, "kl_loss_13": 3097.6, "kl_loss_26": 2171.4, "kl_loss_39": 1176.6, "kl_loss_7": 3631.6, "learning_rate": 3.630866116995757e-06, "loss": 4991.65, "step": 9620 }, { "ce_loss_13": 2.848733913898468, "ce_loss_26": 2.4173508852720262, "ce_loss_39": 1.9629988223314285, "ce_loss_52": 1.4227147445082664, "ce_loss_7": 3.105393874645233, "epoch": 0.963, "grad_norm": 18.772124078001035, "kl_loss_13": 2951.2, "kl_loss_26": 2061.0, "kl_loss_39": 1101.5, "kl_loss_7": 3491.2, "learning_rate": 3.4424990570994797e-06, "loss": 4903.15, "step": 9630 }, { "ce_loss_13": 2.9066348552703856, "ce_loss_26": 2.469021773338318, "ce_loss_39": 2.010333400964737, "ce_loss_52": 1.4492767244577407, "ce_loss_7": 3.1746467888355254, "epoch": 0.964, "grad_norm": 19.32775364197132, "kl_loss_13": 3013.6, "kl_loss_26": 2110.8, "kl_loss_39": 1135.7, "kl_loss_7": 3570.0, "learning_rate": 3.2591323257248896e-06, "loss": 4939.25, "step": 9640 }, { "ce_loss_13": 2.9041188657283783, "ce_loss_26": 2.471466612815857, "ce_loss_39": 2.0076118439435957, "ce_loss_52": 1.4575997084379195, "ce_loss_7": 3.156245505809784, "epoch": 0.965, "grad_norm": 18.772007331370325, "kl_loss_13": 3016.0, "kl_loss_26": 2111.8, "kl_loss_39": 1138.9, "kl_loss_7": 3551.6, "learning_rate": 3.0807677693729385e-06, "loss": 4953.0, "step": 9650 }, { "ce_loss_13": 2.9185379564762117, "ce_loss_26": 2.4852662444114686, "ce_loss_39": 2.0213693618774413, "ce_loss_52": 1.4567248612642287, "ce_loss_7": 3.1788457691669465, "epoch": 0.966, "grad_norm": 19.301754151350856, "kl_loss_13": 3049.2, "kl_loss_26": 2142.4, "kl_loss_39": 1165.5, "kl_loss_7": 3589.2, "learning_rate": 2.9074071841727055e-06, "loss": 4966.3, "step": 9660 }, { "ce_loss_13": 2.856596076488495, "ce_loss_26": 2.4228154510259627, "ce_loss_39": 1.966923463344574, "ce_loss_52": 1.3987573131918907, "ce_loss_7": 3.112848150730133, "epoch": 0.967, "grad_norm": 18.636169987835014, "kl_loss_13": 3015.2, "kl_loss_26": 2124.8, "kl_loss_39": 1150.1, "kl_loss_7": 3558.0, "learning_rate": 2.739052315863355e-06, "loss": 4944.85, "step": 9670 }, { "ce_loss_13": 2.946157419681549, "ce_loss_26": 2.506874307990074, "ce_loss_39": 2.0385408878326414, "ce_loss_52": 1.4472137212753295, "ce_loss_7": 3.2151435017585754, "epoch": 0.968, "grad_norm": 19.361220180181423, "kl_loss_13": 3105.6, "kl_loss_26": 2193.0, "kl_loss_39": 1200.9, "kl_loss_7": 3669.2, "learning_rate": 2.5757048597765396e-06, "loss": 4938.1, "step": 9680 }, { "ce_loss_13": 2.838688534498215, "ce_loss_26": 2.408904367685318, "ce_loss_39": 1.9514323592185974, "ce_loss_52": 1.4050966590642928, "ce_loss_7": 3.102074921131134, "epoch": 0.969, "grad_norm": 18.982791691406838, "kl_loss_13": 2984.8, "kl_loss_26": 2086.6, "kl_loss_39": 1106.6, "kl_loss_7": 3533.2, "learning_rate": 2.417366460819359e-06, "loss": 4918.15, "step": 9690 }, { "ce_loss_13": 2.880851173400879, "ce_loss_26": 2.447329577803612, "ce_loss_39": 1.989661106467247, "ce_loss_52": 1.4333824023604393, "ce_loss_7": 3.137517309188843, "epoch": 0.97, "grad_norm": 19.196819142959395, "kl_loss_13": 2988.0, "kl_loss_26": 2092.4, "kl_loss_39": 1128.9, "kl_loss_7": 3524.4, "learning_rate": 2.2640387134577057e-06, "loss": 4938.15, "step": 9700 }, { "ce_loss_13": 2.8625703275203707, "ce_loss_26": 2.4340526342391966, "ce_loss_39": 1.968525806069374, "ce_loss_52": 1.4236899584531784, "ce_loss_7": 3.112631046772003, "epoch": 0.971, "grad_norm": 19.493522870524444, "kl_loss_13": 2981.2, "kl_loss_26": 2091.6, "kl_loss_39": 1109.1, "kl_loss_7": 3508.4, "learning_rate": 2.115723161700278e-06, "loss": 4978.3, "step": 9710 }, { "ce_loss_13": 2.930490869283676, "ce_loss_26": 2.495130881667137, "ce_loss_39": 2.030032703280449, "ce_loss_52": 1.4442616790533065, "ce_loss_7": 3.190455746650696, "epoch": 0.972, "grad_norm": 18.231386237261873, "kl_loss_13": 3083.2, "kl_loss_26": 2182.4, "kl_loss_39": 1200.2, "kl_loss_7": 3630.4, "learning_rate": 1.9724212990830937e-06, "loss": 4917.25, "step": 9720 }, { "ce_loss_13": 2.8745281517505648, "ce_loss_26": 2.450575265288353, "ce_loss_39": 1.9936909019947051, "ce_loss_52": 1.4346210777759552, "ce_loss_7": 3.1411093890666963, "epoch": 0.973, "grad_norm": 17.9155258115958, "kl_loss_13": 2990.4, "kl_loss_26": 2104.0, "kl_loss_39": 1135.2, "kl_loss_7": 3546.0, "learning_rate": 1.8341345686543331e-06, "loss": 4907.2, "step": 9730 }, { "ce_loss_13": 2.950432300567627, "ce_loss_26": 2.5173233568668367, "ce_loss_39": 2.05553839802742, "ce_loss_52": 1.5059631228446961, "ce_loss_7": 3.203335565328598, "epoch": 0.974, "grad_norm": 18.692267522295538, "kl_loss_13": 2994.0, "kl_loss_26": 2099.8, "kl_loss_39": 1118.3, "kl_loss_7": 3526.0, "learning_rate": 1.7008643629596864e-06, "loss": 4975.3, "step": 9740 }, { "ce_loss_13": 2.938932454586029, "ce_loss_26": 2.4943090945482256, "ce_loss_39": 2.030773627758026, "ce_loss_52": 1.4614870190620421, "ce_loss_7": 3.203068423271179, "epoch": 0.975, "grad_norm": 19.21025690602488, "kl_loss_13": 3068.4, "kl_loss_26": 2141.2, "kl_loss_39": 1151.8, "kl_loss_7": 3617.6, "learning_rate": 1.5726120240288633e-06, "loss": 4916.8, "step": 9750 }, { "ce_loss_13": 2.9820500314235687, "ce_loss_26": 2.5365146696567535, "ce_loss_39": 2.0661711603403092, "ce_loss_52": 1.471569898724556, "ce_loss_7": 3.2498775362968444, "epoch": 0.976, "grad_norm": 18.6163174066225, "kl_loss_13": 3112.0, "kl_loss_26": 2196.6, "kl_loss_39": 1202.4, "kl_loss_7": 3666.4, "learning_rate": 1.4493788433612708e-06, "loss": 4925.3, "step": 9760 }, { "ce_loss_13": 2.8684714436531067, "ce_loss_26": 2.433712217211723, "ce_loss_39": 1.979764473438263, "ce_loss_52": 1.4259307652711868, "ce_loss_7": 3.1243775844573975, "epoch": 0.977, "grad_norm": 18.645711029415455, "kl_loss_13": 2991.6, "kl_loss_26": 2090.8, "kl_loss_39": 1122.5, "kl_loss_7": 3526.8, "learning_rate": 1.3311660619138578e-06, "loss": 4899.9, "step": 9770 }, { "ce_loss_13": 2.875075614452362, "ce_loss_26": 2.4274426341056823, "ce_loss_39": 1.9597632795572282, "ce_loss_52": 1.3981771111488341, "ce_loss_7": 3.132591074705124, "epoch": 0.978, "grad_norm": 19.101397556379275, "kl_loss_13": 3053.6, "kl_loss_26": 2133.6, "kl_loss_39": 1148.0, "kl_loss_7": 3594.4, "learning_rate": 1.2179748700879012e-06, "loss": 4922.55, "step": 9780 }, { "ce_loss_13": 2.8309387296438215, "ce_loss_26": 2.4053177654743196, "ce_loss_39": 1.9509627014398574, "ce_loss_52": 1.397429385781288, "ce_loss_7": 3.098381590843201, "epoch": 0.979, "grad_norm": 18.730022448398557, "kl_loss_13": 2994.8, "kl_loss_26": 2100.6, "kl_loss_39": 1129.3, "kl_loss_7": 3549.6, "learning_rate": 1.1098064077174619e-06, "loss": 4943.05, "step": 9790 }, { "ce_loss_13": 2.939675289392471, "ce_loss_26": 2.5042629301548005, "ce_loss_39": 2.036014449596405, "ce_loss_52": 1.4478511959314346, "ce_loss_7": 3.2001714766025544, "epoch": 0.98, "grad_norm": 18.76259776094823, "kl_loss_13": 3078.4, "kl_loss_26": 2174.8, "kl_loss_39": 1199.8, "kl_loss_7": 3615.6, "learning_rate": 1.006661764057837e-06, "loss": 4908.35, "step": 9800 }, { "ce_loss_13": 2.871473455429077, "ce_loss_26": 2.4314837962388993, "ce_loss_39": 1.95380699634552, "ce_loss_52": 1.3844006016850472, "ce_loss_7": 3.1342472076416015, "epoch": 0.981, "grad_norm": 19.274903724206773, "kl_loss_13": 3093.6, "kl_loss_26": 2172.4, "kl_loss_39": 1165.3, "kl_loss_7": 3637.6, "learning_rate": 9.085419777743465e-07, "loss": 4984.5, "step": 9810 }, { "ce_loss_13": 2.895174187421799, "ce_loss_26": 2.465178096294403, "ce_loss_39": 2.006316193938255, "ce_loss_52": 1.441744513809681, "ce_loss_7": 3.1597203612327576, "epoch": 0.982, "grad_norm": 18.123510539706626, "kl_loss_13": 3039.2, "kl_loss_26": 2136.8, "kl_loss_39": 1165.1, "kl_loss_7": 3592.0, "learning_rate": 8.15448036932176e-07, "loss": 4978.7, "step": 9820 }, { "ce_loss_13": 2.9061976075172424, "ce_loss_26": 2.477484393119812, "ce_loss_39": 2.0169315338134766, "ce_loss_52": 1.4464313685894012, "ce_loss_7": 3.1681883454322817, "epoch": 0.983, "grad_norm": 18.434840579065046, "kl_loss_13": 3067.2, "kl_loss_26": 2157.8, "kl_loss_39": 1175.9, "kl_loss_7": 3616.8, "learning_rate": 7.273808789862724e-07, "loss": 4921.0, "step": 9830 }, { "ce_loss_13": 2.91153547167778, "ce_loss_26": 2.473715308308601, "ce_loss_39": 2.007182112336159, "ce_loss_52": 1.4433553382754325, "ce_loss_7": 3.170134776830673, "epoch": 0.984, "grad_norm": 19.402855155704938, "kl_loss_13": 3056.0, "kl_loss_26": 2142.6, "kl_loss_39": 1154.2, "kl_loss_7": 3589.6, "learning_rate": 6.443413907720186e-07, "loss": 4900.3, "step": 9840 }, { "ce_loss_13": 2.812727469205856, "ce_loss_26": 2.3864874839782715, "ce_loss_39": 1.9436532348394393, "ce_loss_52": 1.3948013991117478, "ce_loss_7": 3.0747777581214906, "epoch": 0.985, "grad_norm": 18.7509272372939, "kl_loss_13": 2956.4, "kl_loss_26": 2066.6, "kl_loss_39": 1115.4, "kl_loss_7": 3495.2, "learning_rate": 5.663304084960185e-07, "loss": 4941.5, "step": 9850 }, { "ce_loss_13": 2.8474230617284775, "ce_loss_26": 2.4196896702051163, "ce_loss_39": 1.958586323261261, "ce_loss_52": 1.4132703453302384, "ce_loss_7": 3.112215679883957, "epoch": 0.986, "grad_norm": 19.13033176723296, "kl_loss_13": 2947.6, "kl_loss_26": 2054.6, "kl_loss_39": 1096.6, "kl_loss_7": 3496.0, "learning_rate": 4.933487177280482e-07, "loss": 4900.7, "step": 9860 }, { "ce_loss_13": 2.914989507198334, "ce_loss_26": 2.4912798583507536, "ce_loss_39": 2.020476207137108, "ce_loss_52": 1.45462586581707, "ce_loss_7": 3.169612795114517, "epoch": 0.987, "grad_norm": 18.808140949859265, "kl_loss_13": 3018.8, "kl_loss_26": 2134.0, "kl_loss_39": 1159.4, "kl_loss_7": 3551.6, "learning_rate": 4.2539705339295075e-07, "loss": 4908.55, "step": 9870 }, { "ce_loss_13": 2.8734777927398683, "ce_loss_26": 2.437858074903488, "ce_loss_39": 1.9720120638608933, "ce_loss_52": 1.4267651215195656, "ce_loss_7": 3.13208429813385, "epoch": 0.988, "grad_norm": 18.962161399510684, "kl_loss_13": 2984.8, "kl_loss_26": 2080.8, "kl_loss_39": 1110.4, "kl_loss_7": 3520.0, "learning_rate": 3.6247609976319816e-07, "loss": 4944.0, "step": 9880 }, { "ce_loss_13": 2.941332721710205, "ce_loss_26": 2.5040529906749724, "ce_loss_39": 2.0378583818674088, "ce_loss_52": 1.468481183052063, "ce_loss_7": 3.198000502586365, "epoch": 0.989, "grad_norm": 18.392580217010416, "kl_loss_13": 3027.6, "kl_loss_26": 2127.6, "kl_loss_39": 1152.6, "kl_loss_7": 3568.8, "learning_rate": 3.0458649045211895e-07, "loss": 4940.25, "step": 9890 }, { "ce_loss_13": 2.7879110276699066, "ce_loss_26": 2.361061328649521, "ce_loss_39": 1.9103228181600571, "ce_loss_52": 1.3702009424567223, "ce_loss_7": 3.0456897139549257, "epoch": 0.99, "grad_norm": 18.728359427979246, "kl_loss_13": 2962.0, "kl_loss_26": 2068.0, "kl_loss_39": 1101.6, "kl_loss_7": 3497.6, "learning_rate": 2.517288084074587e-07, "loss": 4930.1, "step": 9900 }, { "ce_loss_13": 2.8971258997917175, "ce_loss_26": 2.5054025918245317, "ce_loss_39": 2.0191519230604174, "ce_loss_52": 1.4615912348031999, "ce_loss_7": 3.1491506710648536, "epoch": 0.991, "grad_norm": 18.31923607379438, "kl_loss_13": 3021.4, "kl_loss_26": 2147.8, "kl_loss_39": 1158.2, "kl_loss_7": 3569.6, "learning_rate": 2.0390358590538505e-07, "loss": 4961.35, "step": 9910 }, { "ce_loss_13": 2.8903492599725724, "ce_loss_26": 2.463494861125946, "ce_loss_39": 1.9990645915269851, "ce_loss_52": 1.4176891192793846, "ce_loss_7": 3.156185895204544, "epoch": 0.992, "grad_norm": 18.63207057019639, "kl_loss_13": 3059.2, "kl_loss_26": 2160.8, "kl_loss_39": 1181.7, "kl_loss_7": 3612.8, "learning_rate": 1.61111304545436e-07, "loss": 4924.65, "step": 9920 }, { "ce_loss_13": 2.9144785940647124, "ce_loss_26": 2.4774809032678604, "ce_loss_39": 2.0119458585977554, "ce_loss_52": 1.4417672097682952, "ce_loss_7": 3.1794375479221344, "epoch": 0.993, "grad_norm": 19.670611142271607, "kl_loss_13": 3065.6, "kl_loss_26": 2158.2, "kl_loss_39": 1171.1, "kl_loss_7": 3620.0, "learning_rate": 1.2335239524541298e-07, "loss": 4934.7, "step": 9930 }, { "ce_loss_13": 2.8820480942726134, "ce_loss_26": 2.4476457953453066, "ce_loss_39": 1.9870627135038377, "ce_loss_52": 1.418778820335865, "ce_loss_7": 3.140765738487244, "epoch": 0.994, "grad_norm": 18.855641981755237, "kl_loss_13": 3036.0, "kl_loss_26": 2135.8, "kl_loss_39": 1164.2, "kl_loss_7": 3583.2, "learning_rate": 9.06272382371065e-08, "loss": 4938.8, "step": 9940 }, { "ce_loss_13": 2.8076956808567046, "ce_loss_26": 2.3849500566720963, "ce_loss_39": 1.944134348630905, "ce_loss_52": 1.4005259275436401, "ce_loss_7": 3.060946136713028, "epoch": 0.995, "grad_norm": 18.321398390501436, "kl_loss_13": 2907.2, "kl_loss_26": 2027.0, "kl_loss_39": 1087.6, "kl_loss_7": 3442.4, "learning_rate": 6.293616306246586e-08, "loss": 4950.05, "step": 9950 }, { "ce_loss_13": 2.879979431629181, "ce_loss_26": 2.4471361935138702, "ce_loss_39": 1.9933580070734025, "ce_loss_52": 1.4384188532829285, "ce_loss_7": 3.137728548049927, "epoch": 0.996, "grad_norm": 18.715841313903894, "kl_loss_13": 3004.0, "kl_loss_26": 2101.4, "kl_loss_39": 1139.8, "kl_loss_7": 3537.6, "learning_rate": 4.027944857032395e-08, "loss": 4943.4, "step": 9960 }, { "ce_loss_13": 2.873315241932869, "ce_loss_26": 2.4401324480772018, "ce_loss_39": 1.9843392819166183, "ce_loss_52": 1.4195681273937226, "ce_loss_7": 3.131341791152954, "epoch": 0.997, "grad_norm": 18.684401038028668, "kl_loss_13": 3010.0, "kl_loss_26": 2109.2, "kl_loss_39": 1143.6, "kl_loss_7": 3546.8, "learning_rate": 2.265732291356626e-08, "loss": 4916.9, "step": 9970 }, { "ce_loss_13": 2.806668055057526, "ce_loss_26": 2.379783111810684, "ce_loss_39": 1.9266512155532838, "ce_loss_52": 1.397612212598324, "ce_loss_7": 3.066510772705078, "epoch": 0.998, "grad_norm": 18.616508974623724, "kl_loss_13": 2953.6, "kl_loss_26": 2053.8, "kl_loss_39": 1093.4, "kl_loss_7": 3497.6, "learning_rate": 1.0069963546743833e-08, "loss": 4905.25, "step": 9980 }, { "ce_loss_13": 2.8469128251075744, "ce_loss_26": 2.4174416065216064, "ce_loss_39": 1.9600837975740433, "ce_loss_52": 1.4190092101693152, "ce_loss_7": 3.1044517934322355, "epoch": 0.999, "grad_norm": 18.871308722121288, "kl_loss_13": 2960.0, "kl_loss_26": 2064.8, "kl_loss_39": 1104.4, "kl_loss_7": 3488.8, "learning_rate": 2.517497224463483e-09, "loss": 4901.2, "step": 9990 }, { "ce_loss_13": 2.895157891511917, "ce_loss_26": 2.452625501155853, "ce_loss_39": 1.9889036536216735, "ce_loss_52": 1.4127988710999488, "ce_loss_7": 3.167273908853531, "epoch": 1.0, "grad_norm": 19.02740690165538, "kl_loss_13": 3067.6, "kl_loss_26": 2157.2, "kl_loss_39": 1167.3, "kl_loss_7": 3628.8, "learning_rate": 0.0, "loss": 4933.3, "step": 10000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0167830278176768e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }