Balcony-Model20 / trainer_state.json
adpretko's picture
Upload folder using huggingface_hub
6d77ec3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"ce_loss_13": 11.519832849502563,
"ce_loss_26": 11.473536491394043,
"ce_loss_39": 11.263565063476562,
"ce_loss_52": 1.3852829337120056,
"ce_loss_7": 11.56409740447998,
"epoch": 0.0001,
"grad_norm": 22.293954988093517,
"kl_loss_13": 20864.0,
"kl_loss_26": 20736.0,
"kl_loss_39": 20320.0,
"kl_loss_7": 20992.0,
"learning_rate": 1e-05,
"loss": 41440.0,
"step": 1
},
{
"ce_loss_13": 11.513921552234226,
"ce_loss_26": 11.469077931510078,
"ce_loss_39": 11.246019893222385,
"ce_loss_52": 1.4558950497044458,
"ce_loss_7": 11.55848307079739,
"epoch": 0.001,
"grad_norm": 23.161174410395507,
"kl_loss_13": 20800.0,
"kl_loss_26": 20696.88888888889,
"kl_loss_39": 20227.555555555555,
"kl_loss_7": 20881.777777777777,
"learning_rate": 0.0001,
"loss": 41384.0,
"step": 10
},
{
"ce_loss_13": 11.426759386062622,
"ce_loss_26": 11.41293363571167,
"ce_loss_39": 11.229010510444642,
"ce_loss_52": 1.4324860751628876,
"ce_loss_7": 11.462353825569153,
"epoch": 0.002,
"grad_norm": 38.50857397395853,
"kl_loss_13": 20668.8,
"kl_loss_26": 20640.0,
"kl_loss_39": 20256.0,
"kl_loss_7": 20745.6,
"learning_rate": 0.0002,
"loss": 41179.2,
"step": 20
},
{
"ce_loss_13": 10.954976797103882,
"ce_loss_26": 11.087984251976014,
"ce_loss_39": 11.10265805721283,
"ce_loss_52": 1.4276391446590424,
"ce_loss_7": 10.940680837631225,
"epoch": 0.003,
"grad_norm": 58.08626567131467,
"kl_loss_13": 19702.4,
"kl_loss_26": 19977.6,
"kl_loss_39": 20028.8,
"kl_loss_7": 19680.0,
"learning_rate": 0.0003,
"loss": 39668.0,
"step": 30
},
{
"ce_loss_13": 10.308717799186706,
"ce_loss_26": 10.375125074386597,
"ce_loss_39": 10.562542247772218,
"ce_loss_52": 1.455844309926033,
"ce_loss_7": 10.312761902809143,
"epoch": 0.004,
"grad_norm": 30.451988937114738,
"kl_loss_13": 18307.2,
"kl_loss_26": 18438.4,
"kl_loss_39": 18832.0,
"kl_loss_7": 18313.6,
"learning_rate": 0.0004,
"loss": 36999.2,
"step": 40
},
{
"ce_loss_13": 10.173566651344299,
"ce_loss_26": 10.188505339622498,
"ce_loss_39": 10.173172044754029,
"ce_loss_52": 1.4577810317277908,
"ce_loss_7": 10.182775902748109,
"epoch": 0.005,
"grad_norm": 37.851028798241174,
"kl_loss_13": 18006.4,
"kl_loss_26": 18028.8,
"kl_loss_39": 18012.8,
"kl_loss_7": 18022.4,
"learning_rate": 0.0005,
"loss": 36191.2,
"step": 50
},
{
"ce_loss_13": 10.071360087394714,
"ce_loss_26": 10.092596936225892,
"ce_loss_39": 10.072972583770753,
"ce_loss_52": 1.428243064880371,
"ce_loss_7": 10.105072927474975,
"epoch": 0.006,
"grad_norm": 45.98715921867029,
"kl_loss_13": 17872.0,
"kl_loss_26": 17907.2,
"kl_loss_39": 17878.4,
"kl_loss_7": 17936.0,
"learning_rate": 0.0006,
"loss": 35728.0,
"step": 60
},
{
"ce_loss_13": 9.994266033172607,
"ce_loss_26": 10.000447010993957,
"ce_loss_39": 9.958984637260437,
"ce_loss_52": 1.392430166900158,
"ce_loss_7": 10.041595196723938,
"epoch": 0.007,
"grad_norm": 53.56384816487026,
"kl_loss_13": 17750.4,
"kl_loss_26": 17763.2,
"kl_loss_39": 17667.2,
"kl_loss_7": 17846.4,
"learning_rate": 0.0007,
"loss": 35411.2,
"step": 70
},
{
"ce_loss_13": 9.870160865783692,
"ce_loss_26": 9.870406699180602,
"ce_loss_39": 9.815650677680969,
"ce_loss_52": 1.4188331544399262,
"ce_loss_7": 9.92598659992218,
"epoch": 0.008,
"grad_norm": 58.363906192597035,
"kl_loss_13": 17475.2,
"kl_loss_26": 17484.8,
"kl_loss_39": 17366.4,
"kl_loss_7": 17587.2,
"learning_rate": 0.0008,
"loss": 35010.4,
"step": 80
},
{
"ce_loss_13": 9.786613607406617,
"ce_loss_26": 9.77514407634735,
"ce_loss_39": 9.697125172615051,
"ce_loss_52": 1.4261163920164108,
"ce_loss_7": 9.84011538028717,
"epoch": 0.009,
"grad_norm": 57.597510936184484,
"kl_loss_13": 17267.2,
"kl_loss_26": 17232.0,
"kl_loss_39": 17065.6,
"kl_loss_7": 17376.0,
"learning_rate": 0.0009000000000000001,
"loss": 34545.6,
"step": 90
},
{
"ce_loss_13": 9.70496118068695,
"ce_loss_26": 9.680623888969421,
"ce_loss_39": 9.57672963142395,
"ce_loss_52": 1.4332450866699218,
"ce_loss_7": 9.75693221092224,
"epoch": 0.01,
"grad_norm": 56.92102830978135,
"kl_loss_13": 17075.2,
"kl_loss_26": 17030.4,
"kl_loss_39": 16814.4,
"kl_loss_7": 17187.2,
"learning_rate": 0.001,
"loss": 34141.6,
"step": 100
},
{
"ce_loss_13": 9.62608094215393,
"ce_loss_26": 9.584086346626282,
"ce_loss_39": 9.469242024421693,
"ce_loss_52": 1.4119121626019477,
"ce_loss_7": 9.681734418869018,
"epoch": 0.011,
"grad_norm": 55.54751251170693,
"kl_loss_13": 16956.8,
"kl_loss_26": 16856.0,
"kl_loss_39": 16632.0,
"kl_loss_7": 17075.2,
"learning_rate": 0.0009999974825027757,
"loss": 33673.2,
"step": 110
},
{
"ce_loss_13": 9.557737636566163,
"ce_loss_26": 9.502408647537232,
"ce_loss_39": 9.373179388046264,
"ce_loss_52": 1.420964427292347,
"ce_loss_7": 9.613711476325989,
"epoch": 0.012,
"grad_norm": 55.463113229450904,
"kl_loss_13": 16777.6,
"kl_loss_26": 16667.2,
"kl_loss_39": 16393.6,
"kl_loss_7": 16905.6,
"learning_rate": 0.0009999899300364532,
"loss": 33335.2,
"step": 120
},
{
"ce_loss_13": 9.474759387969971,
"ce_loss_26": 9.408789944648742,
"ce_loss_39": 9.266374969482422,
"ce_loss_52": 1.4124270409345627,
"ce_loss_7": 9.538655042648315,
"epoch": 0.013,
"grad_norm": 54.241543470395335,
"kl_loss_13": 16628.8,
"kl_loss_26": 16492.8,
"kl_loss_39": 16193.6,
"kl_loss_7": 16766.4,
"learning_rate": 0.0009999773426770863,
"loss": 32999.6,
"step": 130
},
{
"ce_loss_13": 9.420424246788025,
"ce_loss_26": 9.348571801185608,
"ce_loss_39": 9.1972074508667,
"ce_loss_52": 1.4392782002687454,
"ce_loss_7": 9.492741465568542,
"epoch": 0.014,
"grad_norm": 54.10933362933205,
"kl_loss_13": 16476.8,
"kl_loss_26": 16324.8,
"kl_loss_39": 16014.4,
"kl_loss_7": 16638.4,
"learning_rate": 0.0009999597205514296,
"loss": 32751.6,
"step": 140
},
{
"ce_loss_13": 9.388373732566833,
"ce_loss_26": 9.308008575439453,
"ce_loss_39": 9.153336524963379,
"ce_loss_52": 1.4420859813690186,
"ce_loss_7": 9.46445541381836,
"epoch": 0.015,
"grad_norm": 55.15236350542743,
"kl_loss_13": 16382.4,
"kl_loss_26": 16219.2,
"kl_loss_39": 15888.0,
"kl_loss_7": 16542.4,
"learning_rate": 0.0009999370638369377,
"loss": 32525.2,
"step": 150
},
{
"ce_loss_13": 9.301919007301331,
"ce_loss_26": 9.212661600112915,
"ce_loss_39": 9.050238633155823,
"ce_loss_52": 1.4233157366514206,
"ce_loss_7": 9.383031058311463,
"epoch": 0.016,
"grad_norm": 55.03653973388566,
"kl_loss_13": 16278.4,
"kl_loss_26": 16092.8,
"kl_loss_39": 15755.2,
"kl_loss_7": 16440.0,
"learning_rate": 0.000999909372761763,
"loss": 32209.6,
"step": 160
},
{
"ce_loss_13": 9.24642186164856,
"ce_loss_26": 9.14689018726349,
"ce_loss_39": 8.978599190711975,
"ce_loss_52": 1.429112258553505,
"ce_loss_7": 9.331455826759338,
"epoch": 0.017,
"grad_norm": 54.90625528920335,
"kl_loss_13": 16142.4,
"kl_loss_26": 15931.2,
"kl_loss_39": 15580.8,
"kl_loss_7": 16315.2,
"learning_rate": 0.0009998766476047546,
"loss": 31964.8,
"step": 170
},
{
"ce_loss_13": 9.187117385864259,
"ce_loss_26": 9.076116013526917,
"ce_loss_39": 8.902227759361267,
"ce_loss_52": 1.3885775536298752,
"ce_loss_7": 9.275272035598755,
"epoch": 0.018,
"grad_norm": 54.60426962776646,
"kl_loss_13": 16072.0,
"kl_loss_26": 15844.8,
"kl_loss_39": 15480.0,
"kl_loss_7": 16262.4,
"learning_rate": 0.0009998388886954545,
"loss": 31645.2,
"step": 180
},
{
"ce_loss_13": 9.131648278236389,
"ce_loss_26": 9.008217167854308,
"ce_loss_39": 8.831042790412903,
"ce_loss_52": 1.4482133895158769,
"ce_loss_7": 9.224077129364014,
"epoch": 0.019,
"grad_norm": 53.93299711922953,
"kl_loss_13": 15870.4,
"kl_loss_26": 15609.6,
"kl_loss_39": 15232.0,
"kl_loss_7": 16067.2,
"learning_rate": 0.0009997960964140947,
"loss": 31408.4,
"step": 190
},
{
"ce_loss_13": 9.050732731819153,
"ce_loss_26": 8.918284726142883,
"ce_loss_39": 8.738019919395446,
"ce_loss_52": 1.4300477087497712,
"ce_loss_7": 9.145042276382446,
"epoch": 0.02,
"grad_norm": 53.732589384741736,
"kl_loss_13": 15728.0,
"kl_loss_26": 15449.6,
"kl_loss_39": 15064.0,
"kl_loss_7": 15928.0,
"learning_rate": 0.0009997482711915926,
"loss": 31145.6,
"step": 200
},
{
"ce_loss_13": 8.988229060173035,
"ce_loss_26": 8.844111323356628,
"ce_loss_39": 8.654908394813537,
"ce_loss_52": 1.4580651924014092,
"ce_loss_7": 9.090379095077514,
"epoch": 0.021,
"grad_norm": 53.04072542826613,
"kl_loss_13": 15550.4,
"kl_loss_26": 15251.2,
"kl_loss_39": 14854.4,
"kl_loss_7": 15771.2,
"learning_rate": 0.0009996954135095479,
"loss": 30853.2,
"step": 210
},
{
"ce_loss_13": 8.945707607269288,
"ce_loss_26": 8.79644329547882,
"ce_loss_39": 8.601721858978271,
"ce_loss_52": 1.4154451981186866,
"ce_loss_7": 9.052277135849,
"epoch": 0.022,
"grad_norm": 53.309057416596275,
"kl_loss_13": 15544.0,
"kl_loss_26": 15219.2,
"kl_loss_39": 14809.6,
"kl_loss_7": 15761.6,
"learning_rate": 0.0009996375239002368,
"loss": 30606.8,
"step": 220
},
{
"ce_loss_13": 8.933341026306152,
"ce_loss_26": 8.773744869232178,
"ce_loss_39": 8.57140781879425,
"ce_loss_52": 1.4167777329683304,
"ce_loss_7": 9.04187982082367,
"epoch": 0.023,
"grad_norm": 53.383646235412414,
"kl_loss_13": 15486.4,
"kl_loss_26": 15156.8,
"kl_loss_39": 14726.4,
"kl_loss_7": 15721.6,
"learning_rate": 0.0009995746029466072,
"loss": 30406.4,
"step": 230
},
{
"ce_loss_13": 8.869291019439697,
"ce_loss_26": 8.693119740486145,
"ce_loss_39": 8.476832008361816,
"ce_loss_52": 1.4153499186038971,
"ce_loss_7": 8.980035424232483,
"epoch": 0.024,
"grad_norm": 52.917730504652305,
"kl_loss_13": 15353.6,
"kl_loss_26": 14985.6,
"kl_loss_39": 14528.0,
"kl_loss_7": 15588.8,
"learning_rate": 0.0009995066512822719,
"loss": 30148.4,
"step": 240
},
{
"ce_loss_13": 8.81713101863861,
"ce_loss_26": 8.636789417266845,
"ce_loss_39": 8.412476801872254,
"ce_loss_52": 1.4529948115348816,
"ce_loss_7": 8.929914593696594,
"epoch": 0.025,
"grad_norm": 53.98592158305785,
"kl_loss_13": 15196.8,
"kl_loss_26": 14820.8,
"kl_loss_39": 14340.8,
"kl_loss_7": 15438.4,
"learning_rate": 0.000999433669591504,
"loss": 29860.8,
"step": 250
},
{
"ce_loss_13": 8.748036527633667,
"ce_loss_26": 8.558162140846253,
"ce_loss_39": 8.331628108024598,
"ce_loss_52": 1.4311662405729293,
"ce_loss_7": 8.864733743667603,
"epoch": 0.026,
"grad_norm": 52.415587650337294,
"kl_loss_13": 15088.0,
"kl_loss_26": 14683.2,
"kl_loss_39": 14200.0,
"kl_loss_7": 15329.6,
"learning_rate": 0.000999355658609228,
"loss": 29636.0,
"step": 260
},
{
"ce_loss_13": 8.692949771881104,
"ce_loss_26": 8.494869589805603,
"ce_loss_39": 8.259693837165832,
"ce_loss_52": 1.4384984374046326,
"ce_loss_7": 8.814060854911805,
"epoch": 0.027,
"grad_norm": 53.356303831580306,
"kl_loss_13": 14976.0,
"kl_loss_26": 14555.2,
"kl_loss_39": 14054.4,
"kl_loss_7": 15230.4,
"learning_rate": 0.0009992726191210138,
"loss": 29438.0,
"step": 270
},
{
"ce_loss_13": 8.67082085609436,
"ce_loss_26": 8.463814663887025,
"ce_loss_39": 8.215038228034974,
"ce_loss_52": 1.4267250567674636,
"ce_loss_7": 8.794116616249084,
"epoch": 0.028,
"grad_norm": 52.94359037736481,
"kl_loss_13": 14902.4,
"kl_loss_26": 14476.8,
"kl_loss_39": 13944.0,
"kl_loss_7": 15174.4,
"learning_rate": 0.0009991845519630679,
"loss": 29276.4,
"step": 280
},
{
"ce_loss_13": 8.61563618183136,
"ce_loss_26": 8.402152299880981,
"ce_loss_39": 8.144541609287263,
"ce_loss_52": 1.4250373497605324,
"ce_loss_7": 8.742781138420105,
"epoch": 0.029,
"grad_norm": 51.566336997103136,
"kl_loss_13": 14817.6,
"kl_loss_26": 14369.6,
"kl_loss_39": 13816.0,
"kl_loss_7": 15089.6,
"learning_rate": 0.0009990914580222257,
"loss": 29010.0,
"step": 290
},
{
"ce_loss_13": 8.572561240196228,
"ce_loss_26": 8.35531551837921,
"ce_loss_39": 8.092759764194488,
"ce_loss_52": 1.4600662559270858,
"ce_loss_7": 8.698393726348877,
"epoch": 0.03,
"grad_norm": 53.1076582900563,
"kl_loss_13": 14704.0,
"kl_loss_26": 14251.2,
"kl_loss_39": 13689.6,
"kl_loss_7": 14974.4,
"learning_rate": 0.0009989933382359422,
"loss": 28776.8,
"step": 300
},
{
"ce_loss_13": 8.491887974739075,
"ce_loss_26": 8.263946199417115,
"ce_loss_39": 7.987871313095093,
"ce_loss_52": 1.4451197743415833,
"ce_loss_7": 8.625923323631287,
"epoch": 0.031,
"grad_norm": 52.58460107915946,
"kl_loss_13": 14537.6,
"kl_loss_26": 14054.4,
"kl_loss_39": 13464.0,
"kl_loss_7": 14825.6,
"learning_rate": 0.0009988901935922825,
"loss": 28548.4,
"step": 310
},
{
"ce_loss_13": 8.474869418144227,
"ce_loss_26": 8.245952117443085,
"ce_loss_39": 7.974157309532165,
"ce_loss_52": 1.4602727562189102,
"ce_loss_7": 8.610798478126526,
"epoch": 0.032,
"grad_norm": 52.331617741046635,
"kl_loss_13": 14480.0,
"kl_loss_26": 14003.2,
"kl_loss_39": 13427.2,
"kl_loss_7": 14760.0,
"learning_rate": 0.0009987820251299122,
"loss": 28364.4,
"step": 320
},
{
"ce_loss_13": 8.44498426914215,
"ce_loss_26": 8.20645843744278,
"ce_loss_39": 7.912872779369354,
"ce_loss_52": 1.4554857224225999,
"ce_loss_7": 8.583872628211974,
"epoch": 0.033,
"grad_norm": 50.87832937020315,
"kl_loss_13": 14411.2,
"kl_loss_26": 13900.8,
"kl_loss_39": 13281.6,
"kl_loss_7": 14700.8,
"learning_rate": 0.0009986688339380862,
"loss": 28109.2,
"step": 330
},
{
"ce_loss_13": 8.38349392414093,
"ce_loss_26": 8.133478546142578,
"ce_loss_39": 7.828318297863007,
"ce_loss_52": 1.425998830795288,
"ce_loss_7": 8.525882768630982,
"epoch": 0.034,
"grad_norm": 51.153122440976325,
"kl_loss_13": 14328.0,
"kl_loss_26": 13809.6,
"kl_loss_39": 13163.2,
"kl_loss_7": 14633.6,
"learning_rate": 0.0009985506211566387,
"loss": 27878.0,
"step": 340
},
{
"ce_loss_13": 8.349605464935303,
"ce_loss_26": 8.097514569759369,
"ce_loss_39": 7.785721278190612,
"ce_loss_52": 1.4315812528133391,
"ce_loss_7": 8.496586155891418,
"epoch": 0.035,
"grad_norm": 51.20256734387486,
"kl_loss_13": 14254.4,
"kl_loss_26": 13721.6,
"kl_loss_39": 13062.4,
"kl_loss_7": 14561.6,
"learning_rate": 0.0009984273879759713,
"loss": 27693.2,
"step": 350
},
{
"ce_loss_13": 8.273482608795167,
"ce_loss_26": 8.020016944408416,
"ce_loss_39": 7.711479115486145,
"ce_loss_52": 1.4499147981405258,
"ce_loss_7": 8.422512984275818,
"epoch": 0.036,
"grad_norm": 52.12411775413645,
"kl_loss_13": 14088.0,
"kl_loss_26": 13556.8,
"kl_loss_39": 12892.8,
"kl_loss_7": 14403.2,
"learning_rate": 0.0009982991356370402,
"loss": 27442.0,
"step": 360
},
{
"ce_loss_13": 8.215481567382813,
"ce_loss_26": 7.953275382518768,
"ce_loss_39": 7.628855121135712,
"ce_loss_52": 1.411750042438507,
"ce_loss_7": 8.368350863456726,
"epoch": 0.037,
"grad_norm": 51.0215332330724,
"kl_loss_13": 14024.0,
"kl_loss_26": 13470.4,
"kl_loss_39": 12779.2,
"kl_loss_7": 14348.8,
"learning_rate": 0.0009981658654313456,
"loss": 27348.0,
"step": 370
},
{
"ce_loss_13": 8.217882227897643,
"ce_loss_26": 7.9462348341941835,
"ce_loss_39": 7.613846385478974,
"ce_loss_52": 1.4831970453262329,
"ce_loss_7": 8.373853397369384,
"epoch": 0.038,
"grad_norm": 50.4571010346743,
"kl_loss_13": 13913.6,
"kl_loss_26": 13345.6,
"kl_loss_39": 12646.4,
"kl_loss_7": 14243.2,
"learning_rate": 0.000998027578700917,
"loss": 27082.8,
"step": 380
},
{
"ce_loss_13": 8.11286985874176,
"ce_loss_26": 7.831897294521331,
"ce_loss_39": 7.493152487277984,
"ce_loss_52": 1.4128454998135567,
"ce_loss_7": 8.275482225418092,
"epoch": 0.039,
"grad_norm": 51.75236464910614,
"kl_loss_13": 13798.4,
"kl_loss_26": 13212.8,
"kl_loss_39": 12491.2,
"kl_loss_7": 14139.2,
"learning_rate": 0.0009978842768382998,
"loss": 26835.6,
"step": 390
},
{
"ce_loss_13": 8.092873919010163,
"ce_loss_26": 7.80686913728714,
"ce_loss_39": 7.462458717823028,
"ce_loss_52": 1.449526023864746,
"ce_loss_7": 8.253998827934264,
"epoch": 0.04,
"grad_norm": 50.588198000151046,
"kl_loss_13": 13680.0,
"kl_loss_26": 13083.2,
"kl_loss_39": 12355.2,
"kl_loss_7": 14019.2,
"learning_rate": 0.0009977359612865424,
"loss": 26670.0,
"step": 400
},
{
"ce_loss_13": 8.073025333881379,
"ce_loss_26": 7.792924261093139,
"ce_loss_39": 7.448240423202515,
"ce_loss_52": 1.4590945556759833,
"ce_loss_7": 8.234287071228028,
"epoch": 0.041,
"grad_norm": 50.388842117598266,
"kl_loss_13": 13630.4,
"kl_loss_26": 13032.0,
"kl_loss_39": 12302.4,
"kl_loss_7": 13969.6,
"learning_rate": 0.0009975826335391806,
"loss": 26457.2,
"step": 410
},
{
"ce_loss_13": 7.959017169475556,
"ce_loss_26": 7.667092227935791,
"ce_loss_39": 7.308180010318756,
"ce_loss_52": 1.3903952419757843,
"ce_loss_7": 8.129511964321136,
"epoch": 0.042,
"grad_norm": 50.86911767207133,
"kl_loss_13": 13540.8,
"kl_loss_26": 12916.8,
"kl_loss_39": 12163.2,
"kl_loss_7": 13888.0,
"learning_rate": 0.0009974242951402235,
"loss": 26197.6,
"step": 420
},
{
"ce_loss_13": 7.940323996543884,
"ce_loss_26": 7.629641830921173,
"ce_loss_39": 7.2696495175361635,
"ce_loss_52": 1.4528310179710389,
"ce_loss_7": 8.112548959255218,
"epoch": 0.043,
"grad_norm": 49.24163454624527,
"kl_loss_13": 13374.4,
"kl_loss_26": 12728.0,
"kl_loss_39": 11958.4,
"kl_loss_7": 13744.0,
"learning_rate": 0.0009972609476841367,
"loss": 25992.4,
"step": 430
},
{
"ce_loss_13": 7.8985715508461,
"ce_loss_26": 7.592843997478485,
"ce_loss_39": 7.205817592144013,
"ce_loss_52": 1.4196556687355042,
"ce_loss_7": 8.078336155414581,
"epoch": 0.044,
"grad_norm": 49.811998634305894,
"kl_loss_13": 13345.6,
"kl_loss_26": 12699.2,
"kl_loss_39": 11881.6,
"kl_loss_7": 13718.4,
"learning_rate": 0.0009970925928158272,
"loss": 25854.4,
"step": 440
},
{
"ce_loss_13": 7.867163848876953,
"ce_loss_26": 7.55372383594513,
"ce_loss_39": 7.173475623130798,
"ce_loss_52": 1.4358820408582686,
"ce_loss_7": 8.0431494474411,
"epoch": 0.045,
"grad_norm": 48.311473749243106,
"kl_loss_13": 13248.0,
"kl_loss_26": 12595.2,
"kl_loss_39": 11792.0,
"kl_loss_7": 13614.4,
"learning_rate": 0.000996919232230627,
"loss": 25620.0,
"step": 450
},
{
"ce_loss_13": 7.794478893280029,
"ce_loss_26": 7.470960378646851,
"ce_loss_39": 7.075640022754669,
"ce_loss_52": 1.4313764542341232,
"ce_loss_7": 7.97986272573471,
"epoch": 0.046,
"grad_norm": 49.75600981611632,
"kl_loss_13": 13113.6,
"kl_loss_26": 12433.6,
"kl_loss_39": 11588.8,
"kl_loss_7": 13496.0,
"learning_rate": 0.0009967408676742752,
"loss": 25367.2,
"step": 460
},
{
"ce_loss_13": 7.780554842948914,
"ce_loss_26": 7.450528597831726,
"ce_loss_39": 7.06024489402771,
"ce_loss_52": 1.4271526962518692,
"ce_loss_7": 7.96198604106903,
"epoch": 0.047,
"grad_norm": 49.73198686766462,
"kl_loss_13": 13089.6,
"kl_loss_26": 12406.4,
"kl_loss_39": 11576.0,
"kl_loss_7": 13475.2,
"learning_rate": 0.0009965575009429006,
"loss": 25186.4,
"step": 470
},
{
"ce_loss_13": 7.771422934532166,
"ce_loss_26": 7.443638646602631,
"ce_loss_39": 7.045053339004516,
"ce_loss_52": 1.4691366642713546,
"ce_loss_7": 7.956920957565307,
"epoch": 0.048,
"grad_norm": 48.6898889709016,
"kl_loss_13": 12971.2,
"kl_loss_26": 12286.4,
"kl_loss_39": 11440.0,
"kl_loss_7": 13366.4,
"learning_rate": 0.0009963691338830043,
"loss": 25028.4,
"step": 480
},
{
"ce_loss_13": 7.7170240640640255,
"ce_loss_26": 7.3867839813232425,
"ce_loss_39": 6.986623299121857,
"ce_loss_52": 1.4700770109891892,
"ce_loss_7": 7.900947248935699,
"epoch": 0.049,
"grad_norm": 47.968754476102596,
"kl_loss_13": 12884.8,
"kl_loss_26": 12195.2,
"kl_loss_39": 11332.8,
"kl_loss_7": 13273.6,
"learning_rate": 0.0009961757683914405,
"loss": 24808.8,
"step": 490
},
{
"ce_loss_13": 7.612575709819794,
"ce_loss_26": 7.270983147621155,
"ce_loss_39": 6.851053369045258,
"ce_loss_52": 1.4072588831186295,
"ce_loss_7": 7.807804656028748,
"epoch": 0.05,
"grad_norm": 49.18975121944083,
"kl_loss_13": 12780.8,
"kl_loss_26": 12060.8,
"kl_loss_39": 11161.6,
"kl_loss_7": 13190.4,
"learning_rate": 0.0009959774064153978,
"loss": 24615.6,
"step": 500
},
{
"ce_loss_13": 7.6113405585289,
"ce_loss_26": 7.257396864891052,
"ce_loss_39": 6.8364926934242245,
"ce_loss_52": 1.405586513876915,
"ce_loss_7": 7.807830440998077,
"epoch": 0.051,
"grad_norm": 48.36038036613293,
"kl_loss_13": 12753.6,
"kl_loss_26": 12017.6,
"kl_loss_39": 11121.6,
"kl_loss_7": 13161.6,
"learning_rate": 0.0009957740499523787,
"loss": 24452.0,
"step": 510
},
{
"ce_loss_13": 7.562924301624298,
"ce_loss_26": 7.205751180648804,
"ce_loss_39": 6.7729793906211855,
"ce_loss_52": 1.441327565908432,
"ce_loss_7": 7.762964737415314,
"epoch": 0.052,
"grad_norm": 48.52091531249349,
"kl_loss_13": 12577.6,
"kl_loss_26": 11833.6,
"kl_loss_39": 10912.0,
"kl_loss_7": 12990.4,
"learning_rate": 0.0009955657010501807,
"loss": 24214.4,
"step": 520
},
{
"ce_loss_13": 7.5027553796768185,
"ce_loss_26": 7.149892139434814,
"ce_loss_39": 6.725725698471069,
"ce_loss_52": 1.4616976886987687,
"ce_loss_7": 7.700168478488922,
"epoch": 0.053,
"grad_norm": 47.609892122251686,
"kl_loss_13": 12451.2,
"kl_loss_26": 11710.4,
"kl_loss_39": 10804.8,
"kl_loss_7": 12872.0,
"learning_rate": 0.000995352361806875,
"loss": 24037.2,
"step": 530
},
{
"ce_loss_13": 7.525733006000519,
"ce_loss_26": 7.1605717778205875,
"ce_loss_39": 6.703774988651276,
"ce_loss_52": 1.42885320186615,
"ce_loss_7": 7.727836930751801,
"epoch": 0.054,
"grad_norm": 47.0111007198644,
"kl_loss_13": 12556.8,
"kl_loss_26": 11790.4,
"kl_loss_39": 10828.8,
"kl_loss_7": 12976.0,
"learning_rate": 0.0009951340343707852,
"loss": 23845.2,
"step": 540
},
{
"ce_loss_13": 7.423776483535766,
"ce_loss_26": 7.0557411193847654,
"ce_loss_39": 6.604493200778961,
"ce_loss_52": 1.4447590827941894,
"ce_loss_7": 7.626477897167206,
"epoch": 0.055,
"grad_norm": 49.45361296474082,
"kl_loss_13": 12328.0,
"kl_loss_26": 11540.8,
"kl_loss_39": 10584.0,
"kl_loss_7": 12747.2,
"learning_rate": 0.0009949107209404665,
"loss": 23664.0,
"step": 550
},
{
"ce_loss_13": 7.434036374092102,
"ce_loss_26": 7.059368348121643,
"ce_loss_39": 6.6053709268569945,
"ce_loss_52": 1.4645269870758058,
"ce_loss_7": 7.6432753801345825,
"epoch": 0.056,
"grad_norm": 47.673368470799254,
"kl_loss_13": 12291.2,
"kl_loss_26": 11512.0,
"kl_loss_39": 10547.2,
"kl_loss_7": 12726.4,
"learning_rate": 0.0009946824237646824,
"loss": 23469.6,
"step": 560
},
{
"ce_loss_13": 7.307004892826081,
"ce_loss_26": 6.927345609664917,
"ce_loss_39": 6.4629304051399235,
"ce_loss_52": 1.437305434048176,
"ce_loss_7": 7.522386133670807,
"epoch": 0.057,
"grad_norm": 46.80952508481597,
"kl_loss_13": 12094.4,
"kl_loss_26": 11300.8,
"kl_loss_39": 10308.8,
"kl_loss_7": 12539.2,
"learning_rate": 0.0009944491451423828,
"loss": 23249.6,
"step": 570
},
{
"ce_loss_13": 7.349401378631592,
"ce_loss_26": 6.964329659938812,
"ce_loss_39": 6.480949449539184,
"ce_loss_52": 1.4452718168497085,
"ce_loss_7": 7.564259791374207,
"epoch": 0.058,
"grad_norm": 46.22867294627436,
"kl_loss_13": 12145.6,
"kl_loss_26": 11340.8,
"kl_loss_39": 10315.2,
"kl_loss_7": 12592.0,
"learning_rate": 0.0009942108874226813,
"loss": 23091.2,
"step": 580
},
{
"ce_loss_13": 7.254886651039124,
"ce_loss_26": 6.858836472034454,
"ce_loss_39": 6.3856946468353275,
"ce_loss_52": 1.4449717432260514,
"ce_loss_7": 7.473185133934021,
"epoch": 0.059,
"grad_norm": 45.84422579202554,
"kl_loss_13": 11969.6,
"kl_loss_26": 11147.2,
"kl_loss_39": 10136.0,
"kl_loss_7": 12424.0,
"learning_rate": 0.00099396765300483,
"loss": 22886.4,
"step": 590
},
{
"ce_loss_13": 7.248957896232605,
"ce_loss_26": 6.855677163600921,
"ce_loss_39": 6.3774519801139835,
"ce_loss_52": 1.477000206708908,
"ce_loss_7": 7.465912497043609,
"epoch": 0.06,
"grad_norm": 46.37348014710593,
"kl_loss_13": 11888.0,
"kl_loss_26": 11064.0,
"kl_loss_39": 10044.8,
"kl_loss_7": 12347.2,
"learning_rate": 0.0009937194443381972,
"loss": 22708.0,
"step": 600
},
{
"ce_loss_13": 7.210493552684784,
"ce_loss_26": 6.8088652968406675,
"ce_loss_39": 6.325126445293426,
"ce_loss_52": 1.444644930958748,
"ce_loss_7": 7.429195690155029,
"epoch": 0.061,
"grad_norm": 44.92499922138711,
"kl_loss_13": 11859.2,
"kl_loss_26": 11019.2,
"kl_loss_39": 9995.2,
"kl_loss_7": 12320.0,
"learning_rate": 0.0009934662639222412,
"loss": 22544.8,
"step": 610
},
{
"ce_loss_13": 7.1185362339019775,
"ce_loss_26": 6.714106225967408,
"ce_loss_39": 6.223515486717224,
"ce_loss_52": 1.4858893424272537,
"ce_loss_7": 7.341545379161834,
"epoch": 0.062,
"grad_norm": 46.45143938897793,
"kl_loss_13": 11601.6,
"kl_loss_26": 10750.4,
"kl_loss_39": 9708.8,
"kl_loss_7": 12072.0,
"learning_rate": 0.000993208114306486,
"loss": 22270.0,
"step": 620
},
{
"ce_loss_13": 7.0913821935653685,
"ce_loss_26": 6.689675974845886,
"ce_loss_39": 6.203632855415345,
"ce_loss_52": 1.4506051570177079,
"ce_loss_7": 7.311544299125671,
"epoch": 0.063,
"grad_norm": 45.34630221197193,
"kl_loss_13": 11592.0,
"kl_loss_26": 10752.0,
"kl_loss_39": 9720.0,
"kl_loss_7": 12067.2,
"learning_rate": 0.0009929449980904952,
"loss": 22153.2,
"step": 630
},
{
"ce_loss_13": 7.083522534370422,
"ce_loss_26": 6.665987038612366,
"ce_loss_39": 6.162657225131989,
"ce_loss_52": 1.4658448547124863,
"ce_loss_7": 7.312107050418854,
"epoch": 0.064,
"grad_norm": 45.471744941742365,
"kl_loss_13": 11552.0,
"kl_loss_26": 10675.2,
"kl_loss_39": 9596.8,
"kl_loss_7": 12032.0,
"learning_rate": 0.0009926769179238466,
"loss": 21949.2,
"step": 640
},
{
"ce_loss_13": 6.994167017936706,
"ce_loss_26": 6.563658082485199,
"ce_loss_39": 6.042373907566071,
"ce_loss_52": 1.4207285180687905,
"ce_loss_7": 7.2311041235923765,
"epoch": 0.065,
"grad_norm": 43.84127734363621,
"kl_loss_13": 11489.6,
"kl_loss_26": 10593.6,
"kl_loss_39": 9488.0,
"kl_loss_7": 11980.8,
"learning_rate": 0.000992403876506104,
"loss": 21796.8,
"step": 650
},
{
"ce_loss_13": 6.9931820154190065,
"ce_loss_26": 6.566097593307495,
"ce_loss_39": 6.045178306102753,
"ce_loss_52": 1.4772068083286285,
"ce_loss_7": 7.2253869533538815,
"epoch": 0.066,
"grad_norm": 43.29636197313948,
"kl_loss_13": 11363.2,
"kl_loss_26": 10462.4,
"kl_loss_39": 9350.4,
"kl_loss_7": 11856.0,
"learning_rate": 0.0009921258765867918,
"loss": 21581.2,
"step": 660
},
{
"ce_loss_13": 6.907565414905548,
"ce_loss_26": 6.471543419361114,
"ce_loss_39": 5.933564639091491,
"ce_loss_52": 1.4364299774169922,
"ce_loss_7": 7.147727394104004,
"epoch": 0.067,
"grad_norm": 45.37835002704289,
"kl_loss_13": 11259.2,
"kl_loss_26": 10348.8,
"kl_loss_39": 9200.0,
"kl_loss_7": 11766.4,
"learning_rate": 0.0009918429209653662,
"loss": 21394.0,
"step": 670
},
{
"ce_loss_13": 6.9164858102798465,
"ce_loss_26": 6.482825660705567,
"ce_loss_39": 5.9588632702827455,
"ce_loss_52": 1.4493420034646989,
"ce_loss_7": 7.152165937423706,
"epoch": 0.068,
"grad_norm": 44.49853682897619,
"kl_loss_13": 11238.4,
"kl_loss_26": 10336.0,
"kl_loss_39": 9201.6,
"kl_loss_7": 11729.6,
"learning_rate": 0.0009915550124911866,
"loss": 21260.8,
"step": 680
},
{
"ce_loss_13": 6.871931791305542,
"ce_loss_26": 6.440780913829803,
"ce_loss_39": 5.910830950736999,
"ce_loss_52": 1.4238717705011368,
"ce_loss_7": 7.117027842998505,
"epoch": 0.069,
"grad_norm": 44.632228248662486,
"kl_loss_13": 11209.6,
"kl_loss_26": 10307.2,
"kl_loss_39": 9184.0,
"kl_loss_7": 11716.8,
"learning_rate": 0.0009912621540634887,
"loss": 21100.4,
"step": 690
},
{
"ce_loss_13": 6.761080467700959,
"ce_loss_26": 6.306544578075409,
"ce_loss_39": 5.754528117179871,
"ce_loss_52": 1.378117674589157,
"ce_loss_7": 7.018208122253418,
"epoch": 0.07,
"grad_norm": 45.73542626422545,
"kl_loss_13": 11040.0,
"kl_loss_26": 10088.0,
"kl_loss_39": 8913.6,
"kl_loss_7": 11569.6,
"learning_rate": 0.0009909643486313534,
"loss": 20851.6,
"step": 700
},
{
"ce_loss_13": 6.78661539554596,
"ce_loss_26": 6.327802836894989,
"ce_loss_39": 5.761592519283295,
"ce_loss_52": 1.411187869310379,
"ce_loss_7": 7.041204571723938,
"epoch": 0.071,
"grad_norm": 42.217908581540186,
"kl_loss_13": 11076.8,
"kl_loss_26": 10112.0,
"kl_loss_39": 8905.6,
"kl_loss_7": 11606.4,
"learning_rate": 0.000990661599193678,
"loss": 20727.2,
"step": 710
},
{
"ce_loss_13": 6.725618660449982,
"ce_loss_26": 6.256143915653229,
"ce_loss_39": 5.684507942199707,
"ce_loss_52": 1.4021562442183495,
"ce_loss_7": 6.98309029340744,
"epoch": 0.072,
"grad_norm": 41.87540936995742,
"kl_loss_13": 10950.4,
"kl_loss_26": 9966.4,
"kl_loss_39": 8756.0,
"kl_loss_7": 11489.6,
"learning_rate": 0.0009903539087991462,
"loss": 20494.0,
"step": 720
},
{
"ce_loss_13": 6.704308211803436,
"ce_loss_26": 6.2481373190879825,
"ce_loss_39": 5.691672837734222,
"ce_loss_52": 1.4353317350149155,
"ce_loss_7": 6.9648723125457765,
"epoch": 0.073,
"grad_norm": 41.50232307107669,
"kl_loss_13": 10825.6,
"kl_loss_26": 9868.8,
"kl_loss_39": 8681.6,
"kl_loss_7": 11364.8,
"learning_rate": 0.0009900412805461966,
"loss": 20435.6,
"step": 730
},
{
"ce_loss_13": 6.661628067493439,
"ce_loss_26": 6.196605837345123,
"ce_loss_39": 5.6325979948043825,
"ce_loss_52": 1.4341969668865204,
"ce_loss_7": 6.919101679325104,
"epoch": 0.074,
"grad_norm": 40.95865426481825,
"kl_loss_13": 10744.0,
"kl_loss_26": 9771.2,
"kl_loss_39": 8564.0,
"kl_loss_7": 11276.8,
"learning_rate": 0.0009897237175829927,
"loss": 20203.6,
"step": 740
},
{
"ce_loss_13": 6.600095963478088,
"ce_loss_26": 6.1362119793891905,
"ce_loss_39": 5.5736886858940125,
"ce_loss_52": 1.408122679591179,
"ce_loss_7": 6.857535183429718,
"epoch": 0.075,
"grad_norm": 39.711086876656914,
"kl_loss_13": 10672.0,
"kl_loss_26": 9696.0,
"kl_loss_39": 8488.0,
"kl_loss_7": 11211.2,
"learning_rate": 0.0009894012231073895,
"loss": 20039.6,
"step": 750
},
{
"ce_loss_13": 6.570180189609528,
"ce_loss_26": 6.098634576797485,
"ce_loss_39": 5.526832151412964,
"ce_loss_52": 1.4715266615152358,
"ce_loss_7": 6.833914375305175,
"epoch": 0.076,
"grad_norm": 43.2480305118225,
"kl_loss_13": 10513.6,
"kl_loss_26": 9529.6,
"kl_loss_39": 8309.6,
"kl_loss_7": 11064.0,
"learning_rate": 0.0009890738003669028,
"loss": 19880.0,
"step": 760
},
{
"ce_loss_13": 6.567346775531769,
"ce_loss_26": 6.0874533414840695,
"ce_loss_39": 5.5076407313346865,
"ce_loss_52": 1.4363629996776581,
"ce_loss_7": 6.83351217508316,
"epoch": 0.077,
"grad_norm": 40.54611020558553,
"kl_loss_13": 10534.4,
"kl_loss_26": 9550.4,
"kl_loss_39": 8325.6,
"kl_loss_7": 11091.2,
"learning_rate": 0.0009887414526586764,
"loss": 19717.2,
"step": 770
},
{
"ce_loss_13": 6.519154870510102,
"ce_loss_26": 6.030243515968323,
"ce_loss_39": 5.4369661688804625,
"ce_loss_52": 1.4315312415361405,
"ce_loss_7": 6.7882112741470335,
"epoch": 0.078,
"grad_norm": 40.32433812744511,
"kl_loss_13": 10425.6,
"kl_loss_26": 9401.6,
"kl_loss_39": 8148.0,
"kl_loss_7": 10985.6,
"learning_rate": 0.0009884041833294476,
"loss": 19528.4,
"step": 780
},
{
"ce_loss_13": 6.458490109443664,
"ce_loss_26": 5.970031499862671,
"ce_loss_39": 5.392513406276703,
"ce_loss_52": 1.4178608924150466,
"ce_loss_7": 6.729024958610535,
"epoch": 0.079,
"grad_norm": 41.89461078246612,
"kl_loss_13": 10355.2,
"kl_loss_26": 9337.6,
"kl_loss_39": 8099.2,
"kl_loss_7": 10920.0,
"learning_rate": 0.000988061995775515,
"loss": 19441.6,
"step": 790
},
{
"ce_loss_13": 6.432489657402039,
"ce_loss_26": 5.9446264743804935,
"ce_loss_39": 5.3700491189956665,
"ce_loss_52": 1.448669496178627,
"ce_loss_7": 6.70353993177414,
"epoch": 0.08,
"grad_norm": 41.284533949329585,
"kl_loss_13": 10246.4,
"kl_loss_26": 9230.4,
"kl_loss_39": 8004.0,
"kl_loss_7": 10811.2,
"learning_rate": 0.0009877148934427035,
"loss": 19206.8,
"step": 800
},
{
"ce_loss_13": 6.43521283864975,
"ce_loss_26": 5.939107716083527,
"ce_loss_39": 5.3414135575294495,
"ce_loss_52": 1.4238088309764863,
"ce_loss_7": 6.7135733485221865,
"epoch": 0.081,
"grad_norm": 39.49115349094681,
"kl_loss_13": 10304.0,
"kl_loss_26": 9264.0,
"kl_loss_39": 8010.4,
"kl_loss_7": 10881.6,
"learning_rate": 0.0009873628798263297,
"loss": 19058.0,
"step": 810
},
{
"ce_loss_13": 6.375123608112335,
"ce_loss_26": 5.867140221595764,
"ce_loss_39": 5.2596719622612,
"ce_loss_52": 1.4445373743772507,
"ce_loss_7": 6.642891383171081,
"epoch": 0.082,
"grad_norm": 39.14925963953466,
"kl_loss_13": 10124.8,
"kl_loss_26": 9064.0,
"kl_loss_39": 7779.2,
"kl_loss_7": 10689.6,
"learning_rate": 0.0009870059584711668,
"loss": 18891.2,
"step": 820
},
{
"ce_loss_13": 6.2935021877288815,
"ce_loss_26": 5.791378605365753,
"ce_loss_39": 5.201059639453888,
"ce_loss_52": 1.424940624833107,
"ce_loss_7": 6.564577507972717,
"epoch": 0.083,
"grad_norm": 39.604616846798116,
"kl_loss_13": 9998.4,
"kl_loss_26": 8953.6,
"kl_loss_39": 7687.2,
"kl_loss_7": 10563.2,
"learning_rate": 0.000986644132971409,
"loss": 18704.0,
"step": 830
},
{
"ce_loss_13": 6.274489688873291,
"ce_loss_26": 5.771354067325592,
"ce_loss_39": 5.155186474323273,
"ce_loss_52": 1.429549178481102,
"ce_loss_7": 6.554718315601349,
"epoch": 0.084,
"grad_norm": 38.382382142673,
"kl_loss_13": 9971.2,
"kl_loss_26": 8913.6,
"kl_loss_39": 7593.6,
"kl_loss_7": 10548.8,
"learning_rate": 0.0009862774069706345,
"loss": 18644.4,
"step": 840
},
{
"ce_loss_13": 6.2195475697517395,
"ce_loss_26": 5.717883968353272,
"ce_loss_39": 5.13158141374588,
"ce_loss_52": 1.4267238914966582,
"ce_loss_7": 6.497097527980804,
"epoch": 0.085,
"grad_norm": 39.67456709505246,
"kl_loss_13": 9820.8,
"kl_loss_26": 8772.8,
"kl_loss_39": 7518.4,
"kl_loss_7": 10401.6,
"learning_rate": 0.000985905784161771,
"loss": 18443.2,
"step": 850
},
{
"ce_loss_13": 6.253428983688354,
"ce_loss_26": 5.746143198013305,
"ce_loss_39": 5.127990126609802,
"ce_loss_52": 1.4239614009857178,
"ce_loss_7": 6.538207769393921,
"epoch": 0.086,
"grad_norm": 39.46700749652253,
"kl_loss_13": 9912.0,
"kl_loss_26": 8857.6,
"kl_loss_39": 7540.0,
"kl_loss_7": 10500.8,
"learning_rate": 0.000985529268287055,
"loss": 18336.4,
"step": 860
},
{
"ce_loss_13": 6.181728804111481,
"ce_loss_26": 5.661606287956237,
"ce_loss_39": 5.042867851257324,
"ce_loss_52": 1.427235585451126,
"ce_loss_7": 6.472940897941589,
"epoch": 0.087,
"grad_norm": 38.42645346767181,
"kl_loss_13": 9798.4,
"kl_loss_26": 8700.0,
"kl_loss_39": 7378.4,
"kl_loss_7": 10409.6,
"learning_rate": 0.0009851478631379982,
"loss": 18167.2,
"step": 870
},
{
"ce_loss_13": 6.113723492622375,
"ce_loss_26": 5.586072051525116,
"ce_loss_39": 4.9395282626152035,
"ce_loss_52": 1.3542900115251542,
"ce_loss_7": 6.410606110095978,
"epoch": 0.088,
"grad_norm": 37.03254074803745,
"kl_loss_13": 9768.0,
"kl_loss_26": 8668.8,
"kl_loss_39": 7310.4,
"kl_loss_7": 10384.0,
"learning_rate": 0.0009847615725553456,
"loss": 18092.4,
"step": 880
},
{
"ce_loss_13": 6.182212936878204,
"ce_loss_26": 5.651630616188049,
"ce_loss_39": 5.018951749801635,
"ce_loss_52": 1.4187958374619485,
"ce_loss_7": 6.474197280406952,
"epoch": 0.089,
"grad_norm": 36.66962934304384,
"kl_loss_13": 9784.0,
"kl_loss_26": 8682.4,
"kl_loss_39": 7329.6,
"kl_loss_7": 10400.0,
"learning_rate": 0.0009843704004290394,
"loss": 18005.6,
"step": 890
},
{
"ce_loss_13": 6.1027270436286924,
"ce_loss_26": 5.578388214111328,
"ce_loss_39": 4.954142212867737,
"ce_loss_52": 1.433653001487255,
"ce_loss_7": 6.3896349430084225,
"epoch": 0.09,
"grad_norm": 37.895271797404135,
"kl_loss_13": 9592.0,
"kl_loss_26": 8488.8,
"kl_loss_39": 7161.6,
"kl_loss_7": 10198.4,
"learning_rate": 0.0009839743506981783,
"loss": 17767.6,
"step": 900
},
{
"ce_loss_13": 6.134694254398346,
"ce_loss_26": 5.600531077384948,
"ce_loss_39": 4.956934368610382,
"ce_loss_52": 1.4616568014025688,
"ce_loss_7": 6.426614594459534,
"epoch": 0.091,
"grad_norm": 35.5095046076979,
"kl_loss_13": 9608.0,
"kl_loss_26": 8489.6,
"kl_loss_39": 7093.6,
"kl_loss_7": 10217.6,
"learning_rate": 0.0009835734273509786,
"loss": 17664.2,
"step": 910
},
{
"ce_loss_13": 6.059712076187134,
"ce_loss_26": 5.526252567768097,
"ce_loss_39": 4.893229007720947,
"ce_loss_52": 1.4395650416612624,
"ce_loss_7": 6.356565976142884,
"epoch": 0.092,
"grad_norm": 34.796923941159505,
"kl_loss_13": 9540.8,
"kl_loss_26": 8412.8,
"kl_loss_39": 7049.6,
"kl_loss_7": 10158.4,
"learning_rate": 0.0009831676344247342,
"loss": 17511.4,
"step": 920
},
{
"ce_loss_13": 6.012007105350494,
"ce_loss_26": 5.470919144153595,
"ce_loss_39": 4.816142636537552,
"ce_loss_52": 1.3806764528155326,
"ce_loss_7": 6.3036043524742125,
"epoch": 0.093,
"grad_norm": 34.520449300223,
"kl_loss_13": 9520.0,
"kl_loss_26": 8383.2,
"kl_loss_39": 6990.4,
"kl_loss_7": 10132.8,
"learning_rate": 0.0009827569760057755,
"loss": 17476.2,
"step": 930
},
{
"ce_loss_13": 6.020643877983093,
"ce_loss_26": 5.4863135576248165,
"ce_loss_39": 4.843132376670837,
"ce_loss_52": 1.4196506530046462,
"ce_loss_7": 6.318761503696441,
"epoch": 0.094,
"grad_norm": 33.89098596439575,
"kl_loss_13": 9425.6,
"kl_loss_26": 8303.2,
"kl_loss_39": 6938.4,
"kl_loss_7": 10048.0,
"learning_rate": 0.000982341456229428,
"loss": 17230.8,
"step": 940
},
{
"ce_loss_13": 5.974271166324615,
"ce_loss_26": 5.436639845371246,
"ce_loss_39": 4.804401755332947,
"ce_loss_52": 1.4617832124233245,
"ce_loss_7": 6.261663150787354,
"epoch": 0.095,
"grad_norm": 33.71164019490533,
"kl_loss_13": 9278.4,
"kl_loss_26": 8148.0,
"kl_loss_39": 6788.0,
"kl_loss_7": 9880.0,
"learning_rate": 0.000981921079279971,
"loss": 17111.2,
"step": 950
},
{
"ce_loss_13": 5.966051030158996,
"ce_loss_26": 5.422689366340637,
"ce_loss_39": 4.77484347820282,
"ce_loss_52": 1.4197688490152358,
"ce_loss_7": 6.259041047096252,
"epoch": 0.096,
"grad_norm": 33.446193055983784,
"kl_loss_13": 9321.6,
"kl_loss_26": 8191.2,
"kl_loss_39": 6796.8,
"kl_loss_7": 9932.8,
"learning_rate": 0.0009814958493905962,
"loss": 17055.2,
"step": 960
},
{
"ce_loss_13": 5.8990898609161375,
"ce_loss_26": 5.356628429889679,
"ce_loss_39": 4.708657902479172,
"ce_loss_52": 1.4259506687521935,
"ce_loss_7": 6.204090213775634,
"epoch": 0.097,
"grad_norm": 32.73849681683031,
"kl_loss_13": 9192.0,
"kl_loss_26": 8055.2,
"kl_loss_39": 6684.0,
"kl_loss_7": 9836.8,
"learning_rate": 0.0009810657708433637,
"loss": 16837.6,
"step": 970
},
{
"ce_loss_13": 5.879060399532318,
"ce_loss_26": 5.346331930160522,
"ce_loss_39": 4.690547597408295,
"ce_loss_52": 1.4302369862794877,
"ce_loss_7": 6.184376800060273,
"epoch": 0.098,
"grad_norm": 32.85977379524165,
"kl_loss_13": 9155.2,
"kl_loss_26": 8032.0,
"kl_loss_39": 6616.8,
"kl_loss_7": 9792.0,
"learning_rate": 0.0009806308479691594,
"loss": 16832.6,
"step": 980
},
{
"ce_loss_13": 5.816424036026001,
"ce_loss_26": 5.264147555828094,
"ce_loss_39": 4.6131664395332335,
"ce_loss_52": 1.4319524437189102,
"ce_loss_7": 6.118786966800689,
"epoch": 0.099,
"grad_norm": 33.426081767419625,
"kl_loss_13": 8992.0,
"kl_loss_26": 7842.4,
"kl_loss_39": 6442.4,
"kl_loss_7": 9635.2,
"learning_rate": 0.0009801910851476522,
"loss": 16728.4,
"step": 990
},
{
"ce_loss_13": 5.814940357208252,
"ce_loss_26": 5.2667844772338865,
"ce_loss_39": 4.6347626686096195,
"ce_loss_52": 1.4415421515703202,
"ce_loss_7": 6.12279201745987,
"epoch": 0.1,
"grad_norm": 33.016085914188245,
"kl_loss_13": 8960.0,
"kl_loss_26": 7812.8,
"kl_loss_39": 6444.0,
"kl_loss_7": 9588.8,
"learning_rate": 0.0009797464868072487,
"loss": 16535.6,
"step": 1000
},
{
"ce_loss_13": 5.8194945573806764,
"ce_loss_26": 5.259810090065002,
"ce_loss_39": 4.592129653692245,
"ce_loss_52": 1.4136200681328774,
"ce_loss_7": 6.127060306072235,
"epoch": 0.101,
"grad_norm": 31.039827427941336,
"kl_loss_13": 9033.6,
"kl_loss_26": 7855.2,
"kl_loss_39": 6432.0,
"kl_loss_7": 9683.2,
"learning_rate": 0.0009792970574250492,
"loss": 16416.6,
"step": 1010
},
{
"ce_loss_13": 5.752316701412201,
"ce_loss_26": 5.182761800289154,
"ce_loss_39": 4.4902693152427675,
"ce_loss_52": 1.37675661444664,
"ce_loss_7": 6.068772268295288,
"epoch": 0.102,
"grad_norm": 30.262078958434195,
"kl_loss_13": 8969.6,
"kl_loss_26": 7772.0,
"kl_loss_39": 6300.8,
"kl_loss_7": 9622.4,
"learning_rate": 0.0009788428015268028,
"loss": 16337.4,
"step": 1020
},
{
"ce_loss_13": 5.781488347053528,
"ce_loss_26": 5.2409987449646,
"ce_loss_39": 4.596257948875428,
"ce_loss_52": 1.4595280766487122,
"ce_loss_7": 6.078718197345734,
"epoch": 0.103,
"grad_norm": 31.79275929639243,
"kl_loss_13": 8864.0,
"kl_loss_26": 7715.2,
"kl_loss_39": 6324.0,
"kl_loss_7": 9491.2,
"learning_rate": 0.0009783837236868609,
"loss": 16174.0,
"step": 1030
},
{
"ce_loss_13": 5.71779420375824,
"ce_loss_26": 5.146243929862976,
"ce_loss_39": 4.4791832447052,
"ce_loss_52": 1.4291063606739045,
"ce_loss_7": 6.026824104785919,
"epoch": 0.104,
"grad_norm": 30.963349252235982,
"kl_loss_13": 8771.2,
"kl_loss_26": 7573.6,
"kl_loss_39": 6160.8,
"kl_loss_7": 9416.0,
"learning_rate": 0.0009779198285281327,
"loss": 16072.0,
"step": 1040
},
{
"ce_loss_13": 5.742816948890686,
"ce_loss_26": 5.189597749710083,
"ce_loss_39": 4.527956926822663,
"ce_loss_52": 1.4477199196815491,
"ce_loss_7": 6.058120143413544,
"epoch": 0.105,
"grad_norm": 32.72479335973728,
"kl_loss_13": 8772.8,
"kl_loss_26": 7620.0,
"kl_loss_39": 6205.6,
"kl_loss_7": 9438.4,
"learning_rate": 0.0009774511207220368,
"loss": 15932.8,
"step": 1050
},
{
"ce_loss_13": 5.728980660438538,
"ce_loss_26": 5.167429828643799,
"ce_loss_39": 4.511177510023117,
"ce_loss_52": 1.4759073287248612,
"ce_loss_7": 6.031959307193756,
"epoch": 0.106,
"grad_norm": 31.226954775299472,
"kl_loss_13": 8736.8,
"kl_loss_26": 7550.4,
"kl_loss_39": 6153.6,
"kl_loss_7": 9363.2,
"learning_rate": 0.0009769776049884564,
"loss": 15802.0,
"step": 1060
},
{
"ce_loss_13": 5.7382616877555845,
"ce_loss_26": 5.175736773014068,
"ce_loss_39": 4.510921847820282,
"ce_loss_52": 1.4512410640716553,
"ce_loss_7": 6.050414621829987,
"epoch": 0.107,
"grad_norm": 30.630877517383688,
"kl_loss_13": 8788.0,
"kl_loss_26": 7622.4,
"kl_loss_39": 6202.4,
"kl_loss_7": 9452.8,
"learning_rate": 0.0009764992860956889,
"loss": 15822.0,
"step": 1070
},
{
"ce_loss_13": 5.645794451236725,
"ce_loss_26": 5.088676834106446,
"ce_loss_39": 4.417802548408508,
"ce_loss_52": 1.4182049363851548,
"ce_loss_7": 5.959036731719971,
"epoch": 0.108,
"grad_norm": 30.457487960812127,
"kl_loss_13": 8681.6,
"kl_loss_26": 7509.6,
"kl_loss_39": 6084.8,
"kl_loss_7": 9332.8,
"learning_rate": 0.0009760161688604008,
"loss": 15627.0,
"step": 1080
},
{
"ce_loss_13": 5.568986439704895,
"ce_loss_26": 5.009959697723389,
"ce_loss_39": 4.367926681041718,
"ce_loss_52": 1.4609902381896973,
"ce_loss_7": 5.883221137523651,
"epoch": 0.109,
"grad_norm": 29.989545158997807,
"kl_loss_13": 8458.4,
"kl_loss_26": 7279.2,
"kl_loss_39": 5895.2,
"kl_loss_7": 9112.0,
"learning_rate": 0.0009755282581475768,
"loss": 15555.0,
"step": 1090
},
{
"ce_loss_13": 5.6291534304618835,
"ce_loss_26": 5.068492126464844,
"ce_loss_39": 4.403285652399063,
"ce_loss_52": 1.445840133726597,
"ce_loss_7": 5.945274484157562,
"epoch": 0.11,
"grad_norm": 30.66383917654591,
"kl_loss_13": 8576.8,
"kl_loss_26": 7396.8,
"kl_loss_39": 5972.8,
"kl_loss_7": 9244.8,
"learning_rate": 0.0009750355588704727,
"loss": 15472.0,
"step": 1100
},
{
"ce_loss_13": 5.522909152507782,
"ce_loss_26": 4.949995934963226,
"ce_loss_39": 4.270336884260177,
"ce_loss_52": 1.407256692647934,
"ce_loss_7": 5.840477633476257,
"epoch": 0.111,
"grad_norm": 29.782295400473814,
"kl_loss_13": 8460.8,
"kl_loss_26": 7250.4,
"kl_loss_39": 5788.8,
"kl_loss_7": 9118.4,
"learning_rate": 0.0009745380759905647,
"loss": 15294.0,
"step": 1110
},
{
"ce_loss_13": 5.517545366287232,
"ce_loss_26": 4.932116758823395,
"ce_loss_39": 4.250882798433304,
"ce_loss_52": 1.3833064809441566,
"ce_loss_7": 5.846422612667084,
"epoch": 0.112,
"grad_norm": 28.73414895119145,
"kl_loss_13": 8486.4,
"kl_loss_26": 7264.0,
"kl_loss_39": 5804.0,
"kl_loss_7": 9168.0,
"learning_rate": 0.0009740358145174998,
"loss": 15318.0,
"step": 1120
},
{
"ce_loss_13": 5.50923570394516,
"ce_loss_26": 4.938242793083191,
"ce_loss_39": 4.259438300132752,
"ce_loss_52": 1.430792199075222,
"ce_loss_7": 5.8306269407272335,
"epoch": 0.113,
"grad_norm": 28.895019773736312,
"kl_loss_13": 8355.2,
"kl_loss_26": 7152.8,
"kl_loss_39": 5700.8,
"kl_loss_7": 9024.0,
"learning_rate": 0.0009735287795090455,
"loss": 15192.0,
"step": 1130
},
{
"ce_loss_13": 5.401988506317139,
"ce_loss_26": 4.814380037784576,
"ce_loss_39": 4.133068162202835,
"ce_loss_52": 1.3917075648903847,
"ce_loss_7": 5.732739126682281,
"epoch": 0.114,
"grad_norm": 28.473930821013894,
"kl_loss_13": 8231.2,
"kl_loss_26": 7010.4,
"kl_loss_39": 5549.6,
"kl_loss_7": 8921.6,
"learning_rate": 0.0009730169760710386,
"loss": 15030.2,
"step": 1140
},
{
"ce_loss_13": 5.538087117671966,
"ce_loss_26": 4.94068056344986,
"ce_loss_39": 4.253137022256851,
"ce_loss_52": 1.4375049352645874,
"ce_loss_7": 5.861488628387451,
"epoch": 0.115,
"grad_norm": 30.345832823062537,
"kl_loss_13": 8408.0,
"kl_loss_26": 7160.0,
"kl_loss_39": 5690.4,
"kl_loss_7": 9096.0,
"learning_rate": 0.0009725004093573342,
"loss": 14951.8,
"step": 1150
},
{
"ce_loss_13": 5.385800528526306,
"ce_loss_26": 4.801717817783356,
"ce_loss_39": 4.136020374298096,
"ce_loss_52": 1.4078958943486213,
"ce_loss_7": 5.722076547145844,
"epoch": 0.116,
"grad_norm": 30.297186235009132,
"kl_loss_13": 8177.6,
"kl_loss_26": 6956.0,
"kl_loss_39": 5514.4,
"kl_loss_7": 8880.0,
"learning_rate": 0.0009719790845697534,
"loss": 14867.6,
"step": 1160
},
{
"ce_loss_13": 5.426836037635804,
"ce_loss_26": 4.832395279407502,
"ce_loss_39": 4.153381270170212,
"ce_loss_52": 1.426569977402687,
"ce_loss_7": 5.755613851547241,
"epoch": 0.117,
"grad_norm": 31.87589996223516,
"kl_loss_13": 8220.8,
"kl_loss_26": 6976.0,
"kl_loss_39": 5544.0,
"kl_loss_7": 8909.6,
"learning_rate": 0.0009714530069580309,
"loss": 14745.8,
"step": 1170
},
{
"ce_loss_13": 5.359652185440064,
"ce_loss_26": 4.760751461982727,
"ce_loss_39": 4.048358517885208,
"ce_loss_52": 1.3907365471124649,
"ce_loss_7": 5.693908452987671,
"epoch": 0.118,
"grad_norm": 27.539122306915463,
"kl_loss_13": 8126.4,
"kl_loss_26": 6876.0,
"kl_loss_39": 5376.0,
"kl_loss_7": 8822.4,
"learning_rate": 0.0009709221818197624,
"loss": 14704.2,
"step": 1180
},
{
"ce_loss_13": 5.350854313373565,
"ce_loss_26": 4.768227469921112,
"ce_loss_39": 4.099184954166413,
"ce_loss_52": 1.4248775228857995,
"ce_loss_7": 5.680926930904389,
"epoch": 0.119,
"grad_norm": 28.933475617899816,
"kl_loss_13": 8027.2,
"kl_loss_26": 6818.4,
"kl_loss_39": 5372.0,
"kl_loss_7": 8727.2,
"learning_rate": 0.0009703866145003512,
"loss": 14583.0,
"step": 1190
},
{
"ce_loss_13": 5.372988939285278,
"ce_loss_26": 4.778137028217316,
"ce_loss_39": 4.089074891805649,
"ce_loss_52": 1.4195260405540466,
"ce_loss_7": 5.714505088329315,
"epoch": 0.12,
"grad_norm": 26.60053169419214,
"kl_loss_13": 8131.2,
"kl_loss_26": 6885.6,
"kl_loss_39": 5404.8,
"kl_loss_7": 8840.8,
"learning_rate": 0.0009698463103929542,
"loss": 14513.0,
"step": 1200
},
{
"ce_loss_13": 5.392605185508728,
"ce_loss_26": 4.798673605918884,
"ce_loss_39": 4.126504504680634,
"ce_loss_52": 1.4746148020029068,
"ce_loss_7": 5.7245006442070006,
"epoch": 0.121,
"grad_norm": 26.989592045291893,
"kl_loss_13": 8018.4,
"kl_loss_26": 6775.2,
"kl_loss_39": 5339.2,
"kl_loss_7": 8718.4,
"learning_rate": 0.0009693012749384279,
"loss": 14383.2,
"step": 1210
},
{
"ce_loss_13": 5.319035434722901,
"ce_loss_26": 4.736347317695618,
"ce_loss_39": 4.0659150838851925,
"ce_loss_52": 1.4397204488515853,
"ce_loss_7": 5.642805421352387,
"epoch": 0.122,
"grad_norm": 29.38130003686817,
"kl_loss_13": 7946.4,
"kl_loss_26": 6723.2,
"kl_loss_39": 5294.4,
"kl_loss_7": 8631.2,
"learning_rate": 0.0009687515136252732,
"loss": 14375.6,
"step": 1220
},
{
"ce_loss_13": 5.3430128455162045,
"ce_loss_26": 4.744695138931275,
"ce_loss_39": 4.071437209844589,
"ce_loss_52": 1.4354561120271683,
"ce_loss_7": 5.681857228279114,
"epoch": 0.123,
"grad_norm": 25.479885446063793,
"kl_loss_13": 8008.0,
"kl_loss_26": 6754.4,
"kl_loss_39": 5308.8,
"kl_loss_7": 8709.6,
"learning_rate": 0.0009681970319895803,
"loss": 14273.4,
"step": 1230
},
{
"ce_loss_13": 5.331636953353882,
"ce_loss_26": 4.735000967979431,
"ce_loss_39": 4.073598688840866,
"ce_loss_52": 1.470110397040844,
"ce_loss_7": 5.655930757522583,
"epoch": 0.124,
"grad_norm": 28.379117971457468,
"kl_loss_13": 7929.6,
"kl_loss_26": 6684.0,
"kl_loss_39": 5260.8,
"kl_loss_7": 8612.8,
"learning_rate": 0.0009676378356149733,
"loss": 14150.8,
"step": 1240
},
{
"ce_loss_13": 5.1874682068824765,
"ce_loss_26": 4.582801806926727,
"ce_loss_39": 3.900476610660553,
"ce_loss_52": 1.4180996417999268,
"ce_loss_7": 5.519565558433532,
"epoch": 0.125,
"grad_norm": 27.465496459767188,
"kl_loss_13": 7764.8,
"kl_loss_26": 6504.0,
"kl_loss_39": 5041.6,
"kl_loss_7": 8455.2,
"learning_rate": 0.0009670739301325534,
"loss": 13985.0,
"step": 1250
},
{
"ce_loss_13": 5.221911752223969,
"ce_loss_26": 4.625354039669037,
"ce_loss_39": 3.9350062906742096,
"ce_loss_52": 1.3881619155406952,
"ce_loss_7": 5.562334418296814,
"epoch": 0.126,
"grad_norm": 26.021158683557974,
"kl_loss_13": 7847.2,
"kl_loss_26": 6599.2,
"kl_loss_39": 5120.0,
"kl_loss_7": 8558.4,
"learning_rate": 0.0009665053212208426,
"loss": 13978.8,
"step": 1260
},
{
"ce_loss_13": 5.201095879077911,
"ce_loss_26": 4.594125282764435,
"ce_loss_39": 3.8948924005031587,
"ce_loss_52": 1.4181114554405212,
"ce_loss_7": 5.552536249160767,
"epoch": 0.127,
"grad_norm": 26.300188898530276,
"kl_loss_13": 7783.2,
"kl_loss_26": 6515.2,
"kl_loss_39": 5012.0,
"kl_loss_7": 8515.2,
"learning_rate": 0.0009659320146057262,
"loss": 13927.6,
"step": 1270
},
{
"ce_loss_13": 5.186312806606293,
"ce_loss_26": 4.5913723587989805,
"ce_loss_39": 3.912171256542206,
"ce_loss_52": 1.4045341789722443,
"ce_loss_7": 5.535152721405029,
"epoch": 0.128,
"grad_norm": 25.74310395170922,
"kl_loss_13": 7757.6,
"kl_loss_26": 6509.6,
"kl_loss_39": 5065.6,
"kl_loss_7": 8486.4,
"learning_rate": 0.0009653540160603955,
"loss": 13929.0,
"step": 1280
},
{
"ce_loss_13": 5.17168892621994,
"ce_loss_26": 4.572988575696945,
"ce_loss_39": 3.902406334877014,
"ce_loss_52": 1.4593060314655304,
"ce_loss_7": 5.513335394859314,
"epoch": 0.129,
"grad_norm": 26.464934956114348,
"kl_loss_13": 7624.0,
"kl_loss_26": 6372.8,
"kl_loss_39": 4939.2,
"kl_loss_7": 8331.2,
"learning_rate": 0.0009647713314052896,
"loss": 13720.2,
"step": 1290
},
{
"ce_loss_13": 5.167734289169312,
"ce_loss_26": 4.5827870786190035,
"ce_loss_39": 3.9150634109973907,
"ce_loss_52": 1.4295778691768646,
"ce_loss_7": 5.517308306694031,
"epoch": 0.13,
"grad_norm": 26.41573865043221,
"kl_loss_13": 7627.2,
"kl_loss_26": 6407.2,
"kl_loss_39": 4972.0,
"kl_loss_7": 8360.0,
"learning_rate": 0.0009641839665080363,
"loss": 13644.6,
"step": 1300
},
{
"ce_loss_13": 5.155666828155518,
"ce_loss_26": 4.557369256019593,
"ce_loss_39": 3.8985717594623566,
"ce_loss_52": 1.4532029300928115,
"ce_loss_7": 5.49907066822052,
"epoch": 0.131,
"grad_norm": 28.09061259972242,
"kl_loss_13": 7600.0,
"kl_loss_26": 6340.8,
"kl_loss_39": 4909.6,
"kl_loss_7": 8327.2,
"learning_rate": 0.0009635919272833937,
"loss": 13575.0,
"step": 1310
},
{
"ce_loss_13": 5.079627573490143,
"ce_loss_26": 4.46733387708664,
"ce_loss_39": 3.7981239676475527,
"ce_loss_52": 1.4149045318365097,
"ce_loss_7": 5.424402499198914,
"epoch": 0.132,
"grad_norm": 29.96516682014439,
"kl_loss_13": 7483.2,
"kl_loss_26": 6202.4,
"kl_loss_39": 4773.6,
"kl_loss_7": 8210.4,
"learning_rate": 0.0009629952196931902,
"loss": 13547.6,
"step": 1320
},
{
"ce_loss_13": 5.090298974514008,
"ce_loss_26": 4.495900344848633,
"ce_loss_39": 3.821765500307083,
"ce_loss_52": 1.4328191310167313,
"ce_loss_7": 5.431738471984863,
"epoch": 0.133,
"grad_norm": 26.14827995707597,
"kl_loss_13": 7504.0,
"kl_loss_26": 6259.2,
"kl_loss_39": 4809.6,
"kl_loss_7": 8225.6,
"learning_rate": 0.0009623938497462645,
"loss": 13496.2,
"step": 1330
},
{
"ce_loss_13": 5.099011301994324,
"ce_loss_26": 4.4848466455936435,
"ce_loss_39": 3.794223016500473,
"ce_loss_52": 1.416146419942379,
"ce_loss_7": 5.462341606616974,
"epoch": 0.134,
"grad_norm": 24.84289392950202,
"kl_loss_13": 7542.4,
"kl_loss_26": 6259.2,
"kl_loss_39": 4785.2,
"kl_loss_7": 8304.8,
"learning_rate": 0.0009617878234984055,
"loss": 13395.2,
"step": 1340
},
{
"ce_loss_13": 5.097129952907562,
"ce_loss_26": 4.498237466812133,
"ce_loss_39": 3.8250812292099,
"ce_loss_52": 1.4416721731424331,
"ce_loss_7": 5.444403338432312,
"epoch": 0.135,
"grad_norm": 26.607564330476333,
"kl_loss_13": 7480.8,
"kl_loss_26": 6224.8,
"kl_loss_39": 4782.4,
"kl_loss_7": 8202.4,
"learning_rate": 0.0009611771470522907,
"loss": 13240.2,
"step": 1350
},
{
"ce_loss_13": 5.043468415737152,
"ce_loss_26": 4.443963885307312,
"ce_loss_39": 3.7804897725582123,
"ce_loss_52": 1.4098651513457299,
"ce_loss_7": 5.407905113697052,
"epoch": 0.136,
"grad_norm": 27.568927473511277,
"kl_loss_13": 7464.8,
"kl_loss_26": 6209.6,
"kl_loss_39": 4773.6,
"kl_loss_7": 8221.6,
"learning_rate": 0.0009605618265574251,
"loss": 13312.2,
"step": 1360
},
{
"ce_loss_13": 5.1134570121765135,
"ce_loss_26": 4.510142356157303,
"ce_loss_39": 3.844588041305542,
"ce_loss_52": 1.4824964210391045,
"ce_loss_7": 5.455760145187378,
"epoch": 0.137,
"grad_norm": 26.586450808382164,
"kl_loss_13": 7431.2,
"kl_loss_26": 6176.0,
"kl_loss_39": 4748.0,
"kl_loss_7": 8151.2,
"learning_rate": 0.0009599418682100792,
"loss": 13171.6,
"step": 1370
},
{
"ce_loss_13": 4.993066036701203,
"ce_loss_26": 4.390464246273041,
"ce_loss_39": 3.7108724772930146,
"ce_loss_52": 1.3996128499507905,
"ce_loss_7": 5.354221343994141,
"epoch": 0.138,
"grad_norm": 24.51436324855179,
"kl_loss_13": 7388.0,
"kl_loss_26": 6120.0,
"kl_loss_39": 4660.8,
"kl_loss_7": 8145.6,
"learning_rate": 0.0009593172782532268,
"loss": 13135.2,
"step": 1380
},
{
"ce_loss_13": 4.9749324202537535,
"ce_loss_26": 4.36228443980217,
"ce_loss_39": 3.71304127573967,
"ce_loss_52": 1.4259393498301507,
"ce_loss_7": 5.328954219818115,
"epoch": 0.139,
"grad_norm": 25.448579155888293,
"kl_loss_13": 7284.8,
"kl_loss_26": 6005.6,
"kl_loss_39": 4599.2,
"kl_loss_7": 8029.6,
"learning_rate": 0.0009586880629764817,
"loss": 13023.4,
"step": 1390
},
{
"ce_loss_13": 5.021213936805725,
"ce_loss_26": 4.392004972696304,
"ce_loss_39": 3.695616716146469,
"ce_loss_52": 1.3939141556620598,
"ce_loss_7": 5.386792302131653,
"epoch": 0.14,
"grad_norm": 27.169552009752685,
"kl_loss_13": 7436.0,
"kl_loss_26": 6132.8,
"kl_loss_39": 4655.2,
"kl_loss_7": 8205.6,
"learning_rate": 0.0009580542287160348,
"loss": 13043.6,
"step": 1400
},
{
"ce_loss_13": 5.006197059154511,
"ce_loss_26": 4.410893344879151,
"ce_loss_39": 3.74367755651474,
"ce_loss_52": 1.4519873589277268,
"ce_loss_7": 5.36526129245758,
"epoch": 0.141,
"grad_norm": 24.865151038825246,
"kl_loss_13": 7283.2,
"kl_loss_26": 6027.2,
"kl_loss_39": 4615.2,
"kl_loss_7": 8029.6,
"learning_rate": 0.0009574157818545901,
"loss": 12913.8,
"step": 1410
},
{
"ce_loss_13": 4.958354568481445,
"ce_loss_26": 4.367397904396057,
"ce_loss_39": 3.7062928318977355,
"ce_loss_52": 1.4099174112081527,
"ce_loss_7": 5.317552924156189,
"epoch": 0.142,
"grad_norm": 24.898155460709848,
"kl_loss_13": 7277.6,
"kl_loss_26": 6040.8,
"kl_loss_39": 4623.6,
"kl_loss_7": 8032.8,
"learning_rate": 0.0009567727288213005,
"loss": 12929.6,
"step": 1420
},
{
"ce_loss_13": 4.984454607963562,
"ce_loss_26": 4.392505377531052,
"ce_loss_39": 3.7512011885643006,
"ce_loss_52": 1.473440769314766,
"ce_loss_7": 5.340136766433716,
"epoch": 0.143,
"grad_norm": 24.34585690638739,
"kl_loss_13": 7221.6,
"kl_loss_26": 5972.8,
"kl_loss_39": 4602.0,
"kl_loss_7": 7973.6,
"learning_rate": 0.0009561250760917027,
"loss": 12830.2,
"step": 1430
},
{
"ce_loss_13": 4.917237496376037,
"ce_loss_26": 4.313831263780594,
"ce_loss_39": 3.6587266325950623,
"ce_loss_52": 1.4092496067285538,
"ce_loss_7": 5.2786689639091495,
"epoch": 0.144,
"grad_norm": 25.288024521189875,
"kl_loss_13": 7198.4,
"kl_loss_26": 5931.2,
"kl_loss_39": 4527.2,
"kl_loss_7": 7960.8,
"learning_rate": 0.0009554728301876525,
"loss": 12688.6,
"step": 1440
},
{
"ce_loss_13": 4.95776047706604,
"ce_loss_26": 4.340852671861649,
"ce_loss_39": 3.657728981971741,
"ce_loss_52": 1.4168317198753357,
"ce_loss_7": 5.322667574882507,
"epoch": 0.145,
"grad_norm": 26.641005752286592,
"kl_loss_13": 7228.8,
"kl_loss_26": 5940.0,
"kl_loss_39": 4485.2,
"kl_loss_7": 8003.2,
"learning_rate": 0.0009548159976772592,
"loss": 12683.8,
"step": 1450
},
{
"ce_loss_13": 4.831417870521546,
"ce_loss_26": 4.231567287445069,
"ce_loss_39": 3.581255227327347,
"ce_loss_52": 1.4485478460788728,
"ce_loss_7": 5.20115841627121,
"epoch": 0.146,
"grad_norm": 24.920691081516484,
"kl_loss_13": 6952.0,
"kl_loss_26": 5699.2,
"kl_loss_39": 4308.4,
"kl_loss_7": 7733.6,
"learning_rate": 0.0009541545851748186,
"loss": 12574.8,
"step": 1460
},
{
"ce_loss_13": 4.8803037166595455,
"ce_loss_26": 4.2741272211074826,
"ce_loss_39": 3.598990321159363,
"ce_loss_52": 1.4145199984312058,
"ce_loss_7": 5.244284570217133,
"epoch": 0.147,
"grad_norm": 25.90739775261194,
"kl_loss_13": 7076.8,
"kl_loss_26": 5806.4,
"kl_loss_39": 4372.0,
"kl_loss_7": 7841.6,
"learning_rate": 0.0009534885993407473,
"loss": 12558.0,
"step": 1470
},
{
"ce_loss_13": 4.854452967643738,
"ce_loss_26": 4.251825517416,
"ce_loss_39": 3.5948518395423887,
"ce_loss_52": 1.428754985332489,
"ce_loss_7": 5.219927191734314,
"epoch": 0.148,
"grad_norm": 24.48718194678669,
"kl_loss_13": 7008.8,
"kl_loss_26": 5743.2,
"kl_loss_39": 4354.4,
"kl_loss_7": 7777.6,
"learning_rate": 0.0009528180468815154,
"loss": 12484.4,
"step": 1480
},
{
"ce_loss_13": 4.884927380084991,
"ce_loss_26": 4.2919243454933165,
"ce_loss_39": 3.642289215326309,
"ce_loss_52": 1.465419703722,
"ce_loss_7": 5.241849565505982,
"epoch": 0.149,
"grad_norm": 24.903440253335923,
"kl_loss_13": 7001.6,
"kl_loss_26": 5763.2,
"kl_loss_39": 4366.0,
"kl_loss_7": 7752.0,
"learning_rate": 0.0009521429345495787,
"loss": 12486.6,
"step": 1490
},
{
"ce_loss_13": 4.82739794254303,
"ce_loss_26": 4.228940737247467,
"ce_loss_39": 3.5767277657985685,
"ce_loss_52": 1.4382753789424896,
"ce_loss_7": 5.209846138954163,
"epoch": 0.15,
"grad_norm": 25.291080092187237,
"kl_loss_13": 6960.8,
"kl_loss_26": 5698.4,
"kl_loss_39": 4297.6,
"kl_loss_7": 7764.0,
"learning_rate": 0.0009514632691433108,
"loss": 12420.2,
"step": 1500
},
{
"ce_loss_13": 4.828064477443695,
"ce_loss_26": 4.213042998313904,
"ce_loss_39": 3.5323724269866945,
"ce_loss_52": 1.3961022228002549,
"ce_loss_7": 5.196406292915344,
"epoch": 0.151,
"grad_norm": 25.466425081780237,
"kl_loss_13": 7046.4,
"kl_loss_26": 5772.8,
"kl_loss_39": 4315.2,
"kl_loss_7": 7825.6,
"learning_rate": 0.0009507790575069346,
"loss": 12387.6,
"step": 1510
},
{
"ce_loss_13": 4.786497128009796,
"ce_loss_26": 4.188397663831711,
"ce_loss_39": 3.536989223957062,
"ce_loss_52": 1.4404057756066322,
"ce_loss_7": 5.156642246246338,
"epoch": 0.152,
"grad_norm": 22.488994996506335,
"kl_loss_13": 6872.8,
"kl_loss_26": 5620.8,
"kl_loss_39": 4225.6,
"kl_loss_7": 7650.4,
"learning_rate": 0.0009500903065304539,
"loss": 12265.4,
"step": 1520
},
{
"ce_loss_13": 4.79404227733612,
"ce_loss_26": 4.1956378519535065,
"ce_loss_39": 3.539849889278412,
"ce_loss_52": 1.447507870197296,
"ce_loss_7": 5.170107614994049,
"epoch": 0.153,
"grad_norm": 24.979481722705945,
"kl_loss_13": 6864.0,
"kl_loss_26": 5609.6,
"kl_loss_39": 4213.6,
"kl_loss_7": 7656.8,
"learning_rate": 0.0009493970231495835,
"loss": 12182.2,
"step": 1530
},
{
"ce_loss_13": 4.754118239879608,
"ce_loss_26": 4.16424406170845,
"ce_loss_39": 3.5151414275169373,
"ce_loss_52": 1.423200336098671,
"ce_loss_7": 5.132595348358154,
"epoch": 0.154,
"grad_norm": 24.139218625352445,
"kl_loss_13": 6807.2,
"kl_loss_26": 5573.6,
"kl_loss_39": 4190.4,
"kl_loss_7": 7594.4,
"learning_rate": 0.0009486992143456792,
"loss": 12152.0,
"step": 1540
},
{
"ce_loss_13": 4.745328724384308,
"ce_loss_26": 4.135520172119141,
"ce_loss_39": 3.4818262457847595,
"ce_loss_52": 1.4286953419446946,
"ce_loss_7": 5.114579677581787,
"epoch": 0.155,
"grad_norm": 24.426109316342576,
"kl_loss_13": 6791.2,
"kl_loss_26": 5516.8,
"kl_loss_39": 4120.0,
"kl_loss_7": 7567.2,
"learning_rate": 0.0009479968871456679,
"loss": 12128.4,
"step": 1550
},
{
"ce_loss_13": 4.7574557065963745,
"ce_loss_26": 4.145674997568131,
"ce_loss_39": 3.476013499498367,
"ce_loss_52": 1.4235228240489959,
"ce_loss_7": 5.133380055427551,
"epoch": 0.156,
"grad_norm": 25.100926583342837,
"kl_loss_13": 6843.2,
"kl_loss_26": 5556.0,
"kl_loss_39": 4126.0,
"kl_loss_7": 7627.2,
"learning_rate": 0.0009472900486219768,
"loss": 12082.2,
"step": 1560
},
{
"ce_loss_13": 4.735032224655152,
"ce_loss_26": 4.128789341449737,
"ce_loss_39": 3.4694815576076508,
"ce_loss_52": 1.4237273722887038,
"ce_loss_7": 5.11258887052536,
"epoch": 0.157,
"grad_norm": 25.10370372986473,
"kl_loss_13": 6792.0,
"kl_loss_26": 5520.0,
"kl_loss_39": 4095.6,
"kl_loss_7": 7585.6,
"learning_rate": 0.000946578705892462,
"loss": 11936.2,
"step": 1570
},
{
"ce_loss_13": 4.741922962665558,
"ce_loss_26": 4.132791459560394,
"ce_loss_39": 3.482679557800293,
"ce_loss_52": 1.4294559836387635,
"ce_loss_7": 5.117163801193238,
"epoch": 0.158,
"grad_norm": 21.844394510796377,
"kl_loss_13": 6799.2,
"kl_loss_26": 5517.6,
"kl_loss_39": 4118.0,
"kl_loss_7": 7581.6,
"learning_rate": 0.0009458628661203367,
"loss": 11944.8,
"step": 1580
},
{
"ce_loss_13": 4.741668605804444,
"ce_loss_26": 4.1376284003257755,
"ce_loss_39": 3.478295695781708,
"ce_loss_52": 1.415444830060005,
"ce_loss_7": 5.117357003688812,
"epoch": 0.159,
"grad_norm": 25.4671883290825,
"kl_loss_13": 6812.0,
"kl_loss_26": 5548.0,
"kl_loss_39": 4136.8,
"kl_loss_7": 7601.6,
"learning_rate": 0.0009451425365140996,
"loss": 11952.4,
"step": 1590
},
{
"ce_loss_13": 4.723819291591644,
"ce_loss_26": 4.128834217786789,
"ce_loss_39": 3.47242848277092,
"ce_loss_52": 1.429117676615715,
"ce_loss_7": 5.096058523654937,
"epoch": 0.16,
"grad_norm": 25.14078013617688,
"kl_loss_13": 6768.0,
"kl_loss_26": 5519.2,
"kl_loss_39": 4101.6,
"kl_loss_7": 7547.2,
"learning_rate": 0.0009444177243274617,
"loss": 11862.0,
"step": 1600
},
{
"ce_loss_13": 4.648782467842102,
"ce_loss_26": 4.0394273698329926,
"ce_loss_39": 3.377221292257309,
"ce_loss_52": 1.4151206001639367,
"ce_loss_7": 5.0250336050987245,
"epoch": 0.161,
"grad_norm": 24.128253336718885,
"kl_loss_13": 6640.0,
"kl_loss_26": 5364.0,
"kl_loss_39": 3953.2,
"kl_loss_7": 7436.8,
"learning_rate": 0.0009436884368592739,
"loss": 11833.0,
"step": 1610
},
{
"ce_loss_13": 4.695314359664917,
"ce_loss_26": 4.099924111366272,
"ce_loss_39": 3.466512751579285,
"ce_loss_52": 1.4766929775476456,
"ce_loss_7": 5.064470827579498,
"epoch": 0.162,
"grad_norm": 23.68843577414951,
"kl_loss_13": 6614.4,
"kl_loss_26": 5368.8,
"kl_loss_39": 3996.8,
"kl_loss_7": 7387.2,
"learning_rate": 0.0009429546814534529,
"loss": 11713.8,
"step": 1620
},
{
"ce_loss_13": 4.7040504813194275,
"ce_loss_26": 4.104205197095871,
"ce_loss_39": 3.4451481282711027,
"ce_loss_52": 1.4428326219320298,
"ce_loss_7": 5.08098030090332,
"epoch": 0.163,
"grad_norm": 23.332187460756046,
"kl_loss_13": 6673.6,
"kl_loss_26": 5408.0,
"kl_loss_39": 4000.0,
"kl_loss_7": 7468.8,
"learning_rate": 0.0009422164654989072,
"loss": 11730.0,
"step": 1630
},
{
"ce_loss_13": 4.6945901870727536,
"ce_loss_26": 4.092492777109146,
"ce_loss_39": 3.4360527455806733,
"ce_loss_52": 1.4436773255467414,
"ce_loss_7": 5.079924070835114,
"epoch": 0.164,
"grad_norm": 25.877563512298988,
"kl_loss_13": 6666.4,
"kl_loss_26": 5404.8,
"kl_loss_39": 4011.6,
"kl_loss_7": 7479.2,
"learning_rate": 0.0009414737964294635,
"loss": 11645.0,
"step": 1640
},
{
"ce_loss_13": 4.614939618110657,
"ce_loss_26": 4.018665736913681,
"ce_loss_39": 3.3586190402507783,
"ce_loss_52": 1.4472161442041398,
"ce_loss_7": 4.990367615222931,
"epoch": 0.165,
"grad_norm": 24.534381720947415,
"kl_loss_13": 6511.2,
"kl_loss_26": 5264.0,
"kl_loss_39": 3849.6,
"kl_loss_7": 7293.6,
"learning_rate": 0.000940726681723791,
"loss": 11568.6,
"step": 1650
},
{
"ce_loss_13": 4.539776319265366,
"ce_loss_26": 3.943451428413391,
"ce_loss_39": 3.281195378303528,
"ce_loss_52": 1.4070941284298897,
"ce_loss_7": 4.923744630813599,
"epoch": 0.166,
"grad_norm": 23.51720209782485,
"kl_loss_13": 6449.6,
"kl_loss_26": 5196.0,
"kl_loss_39": 3786.0,
"kl_loss_7": 7249.6,
"learning_rate": 0.0009399751289053266,
"loss": 11569.4,
"step": 1660
},
{
"ce_loss_13": 4.590777164697647,
"ce_loss_26": 3.9918887853622436,
"ce_loss_39": 3.328452670574188,
"ce_loss_52": 1.4019996047019958,
"ce_loss_7": 4.978202056884766,
"epoch": 0.167,
"grad_norm": 22.82794096581106,
"kl_loss_13": 6550.4,
"kl_loss_26": 5291.2,
"kl_loss_39": 3877.2,
"kl_loss_7": 7354.4,
"learning_rate": 0.0009392191455421988,
"loss": 11557.4,
"step": 1670
},
{
"ce_loss_13": 4.534084904193878,
"ce_loss_26": 3.9383736848831177,
"ce_loss_39": 3.290838527679443,
"ce_loss_52": 1.3803422033786774,
"ce_loss_7": 4.9193053364753725,
"epoch": 0.168,
"grad_norm": 22.01316358613574,
"kl_loss_13": 6469.6,
"kl_loss_26": 5224.0,
"kl_loss_39": 3837.2,
"kl_loss_7": 7273.6,
"learning_rate": 0.0009384587392471515,
"loss": 11454.2,
"step": 1680
},
{
"ce_loss_13": 4.5477269172668455,
"ce_loss_26": 3.9558385491371153,
"ce_loss_39": 3.3075734674930573,
"ce_loss_52": 1.410713329911232,
"ce_loss_7": 4.9310637474060055,
"epoch": 0.169,
"grad_norm": 24.025001534080104,
"kl_loss_13": 6453.6,
"kl_loss_26": 5223.2,
"kl_loss_39": 3830.0,
"kl_loss_7": 7244.0,
"learning_rate": 0.0009376939176774678,
"loss": 11355.2,
"step": 1690
},
{
"ce_loss_13": 4.580456328392029,
"ce_loss_26": 3.996914601325989,
"ce_loss_39": 3.3568237483501435,
"ce_loss_52": 1.4514233976602555,
"ce_loss_7": 4.956906342506409,
"epoch": 0.17,
"grad_norm": 24.061048820242437,
"kl_loss_13": 6424.8,
"kl_loss_26": 5199.2,
"kl_loss_39": 3822.0,
"kl_loss_7": 7210.4,
"learning_rate": 0.0009369246885348925,
"loss": 11365.4,
"step": 1700
},
{
"ce_loss_13": 4.5829225301742555,
"ce_loss_26": 3.973430114984512,
"ce_loss_39": 3.3136274456977843,
"ce_loss_52": 1.4179346442222596,
"ce_loss_7": 4.9602553129196165,
"epoch": 0.171,
"grad_norm": 21.925882863353518,
"kl_loss_13": 6508.0,
"kl_loss_26": 5225.6,
"kl_loss_39": 3821.2,
"kl_loss_7": 7300.0,
"learning_rate": 0.0009361510595655545,
"loss": 11427.8,
"step": 1710
},
{
"ce_loss_13": 4.597618329524994,
"ce_loss_26": 4.014382421970367,
"ce_loss_39": 3.3793311297893522,
"ce_loss_52": 1.4502436846494675,
"ce_loss_7": 4.970238649845124,
"epoch": 0.172,
"grad_norm": 21.861723684559113,
"kl_loss_13": 6463.2,
"kl_loss_26": 5241.6,
"kl_loss_39": 3880.8,
"kl_loss_7": 7242.4,
"learning_rate": 0.0009353730385598887,
"loss": 11300.4,
"step": 1720
},
{
"ce_loss_13": 4.474293851852417,
"ce_loss_26": 3.8755543529987335,
"ce_loss_39": 3.212596780061722,
"ce_loss_52": 1.4004584550857544,
"ce_loss_7": 4.856262743473053,
"epoch": 0.173,
"grad_norm": 23.168666460490822,
"kl_loss_13": 6318.4,
"kl_loss_26": 5065.6,
"kl_loss_39": 3658.4,
"kl_loss_7": 7116.8,
"learning_rate": 0.0009345906333525581,
"loss": 11205.0,
"step": 1730
},
{
"ce_loss_13": 4.5212029337883,
"ce_loss_26": 3.9314939856529234,
"ce_loss_39": 3.301447206735611,
"ce_loss_52": 1.422508242726326,
"ce_loss_7": 4.894874656200409,
"epoch": 0.174,
"grad_norm": 25.870791070867757,
"kl_loss_13": 6358.4,
"kl_loss_26": 5133.6,
"kl_loss_39": 3775.2,
"kl_loss_7": 7142.4,
"learning_rate": 0.0009338038518223745,
"loss": 11159.2,
"step": 1740
},
{
"ce_loss_13": 4.551153075695038,
"ce_loss_26": 3.96026993393898,
"ce_loss_39": 3.326349085569382,
"ce_loss_52": 1.4542756617069243,
"ce_loss_7": 4.919975602626801,
"epoch": 0.175,
"grad_norm": 23.828468964880035,
"kl_loss_13": 6352.8,
"kl_loss_26": 5109.6,
"kl_loss_39": 3757.6,
"kl_loss_7": 7129.6,
"learning_rate": 0.0009330127018922195,
"loss": 11089.0,
"step": 1750
},
{
"ce_loss_13": 4.469128930568695,
"ce_loss_26": 3.8787964940071107,
"ce_loss_39": 3.2338991940021513,
"ce_loss_52": 1.4316335827112199,
"ce_loss_7": 4.848294925689697,
"epoch": 0.176,
"grad_norm": 24.772424094235244,
"kl_loss_13": 6252.8,
"kl_loss_26": 5015.2,
"kl_loss_39": 3643.2,
"kl_loss_7": 7041.6,
"learning_rate": 0.0009322171915289634,
"loss": 11050.6,
"step": 1760
},
{
"ce_loss_13": 4.515468680858612,
"ce_loss_26": 3.9264565110206604,
"ce_loss_39": 3.2920862257480623,
"ce_loss_52": 1.46503643989563,
"ce_loss_7": 4.88274484872818,
"epoch": 0.177,
"grad_norm": 24.580027558725412,
"kl_loss_13": 6243.2,
"kl_loss_26": 5015.2,
"kl_loss_39": 3668.0,
"kl_loss_7": 7024.0,
"learning_rate": 0.0009314173287433873,
"loss": 11083.0,
"step": 1770
},
{
"ce_loss_13": 4.563484919071198,
"ce_loss_26": 3.988471633195877,
"ce_loss_39": 3.3576016187667848,
"ce_loss_52": 1.4738382428884507,
"ce_loss_7": 4.929107296466827,
"epoch": 0.178,
"grad_norm": 23.727065102019264,
"kl_loss_13": 6340.0,
"kl_loss_26": 5134.4,
"kl_loss_39": 3774.0,
"kl_loss_7": 7108.0,
"learning_rate": 0.0009306131215901003,
"loss": 11053.2,
"step": 1780
},
{
"ce_loss_13": 4.485390210151673,
"ce_loss_26": 3.9024369359016418,
"ce_loss_39": 3.277720022201538,
"ce_loss_52": 1.4684919208288192,
"ce_loss_7": 4.849484694004059,
"epoch": 0.179,
"grad_norm": 24.140381804707665,
"kl_loss_13": 6222.4,
"kl_loss_26": 4996.0,
"kl_loss_39": 3639.6,
"kl_loss_7": 6991.2,
"learning_rate": 0.0009298045781674596,
"loss": 10948.8,
"step": 1790
},
{
"ce_loss_13": 4.485648030042649,
"ce_loss_26": 3.8959447860717775,
"ce_loss_39": 3.255040627717972,
"ce_loss_52": 1.41890487074852,
"ce_loss_7": 4.864380013942719,
"epoch": 0.18,
"grad_norm": 25.753548379396687,
"kl_loss_13": 6269.6,
"kl_loss_26": 5029.6,
"kl_loss_39": 3653.2,
"kl_loss_7": 7068.8,
"learning_rate": 0.0009289917066174886,
"loss": 10940.4,
"step": 1800
},
{
"ce_loss_13": 4.4491588294506075,
"ce_loss_26": 3.862889313697815,
"ce_loss_39": 3.203300213813782,
"ce_loss_52": 1.4129745751619338,
"ce_loss_7": 4.8373774766921995,
"epoch": 0.181,
"grad_norm": 23.580007870242206,
"kl_loss_13": 6251.2,
"kl_loss_26": 5015.2,
"kl_loss_39": 3609.2,
"kl_loss_7": 7063.2,
"learning_rate": 0.0009281745151257945,
"loss": 10831.6,
"step": 1810
},
{
"ce_loss_13": 4.4796471238136295,
"ce_loss_26": 3.9034676015377046,
"ce_loss_39": 3.27801650762558,
"ce_loss_52": 1.470898449420929,
"ce_loss_7": 4.846702206134796,
"epoch": 0.182,
"grad_norm": 21.825066910706077,
"kl_loss_13": 6129.6,
"kl_loss_26": 4921.6,
"kl_loss_39": 3590.4,
"kl_loss_7": 6907.2,
"learning_rate": 0.0009273530119214868,
"loss": 10852.6,
"step": 1820
},
{
"ce_loss_13": 4.397759801149368,
"ce_loss_26": 3.809650295972824,
"ce_loss_39": 3.163052296638489,
"ce_loss_52": 1.4123397037386893,
"ce_loss_7": 4.76624493598938,
"epoch": 0.183,
"grad_norm": 23.028395579089935,
"kl_loss_13": 6109.6,
"kl_loss_26": 4884.8,
"kl_loss_39": 3520.0,
"kl_loss_7": 6886.4,
"learning_rate": 0.0009265272052770935,
"loss": 10776.6,
"step": 1830
},
{
"ce_loss_13": 4.409473043680191,
"ce_loss_26": 3.825248968601227,
"ce_loss_39": 3.174910306930542,
"ce_loss_52": 1.4039017781615257,
"ce_loss_7": 4.799298018217087,
"epoch": 0.184,
"grad_norm": 22.60594476207274,
"kl_loss_13": 6165.6,
"kl_loss_26": 4934.4,
"kl_loss_39": 3543.2,
"kl_loss_7": 6977.6,
"learning_rate": 0.0009256971035084784,
"loss": 10733.4,
"step": 1840
},
{
"ce_loss_13": 4.3755183041095735,
"ce_loss_26": 3.797974693775177,
"ce_loss_39": 3.1725789427757265,
"ce_loss_52": 1.4232216864824294,
"ce_loss_7": 4.739054465293885,
"epoch": 0.185,
"grad_norm": 23.627865972104136,
"kl_loss_13": 6060.0,
"kl_loss_26": 4843.2,
"kl_loss_39": 3517.2,
"kl_loss_7": 6827.2,
"learning_rate": 0.0009248627149747573,
"loss": 10698.4,
"step": 1850
},
{
"ce_loss_13": 4.422569459676742,
"ce_loss_26": 3.822605752944946,
"ce_loss_39": 3.1763491451740267,
"ce_loss_52": 1.427757203578949,
"ce_loss_7": 4.793670791387558,
"epoch": 0.186,
"grad_norm": 22.345780165109367,
"kl_loss_13": 6140.0,
"kl_loss_26": 4902.0,
"kl_loss_39": 3525.6,
"kl_loss_7": 6920.0,
"learning_rate": 0.0009240240480782129,
"loss": 10688.6,
"step": 1860
},
{
"ce_loss_13": 4.390002739429474,
"ce_loss_26": 3.8117696583271026,
"ce_loss_39": 3.193506735563278,
"ce_loss_52": 1.4390262439846992,
"ce_loss_7": 4.754710161685944,
"epoch": 0.187,
"grad_norm": 24.270272909834983,
"kl_loss_13": 6056.0,
"kl_loss_26": 4841.6,
"kl_loss_39": 3523.2,
"kl_loss_7": 6828.0,
"learning_rate": 0.0009231811112642122,
"loss": 10605.8,
"step": 1870
},
{
"ce_loss_13": 4.347514522075653,
"ce_loss_26": 3.774775582551956,
"ce_loss_39": 3.1524779438972472,
"ce_loss_52": 1.4184574037790298,
"ce_loss_7": 4.711814332008362,
"epoch": 0.188,
"grad_norm": 23.060486415907942,
"kl_loss_13": 6006.4,
"kl_loss_26": 4801.6,
"kl_loss_39": 3474.8,
"kl_loss_7": 6776.0,
"learning_rate": 0.0009223339130211192,
"loss": 10599.8,
"step": 1880
},
{
"ce_loss_13": 4.280169582366943,
"ce_loss_26": 3.6960571646690368,
"ce_loss_39": 3.0768611639738084,
"ce_loss_52": 1.4011510267853737,
"ce_loss_7": 4.650495028495788,
"epoch": 0.189,
"grad_norm": 23.308893883500843,
"kl_loss_13": 5916.0,
"kl_loss_26": 4693.6,
"kl_loss_39": 3365.6,
"kl_loss_7": 6688.8,
"learning_rate": 0.0009214824618802108,
"loss": 10510.0,
"step": 1890
},
{
"ce_loss_13": 4.426742446422577,
"ce_loss_26": 3.835584044456482,
"ce_loss_39": 3.1762202858924864,
"ce_loss_52": 1.435165250301361,
"ce_loss_7": 4.8001045942306515,
"epoch": 0.19,
"grad_norm": 24.259267724718942,
"kl_loss_13": 6154.4,
"kl_loss_26": 4914.4,
"kl_loss_39": 3499.2,
"kl_loss_7": 6933.6,
"learning_rate": 0.0009206267664155906,
"loss": 10574.0,
"step": 1900
},
{
"ce_loss_13": 4.317660903930664,
"ce_loss_26": 3.736785036325455,
"ce_loss_39": 3.102087676525116,
"ce_loss_52": 1.4297346964478492,
"ce_loss_7": 4.69397531747818,
"epoch": 0.191,
"grad_norm": 23.3562329011761,
"kl_loss_13": 5937.6,
"kl_loss_26": 4723.2,
"kl_loss_39": 3373.2,
"kl_loss_7": 6731.2,
"learning_rate": 0.0009197668352441024,
"loss": 10503.4,
"step": 1910
},
{
"ce_loss_13": 4.334453409910202,
"ce_loss_26": 3.7587957322597503,
"ce_loss_39": 3.123855656385422,
"ce_loss_52": 1.4094826728105545,
"ce_loss_7": 4.709676373004913,
"epoch": 0.192,
"grad_norm": 24.22470876078119,
"kl_loss_13": 5996.8,
"kl_loss_26": 4786.4,
"kl_loss_39": 3430.0,
"kl_loss_7": 6772.8,
"learning_rate": 0.0009189026770252437,
"loss": 10471.0,
"step": 1920
},
{
"ce_loss_13": 4.351706159114838,
"ce_loss_26": 3.7773966193199158,
"ce_loss_39": 3.1497732281684874,
"ce_loss_52": 1.4320787012577056,
"ce_loss_7": 4.7191231608390805,
"epoch": 0.193,
"grad_norm": 23.447904782586527,
"kl_loss_13": 5997.6,
"kl_loss_26": 4794.4,
"kl_loss_39": 3448.4,
"kl_loss_7": 6763.2,
"learning_rate": 0.000918034300461078,
"loss": 10433.4,
"step": 1930
},
{
"ce_loss_13": 4.307104933261871,
"ce_loss_26": 3.7206166088581085,
"ce_loss_39": 3.091316682100296,
"ce_loss_52": 1.4110687702894211,
"ce_loss_7": 4.676908355951309,
"epoch": 0.194,
"grad_norm": 23.93372642527522,
"kl_loss_13": 5951.2,
"kl_loss_26": 4727.6,
"kl_loss_39": 3376.0,
"kl_loss_7": 6721.6,
"learning_rate": 0.0009171617142961477,
"loss": 10442.2,
"step": 1940
},
{
"ce_loss_13": 4.3363093614578245,
"ce_loss_26": 3.750982850790024,
"ce_loss_39": 3.111935979127884,
"ce_loss_52": 1.431942057609558,
"ce_loss_7": 4.707539451122284,
"epoch": 0.195,
"grad_norm": 23.910939036749266,
"kl_loss_13": 5967.2,
"kl_loss_26": 4743.2,
"kl_loss_39": 3382.8,
"kl_loss_7": 6752.0,
"learning_rate": 0.0009162849273173857,
"loss": 10366.8,
"step": 1950
},
{
"ce_loss_13": 4.271794074773789,
"ce_loss_26": 3.7012794077396394,
"ce_loss_39": 3.0882445216178893,
"ce_loss_52": 1.4407746940851212,
"ce_loss_7": 4.636665797233581,
"epoch": 0.196,
"grad_norm": 23.30649566244444,
"kl_loss_13": 5844.8,
"kl_loss_26": 4652.8,
"kl_loss_39": 3337.2,
"kl_loss_7": 6608.8,
"learning_rate": 0.0009154039483540273,
"loss": 10313.0,
"step": 1960
},
{
"ce_loss_13": 4.3892871856689455,
"ce_loss_26": 3.8117631673812866,
"ce_loss_39": 3.1672019243240355,
"ce_loss_52": 1.4633917301893233,
"ce_loss_7": 4.75022611618042,
"epoch": 0.197,
"grad_norm": 22.823988656575857,
"kl_loss_13": 5992.0,
"kl_loss_26": 4784.0,
"kl_loss_39": 3421.6,
"kl_loss_7": 6749.6,
"learning_rate": 0.0009145187862775209,
"loss": 10294.2,
"step": 1970
},
{
"ce_loss_13": 4.251708203554154,
"ce_loss_26": 3.68521209359169,
"ce_loss_39": 3.0638678431510926,
"ce_loss_52": 1.4189983233809471,
"ce_loss_7": 4.615437304973602,
"epoch": 0.198,
"grad_norm": 22.115400900183356,
"kl_loss_13": 5814.4,
"kl_loss_26": 4623.2,
"kl_loss_39": 3291.2,
"kl_loss_7": 6581.6,
"learning_rate": 0.0009136294500014386,
"loss": 10194.8,
"step": 1980
},
{
"ce_loss_13": 4.36306391954422,
"ce_loss_26": 3.7900101482868194,
"ce_loss_39": 3.1458350718021393,
"ce_loss_52": 1.431062677502632,
"ce_loss_7": 4.733654403686524,
"epoch": 0.199,
"grad_norm": 21.64648055888152,
"kl_loss_13": 6001.6,
"kl_loss_26": 4801.6,
"kl_loss_39": 3444.4,
"kl_loss_7": 6778.4,
"learning_rate": 0.000912735948481387,
"loss": 10217.4,
"step": 1990
},
{
"ce_loss_13": 4.2783638596534725,
"ce_loss_26": 3.7015498995780947,
"ce_loss_39": 3.079295587539673,
"ce_loss_52": 1.4367393761873246,
"ce_loss_7": 4.643443429470063,
"epoch": 0.2,
"grad_norm": 22.667053535414237,
"kl_loss_13": 5844.8,
"kl_loss_26": 4641.2,
"kl_loss_39": 3314.8,
"kl_loss_7": 6607.2,
"learning_rate": 0.0009118382907149164,
"loss": 10108.9,
"step": 2000
},
{
"ce_loss_13": 4.298079961538315,
"ce_loss_26": 3.7112639427185057,
"ce_loss_39": 3.0900191485881807,
"ce_loss_52": 1.4447624236345291,
"ce_loss_7": 4.659080803394318,
"epoch": 0.201,
"grad_norm": 21.421967222285037,
"kl_loss_13": 5860.8,
"kl_loss_26": 4647.6,
"kl_loss_39": 3308.4,
"kl_loss_7": 6620.8,
"learning_rate": 0.0009109364857414306,
"loss": 10210.1,
"step": 2010
},
{
"ce_loss_13": 4.298530715703964,
"ce_loss_26": 3.7281334936618804,
"ce_loss_39": 3.103990191221237,
"ce_loss_52": 1.445554968714714,
"ce_loss_7": 4.667031800746917,
"epoch": 0.202,
"grad_norm": 22.186513808055555,
"kl_loss_13": 5863.2,
"kl_loss_26": 4661.6,
"kl_loss_39": 3329.2,
"kl_loss_7": 6627.2,
"learning_rate": 0.0009100305426420956,
"loss": 10090.6,
"step": 2020
},
{
"ce_loss_13": 4.2117482125759125,
"ce_loss_26": 3.6511631190776823,
"ce_loss_39": 3.0435081899166105,
"ce_loss_52": 1.4112246841192246,
"ce_loss_7": 4.573570990562439,
"epoch": 0.203,
"grad_norm": 23.22208055275699,
"kl_loss_13": 5758.4,
"kl_loss_26": 4581.6,
"kl_loss_39": 3281.6,
"kl_loss_7": 6522.4,
"learning_rate": 0.0009091204705397484,
"loss": 10094.4,
"step": 2030
},
{
"ce_loss_13": 4.2797119140625,
"ce_loss_26": 3.7096898019313813,
"ce_loss_39": 3.0815310001373293,
"ce_loss_52": 1.4514381274580956,
"ce_loss_7": 4.634703290462494,
"epoch": 0.204,
"grad_norm": 22.77691157290275,
"kl_loss_13": 5768.8,
"kl_loss_26": 4568.8,
"kl_loss_39": 3229.6,
"kl_loss_7": 6520.8,
"learning_rate": 0.0009082062785988049,
"loss": 10052.8,
"step": 2040
},
{
"ce_loss_13": 4.228492313623429,
"ce_loss_26": 3.649516838788986,
"ce_loss_39": 3.017675918340683,
"ce_loss_52": 1.4015851855278014,
"ce_loss_7": 4.5945727050304415,
"epoch": 0.205,
"grad_norm": 24.043992652900016,
"kl_loss_13": 5788.0,
"kl_loss_26": 4580.8,
"kl_loss_39": 3234.0,
"kl_loss_7": 6555.2,
"learning_rate": 0.0009072879760251679,
"loss": 10047.6,
"step": 2050
},
{
"ce_loss_13": 4.153064209222793,
"ce_loss_26": 3.5778062403202058,
"ce_loss_39": 2.964171904325485,
"ce_loss_52": 1.4066181004047393,
"ce_loss_7": 4.5237502455711365,
"epoch": 0.206,
"grad_norm": 23.14612831170837,
"kl_loss_13": 5666.4,
"kl_loss_26": 4462.4,
"kl_loss_39": 3156.0,
"kl_loss_7": 6428.0,
"learning_rate": 0.0009063655720661341,
"loss": 10022.0,
"step": 2060
},
{
"ce_loss_13": 4.162076050043106,
"ce_loss_26": 3.5850139617919923,
"ce_loss_39": 2.9763170003890993,
"ce_loss_52": 1.4114871382713319,
"ce_loss_7": 4.525000536441803,
"epoch": 0.207,
"grad_norm": 23.335931334507656,
"kl_loss_13": 5684.8,
"kl_loss_26": 4486.4,
"kl_loss_39": 3172.4,
"kl_loss_7": 6440.8,
"learning_rate": 0.000905439076010301,
"loss": 9910.8,
"step": 2070
},
{
"ce_loss_13": 4.203662091493607,
"ce_loss_26": 3.6453104853630065,
"ce_loss_39": 3.031943756341934,
"ce_loss_52": 1.4498305425047875,
"ce_loss_7": 4.567969477176666,
"epoch": 0.208,
"grad_norm": 22.17297694250979,
"kl_loss_13": 5653.6,
"kl_loss_26": 4474.4,
"kl_loss_39": 3178.4,
"kl_loss_7": 6404.0,
"learning_rate": 0.0009045084971874737,
"loss": 9890.1,
"step": 2080
},
{
"ce_loss_13": 4.1253215074539185,
"ce_loss_26": 3.5510447442531587,
"ce_loss_39": 2.9255994498729705,
"ce_loss_52": 1.383307683467865,
"ce_loss_7": 4.503264659643174,
"epoch": 0.209,
"grad_norm": 21.83866221628796,
"kl_loss_13": 5675.2,
"kl_loss_26": 4462.8,
"kl_loss_39": 3118.0,
"kl_loss_7": 6452.0,
"learning_rate": 0.0009035738449685707,
"loss": 9916.2,
"step": 2090
},
{
"ce_loss_13": 4.232741326093674,
"ce_loss_26": 3.6592664182186128,
"ce_loss_39": 3.0402495503425597,
"ce_loss_52": 1.4619457066059112,
"ce_loss_7": 4.595292699337006,
"epoch": 0.21,
"grad_norm": 23.1172808852491,
"kl_loss_13": 5709.6,
"kl_loss_26": 4522.8,
"kl_loss_39": 3192.0,
"kl_loss_7": 6460.8,
"learning_rate": 0.0009026351287655293,
"loss": 9882.4,
"step": 2100
},
{
"ce_loss_13": 4.196552646160126,
"ce_loss_26": 3.6394916236400605,
"ce_loss_39": 3.029403269290924,
"ce_loss_52": 1.431785149872303,
"ce_loss_7": 4.555036389827729,
"epoch": 0.211,
"grad_norm": 22.472192746858727,
"kl_loss_13": 5678.4,
"kl_loss_26": 4513.2,
"kl_loss_39": 3201.6,
"kl_loss_7": 6423.2,
"learning_rate": 0.0009016923580312113,
"loss": 9778.0,
"step": 2110
},
{
"ce_loss_13": 4.267941182851791,
"ce_loss_26": 3.6975386798381806,
"ce_loss_39": 3.0839960873126984,
"ce_loss_52": 1.4794423222541808,
"ce_loss_7": 4.60888249874115,
"epoch": 0.212,
"grad_norm": 23.91558691594894,
"kl_loss_13": 5697.6,
"kl_loss_26": 4502.8,
"kl_loss_39": 3191.2,
"kl_loss_7": 6425.6,
"learning_rate": 0.0009007455422593077,
"loss": 9764.0,
"step": 2120
},
{
"ce_loss_13": 4.170402336120605,
"ce_loss_26": 3.5973034620285036,
"ce_loss_39": 2.9883872270584106,
"ce_loss_52": 1.4295871376991272,
"ce_loss_7": 4.519614219665527,
"epoch": 0.213,
"grad_norm": 22.812352077428177,
"kl_loss_13": 5644.0,
"kl_loss_26": 4456.4,
"kl_loss_39": 3137.2,
"kl_loss_7": 6387.2,
"learning_rate": 0.0008997946909842425,
"loss": 9755.6,
"step": 2130
},
{
"ce_loss_13": 4.10284715294838,
"ce_loss_26": 3.5395869314670563,
"ce_loss_39": 2.9268704533576964,
"ce_loss_52": 1.4086133271455765,
"ce_loss_7": 4.462130695581436,
"epoch": 0.214,
"grad_norm": 22.23702862817867,
"kl_loss_13": 5524.8,
"kl_loss_26": 4340.4,
"kl_loss_39": 3036.0,
"kl_loss_7": 6288.8,
"learning_rate": 0.0008988398137810777,
"loss": 9645.0,
"step": 2140
},
{
"ce_loss_13": 4.079320967197418,
"ce_loss_26": 3.496495473384857,
"ce_loss_39": 2.8816928565502167,
"ce_loss_52": 1.3819068521261215,
"ce_loss_7": 4.434896755218506,
"epoch": 0.215,
"grad_norm": 22.79915028059786,
"kl_loss_13": 5541.6,
"kl_loss_26": 4350.0,
"kl_loss_39": 3045.2,
"kl_loss_7": 6288.8,
"learning_rate": 0.0008978809202654162,
"loss": 9686.6,
"step": 2150
},
{
"ce_loss_13": 4.069333535432816,
"ce_loss_26": 3.5059276044368746,
"ce_loss_39": 2.8949272632598877,
"ce_loss_52": 1.409129326045513,
"ce_loss_7": 4.434026664495468,
"epoch": 0.216,
"grad_norm": 22.908702660837623,
"kl_loss_13": 5464.0,
"kl_loss_26": 4286.8,
"kl_loss_39": 2986.8,
"kl_loss_7": 6228.0,
"learning_rate": 0.0008969180200933046,
"loss": 9665.2,
"step": 2160
},
{
"ce_loss_13": 4.164896643161773,
"ce_loss_26": 3.594840294122696,
"ce_loss_39": 2.9824715733528135,
"ce_loss_52": 1.4398185968399049,
"ce_loss_7": 4.5187140583992,
"epoch": 0.217,
"grad_norm": 22.2992725858673,
"kl_loss_13": 5602.4,
"kl_loss_26": 4409.6,
"kl_loss_39": 3101.6,
"kl_loss_7": 6338.4,
"learning_rate": 0.0008959511229611376,
"loss": 9611.1,
"step": 2170
},
{
"ce_loss_13": 4.1424953758716585,
"ce_loss_26": 3.5846896708011626,
"ce_loss_39": 2.9776509165763856,
"ce_loss_52": 1.4638631641864777,
"ce_loss_7": 4.499098914861679,
"epoch": 0.218,
"grad_norm": 22.569206560566755,
"kl_loss_13": 5520.8,
"kl_loss_26": 4360.4,
"kl_loss_39": 3039.2,
"kl_loss_7": 6262.4,
"learning_rate": 0.0008949802386055581,
"loss": 9598.7,
"step": 2180
},
{
"ce_loss_13": 4.124321860074997,
"ce_loss_26": 3.559172648191452,
"ce_loss_39": 2.942674660682678,
"ce_loss_52": 1.4182981908321382,
"ce_loss_7": 4.483241724967956,
"epoch": 0.219,
"grad_norm": 22.517460780417444,
"kl_loss_13": 5559.2,
"kl_loss_26": 4374.8,
"kl_loss_39": 3066.4,
"kl_loss_7": 6304.0,
"learning_rate": 0.0008940053768033609,
"loss": 9610.7,
"step": 2190
},
{
"ce_loss_13": 4.136555308103562,
"ce_loss_26": 3.5674175798892973,
"ce_loss_39": 2.9441113233566285,
"ce_loss_52": 1.4381250411272049,
"ce_loss_7": 4.481418180465698,
"epoch": 0.22,
"grad_norm": 23.1169100672147,
"kl_loss_13": 5545.6,
"kl_loss_26": 4354.4,
"kl_loss_39": 3035.2,
"kl_loss_7": 6270.4,
"learning_rate": 0.0008930265473713938,
"loss": 9621.3,
"step": 2200
},
{
"ce_loss_13": 4.116154849529266,
"ce_loss_26": 3.5370861172676085,
"ce_loss_39": 2.9127448469400408,
"ce_loss_52": 1.3838467657566071,
"ce_loss_7": 4.479850220680237,
"epoch": 0.221,
"grad_norm": 23.327101471626264,
"kl_loss_13": 5613.6,
"kl_loss_26": 4407.2,
"kl_loss_39": 3084.0,
"kl_loss_7": 6376.8,
"learning_rate": 0.0008920437601664579,
"loss": 9580.3,
"step": 2210
},
{
"ce_loss_13": 4.081603097915649,
"ce_loss_26": 3.533859223127365,
"ce_loss_39": 2.9178696632385255,
"ce_loss_52": 1.4558427572250365,
"ce_loss_7": 4.428021937608719,
"epoch": 0.222,
"grad_norm": 24.571271492626643,
"kl_loss_13": 5416.8,
"kl_loss_26": 4265.2,
"kl_loss_39": 2949.2,
"kl_loss_7": 6150.4,
"learning_rate": 0.0008910570250852097,
"loss": 9535.0,
"step": 2220
},
{
"ce_loss_13": 4.016762095689773,
"ce_loss_26": 3.453756958246231,
"ce_loss_39": 2.8424510210752487,
"ce_loss_52": 1.382763533294201,
"ce_loss_7": 4.369659447669983,
"epoch": 0.223,
"grad_norm": 22.39109803193224,
"kl_loss_13": 5417.6,
"kl_loss_26": 4255.6,
"kl_loss_39": 2951.6,
"kl_loss_7": 6149.6,
"learning_rate": 0.0008900663520640604,
"loss": 9449.7,
"step": 2230
},
{
"ce_loss_13": 4.07697583436966,
"ce_loss_26": 3.5205394327640533,
"ce_loss_39": 2.9201291859149934,
"ce_loss_52": 1.4419079095125198,
"ce_loss_7": 4.432818019390107,
"epoch": 0.224,
"grad_norm": 29.33071925320992,
"kl_loss_13": 5431.2,
"kl_loss_26": 4260.4,
"kl_loss_39": 2980.0,
"kl_loss_7": 6176.8,
"learning_rate": 0.0008890717510790764,
"loss": 9471.4,
"step": 2240
},
{
"ce_loss_13": 4.099857300519943,
"ce_loss_26": 3.5491108179092405,
"ce_loss_39": 2.946030503511429,
"ce_loss_52": 1.4482155337929725,
"ce_loss_7": 4.446840679645538,
"epoch": 0.225,
"grad_norm": 24.393145108562546,
"kl_loss_13": 5448.0,
"kl_loss_26": 4290.4,
"kl_loss_39": 3004.4,
"kl_loss_7": 6176.0,
"learning_rate": 0.0008880732321458784,
"loss": 9429.4,
"step": 2250
},
{
"ce_loss_13": 4.008820396661759,
"ce_loss_26": 3.466299217939377,
"ce_loss_39": 2.8751066744327547,
"ce_loss_52": 1.4349601715803146,
"ce_loss_7": 4.357817393541336,
"epoch": 0.226,
"grad_norm": 23.790321486003762,
"kl_loss_13": 5306.4,
"kl_loss_26": 4167.6,
"kl_loss_39": 2894.0,
"kl_loss_7": 6034.4,
"learning_rate": 0.0008870708053195413,
"loss": 9349.3,
"step": 2260
},
{
"ce_loss_13": 4.0613229155540465,
"ce_loss_26": 3.495078670978546,
"ce_loss_39": 2.889867639541626,
"ce_loss_52": 1.417461496591568,
"ce_loss_7": 4.40853306055069,
"epoch": 0.227,
"grad_norm": 24.394028059861938,
"kl_loss_13": 5412.8,
"kl_loss_26": 4243.2,
"kl_loss_39": 2960.8,
"kl_loss_7": 6130.4,
"learning_rate": 0.0008860644806944918,
"loss": 9352.6,
"step": 2270
},
{
"ce_loss_13": 4.178533679246902,
"ce_loss_26": 3.622497373819351,
"ce_loss_39": 3.006651484966278,
"ce_loss_52": 1.4494876891374588,
"ce_loss_7": 4.527337849140167,
"epoch": 0.228,
"grad_norm": 22.806373163177923,
"kl_loss_13": 5580.0,
"kl_loss_26": 4405.2,
"kl_loss_39": 3104.0,
"kl_loss_7": 6317.6,
"learning_rate": 0.0008850542684044079,
"loss": 9441.9,
"step": 2280
},
{
"ce_loss_13": 4.018020331859589,
"ce_loss_26": 3.458746635913849,
"ce_loss_39": 2.8657322227954865,
"ce_loss_52": 1.4288572728633882,
"ce_loss_7": 4.375551146268845,
"epoch": 0.229,
"grad_norm": 22.45015355344987,
"kl_loss_13": 5303.2,
"kl_loss_26": 4136.0,
"kl_loss_39": 2856.8,
"kl_loss_7": 6051.2,
"learning_rate": 0.0008840401786221159,
"loss": 9343.7,
"step": 2290
},
{
"ce_loss_13": 4.0382424116134645,
"ce_loss_26": 3.4842948436737062,
"ce_loss_39": 2.894715803861618,
"ce_loss_52": 1.4393651276826858,
"ce_loss_7": 4.377675461769104,
"epoch": 0.23,
"grad_norm": 23.31839583792026,
"kl_loss_13": 5351.2,
"kl_loss_26": 4188.4,
"kl_loss_39": 2912.0,
"kl_loss_7": 6064.8,
"learning_rate": 0.000883022221559489,
"loss": 9246.3,
"step": 2300
},
{
"ce_loss_13": 4.038966596126556,
"ce_loss_26": 3.488220602273941,
"ce_loss_39": 2.8812575459480287,
"ce_loss_52": 1.441010195016861,
"ce_loss_7": 4.381568449735641,
"epoch": 0.231,
"grad_norm": 22.254622557882463,
"kl_loss_13": 5335.2,
"kl_loss_26": 4171.2,
"kl_loss_39": 2887.2,
"kl_loss_7": 6052.8,
"learning_rate": 0.0008820004074673434,
"loss": 9220.3,
"step": 2310
},
{
"ce_loss_13": 3.9854084312915803,
"ce_loss_26": 3.4409989297389982,
"ce_loss_39": 2.84497589468956,
"ce_loss_52": 1.4122451767325401,
"ce_loss_7": 4.340046459436417,
"epoch": 0.232,
"grad_norm": 21.074813671439337,
"kl_loss_13": 5296.8,
"kl_loss_26": 4147.6,
"kl_loss_39": 2864.8,
"kl_loss_7": 6036.8,
"learning_rate": 0.0008809747466353355,
"loss": 9279.8,
"step": 2320
},
{
"ce_loss_13": 4.115998637676239,
"ce_loss_26": 3.5584963142871855,
"ce_loss_39": 2.9395908057689666,
"ce_loss_52": 1.465473085641861,
"ce_loss_7": 4.470573830604553,
"epoch": 0.233,
"grad_norm": 22.26955688088229,
"kl_loss_13": 5459.2,
"kl_loss_26": 4288.0,
"kl_loss_39": 2960.4,
"kl_loss_7": 6197.6,
"learning_rate": 0.0008799452493918585,
"loss": 9213.2,
"step": 2330
},
{
"ce_loss_13": 3.9350290656089784,
"ce_loss_26": 3.3855203211307527,
"ce_loss_39": 2.799959135055542,
"ce_loss_52": 1.4289379581809043,
"ce_loss_7": 4.274723726511001,
"epoch": 0.234,
"grad_norm": 22.04159638849698,
"kl_loss_13": 5186.8,
"kl_loss_26": 4026.0,
"kl_loss_39": 2752.2,
"kl_loss_7": 5904.4,
"learning_rate": 0.0008789119261039385,
"loss": 9222.9,
"step": 2340
},
{
"ce_loss_13": 3.977653867006302,
"ce_loss_26": 3.4353197515010834,
"ce_loss_39": 2.8274969339370726,
"ce_loss_52": 1.400892499089241,
"ce_loss_7": 4.322875905036926,
"epoch": 0.235,
"grad_norm": 25.32335755706349,
"kl_loss_13": 5282.4,
"kl_loss_26": 4139.6,
"kl_loss_39": 2849.2,
"kl_loss_7": 6001.6,
"learning_rate": 0.0008778747871771292,
"loss": 9101.4,
"step": 2350
},
{
"ce_loss_13": 3.9699031889438627,
"ce_loss_26": 3.4155133664608,
"ce_loss_39": 2.8127492308616637,
"ce_loss_52": 1.4119284138083459,
"ce_loss_7": 4.322866821289063,
"epoch": 0.236,
"grad_norm": 24.250283920991954,
"kl_loss_13": 5259.2,
"kl_loss_26": 4092.8,
"kl_loss_39": 2816.4,
"kl_loss_7": 6000.8,
"learning_rate": 0.0008768338430554083,
"loss": 9104.0,
"step": 2360
},
{
"ce_loss_13": 3.928439366817474,
"ce_loss_26": 3.382099211215973,
"ce_loss_39": 2.789314305782318,
"ce_loss_52": 1.3916988223791122,
"ce_loss_7": 4.277575564384461,
"epoch": 0.237,
"grad_norm": 23.978839586298704,
"kl_loss_13": 5212.0,
"kl_loss_26": 4068.4,
"kl_loss_39": 2797.2,
"kl_loss_7": 5938.4,
"learning_rate": 0.0008757891042210713,
"loss": 9141.7,
"step": 2370
},
{
"ce_loss_13": 3.9462322175502775,
"ce_loss_26": 3.397873044013977,
"ce_loss_39": 2.8039229214191437,
"ce_loss_52": 1.4037385553121566,
"ce_loss_7": 4.290230017900467,
"epoch": 0.238,
"grad_norm": 23.01247346605362,
"kl_loss_13": 5215.2,
"kl_loss_26": 4079.6,
"kl_loss_39": 2813.2,
"kl_loss_7": 5942.4,
"learning_rate": 0.0008747405811946271,
"loss": 9055.8,
"step": 2380
},
{
"ce_loss_13": 3.98149796128273,
"ce_loss_26": 3.442757821083069,
"ce_loss_39": 2.84624342918396,
"ce_loss_52": 1.445407471060753,
"ce_loss_7": 4.318572920560837,
"epoch": 0.239,
"grad_norm": 22.74122664649998,
"kl_loss_13": 5226.4,
"kl_loss_26": 4089.6,
"kl_loss_39": 2827.6,
"kl_loss_7": 5934.4,
"learning_rate": 0.0008736882845346905,
"loss": 9110.6,
"step": 2390
},
{
"ce_loss_13": 3.9661067545413973,
"ce_loss_26": 3.4294336676597594,
"ce_loss_39": 2.836567336320877,
"ce_loss_52": 1.442602628469467,
"ce_loss_7": 4.3087667465209964,
"epoch": 0.24,
"grad_norm": 23.333126009298994,
"kl_loss_13": 5196.0,
"kl_loss_26": 4051.6,
"kl_loss_39": 2790.0,
"kl_loss_7": 5911.2,
"learning_rate": 0.0008726322248378774,
"loss": 9064.8,
"step": 2400
},
{
"ce_loss_13": 3.988937532901764,
"ce_loss_26": 3.4361318945884705,
"ce_loss_39": 2.833483111858368,
"ce_loss_52": 1.4274606987833978,
"ce_loss_7": 4.3361672222614285,
"epoch": 0.241,
"grad_norm": 21.55988300492865,
"kl_loss_13": 5244.8,
"kl_loss_26": 4091.2,
"kl_loss_39": 2809.2,
"kl_loss_7": 5974.4,
"learning_rate": 0.0008715724127386971,
"loss": 9048.5,
"step": 2410
},
{
"ce_loss_13": 3.93166036605835,
"ce_loss_26": 3.3924847066402437,
"ce_loss_39": 2.8114346325397492,
"ce_loss_52": 1.433423739671707,
"ce_loss_7": 4.277747517824173,
"epoch": 0.242,
"grad_norm": 21.76629675806892,
"kl_loss_13": 5152.8,
"kl_loss_26": 4024.4,
"kl_loss_39": 2775.6,
"kl_loss_7": 5876.0,
"learning_rate": 0.0008705088589094458,
"loss": 8950.6,
"step": 2420
},
{
"ce_loss_13": 4.0298320889472965,
"ce_loss_26": 3.4784019589424133,
"ce_loss_39": 2.8909366130828857,
"ce_loss_52": 1.4593130856752397,
"ce_loss_7": 4.372854852676392,
"epoch": 0.243,
"grad_norm": 22.782714549711034,
"kl_loss_13": 5275.2,
"kl_loss_26": 4132.0,
"kl_loss_39": 2868.8,
"kl_loss_7": 6000.0,
"learning_rate": 0.0008694415740600988,
"loss": 8979.7,
"step": 2430
},
{
"ce_loss_13": 3.957322496175766,
"ce_loss_26": 3.391196775436401,
"ce_loss_39": 2.7942449331283568,
"ce_loss_52": 1.429965654015541,
"ce_loss_7": 4.3002465009689335,
"epoch": 0.244,
"grad_norm": 22.108343623695664,
"kl_loss_13": 5175.2,
"kl_loss_26": 4000.0,
"kl_loss_39": 2735.2,
"kl_loss_7": 5894.4,
"learning_rate": 0.0008683705689382025,
"loss": 8983.5,
"step": 2440
},
{
"ce_loss_13": 3.914830905199051,
"ce_loss_26": 3.371804046630859,
"ce_loss_39": 2.789295125007629,
"ce_loss_52": 1.4458730816841125,
"ce_loss_7": 4.244754731655121,
"epoch": 0.245,
"grad_norm": 22.68476073719735,
"kl_loss_13": 5094.4,
"kl_loss_26": 3954.8,
"kl_loss_39": 2705.0,
"kl_loss_7": 5792.8,
"learning_rate": 0.0008672958543287666,
"loss": 8971.0,
"step": 2450
},
{
"ce_loss_13": 3.910190373659134,
"ce_loss_26": 3.3697587728500364,
"ce_loss_39": 2.7743864953517914,
"ce_loss_52": 1.4169176414608955,
"ce_loss_7": 4.245863050222397,
"epoch": 0.246,
"grad_norm": 23.78530061144511,
"kl_loss_13": 5117.6,
"kl_loss_26": 3979.2,
"kl_loss_39": 2714.8,
"kl_loss_7": 5818.4,
"learning_rate": 0.0008662174410541554,
"loss": 8871.3,
"step": 2460
},
{
"ce_loss_13": 3.905332827568054,
"ce_loss_26": 3.3677509129047394,
"ce_loss_39": 2.785578554868698,
"ce_loss_52": 1.4284577563405036,
"ce_loss_7": 4.243521982431412,
"epoch": 0.247,
"grad_norm": 21.62010382710404,
"kl_loss_13": 5076.0,
"kl_loss_26": 3951.2,
"kl_loss_39": 2706.0,
"kl_loss_7": 5774.4,
"learning_rate": 0.0008651353399739787,
"loss": 8827.8,
"step": 2470
},
{
"ce_loss_13": 3.9418592929840086,
"ce_loss_26": 3.399972987174988,
"ce_loss_39": 2.7943135529756544,
"ce_loss_52": 1.4213671818375588,
"ce_loss_7": 4.285894882678986,
"epoch": 0.248,
"grad_norm": 21.67451956689309,
"kl_loss_13": 5164.0,
"kl_loss_26": 4026.4,
"kl_loss_39": 2746.4,
"kl_loss_7": 5886.4,
"learning_rate": 0.0008640495619849821,
"loss": 8908.6,
"step": 2480
},
{
"ce_loss_13": 3.9583646595478057,
"ce_loss_26": 3.418367612361908,
"ce_loss_39": 2.8201009154319765,
"ce_loss_52": 1.4697474852204322,
"ce_loss_7": 4.3033524513244625,
"epoch": 0.249,
"grad_norm": 23.94241052015279,
"kl_loss_13": 5140.0,
"kl_loss_26": 4010.0,
"kl_loss_39": 2731.6,
"kl_loss_7": 5860.0,
"learning_rate": 0.0008629601180209381,
"loss": 8796.4,
"step": 2490
},
{
"ce_loss_13": 3.9134137570858,
"ce_loss_26": 3.3699698984622954,
"ce_loss_39": 2.781053990125656,
"ce_loss_52": 1.4334075331687928,
"ce_loss_7": 4.242314898967743,
"epoch": 0.25,
"grad_norm": 22.621772280297588,
"kl_loss_13": 5100.0,
"kl_loss_26": 3960.0,
"kl_loss_39": 2699.6,
"kl_loss_7": 5796.0,
"learning_rate": 0.000861867019052535,
"loss": 8802.5,
"step": 2500
},
{
"ce_loss_13": 3.975496470928192,
"ce_loss_26": 3.4352354168891908,
"ce_loss_39": 2.8324910700321198,
"ce_loss_52": 1.469119620323181,
"ce_loss_7": 4.319353139400482,
"epoch": 0.251,
"grad_norm": 24.031546669852546,
"kl_loss_13": 5152.4,
"kl_loss_26": 4019.6,
"kl_loss_39": 2733.2,
"kl_loss_7": 5876.8,
"learning_rate": 0.0008607702760872678,
"loss": 8791.0,
"step": 2510
},
{
"ce_loss_13": 3.970981556177139,
"ce_loss_26": 3.4194670915603638,
"ce_loss_39": 2.826087462902069,
"ce_loss_52": 1.457669761776924,
"ce_loss_7": 4.311935073137283,
"epoch": 0.252,
"grad_norm": 22.68902245721029,
"kl_loss_13": 5173.6,
"kl_loss_26": 4025.2,
"kl_loss_39": 2753.2,
"kl_loss_7": 5881.6,
"learning_rate": 0.0008596699001693256,
"loss": 8797.8,
"step": 2520
},
{
"ce_loss_13": 3.9163833260536194,
"ce_loss_26": 3.378202974796295,
"ce_loss_39": 2.789392131567001,
"ce_loss_52": 1.4278187423944473,
"ce_loss_7": 4.251722925901413,
"epoch": 0.253,
"grad_norm": 23.732161584585768,
"kl_loss_13": 5073.6,
"kl_loss_26": 3944.0,
"kl_loss_39": 2692.8,
"kl_loss_7": 5776.8,
"learning_rate": 0.0008585659023794818,
"loss": 8730.9,
"step": 2530
},
{
"ce_loss_13": 3.8952399492263794,
"ce_loss_26": 3.3577997207641603,
"ce_loss_39": 2.761990362405777,
"ce_loss_52": 1.421005728840828,
"ce_loss_7": 4.233315163850785,
"epoch": 0.254,
"grad_norm": 23.217787871644095,
"kl_loss_13": 5080.0,
"kl_loss_26": 3941.2,
"kl_loss_39": 2674.0,
"kl_loss_7": 5788.8,
"learning_rate": 0.0008574582938349817,
"loss": 8689.0,
"step": 2540
},
{
"ce_loss_13": 3.9610107481479644,
"ce_loss_26": 3.423109310865402,
"ce_loss_39": 2.8520358502864838,
"ce_loss_52": 1.4856882840394974,
"ce_loss_7": 4.297859001159668,
"epoch": 0.255,
"grad_norm": 24.36417114927956,
"kl_loss_13": 5090.4,
"kl_loss_26": 3959.2,
"kl_loss_39": 2721.2,
"kl_loss_7": 5797.6,
"learning_rate": 0.0008563470856894315,
"loss": 8682.7,
"step": 2550
},
{
"ce_loss_13": 3.9392871856689453,
"ce_loss_26": 3.4012055695056915,
"ce_loss_39": 2.808671069145203,
"ce_loss_52": 1.472413820028305,
"ce_loss_7": 4.270819437503815,
"epoch": 0.256,
"grad_norm": 22.260198047396518,
"kl_loss_13": 5046.4,
"kl_loss_26": 3928.4,
"kl_loss_39": 2676.0,
"kl_loss_7": 5743.2,
"learning_rate": 0.0008552322891326845,
"loss": 8696.9,
"step": 2560
},
{
"ce_loss_13": 3.920336198806763,
"ce_loss_26": 3.379600703716278,
"ce_loss_39": 2.7945084452629088,
"ce_loss_52": 1.4492182582616806,
"ce_loss_7": 4.2536624610424045,
"epoch": 0.257,
"grad_norm": 21.726891625639418,
"kl_loss_13": 5079.2,
"kl_loss_26": 3942.4,
"kl_loss_39": 2686.4,
"kl_loss_7": 5777.6,
"learning_rate": 0.0008541139153907296,
"loss": 8637.5,
"step": 2570
},
{
"ce_loss_13": 3.8855518221855165,
"ce_loss_26": 3.3411356985569,
"ce_loss_39": 2.7515164047479628,
"ce_loss_52": 1.444500783085823,
"ce_loss_7": 4.214444124698639,
"epoch": 0.258,
"grad_norm": 21.147965943731403,
"kl_loss_13": 5012.8,
"kl_loss_26": 3868.4,
"kl_loss_39": 2618.4,
"kl_loss_7": 5708.0,
"learning_rate": 0.0008529919757255782,
"loss": 8639.8,
"step": 2580
},
{
"ce_loss_13": 3.906326872110367,
"ce_loss_26": 3.3759279191493987,
"ce_loss_39": 2.794377237558365,
"ce_loss_52": 1.4845586121082306,
"ce_loss_7": 4.233397454023361,
"epoch": 0.259,
"grad_norm": 23.06645203237802,
"kl_loss_13": 4970.4,
"kl_loss_26": 3854.4,
"kl_loss_39": 2612.0,
"kl_loss_7": 5654.4,
"learning_rate": 0.0008518664814351503,
"loss": 8576.9,
"step": 2590
},
{
"ce_loss_13": 3.7816513538360597,
"ce_loss_26": 3.2462404370307922,
"ce_loss_39": 2.6499598264694213,
"ce_loss_52": 1.391408371925354,
"ce_loss_7": 4.128740018606186,
"epoch": 0.26,
"grad_norm": 23.574382647594796,
"kl_loss_13": 4927.2,
"kl_loss_26": 3814.8,
"kl_loss_39": 2554.8,
"kl_loss_7": 5640.8,
"learning_rate": 0.0008507374438531607,
"loss": 8563.9,
"step": 2600
},
{
"ce_loss_13": 3.9538560569286347,
"ce_loss_26": 3.4095658123493195,
"ce_loss_39": 2.8113979279994963,
"ce_loss_52": 1.4748313665390014,
"ce_loss_7": 4.2874442756176,
"epoch": 0.261,
"grad_norm": 22.486126104346287,
"kl_loss_13": 5098.4,
"kl_loss_26": 3961.2,
"kl_loss_39": 2686.8,
"kl_loss_7": 5806.4,
"learning_rate": 0.0008496048743490053,
"loss": 8565.1,
"step": 2610
},
{
"ce_loss_13": 3.806992840766907,
"ce_loss_26": 3.282588803768158,
"ce_loss_39": 2.7138577342033385,
"ce_loss_52": 1.4272374346852303,
"ce_loss_7": 4.137687039375305,
"epoch": 0.262,
"grad_norm": 23.33658890766694,
"kl_loss_13": 4889.6,
"kl_loss_26": 3786.4,
"kl_loss_39": 2568.2,
"kl_loss_7": 5575.2,
"learning_rate": 0.0008484687843276469,
"loss": 8535.4,
"step": 2620
},
{
"ce_loss_13": 3.8657312452793122,
"ce_loss_26": 3.3368504345417023,
"ce_loss_39": 2.7600394666194914,
"ce_loss_52": 1.4648621320724486,
"ce_loss_7": 4.185077089071274,
"epoch": 0.263,
"grad_norm": 21.52255395290807,
"kl_loss_13": 4951.2,
"kl_loss_26": 3845.2,
"kl_loss_39": 2613.2,
"kl_loss_7": 5622.4,
"learning_rate": 0.0008473291852294987,
"loss": 8580.4,
"step": 2630
},
{
"ce_loss_13": 3.8671354949474335,
"ce_loss_26": 3.334774547815323,
"ce_loss_39": 2.757487526535988,
"ce_loss_52": 1.4462923228740692,
"ce_loss_7": 4.198447376489639,
"epoch": 0.264,
"grad_norm": 22.57622271607983,
"kl_loss_13": 4962.4,
"kl_loss_26": 3854.4,
"kl_loss_39": 2616.4,
"kl_loss_7": 5648.8,
"learning_rate": 0.0008461860885303114,
"loss": 8492.7,
"step": 2640
},
{
"ce_loss_13": 3.875018262863159,
"ce_loss_26": 3.3343500018119814,
"ce_loss_39": 2.731833589076996,
"ce_loss_52": 1.4120988547801971,
"ce_loss_7": 4.215745764970779,
"epoch": 0.265,
"grad_norm": 21.329908759369555,
"kl_loss_13": 5048.8,
"kl_loss_26": 3930.4,
"kl_loss_39": 2648.0,
"kl_loss_7": 5764.8,
"learning_rate": 0.000845039505741056,
"loss": 8545.0,
"step": 2650
},
{
"ce_loss_13": 3.8392697393894197,
"ce_loss_26": 3.313974368572235,
"ce_loss_39": 2.743110102415085,
"ce_loss_52": 1.4838833779096603,
"ce_loss_7": 4.162828749418258,
"epoch": 0.266,
"grad_norm": 22.328558189428094,
"kl_loss_13": 4850.4,
"kl_loss_26": 3737.2,
"kl_loss_39": 2504.0,
"kl_loss_7": 5524.8,
"learning_rate": 0.0008438894484078086,
"loss": 8456.0,
"step": 2660
},
{
"ce_loss_13": 3.7446135103702547,
"ce_loss_26": 3.2187088668346404,
"ce_loss_39": 2.6340013802051545,
"ce_loss_52": 1.389008679986,
"ce_loss_7": 4.068695777654648,
"epoch": 0.267,
"grad_norm": 22.22147816002266,
"kl_loss_13": 4894.4,
"kl_loss_26": 3784.0,
"kl_loss_39": 2542.0,
"kl_loss_7": 5576.0,
"learning_rate": 0.0008427359281116334,
"loss": 8425.6,
"step": 2670
},
{
"ce_loss_13": 3.8235996186733248,
"ce_loss_26": 3.292762166261673,
"ce_loss_39": 2.7183104872703554,
"ce_loss_52": 1.4328299894928933,
"ce_loss_7": 4.153199070692063,
"epoch": 0.268,
"grad_norm": 22.48935879447384,
"kl_loss_13": 4909.2,
"kl_loss_26": 3800.0,
"kl_loss_39": 2568.2,
"kl_loss_7": 5595.2,
"learning_rate": 0.0008415789564684673,
"loss": 8422.0,
"step": 2680
},
{
"ce_loss_13": 3.776876950263977,
"ce_loss_26": 3.2493546307086945,
"ce_loss_39": 2.682066896557808,
"ce_loss_52": 1.426028846204281,
"ce_loss_7": 4.102496325969696,
"epoch": 0.269,
"grad_norm": 23.679133499819734,
"kl_loss_13": 4848.8,
"kl_loss_26": 3742.4,
"kl_loss_39": 2509.6,
"kl_loss_7": 5526.4,
"learning_rate": 0.0008404185451290017,
"loss": 8501.8,
"step": 2690
},
{
"ce_loss_13": 3.8134057581424714,
"ce_loss_26": 3.272122323513031,
"ce_loss_39": 2.686330908536911,
"ce_loss_52": 1.4213334619998932,
"ce_loss_7": 4.143577241897583,
"epoch": 0.27,
"grad_norm": 22.090911465136045,
"kl_loss_13": 4938.4,
"kl_loss_26": 3810.8,
"kl_loss_39": 2554.8,
"kl_loss_7": 5636.8,
"learning_rate": 0.0008392547057785661,
"loss": 8351.5,
"step": 2700
},
{
"ce_loss_13": 3.786053466796875,
"ce_loss_26": 3.247877132892609,
"ce_loss_39": 2.6689940333366393,
"ce_loss_52": 1.4167698860168456,
"ce_loss_7": 4.1162903845310215,
"epoch": 0.271,
"grad_norm": 20.5206064618152,
"kl_loss_13": 4872.8,
"kl_loss_26": 3742.0,
"kl_loss_39": 2504.4,
"kl_loss_7": 5563.2,
"learning_rate": 0.0008380874501370098,
"loss": 8427.4,
"step": 2710
},
{
"ce_loss_13": 3.704389762878418,
"ce_loss_26": 3.187553709745407,
"ce_loss_39": 2.624448519945145,
"ce_loss_52": 1.42959221303463,
"ce_loss_7": 4.022905468940735,
"epoch": 0.272,
"grad_norm": 24.53128855683424,
"kl_loss_13": 4716.8,
"kl_loss_26": 3630.0,
"kl_loss_39": 2409.2,
"kl_loss_7": 5390.4,
"learning_rate": 0.0008369167899585841,
"loss": 8346.0,
"step": 2720
},
{
"ce_loss_13": 3.7978334367275237,
"ce_loss_26": 3.257440310716629,
"ce_loss_39": 2.673193109035492,
"ce_loss_52": 1.4194035559892655,
"ce_loss_7": 4.132071840763092,
"epoch": 0.273,
"grad_norm": 22.71970164256676,
"kl_loss_13": 4896.8,
"kl_loss_26": 3759.2,
"kl_loss_39": 2513.4,
"kl_loss_7": 5595.2,
"learning_rate": 0.0008357427370318238,
"loss": 8347.6,
"step": 2730
},
{
"ce_loss_13": 3.7844323456287383,
"ce_loss_26": 3.263492447137833,
"ce_loss_39": 2.6871775329113006,
"ce_loss_52": 1.452097550034523,
"ce_loss_7": 4.105139708518982,
"epoch": 0.274,
"grad_norm": 22.48264422716788,
"kl_loss_13": 4807.2,
"kl_loss_26": 3710.0,
"kl_loss_39": 2469.2,
"kl_loss_7": 5490.4,
"learning_rate": 0.0008345653031794292,
"loss": 8382.9,
"step": 2740
},
{
"ce_loss_13": 3.8110480844974517,
"ce_loss_26": 3.2820364594459535,
"ce_loss_39": 2.7046351432800293,
"ce_loss_52": 1.4519646763801575,
"ce_loss_7": 4.134507310390473,
"epoch": 0.275,
"grad_norm": 21.943498417005777,
"kl_loss_13": 4812.0,
"kl_loss_26": 3713.6,
"kl_loss_39": 2499.8,
"kl_loss_7": 5488.0,
"learning_rate": 0.0008333845002581458,
"loss": 8287.2,
"step": 2750
},
{
"ce_loss_13": 3.822121250629425,
"ce_loss_26": 3.3007264256477358,
"ce_loss_39": 2.73195458650589,
"ce_loss_52": 1.4655994832515717,
"ce_loss_7": 4.141650629043579,
"epoch": 0.276,
"grad_norm": 22.59740270522763,
"kl_loss_13": 4832.8,
"kl_loss_26": 3745.6,
"kl_loss_39": 2538.4,
"kl_loss_7": 5500.0,
"learning_rate": 0.0008322003401586462,
"loss": 8283.1,
"step": 2760
},
{
"ce_loss_13": 3.726576977968216,
"ce_loss_26": 3.214203953742981,
"ce_loss_39": 2.669701686501503,
"ce_loss_52": 1.4409982591867447,
"ce_loss_7": 4.043469870090485,
"epoch": 0.277,
"grad_norm": 21.384417928568727,
"kl_loss_13": 4712.0,
"kl_loss_26": 3643.6,
"kl_loss_39": 2461.0,
"kl_loss_7": 5379.2,
"learning_rate": 0.0008310128348054094,
"loss": 8251.4,
"step": 2770
},
{
"ce_loss_13": 3.768916404247284,
"ce_loss_26": 3.2343318104743957,
"ce_loss_39": 2.6558803230524064,
"ce_loss_52": 1.4221897169947624,
"ce_loss_7": 4.097134619951248,
"epoch": 0.278,
"grad_norm": 21.8508758307847,
"kl_loss_13": 4846.4,
"kl_loss_26": 3737.6,
"kl_loss_39": 2492.4,
"kl_loss_7": 5534.4,
"learning_rate": 0.0008298219961566008,
"loss": 8264.2,
"step": 2780
},
{
"ce_loss_13": 3.73385471701622,
"ce_loss_26": 3.216676640510559,
"ce_loss_39": 2.634915125370026,
"ce_loss_52": 1.4022609382867812,
"ce_loss_7": 4.067033034563065,
"epoch": 0.279,
"grad_norm": 22.23449381616188,
"kl_loss_13": 4806.0,
"kl_loss_26": 3708.8,
"kl_loss_39": 2479.2,
"kl_loss_7": 5499.2,
"learning_rate": 0.0008286278362039527,
"loss": 8184.2,
"step": 2790
},
{
"ce_loss_13": 3.756936568021774,
"ce_loss_26": 3.2383838176727293,
"ce_loss_39": 2.672528338432312,
"ce_loss_52": 1.452534568309784,
"ce_loss_7": 4.076909917593002,
"epoch": 0.28,
"grad_norm": 21.54056853237845,
"kl_loss_13": 4743.2,
"kl_loss_26": 3661.6,
"kl_loss_39": 2456.0,
"kl_loss_7": 5414.4,
"learning_rate": 0.0008274303669726426,
"loss": 8160.7,
"step": 2800
},
{
"ce_loss_13": 3.8688619792461396,
"ce_loss_26": 3.3306061148643495,
"ce_loss_39": 2.7420520305633547,
"ce_loss_52": 1.4561516880989074,
"ce_loss_7": 4.191312706470489,
"epoch": 0.281,
"grad_norm": 23.01011471220724,
"kl_loss_13": 4962.4,
"kl_loss_26": 3836.4,
"kl_loss_39": 2581.6,
"kl_loss_7": 5640.8,
"learning_rate": 0.0008262296005211721,
"loss": 8239.5,
"step": 2810
},
{
"ce_loss_13": 3.7579640209674836,
"ce_loss_26": 3.2256029903888703,
"ce_loss_39": 2.650630474090576,
"ce_loss_52": 1.4400919079780579,
"ce_loss_7": 4.077768385410309,
"epoch": 0.282,
"grad_norm": 21.557554897738267,
"kl_loss_13": 4784.8,
"kl_loss_26": 3661.6,
"kl_loss_39": 2435.2,
"kl_loss_7": 5454.4,
"learning_rate": 0.0008250255489412463,
"loss": 8218.5,
"step": 2820
},
{
"ce_loss_13": 3.7878367722034456,
"ce_loss_26": 3.255573272705078,
"ce_loss_39": 2.667675232887268,
"ce_loss_52": 1.4289155021309852,
"ce_loss_7": 4.111210036277771,
"epoch": 0.283,
"grad_norm": 22.099755132556425,
"kl_loss_13": 4851.2,
"kl_loss_26": 3733.6,
"kl_loss_39": 2480.6,
"kl_loss_7": 5527.2,
"learning_rate": 0.0008238182243576511,
"loss": 8152.9,
"step": 2830
},
{
"ce_loss_13": 3.7699286341667175,
"ce_loss_26": 3.2401221811771395,
"ce_loss_39": 2.6614575743675233,
"ce_loss_52": 1.4347774118185044,
"ce_loss_7": 4.08802090883255,
"epoch": 0.284,
"grad_norm": 21.441617328301042,
"kl_loss_13": 4791.6,
"kl_loss_26": 3695.6,
"kl_loss_39": 2469.8,
"kl_loss_7": 5453.6,
"learning_rate": 0.0008226076389281315,
"loss": 8141.7,
"step": 2840
},
{
"ce_loss_13": 3.692534440755844,
"ce_loss_26": 3.174397534132004,
"ce_loss_39": 2.6222778260707855,
"ce_loss_52": 1.4332606226205826,
"ce_loss_7": 4.00234357714653,
"epoch": 0.285,
"grad_norm": 23.306650885126444,
"kl_loss_13": 4633.6,
"kl_loss_26": 3560.0,
"kl_loss_39": 2376.6,
"kl_loss_7": 5279.2,
"learning_rate": 0.0008213938048432696,
"loss": 8068.6,
"step": 2850
},
{
"ce_loss_13": 3.6946506440639495,
"ce_loss_26": 3.1700410664081575,
"ce_loss_39": 2.597234898805618,
"ce_loss_52": 1.4046493530273438,
"ce_loss_7": 4.0274644792079926,
"epoch": 0.286,
"grad_norm": 21.879949646782595,
"kl_loss_13": 4721.6,
"kl_loss_26": 3628.8,
"kl_loss_39": 2400.4,
"kl_loss_7": 5412.8,
"learning_rate": 0.0008201767343263612,
"loss": 8086.6,
"step": 2860
},
{
"ce_loss_13": 3.7227329850196837,
"ce_loss_26": 3.200405848026276,
"ce_loss_39": 2.637904042005539,
"ce_loss_52": 1.422918725013733,
"ce_loss_7": 4.043233323097229,
"epoch": 0.287,
"grad_norm": 24.428095636864317,
"kl_loss_13": 4732.0,
"kl_loss_26": 3643.6,
"kl_loss_39": 2433.8,
"kl_loss_7": 5399.2,
"learning_rate": 0.0008189564396332927,
"loss": 8066.0,
"step": 2870
},
{
"ce_loss_13": 3.721643441915512,
"ce_loss_26": 3.185835379362106,
"ce_loss_39": 2.6226376593112946,
"ce_loss_52": 1.4448419839143753,
"ce_loss_7": 4.039817118644715,
"epoch": 0.288,
"grad_norm": 22.93644669160459,
"kl_loss_13": 4683.2,
"kl_loss_26": 3561.2,
"kl_loss_39": 2345.2,
"kl_loss_7": 5352.8,
"learning_rate": 0.0008177329330524181,
"loss": 8090.5,
"step": 2880
},
{
"ce_loss_13": 3.732273721694946,
"ce_loss_26": 3.217743480205536,
"ce_loss_39": 2.656236010789871,
"ce_loss_52": 1.4405113011598587,
"ce_loss_7": 4.046578335762024,
"epoch": 0.289,
"grad_norm": 22.27500685105708,
"kl_loss_13": 4702.0,
"kl_loss_26": 3629.6,
"kl_loss_39": 2416.0,
"kl_loss_7": 5360.8,
"learning_rate": 0.0008165062269044352,
"loss": 8083.7,
"step": 2890
},
{
"ce_loss_13": 3.7308314204216004,
"ce_loss_26": 3.2038592040538787,
"ce_loss_39": 2.6387904793024064,
"ce_loss_52": 1.44214668571949,
"ce_loss_7": 4.04458264708519,
"epoch": 0.29,
"grad_norm": 22.45358727511497,
"kl_loss_13": 4700.4,
"kl_loss_26": 3605.2,
"kl_loss_39": 2398.4,
"kl_loss_7": 5358.4,
"learning_rate": 0.0008152763335422613,
"loss": 8063.0,
"step": 2900
},
{
"ce_loss_13": 3.6699211478233336,
"ce_loss_26": 3.153660440444946,
"ce_loss_39": 2.5951134085655214,
"ce_loss_52": 1.4221089735627175,
"ce_loss_7": 3.978937405347824,
"epoch": 0.291,
"grad_norm": 23.44237828375525,
"kl_loss_13": 4614.0,
"kl_loss_26": 3534.4,
"kl_loss_39": 2341.4,
"kl_loss_7": 5261.6,
"learning_rate": 0.0008140432653509088,
"loss": 8001.3,
"step": 2910
},
{
"ce_loss_13": 3.6406539916992187,
"ce_loss_26": 3.1216741025447847,
"ce_loss_39": 2.5570163398981096,
"ce_loss_52": 1.39638482183218,
"ce_loss_7": 3.9557625532150267,
"epoch": 0.292,
"grad_norm": 21.187460458215387,
"kl_loss_13": 4592.8,
"kl_loss_26": 3516.8,
"kl_loss_39": 2326.8,
"kl_loss_7": 5254.0,
"learning_rate": 0.0008128070347473608,
"loss": 7966.5,
"step": 2920
},
{
"ce_loss_13": 3.665645903348923,
"ce_loss_26": 3.149553042650223,
"ce_loss_39": 2.58870205283165,
"ce_loss_52": 1.4208435118198395,
"ce_loss_7": 3.9827320516109466,
"epoch": 0.293,
"grad_norm": 21.300592787802476,
"kl_loss_13": 4619.6,
"kl_loss_26": 3530.8,
"kl_loss_39": 2336.0,
"kl_loss_7": 5284.0,
"learning_rate": 0.0008115676541804455,
"loss": 7990.7,
"step": 2930
},
{
"ce_loss_13": 3.6261947989463805,
"ce_loss_26": 3.1126498699188234,
"ce_loss_39": 2.5498824626207353,
"ce_loss_52": 1.3916691318154335,
"ce_loss_7": 3.9482949018478393,
"epoch": 0.294,
"grad_norm": 21.8788417242541,
"kl_loss_13": 4598.0,
"kl_loss_26": 3513.2,
"kl_loss_39": 2317.4,
"kl_loss_7": 5269.6,
"learning_rate": 0.0008103251361307119,
"loss": 7972.2,
"step": 2940
},
{
"ce_loss_13": 3.6617009818553923,
"ce_loss_26": 3.134212648868561,
"ce_loss_39": 2.569432234764099,
"ce_loss_52": 1.4306001305580138,
"ce_loss_7": 3.978475254774094,
"epoch": 0.295,
"grad_norm": 21.383257731886584,
"kl_loss_13": 4584.8,
"kl_loss_26": 3486.8,
"kl_loss_39": 2289.6,
"kl_loss_7": 5252.0,
"learning_rate": 0.0008090794931103026,
"loss": 7903.9,
"step": 2950
},
{
"ce_loss_13": 3.674825745820999,
"ce_loss_26": 3.1651513874530792,
"ce_loss_39": 2.605163484811783,
"ce_loss_52": 1.434949815273285,
"ce_loss_7": 3.988205587863922,
"epoch": 0.296,
"grad_norm": 21.87171120261939,
"kl_loss_13": 4585.6,
"kl_loss_26": 3519.2,
"kl_loss_39": 2327.0,
"kl_loss_7": 5246.4,
"learning_rate": 0.0008078307376628291,
"loss": 7903.2,
"step": 2960
},
{
"ce_loss_13": 3.6504483819007874,
"ce_loss_26": 3.1346111416816713,
"ce_loss_39": 2.5827776730060577,
"ce_loss_52": 1.4189698368310928,
"ce_loss_7": 3.9625262200832365,
"epoch": 0.297,
"grad_norm": 23.048467847326563,
"kl_loss_13": 4571.2,
"kl_loss_26": 3498.4,
"kl_loss_39": 2319.0,
"kl_loss_7": 5224.0,
"learning_rate": 0.000806578882363245,
"loss": 7901.6,
"step": 2970
},
{
"ce_loss_13": 3.655070722103119,
"ce_loss_26": 3.136745995283127,
"ce_loss_39": 2.5604557782411574,
"ce_loss_52": 1.401164847612381,
"ce_loss_7": 3.973217171430588,
"epoch": 0.298,
"grad_norm": 21.078263907370157,
"kl_loss_13": 4614.4,
"kl_loss_26": 3538.4,
"kl_loss_39": 2311.4,
"kl_loss_7": 5287.2,
"learning_rate": 0.0008053239398177191,
"loss": 7911.8,
"step": 2980
},
{
"ce_loss_13": 3.6555157959461213,
"ce_loss_26": 3.13962464928627,
"ce_loss_39": 2.5709628492593763,
"ce_loss_52": 1.4242349237203598,
"ce_loss_7": 3.9756637513637543,
"epoch": 0.299,
"grad_norm": 22.608350182138345,
"kl_loss_13": 4603.2,
"kl_loss_26": 3520.4,
"kl_loss_39": 2304.0,
"kl_loss_7": 5274.4,
"learning_rate": 0.0008040659226635089,
"loss": 7892.4,
"step": 2990
},
{
"ce_loss_13": 3.6532657563686373,
"ce_loss_26": 3.124306696653366,
"ce_loss_39": 2.557386627793312,
"ce_loss_52": 1.402693158388138,
"ce_loss_7": 3.9695385217666628,
"epoch": 0.3,
"grad_norm": 22.376822604204033,
"kl_loss_13": 4616.4,
"kl_loss_26": 3523.6,
"kl_loss_39": 2321.8,
"kl_loss_7": 5287.2,
"learning_rate": 0.0008028048435688333,
"loss": 7820.7,
"step": 3000
},
{
"ce_loss_13": 3.6811940252780913,
"ce_loss_26": 3.1677908301353455,
"ce_loss_39": 2.607375094294548,
"ce_loss_52": 1.4560914367437363,
"ce_loss_7": 3.9915607273578644,
"epoch": 0.301,
"grad_norm": 21.84188714128543,
"kl_loss_13": 4604.8,
"kl_loss_26": 3538.4,
"kl_loss_39": 2343.2,
"kl_loss_7": 5256.0,
"learning_rate": 0.0008015407152327448,
"loss": 7933.0,
"step": 3010
},
{
"ce_loss_13": 3.737543153762817,
"ce_loss_26": 3.2195757627487183,
"ce_loss_39": 2.6466131448745727,
"ce_loss_52": 1.4449012607336045,
"ce_loss_7": 4.052252840995789,
"epoch": 0.302,
"grad_norm": 22.34664686545947,
"kl_loss_13": 4700.8,
"kl_loss_26": 3618.8,
"kl_loss_39": 2394.4,
"kl_loss_7": 5358.4,
"learning_rate": 0.0008002735503850016,
"loss": 7844.2,
"step": 3020
},
{
"ce_loss_13": 3.6698498368263244,
"ce_loss_26": 3.155323106050491,
"ce_loss_39": 2.579972979426384,
"ce_loss_52": 1.4486516952514648,
"ce_loss_7": 3.9814261555671693,
"epoch": 0.303,
"grad_norm": 22.316774640404955,
"kl_loss_13": 4563.2,
"kl_loss_26": 3486.8,
"kl_loss_39": 2285.2,
"kl_loss_7": 5213.6,
"learning_rate": 0.0007990033617859396,
"loss": 7844.3,
"step": 3030
},
{
"ce_loss_13": 3.661813771724701,
"ce_loss_26": 3.1450955271720886,
"ce_loss_39": 2.5806987404823305,
"ce_loss_52": 1.4304928302764892,
"ce_loss_7": 3.9737633407115935,
"epoch": 0.304,
"grad_norm": 22.094854051528255,
"kl_loss_13": 4595.2,
"kl_loss_26": 3519.2,
"kl_loss_39": 2321.2,
"kl_loss_7": 5248.0,
"learning_rate": 0.000797730162226344,
"loss": 7813.7,
"step": 3040
},
{
"ce_loss_13": 3.6036822319030763,
"ce_loss_26": 3.0875354915857316,
"ce_loss_39": 2.5262934505939483,
"ce_loss_52": 1.3893155947327613,
"ce_loss_7": 3.921951335668564,
"epoch": 0.305,
"grad_norm": 22.896126437016644,
"kl_loss_13": 4538.0,
"kl_loss_26": 3453.6,
"kl_loss_39": 2252.6,
"kl_loss_7": 5210.4,
"learning_rate": 0.0007964539645273203,
"loss": 7783.3,
"step": 3050
},
{
"ce_loss_13": 3.690790832042694,
"ce_loss_26": 3.1806884586811064,
"ce_loss_39": 2.6345931828022002,
"ce_loss_52": 1.4827970415353775,
"ce_loss_7": 3.996461832523346,
"epoch": 0.306,
"grad_norm": 22.12157409866164,
"kl_loss_13": 4558.4,
"kl_loss_26": 3495.6,
"kl_loss_39": 2322.2,
"kl_loss_7": 5202.4,
"learning_rate": 0.000795174781540165,
"loss": 7798.9,
"step": 3060
},
{
"ce_loss_13": 3.6345800876617433,
"ce_loss_26": 3.126866352558136,
"ce_loss_39": 2.5723444908857345,
"ce_loss_52": 1.4505648389458656,
"ce_loss_7": 3.938809943199158,
"epoch": 0.307,
"grad_norm": 21.67276371006888,
"kl_loss_13": 4502.8,
"kl_loss_26": 3435.6,
"kl_loss_39": 2259.2,
"kl_loss_7": 5140.8,
"learning_rate": 0.0007938926261462366,
"loss": 7786.2,
"step": 3070
},
{
"ce_loss_13": 3.6539651334285734,
"ce_loss_26": 3.136826354265213,
"ce_loss_39": 2.5686775982379912,
"ce_loss_52": 1.4312876760959625,
"ce_loss_7": 3.9670185923576353,
"epoch": 0.308,
"grad_norm": 23.264906723037097,
"kl_loss_13": 4571.6,
"kl_loss_26": 3495.6,
"kl_loss_39": 2289.8,
"kl_loss_7": 5231.2,
"learning_rate": 0.0007926075112568258,
"loss": 7773.0,
"step": 3080
},
{
"ce_loss_13": 3.6424070239067077,
"ce_loss_26": 3.13111692070961,
"ce_loss_39": 2.5661837816238404,
"ce_loss_52": 1.4395585834980011,
"ce_loss_7": 3.9482239544391633,
"epoch": 0.309,
"grad_norm": 22.051447045868983,
"kl_loss_13": 4540.0,
"kl_loss_26": 3467.2,
"kl_loss_39": 2265.6,
"kl_loss_7": 5186.4,
"learning_rate": 0.0007913194498130252,
"loss": 7730.0,
"step": 3090
},
{
"ce_loss_13": 3.6187573671340942,
"ce_loss_26": 3.110442912578583,
"ce_loss_39": 2.553048479557037,
"ce_loss_52": 1.4316339492797852,
"ce_loss_7": 3.92513769865036,
"epoch": 0.31,
"grad_norm": 22.041241368180156,
"kl_loss_13": 4504.8,
"kl_loss_26": 3436.0,
"kl_loss_39": 2242.0,
"kl_loss_7": 5140.0,
"learning_rate": 0.0007900284547855992,
"loss": 7742.0,
"step": 3100
},
{
"ce_loss_13": 3.6639523029327394,
"ce_loss_26": 3.1580884575843813,
"ce_loss_39": 2.5835469484329225,
"ce_loss_52": 1.4442215472459794,
"ce_loss_7": 3.976783311367035,
"epoch": 0.311,
"grad_norm": 20.880619592237565,
"kl_loss_13": 4592.0,
"kl_loss_26": 3526.4,
"kl_loss_39": 2312.4,
"kl_loss_7": 5244.8,
"learning_rate": 0.0007887345391748532,
"loss": 7735.3,
"step": 3110
},
{
"ce_loss_13": 3.6283124804496767,
"ce_loss_26": 3.113315612077713,
"ce_loss_39": 2.547743684053421,
"ce_loss_52": 1.42053325176239,
"ce_loss_7": 3.9331269919872285,
"epoch": 0.312,
"grad_norm": 22.15121587945531,
"kl_loss_13": 4543.2,
"kl_loss_26": 3463.2,
"kl_loss_39": 2267.8,
"kl_loss_7": 5184.8,
"learning_rate": 0.0007874377160105036,
"loss": 7729.4,
"step": 3120
},
{
"ce_loss_13": 3.6399169504642486,
"ce_loss_26": 3.135877913236618,
"ce_loss_39": 2.5725889205932617,
"ce_loss_52": 1.4427233994007111,
"ce_loss_7": 3.9606189668178557,
"epoch": 0.313,
"grad_norm": 21.87500401487531,
"kl_loss_13": 4563.2,
"kl_loss_26": 3490.0,
"kl_loss_39": 2276.4,
"kl_loss_7": 5228.0,
"learning_rate": 0.0007861379983515449,
"loss": 7710.9,
"step": 3130
},
{
"ce_loss_13": 3.634021121263504,
"ce_loss_26": 3.111995500326157,
"ce_loss_39": 2.5583092838525774,
"ce_loss_52": 1.4399698421359062,
"ce_loss_7": 3.94167400598526,
"epoch": 0.314,
"grad_norm": 22.854565496538875,
"kl_loss_13": 4504.4,
"kl_loss_26": 3415.2,
"kl_loss_39": 2230.6,
"kl_loss_7": 5153.6,
"learning_rate": 0.0007848353992861195,
"loss": 7710.3,
"step": 3140
},
{
"ce_loss_13": 3.6272457361221315,
"ce_loss_26": 3.116968184709549,
"ce_loss_39": 2.551611191034317,
"ce_loss_52": 1.437747061252594,
"ce_loss_7": 3.9388325929641725,
"epoch": 0.315,
"grad_norm": 21.84748688614269,
"kl_loss_13": 4498.8,
"kl_loss_26": 3427.6,
"kl_loss_39": 2231.6,
"kl_loss_7": 5142.4,
"learning_rate": 0.0007835299319313853,
"loss": 7607.0,
"step": 3150
},
{
"ce_loss_13": 3.613277268409729,
"ce_loss_26": 3.0916620969772337,
"ce_loss_39": 2.5186130821704866,
"ce_loss_52": 1.3888636380434036,
"ce_loss_7": 3.935783725976944,
"epoch": 0.316,
"grad_norm": 21.933561198317395,
"kl_loss_13": 4519.2,
"kl_loss_26": 3438.8,
"kl_loss_39": 2232.0,
"kl_loss_7": 5189.6,
"learning_rate": 0.0007822216094333848,
"loss": 7650.0,
"step": 3160
},
{
"ce_loss_13": 3.658072179555893,
"ce_loss_26": 3.1417903542518615,
"ce_loss_39": 2.577219474315643,
"ce_loss_52": 1.437073315680027,
"ce_loss_7": 3.970378410816193,
"epoch": 0.317,
"grad_norm": 22.034139537965903,
"kl_loss_13": 4566.4,
"kl_loss_26": 3493.2,
"kl_loss_39": 2301.8,
"kl_loss_7": 5224.0,
"learning_rate": 0.0007809104449669101,
"loss": 7644.7,
"step": 3170
},
{
"ce_loss_13": 3.593963289260864,
"ce_loss_26": 3.080548882484436,
"ce_loss_39": 2.5262903541326525,
"ce_loss_52": 1.4362893968820571,
"ce_loss_7": 3.8962223708629606,
"epoch": 0.318,
"grad_norm": 22.12833658126749,
"kl_loss_13": 4417.6,
"kl_loss_26": 3353.2,
"kl_loss_39": 2169.6,
"kl_loss_7": 5054.4,
"learning_rate": 0.0007795964517353734,
"loss": 7580.1,
"step": 3180
},
{
"ce_loss_13": 3.639219433069229,
"ce_loss_26": 3.126335847377777,
"ce_loss_39": 2.5598012149333953,
"ce_loss_52": 1.4479554057121278,
"ce_loss_7": 3.955880182981491,
"epoch": 0.319,
"grad_norm": 21.421584248628356,
"kl_loss_13": 4524.8,
"kl_loss_26": 3445.2,
"kl_loss_39": 2238.4,
"kl_loss_7": 5180.8,
"learning_rate": 0.000778279642970672,
"loss": 7577.4,
"step": 3190
},
{
"ce_loss_13": 3.593672776222229,
"ce_loss_26": 3.076059252023697,
"ce_loss_39": 2.5206238448619844,
"ce_loss_52": 1.4138888984918594,
"ce_loss_7": 3.898124760389328,
"epoch": 0.32,
"grad_norm": 23.27138036145762,
"kl_loss_13": 4477.6,
"kl_loss_26": 3400.0,
"kl_loss_39": 2214.8,
"kl_loss_7": 5123.2,
"learning_rate": 0.0007769600319330552,
"loss": 7595.6,
"step": 3200
},
{
"ce_loss_13": 3.6573951125144957,
"ce_loss_26": 3.166206729412079,
"ce_loss_39": 2.6105258047580717,
"ce_loss_52": 1.47857309281826,
"ce_loss_7": 3.9568731427192687,
"epoch": 0.321,
"grad_norm": 21.35600054948774,
"kl_loss_13": 4470.4,
"kl_loss_26": 3434.4,
"kl_loss_39": 2255.6,
"kl_loss_7": 5100.8,
"learning_rate": 0.0007756376319109917,
"loss": 7610.9,
"step": 3210
},
{
"ce_loss_13": 3.619207721948624,
"ce_loss_26": 3.112484961748123,
"ce_loss_39": 2.5595098197460175,
"ce_loss_52": 1.442267394065857,
"ce_loss_7": 3.9296476364135744,
"epoch": 0.322,
"grad_norm": 21.117056892906756,
"kl_loss_13": 4453.6,
"kl_loss_26": 3396.0,
"kl_loss_39": 2218.0,
"kl_loss_7": 5104.8,
"learning_rate": 0.0007743124562210351,
"loss": 7569.7,
"step": 3220
},
{
"ce_loss_13": 3.612431305646896,
"ce_loss_26": 3.104649418592453,
"ce_loss_39": 2.5444509416818617,
"ce_loss_52": 1.4610149055719375,
"ce_loss_7": 3.9223886907100676,
"epoch": 0.323,
"grad_norm": 22.510814919939268,
"kl_loss_13": 4408.0,
"kl_loss_26": 3338.4,
"kl_loss_39": 2155.4,
"kl_loss_7": 5054.4,
"learning_rate": 0.0007729845182076895,
"loss": 7565.6,
"step": 3230
},
{
"ce_loss_13": 3.5650066912174223,
"ce_loss_26": 3.060505121946335,
"ce_loss_39": 2.5127211630344393,
"ce_loss_52": 1.445562407374382,
"ce_loss_7": 3.8779995679855346,
"epoch": 0.324,
"grad_norm": 24.007681143469355,
"kl_loss_13": 4388.4,
"kl_loss_26": 3323.6,
"kl_loss_39": 2150.2,
"kl_loss_7": 5044.0,
"learning_rate": 0.0007716538312432765,
"loss": 7556.0,
"step": 3240
},
{
"ce_loss_13": 3.5737381398677828,
"ce_loss_26": 3.0725920855998994,
"ce_loss_39": 2.5134449005126953,
"ce_loss_52": 1.4138619631528855,
"ce_loss_7": 3.8855117499828338,
"epoch": 0.325,
"grad_norm": 22.203629206824775,
"kl_loss_13": 4430.8,
"kl_loss_26": 3379.6,
"kl_loss_39": 2197.4,
"kl_loss_7": 5081.6,
"learning_rate": 0.0007703204087277988,
"loss": 7530.7,
"step": 3250
},
{
"ce_loss_13": 3.5474561214447022,
"ce_loss_26": 3.037938302755356,
"ce_loss_39": 2.477022570371628,
"ce_loss_52": 1.3884405881166457,
"ce_loss_7": 3.85917187333107,
"epoch": 0.326,
"grad_norm": 21.98291246151193,
"kl_loss_13": 4437.6,
"kl_loss_26": 3371.6,
"kl_loss_39": 2176.2,
"kl_loss_7": 5089.6,
"learning_rate": 0.0007689842640888063,
"loss": 7519.3,
"step": 3260
},
{
"ce_loss_13": 3.6053310513496397,
"ce_loss_26": 3.0936122059822084,
"ce_loss_39": 2.5414693653583527,
"ce_loss_52": 1.4531731829047203,
"ce_loss_7": 3.913253253698349,
"epoch": 0.327,
"grad_norm": 22.418707773974628,
"kl_loss_13": 4430.8,
"kl_loss_26": 3360.0,
"kl_loss_39": 2184.0,
"kl_loss_7": 5068.4,
"learning_rate": 0.0007676454107812607,
"loss": 7473.1,
"step": 3270
},
{
"ce_loss_13": 3.545606768131256,
"ce_loss_26": 3.0480951845645903,
"ce_loss_39": 2.499777999520302,
"ce_loss_52": 1.4314876705408097,
"ce_loss_7": 3.849948841333389,
"epoch": 0.328,
"grad_norm": 22.389500426390892,
"kl_loss_13": 4402.4,
"kl_loss_26": 3351.6,
"kl_loss_39": 2160.2,
"kl_loss_7": 5035.2,
"learning_rate": 0.0007663038622873999,
"loss": 7510.3,
"step": 3280
},
{
"ce_loss_13": 3.6383297204971314,
"ce_loss_26": 3.127288430929184,
"ce_loss_39": 2.561781680583954,
"ce_loss_52": 1.4617935866117477,
"ce_loss_7": 3.9572836577892305,
"epoch": 0.329,
"grad_norm": 23.105081611165705,
"kl_loss_13": 4472.4,
"kl_loss_26": 3396.8,
"kl_loss_39": 2202.8,
"kl_loss_7": 5139.2,
"learning_rate": 0.0007649596321166025,
"loss": 7473.8,
"step": 3290
},
{
"ce_loss_13": 3.5131199419498444,
"ce_loss_26": 3.0129006803035736,
"ce_loss_39": 2.4699264496564863,
"ce_loss_52": 1.4362694859504699,
"ce_loss_7": 3.8114282488822937,
"epoch": 0.33,
"grad_norm": 22.77133190654901,
"kl_loss_13": 4268.0,
"kl_loss_26": 3223.6,
"kl_loss_39": 2069.4,
"kl_loss_7": 4882.4,
"learning_rate": 0.0007636127338052513,
"loss": 7443.1,
"step": 3300
},
{
"ce_loss_13": 3.5485077798366547,
"ce_loss_26": 3.0334243774414062,
"ce_loss_39": 2.471779704093933,
"ce_loss_52": 1.4008762776851653,
"ce_loss_7": 3.860434752702713,
"epoch": 0.331,
"grad_norm": 22.94544302407564,
"kl_loss_13": 4425.6,
"kl_loss_26": 3344.4,
"kl_loss_39": 2154.8,
"kl_loss_7": 5076.8,
"learning_rate": 0.0007622631809165971,
"loss": 7403.2,
"step": 3310
},
{
"ce_loss_13": 3.611973536014557,
"ce_loss_26": 3.1062149882316588,
"ce_loss_39": 2.5540910184383394,
"ce_loss_52": 1.4812486261129378,
"ce_loss_7": 3.913082367181778,
"epoch": 0.332,
"grad_norm": 21.844212510164496,
"kl_loss_13": 4407.2,
"kl_loss_26": 3357.6,
"kl_loss_39": 2168.2,
"kl_loss_7": 5035.2,
"learning_rate": 0.000760910987040623,
"loss": 7436.1,
"step": 3320
},
{
"ce_loss_13": 3.500666618347168,
"ce_loss_26": 2.992197906970978,
"ce_loss_39": 2.443836176395416,
"ce_loss_52": 1.4172912210226059,
"ce_loss_7": 3.809192955493927,
"epoch": 0.333,
"grad_norm": 22.341971135877618,
"kl_loss_13": 4287.2,
"kl_loss_26": 3229.2,
"kl_loss_39": 2050.8,
"kl_loss_7": 4934.4,
"learning_rate": 0.000759556165793906,
"loss": 7354.8,
"step": 3330
},
{
"ce_loss_13": 3.572561663389206,
"ce_loss_26": 3.0666925728321077,
"ce_loss_39": 2.5258089125156404,
"ce_loss_52": 1.4658059388399125,
"ce_loss_7": 3.8703009307384493,
"epoch": 0.334,
"grad_norm": 20.585398734523825,
"kl_loss_13": 4346.0,
"kl_loss_26": 3292.8,
"kl_loss_39": 2121.6,
"kl_loss_7": 4966.4,
"learning_rate": 0.000758198730819481,
"loss": 7376.9,
"step": 3340
},
{
"ce_loss_13": 3.5921706318855287,
"ce_loss_26": 3.0865486025810243,
"ce_loss_39": 2.5250521272420885,
"ce_loss_52": 1.4276385620236396,
"ce_loss_7": 3.9076746106147766,
"epoch": 0.335,
"grad_norm": 22.48300338159267,
"kl_loss_13": 4447.2,
"kl_loss_26": 3387.2,
"kl_loss_39": 2194.8,
"kl_loss_7": 5095.2,
"learning_rate": 0.0007568386957867032,
"loss": 7407.2,
"step": 3350
},
{
"ce_loss_13": 3.5395087361335755,
"ce_loss_26": 3.0476285994052885,
"ce_loss_39": 2.5039754688739775,
"ce_loss_52": 1.4519936561584472,
"ce_loss_7": 3.8360753774642946,
"epoch": 0.336,
"grad_norm": 22.282680621594952,
"kl_loss_13": 4315.6,
"kl_loss_26": 3284.8,
"kl_loss_39": 2115.8,
"kl_loss_7": 4935.2,
"learning_rate": 0.0007554760743911103,
"loss": 7349.9,
"step": 3360
},
{
"ce_loss_13": 3.5341054499149323,
"ce_loss_26": 3.0255552768707275,
"ce_loss_39": 2.4795517563819884,
"ce_loss_52": 1.43424501568079,
"ce_loss_7": 3.8406670331954955,
"epoch": 0.337,
"grad_norm": 21.82655281544531,
"kl_loss_13": 4317.2,
"kl_loss_26": 3248.8,
"kl_loss_39": 2081.8,
"kl_loss_7": 4959.2,
"learning_rate": 0.0007541108803542846,
"loss": 7352.9,
"step": 3370
},
{
"ce_loss_13": 3.571430027484894,
"ce_loss_26": 3.0674545526504517,
"ce_loss_39": 2.5179436981678007,
"ce_loss_52": 1.4572079569101333,
"ce_loss_7": 3.8665721654891967,
"epoch": 0.338,
"grad_norm": 20.283540782731166,
"kl_loss_13": 4343.6,
"kl_loss_26": 3291.2,
"kl_loss_39": 2129.4,
"kl_loss_7": 4966.4,
"learning_rate": 0.0007527431274237149,
"loss": 7371.7,
"step": 3380
},
{
"ce_loss_13": 3.53710196018219,
"ce_loss_26": 3.0354479968547823,
"ce_loss_39": 2.4885441571474076,
"ce_loss_52": 1.4400692582130432,
"ce_loss_7": 3.8393473029136658,
"epoch": 0.339,
"grad_norm": 21.387970982998613,
"kl_loss_13": 4311.6,
"kl_loss_26": 3256.4,
"kl_loss_39": 2099.8,
"kl_loss_7": 4936.0,
"learning_rate": 0.0007513728293726579,
"loss": 7294.4,
"step": 3390
},
{
"ce_loss_13": 3.523770880699158,
"ce_loss_26": 3.0170785784721375,
"ce_loss_39": 2.4583947211503983,
"ce_loss_52": 1.43256463855505,
"ce_loss_7": 3.822116255760193,
"epoch": 0.34,
"grad_norm": 21.22690089148789,
"kl_loss_13": 4311.6,
"kl_loss_26": 3260.0,
"kl_loss_39": 2066.6,
"kl_loss_7": 4940.0,
"learning_rate": 0.00075,
"loss": 7289.9,
"step": 3400
},
{
"ce_loss_13": 3.495673859119415,
"ce_loss_26": 2.9982276618480683,
"ce_loss_39": 2.444023036956787,
"ce_loss_52": 1.4111162751913071,
"ce_loss_7": 3.805855029821396,
"epoch": 0.341,
"grad_norm": 20.68401947266934,
"kl_loss_13": 4286.0,
"kl_loss_26": 3244.8,
"kl_loss_39": 2061.6,
"kl_loss_7": 4924.4,
"learning_rate": 0.0007486246531301177,
"loss": 7295.1,
"step": 3410
},
{
"ce_loss_13": 3.532993698120117,
"ce_loss_26": 3.0362183272838594,
"ce_loss_39": 2.485447385907173,
"ce_loss_52": 1.4526691198349,
"ce_loss_7": 3.830386519432068,
"epoch": 0.342,
"grad_norm": 22.222401081185772,
"kl_loss_13": 4299.6,
"kl_loss_26": 3249.2,
"kl_loss_39": 2080.4,
"kl_loss_7": 4928.0,
"learning_rate": 0.0007472468026127384,
"loss": 7341.7,
"step": 3420
},
{
"ce_loss_13": 3.463904342055321,
"ce_loss_26": 2.9640887469053268,
"ce_loss_39": 2.421750417351723,
"ce_loss_52": 1.4137367144227029,
"ce_loss_7": 3.7643026977777483,
"epoch": 0.343,
"grad_norm": 21.63494224797145,
"kl_loss_13": 4250.8,
"kl_loss_26": 3196.2,
"kl_loss_39": 2039.9,
"kl_loss_7": 4881.6,
"learning_rate": 0.000745866462322802,
"loss": 7230.95,
"step": 3430
},
{
"ce_loss_13": 3.5996195137500764,
"ce_loss_26": 3.093309980630875,
"ce_loss_39": 2.5379314005374907,
"ce_loss_52": 1.4943716078996658,
"ce_loss_7": 3.899878454208374,
"epoch": 0.344,
"grad_norm": 23.12022506478991,
"kl_loss_13": 4332.0,
"kl_loss_26": 3268.0,
"kl_loss_39": 2085.8,
"kl_loss_7": 4968.0,
"learning_rate": 0.0007444836461603195,
"loss": 7294.5,
"step": 3440
},
{
"ce_loss_13": 3.460267198085785,
"ce_loss_26": 2.979328769445419,
"ce_loss_39": 2.435830682516098,
"ce_loss_52": 1.409597858786583,
"ce_loss_7": 3.7637628614902496,
"epoch": 0.345,
"grad_norm": 22.11179077444158,
"kl_loss_13": 4240.0,
"kl_loss_26": 3220.0,
"kl_loss_39": 2052.4,
"kl_loss_7": 4872.8,
"learning_rate": 0.0007430983680502344,
"loss": 7260.3,
"step": 3450
},
{
"ce_loss_13": 3.484216260910034,
"ce_loss_26": 2.986140418052673,
"ce_loss_39": 2.4419540107250213,
"ce_loss_52": 1.423793789744377,
"ce_loss_7": 3.783113992214203,
"epoch": 0.346,
"grad_norm": 21.48292293909848,
"kl_loss_13": 4242.0,
"kl_loss_26": 3200.0,
"kl_loss_39": 2037.4,
"kl_loss_7": 4868.0,
"learning_rate": 0.0007417106419422819,
"loss": 7210.1,
"step": 3460
},
{
"ce_loss_13": 3.4797898173332213,
"ce_loss_26": 2.9807622492313386,
"ce_loss_39": 2.43132506608963,
"ce_loss_52": 1.4044816851615907,
"ce_loss_7": 3.782477653026581,
"epoch": 0.347,
"grad_norm": 21.84069780374938,
"kl_loss_13": 4286.0,
"kl_loss_26": 3248.0,
"kl_loss_39": 2073.6,
"kl_loss_7": 4917.6,
"learning_rate": 0.0007403204818108486,
"loss": 7232.3,
"step": 3470
},
{
"ce_loss_13": 3.461978626251221,
"ce_loss_26": 2.971747863292694,
"ce_loss_39": 2.4197792381048204,
"ce_loss_52": 1.411722904443741,
"ce_loss_7": 3.7553564965724946,
"epoch": 0.348,
"grad_norm": 20.773261789734953,
"kl_loss_13": 4206.0,
"kl_loss_26": 3177.2,
"kl_loss_39": 2006.4,
"kl_loss_7": 4823.2,
"learning_rate": 0.0007389279016548316,
"loss": 7200.0,
"step": 3480
},
{
"ce_loss_13": 3.412698417901993,
"ce_loss_26": 2.910679543018341,
"ce_loss_39": 2.3608890056610106,
"ce_loss_52": 1.3876498267054558,
"ce_loss_7": 3.7148698210716247,
"epoch": 0.349,
"grad_norm": 21.05607269401212,
"kl_loss_13": 4174.0,
"kl_loss_26": 3125.6,
"kl_loss_39": 1966.0,
"kl_loss_7": 4804.0,
"learning_rate": 0.0007375329154974975,
"loss": 7216.6,
"step": 3490
},
{
"ce_loss_13": 3.474409651756287,
"ce_loss_26": 2.9683692157268524,
"ce_loss_39": 2.418835300207138,
"ce_loss_52": 1.4043258875608444,
"ce_loss_7": 3.774983435869217,
"epoch": 0.35,
"grad_norm": 20.352131735407625,
"kl_loss_13": 4246.8,
"kl_loss_26": 3202.8,
"kl_loss_39": 2042.2,
"kl_loss_7": 4875.2,
"learning_rate": 0.0007361355373863414,
"loss": 7202.7,
"step": 3500
},
{
"ce_loss_13": 3.4584279537200926,
"ce_loss_26": 2.9674349963665008,
"ce_loss_39": 2.422105145454407,
"ce_loss_52": 1.4255526602268218,
"ce_loss_7": 3.767817974090576,
"epoch": 0.351,
"grad_norm": 20.416274052366226,
"kl_loss_13": 4216.8,
"kl_loss_26": 3188.8,
"kl_loss_39": 2018.4,
"kl_loss_7": 4854.4,
"learning_rate": 0.0007347357813929454,
"loss": 7180.1,
"step": 3510
},
{
"ce_loss_13": 3.4778851926326753,
"ce_loss_26": 2.979369193315506,
"ce_loss_39": 2.4347113519906998,
"ce_loss_52": 1.4163423389196397,
"ce_loss_7": 3.7732009410858156,
"epoch": 0.352,
"grad_norm": 24.260880475347793,
"kl_loss_13": 4219.6,
"kl_loss_26": 3191.6,
"kl_loss_39": 2033.6,
"kl_loss_7": 4844.0,
"learning_rate": 0.0007333336616128369,
"loss": 7181.8,
"step": 3520
},
{
"ce_loss_13": 3.479642480611801,
"ce_loss_26": 2.9858607232570646,
"ce_loss_39": 2.429011595249176,
"ce_loss_52": 1.4224095463752746,
"ce_loss_7": 3.779256856441498,
"epoch": 0.353,
"grad_norm": 20.532035835107088,
"kl_loss_13": 4211.6,
"kl_loss_26": 3185.2,
"kl_loss_39": 2008.4,
"kl_loss_7": 4841.6,
"learning_rate": 0.0007319291921653463,
"loss": 7183.4,
"step": 3530
},
{
"ce_loss_13": 3.4610216915607452,
"ce_loss_26": 2.962309718132019,
"ce_loss_39": 2.410178878903389,
"ce_loss_52": 1.4115030318498611,
"ce_loss_7": 3.757075273990631,
"epoch": 0.354,
"grad_norm": 23.640729954280236,
"kl_loss_13": 4236.8,
"kl_loss_26": 3190.0,
"kl_loss_39": 2013.0,
"kl_loss_7": 4864.8,
"learning_rate": 0.0007305223871934656,
"loss": 7181.4,
"step": 3540
},
{
"ce_loss_13": 3.5113906443119047,
"ce_loss_26": 3.016271597146988,
"ce_loss_39": 2.4817953169345857,
"ce_loss_52": 1.4761293560266495,
"ce_loss_7": 3.79822900891304,
"epoch": 0.355,
"grad_norm": 22.47350392427222,
"kl_loss_13": 4203.2,
"kl_loss_26": 3159.6,
"kl_loss_39": 2013.8,
"kl_loss_7": 4804.8,
"learning_rate": 0.0007291132608637052,
"loss": 7117.3,
"step": 3550
},
{
"ce_loss_13": 3.51672882437706,
"ce_loss_26": 3.017621088027954,
"ce_loss_39": 2.471283310651779,
"ce_loss_52": 1.4747960895299912,
"ce_loss_7": 3.8214517176151275,
"epoch": 0.356,
"grad_norm": 22.39492370466417,
"kl_loss_13": 4220.0,
"kl_loss_26": 3165.2,
"kl_loss_39": 2000.0,
"kl_loss_7": 4852.8,
"learning_rate": 0.0007277018273659516,
"loss": 7133.8,
"step": 3560
},
{
"ce_loss_13": 3.5502862453460695,
"ce_loss_26": 3.05435825586319,
"ce_loss_39": 2.5157745271921157,
"ce_loss_52": 1.4953802406787873,
"ce_loss_7": 3.8426124274730684,
"epoch": 0.357,
"grad_norm": 22.306092146539875,
"kl_loss_13": 4247.6,
"kl_loss_26": 3216.4,
"kl_loss_39": 2067.2,
"kl_loss_7": 4859.2,
"learning_rate": 0.0007262881009133242,
"loss": 7135.8,
"step": 3570
},
{
"ce_loss_13": 3.454521042108536,
"ce_loss_26": 2.9510986149311065,
"ce_loss_39": 2.4084193408489227,
"ce_loss_52": 1.4200605943799018,
"ce_loss_7": 3.7586602210998534,
"epoch": 0.358,
"grad_norm": 21.121962853812185,
"kl_loss_13": 4202.8,
"kl_loss_26": 3144.4,
"kl_loss_39": 1986.2,
"kl_loss_7": 4836.0,
"learning_rate": 0.0007248720957420329,
"loss": 7135.9,
"step": 3580
},
{
"ce_loss_13": 3.4299929022789,
"ce_loss_26": 2.9262389481067657,
"ce_loss_39": 2.3823306292295454,
"ce_loss_52": 1.4015884697437286,
"ce_loss_7": 3.7345054388046264,
"epoch": 0.359,
"grad_norm": 21.87103253394757,
"kl_loss_13": 4194.4,
"kl_loss_26": 3138.0,
"kl_loss_39": 1982.0,
"kl_loss_7": 4829.6,
"learning_rate": 0.0007234538261112341,
"loss": 7056.9,
"step": 3590
},
{
"ce_loss_13": 3.47941969037056,
"ce_loss_26": 2.9830207943916323,
"ce_loss_39": 2.429542663693428,
"ce_loss_52": 1.4434623152017594,
"ce_loss_7": 3.7679139375686646,
"epoch": 0.36,
"grad_norm": 21.216900982885303,
"kl_loss_13": 4199.2,
"kl_loss_26": 3151.6,
"kl_loss_39": 1989.4,
"kl_loss_7": 4814.0,
"learning_rate": 0.0007220333063028871,
"loss": 7096.6,
"step": 3600
},
{
"ce_loss_13": 3.3754841923713683,
"ce_loss_26": 2.8831639885902405,
"ce_loss_39": 2.3438637793064117,
"ce_loss_52": 1.3872641950845719,
"ce_loss_7": 3.681108373403549,
"epoch": 0.361,
"grad_norm": 21.80171673212867,
"kl_loss_13": 4118.8,
"kl_loss_26": 3083.2,
"kl_loss_39": 1941.8,
"kl_loss_7": 4758.0,
"learning_rate": 0.0007206105506216106,
"loss": 7029.4,
"step": 3610
},
{
"ce_loss_13": 3.548617047071457,
"ce_loss_26": 3.054288852214813,
"ce_loss_39": 2.504137873649597,
"ce_loss_52": 1.4843981340527534,
"ce_loss_7": 3.850842350721359,
"epoch": 0.362,
"grad_norm": 21.588123109222046,
"kl_loss_13": 4246.4,
"kl_loss_26": 3203.6,
"kl_loss_39": 2032.4,
"kl_loss_7": 4870.4,
"learning_rate": 0.0007191855733945387,
"loss": 7126.1,
"step": 3620
},
{
"ce_loss_13": 3.469277936220169,
"ce_loss_26": 2.985336202383041,
"ce_loss_39": 2.4588325411081313,
"ce_loss_52": 1.4790852904319762,
"ce_loss_7": 3.764431744813919,
"epoch": 0.363,
"grad_norm": 22.204733809635066,
"kl_loss_13": 4129.2,
"kl_loss_26": 3111.6,
"kl_loss_39": 1976.2,
"kl_loss_7": 4744.4,
"learning_rate": 0.0007177583889711762,
"loss": 7054.3,
"step": 3630
},
{
"ce_loss_13": 3.442378747463226,
"ce_loss_26": 2.9400178849697114,
"ce_loss_39": 2.394621509313583,
"ce_loss_52": 1.417646163702011,
"ce_loss_7": 3.743503212928772,
"epoch": 0.364,
"grad_norm": 21.957965313010384,
"kl_loss_13": 4163.6,
"kl_loss_26": 3123.2,
"kl_loss_39": 1959.6,
"kl_loss_7": 4796.0,
"learning_rate": 0.0007163290117232541,
"loss": 7054.5,
"step": 3640
},
{
"ce_loss_13": 3.4392197132110596,
"ce_loss_26": 2.951560914516449,
"ce_loss_39": 2.4180801689624785,
"ce_loss_52": 1.4459613859653473,
"ce_loss_7": 3.7318799614906313,
"epoch": 0.365,
"grad_norm": 21.451785041659093,
"kl_loss_13": 4123.6,
"kl_loss_26": 3098.8,
"kl_loss_39": 1968.6,
"kl_loss_7": 4730.8,
"learning_rate": 0.0007148974560445859,
"loss": 7029.7,
"step": 3650
},
{
"ce_loss_13": 3.4576940476894378,
"ce_loss_26": 2.964629900455475,
"ce_loss_39": 2.4162339717149734,
"ce_loss_52": 1.4277015537023545,
"ce_loss_7": 3.7549045085906982,
"epoch": 0.366,
"grad_norm": 22.817794972487214,
"kl_loss_13": 4165.6,
"kl_loss_26": 3135.6,
"kl_loss_39": 1970.0,
"kl_loss_7": 4792.8,
"learning_rate": 0.0007134637363509209,
"loss": 7013.0,
"step": 3660
},
{
"ce_loss_13": 3.5099750757217407,
"ce_loss_26": 3.0163159906864165,
"ce_loss_39": 2.4729519367218016,
"ce_loss_52": 1.4603912830352783,
"ce_loss_7": 3.8057311475276947,
"epoch": 0.367,
"grad_norm": 21.693779714382707,
"kl_loss_13": 4227.6,
"kl_loss_26": 3203.2,
"kl_loss_39": 2043.8,
"kl_loss_7": 4848.0,
"learning_rate": 0.0007120278670798009,
"loss": 7024.2,
"step": 3670
},
{
"ce_loss_13": 3.4791980743408204,
"ce_loss_26": 2.992640608549118,
"ce_loss_39": 2.4573631793260575,
"ce_loss_52": 1.461830335855484,
"ce_loss_7": 3.7739274382591246,
"epoch": 0.368,
"grad_norm": 22.105670609070703,
"kl_loss_13": 4126.8,
"kl_loss_26": 3112.4,
"kl_loss_39": 1978.0,
"kl_loss_7": 4751.2,
"learning_rate": 0.0007105898626904133,
"loss": 6924.7,
"step": 3680
},
{
"ce_loss_13": 3.4017152190208435,
"ce_loss_26": 2.910390090942383,
"ce_loss_39": 2.374891012907028,
"ce_loss_52": 1.41865316927433,
"ce_loss_7": 3.6938551664352417,
"epoch": 0.369,
"grad_norm": 20.08460323542704,
"kl_loss_13": 4101.6,
"kl_loss_26": 3071.6,
"kl_loss_39": 1938.0,
"kl_loss_7": 4706.8,
"learning_rate": 0.0007091497376634463,
"loss": 6952.1,
"step": 3690
},
{
"ce_loss_13": 3.4051457762718202,
"ce_loss_26": 2.9142957627773285,
"ce_loss_39": 2.3769975334405897,
"ce_loss_52": 1.4461660206317901,
"ce_loss_7": 3.7018753468990324,
"epoch": 0.37,
"grad_norm": 21.75095718081699,
"kl_loss_13": 4034.8,
"kl_loss_26": 3006.4,
"kl_loss_39": 1877.2,
"kl_loss_7": 4647.6,
"learning_rate": 0.0007077075065009433,
"loss": 6973.3,
"step": 3700
},
{
"ce_loss_13": 3.407693642377853,
"ce_loss_26": 2.9163559854030607,
"ce_loss_39": 2.3682307243347167,
"ce_loss_52": 1.3965442717075347,
"ce_loss_7": 3.7011303901672363,
"epoch": 0.371,
"grad_norm": 21.90346982831121,
"kl_loss_13": 4126.8,
"kl_loss_26": 3098.4,
"kl_loss_39": 1951.8,
"kl_loss_7": 4740.4,
"learning_rate": 0.0007062631837261557,
"loss": 6968.9,
"step": 3710
},
{
"ce_loss_13": 3.4375191271305083,
"ce_loss_26": 2.9387122094631195,
"ce_loss_39": 2.4049195408821107,
"ce_loss_52": 1.4491820633411407,
"ce_loss_7": 3.7319699347019197,
"epoch": 0.372,
"grad_norm": 22.15813884607567,
"kl_loss_13": 4096.8,
"kl_loss_26": 3062.8,
"kl_loss_39": 1923.8,
"kl_loss_7": 4716.0,
"learning_rate": 0.0007048167838833977,
"loss": 6892.9,
"step": 3720
},
{
"ce_loss_13": 3.443445736169815,
"ce_loss_26": 2.9415276020765306,
"ce_loss_39": 2.3946647971868513,
"ce_loss_52": 1.4324263527989387,
"ce_loss_7": 3.7482150912284853,
"epoch": 0.373,
"grad_norm": 20.533639539523726,
"kl_loss_13": 4160.0,
"kl_loss_26": 3107.6,
"kl_loss_39": 1932.2,
"kl_loss_7": 4794.8,
"learning_rate": 0.0007033683215379002,
"loss": 6994.9,
"step": 3730
},
{
"ce_loss_13": 3.440624713897705,
"ce_loss_26": 2.932874071598053,
"ce_loss_39": 2.384758135676384,
"ce_loss_52": 1.4341223761439323,
"ce_loss_7": 3.7404758751392366,
"epoch": 0.374,
"grad_norm": 22.169142032653717,
"kl_loss_13": 4178.4,
"kl_loss_26": 3120.4,
"kl_loss_39": 1940.6,
"kl_loss_7": 4809.6,
"learning_rate": 0.0007019178112756625,
"loss": 6960.1,
"step": 3740
},
{
"ce_loss_13": 3.479549217224121,
"ce_loss_26": 2.9821768522262575,
"ce_loss_39": 2.4379994481801988,
"ce_loss_52": 1.4455731570720673,
"ce_loss_7": 3.779719626903534,
"epoch": 0.375,
"grad_norm": 22.88722443184811,
"kl_loss_13": 4190.8,
"kl_loss_26": 3160.8,
"kl_loss_39": 2000.2,
"kl_loss_7": 4822.4,
"learning_rate": 0.0007004652677033068,
"loss": 6922.4,
"step": 3750
},
{
"ce_loss_13": 3.5048573672771455,
"ce_loss_26": 2.996897077560425,
"ce_loss_39": 2.4514291107654573,
"ce_loss_52": 1.4677145808935166,
"ce_loss_7": 3.8070162892341615,
"epoch": 0.376,
"grad_norm": 20.379791798469622,
"kl_loss_13": 4200.4,
"kl_loss_26": 3149.2,
"kl_loss_39": 1988.0,
"kl_loss_7": 4828.8,
"learning_rate": 0.0006990107054479312,
"loss": 6948.5,
"step": 3760
},
{
"ce_loss_13": 3.3857189416885376,
"ce_loss_26": 2.8971070766448976,
"ce_loss_39": 2.3661463767290116,
"ce_loss_52": 1.4227028042078018,
"ce_loss_7": 3.679090714454651,
"epoch": 0.377,
"grad_norm": 21.179119667844127,
"kl_loss_13": 4050.8,
"kl_loss_26": 3030.4,
"kl_loss_39": 1892.4,
"kl_loss_7": 4666.8,
"learning_rate": 0.000697554139156961,
"loss": 6941.0,
"step": 3770
},
{
"ce_loss_13": 3.512388813495636,
"ce_loss_26": 3.0138610899448395,
"ce_loss_39": 2.4606133818626406,
"ce_loss_52": 1.4959001630544662,
"ce_loss_7": 3.807480573654175,
"epoch": 0.378,
"grad_norm": 22.362162135534977,
"kl_loss_13": 4145.2,
"kl_loss_26": 3107.2,
"kl_loss_39": 1951.0,
"kl_loss_7": 4758.0,
"learning_rate": 0.0006960955834980027,
"loss": 6874.7,
"step": 3780
},
{
"ce_loss_13": 3.411291944980621,
"ce_loss_26": 2.907497102022171,
"ce_loss_39": 2.355439043045044,
"ce_loss_52": 1.4057017982006073,
"ce_loss_7": 3.7079379081726076,
"epoch": 0.379,
"grad_norm": 20.519862733845507,
"kl_loss_13": 4121.2,
"kl_loss_26": 3082.0,
"kl_loss_39": 1917.0,
"kl_loss_7": 4746.0,
"learning_rate": 0.0006946350531586958,
"loss": 6891.8,
"step": 3790
},
{
"ce_loss_13": 3.365473288297653,
"ce_loss_26": 2.8626536786556245,
"ce_loss_39": 2.3194735169410707,
"ce_loss_52": 1.3925662517547608,
"ce_loss_7": 3.661379265785217,
"epoch": 0.38,
"grad_norm": 21.211701089479526,
"kl_loss_13": 4084.0,
"kl_loss_26": 3048.4,
"kl_loss_39": 1891.2,
"kl_loss_7": 4702.0,
"learning_rate": 0.0006931725628465643,
"loss": 6889.0,
"step": 3800
},
{
"ce_loss_13": 3.375334745645523,
"ce_loss_26": 2.891470319032669,
"ce_loss_39": 2.3472714513540267,
"ce_loss_52": 1.4055952280759811,
"ce_loss_7": 3.667448806762695,
"epoch": 0.381,
"grad_norm": 22.083234786813115,
"kl_loss_13": 4056.4,
"kl_loss_26": 3038.0,
"kl_loss_39": 1887.0,
"kl_loss_7": 4669.2,
"learning_rate": 0.0006917081272888696,
"loss": 6821.1,
"step": 3810
},
{
"ce_loss_13": 3.413332349061966,
"ce_loss_26": 2.918791648745537,
"ce_loss_39": 2.3821532160043715,
"ce_loss_52": 1.426029135286808,
"ce_loss_7": 3.70468533039093,
"epoch": 0.382,
"grad_norm": 21.70379003852508,
"kl_loss_13": 4066.4,
"kl_loss_26": 3040.8,
"kl_loss_39": 1897.2,
"kl_loss_7": 4676.0,
"learning_rate": 0.0006902417612324615,
"loss": 6817.3,
"step": 3820
},
{
"ce_loss_13": 3.448424202203751,
"ce_loss_26": 2.9532420337200165,
"ce_loss_39": 2.389399054646492,
"ce_loss_52": 1.4127800971269608,
"ce_loss_7": 3.7562515437602997,
"epoch": 0.383,
"grad_norm": 22.611090782774035,
"kl_loss_13": 4219.6,
"kl_loss_26": 3187.2,
"kl_loss_39": 2000.4,
"kl_loss_7": 4858.4,
"learning_rate": 0.00068877347944363,
"loss": 6892.2,
"step": 3830
},
{
"ce_loss_13": 3.42295760512352,
"ce_loss_26": 2.926942157745361,
"ce_loss_39": 2.3873476177453994,
"ce_loss_52": 1.439600521326065,
"ce_loss_7": 3.719656354188919,
"epoch": 0.384,
"grad_norm": 20.756696465620674,
"kl_loss_13": 4109.6,
"kl_loss_26": 3061.2,
"kl_loss_39": 1912.2,
"kl_loss_7": 4727.6,
"learning_rate": 0.0006873032967079561,
"loss": 6876.7,
"step": 3840
},
{
"ce_loss_13": 3.4390079021453857,
"ce_loss_26": 2.9532729268074034,
"ce_loss_39": 2.4051371097564695,
"ce_loss_52": 1.446793320775032,
"ce_loss_7": 3.740493839979172,
"epoch": 0.385,
"grad_norm": 20.683464166323773,
"kl_loss_13": 4102.8,
"kl_loss_26": 3080.8,
"kl_loss_39": 1919.2,
"kl_loss_7": 4729.6,
"learning_rate": 0.0006858312278301637,
"loss": 6878.2,
"step": 3850
},
{
"ce_loss_13": 3.368044465780258,
"ce_loss_26": 2.8783551871776583,
"ce_loss_39": 2.35613272190094,
"ce_loss_52": 1.4377225756645202,
"ce_loss_7": 3.657758867740631,
"epoch": 0.386,
"grad_norm": 22.01788101919845,
"kl_loss_13": 3983.6,
"kl_loss_26": 2964.0,
"kl_loss_39": 1838.2,
"kl_loss_7": 4592.4,
"learning_rate": 0.0006843572876339704,
"loss": 6809.2,
"step": 3860
},
{
"ce_loss_13": 3.3198332667350767,
"ce_loss_26": 2.8321444630622863,
"ce_loss_39": 2.2942576706409454,
"ce_loss_52": 1.394131037592888,
"ce_loss_7": 3.6043868601322173,
"epoch": 0.387,
"grad_norm": 23.448962125354107,
"kl_loss_13": 3965.6,
"kl_loss_26": 2956.0,
"kl_loss_39": 1822.8,
"kl_loss_7": 4560.0,
"learning_rate": 0.0006828814909619373,
"loss": 6798.0,
"step": 3870
},
{
"ce_loss_13": 3.3714381575584413,
"ce_loss_26": 2.88772599697113,
"ce_loss_39": 2.3584464609622957,
"ce_loss_52": 1.4422439962625504,
"ce_loss_7": 3.662185198068619,
"epoch": 0.388,
"grad_norm": 22.031425075321017,
"kl_loss_13": 3995.6,
"kl_loss_26": 2987.6,
"kl_loss_39": 1858.6,
"kl_loss_7": 4603.2,
"learning_rate": 0.0006814038526753205,
"loss": 6790.2,
"step": 3880
},
{
"ce_loss_13": 3.4292624831199645,
"ce_loss_26": 2.9387787103652956,
"ce_loss_39": 2.392472392320633,
"ce_loss_52": 1.458945381641388,
"ce_loss_7": 3.722714525461197,
"epoch": 0.389,
"grad_norm": 21.623492145702706,
"kl_loss_13": 4048.0,
"kl_loss_26": 3026.8,
"kl_loss_39": 1877.2,
"kl_loss_7": 4660.8,
"learning_rate": 0.0006799243876539213,
"loss": 6774.2,
"step": 3890
},
{
"ce_loss_13": 3.398139035701752,
"ce_loss_26": 2.903027367591858,
"ce_loss_39": 2.351736932992935,
"ce_loss_52": 1.4211576133966446,
"ce_loss_7": 3.6922240018844605,
"epoch": 0.39,
"grad_norm": 20.862431117162615,
"kl_loss_13": 4048.8,
"kl_loss_26": 3010.0,
"kl_loss_39": 1853.6,
"kl_loss_7": 4666.4,
"learning_rate": 0.0006784431107959359,
"loss": 6774.2,
"step": 3900
},
{
"ce_loss_13": 3.442742919921875,
"ce_loss_26": 2.950921058654785,
"ce_loss_39": 2.407829362154007,
"ce_loss_52": 1.4671964168548584,
"ce_loss_7": 3.7354746580123903,
"epoch": 0.391,
"grad_norm": 22.203660063445458,
"kl_loss_13": 4065.6,
"kl_loss_26": 3044.4,
"kl_loss_39": 1893.8,
"kl_loss_7": 4682.8,
"learning_rate": 0.0006769600370178059,
"loss": 6751.0,
"step": 3910
},
{
"ce_loss_13": 3.3321305394172667,
"ce_loss_26": 2.8416285693645476,
"ce_loss_39": 2.3153179585933685,
"ce_loss_52": 1.3940225571393967,
"ce_loss_7": 3.628729373216629,
"epoch": 0.392,
"grad_norm": 20.321578181458975,
"kl_loss_13": 3993.6,
"kl_loss_26": 2972.0,
"kl_loss_39": 1845.6,
"kl_loss_7": 4610.8,
"learning_rate": 0.0006754751812540679,
"loss": 6716.4,
"step": 3920
},
{
"ce_loss_13": 3.382316732406616,
"ce_loss_26": 2.8903696179389953,
"ce_loss_39": 2.353957489132881,
"ce_loss_52": 1.433479717373848,
"ce_loss_7": 3.671325671672821,
"epoch": 0.393,
"grad_norm": 21.271101889195098,
"kl_loss_13": 4022.4,
"kl_loss_26": 2998.4,
"kl_loss_39": 1861.0,
"kl_loss_7": 4629.6,
"learning_rate": 0.0006739885584572025,
"loss": 6776.3,
"step": 3930
},
{
"ce_loss_13": 3.300927424430847,
"ce_loss_26": 2.820311403274536,
"ce_loss_39": 2.2964976727962494,
"ce_loss_52": 1.4115092635154725,
"ce_loss_7": 3.5904260516166686,
"epoch": 0.394,
"grad_norm": 20.82909920071906,
"kl_loss_13": 3946.8,
"kl_loss_26": 2937.6,
"kl_loss_39": 1809.0,
"kl_loss_7": 4550.0,
"learning_rate": 0.0006725001835974853,
"loss": 6768.3,
"step": 3940
},
{
"ce_loss_13": 3.3879170179367066,
"ce_loss_26": 2.897449654340744,
"ce_loss_39": 2.3513225704431533,
"ce_loss_52": 1.423241639137268,
"ce_loss_7": 3.6786913871765137,
"epoch": 0.395,
"grad_norm": 21.851496416724263,
"kl_loss_13": 4040.0,
"kl_loss_26": 3011.6,
"kl_loss_39": 1852.2,
"kl_loss_7": 4646.4,
"learning_rate": 0.0006710100716628344,
"loss": 6704.8,
"step": 3950
},
{
"ce_loss_13": 3.365324836969376,
"ce_loss_26": 2.8627363234758376,
"ce_loss_39": 2.30602003633976,
"ce_loss_52": 1.3922662898898124,
"ce_loss_7": 3.670176440477371,
"epoch": 0.396,
"grad_norm": 19.95099507420605,
"kl_loss_13": 4070.0,
"kl_loss_26": 3011.6,
"kl_loss_39": 1834.6,
"kl_loss_7": 4694.8,
"learning_rate": 0.0006695182376586602,
"loss": 6737.9,
"step": 3960
},
{
"ce_loss_13": 3.3035158634185793,
"ce_loss_26": 2.820575511455536,
"ce_loss_39": 2.2792143374681473,
"ce_loss_52": 1.3631332144141197,
"ce_loss_7": 3.5984981656074524,
"epoch": 0.397,
"grad_norm": 21.419651305345628,
"kl_loss_13": 4016.4,
"kl_loss_26": 2992.4,
"kl_loss_39": 1851.6,
"kl_loss_7": 4639.2,
"learning_rate": 0.000668024696607715,
"loss": 6659.1,
"step": 3970
},
{
"ce_loss_13": 3.2616395235061644,
"ce_loss_26": 2.784494936466217,
"ce_loss_39": 2.2700316429138185,
"ce_loss_52": 1.3974194526672363,
"ce_loss_7": 3.548482429981232,
"epoch": 0.398,
"grad_norm": 20.984610674219567,
"kl_loss_13": 3844.8,
"kl_loss_26": 2848.4,
"kl_loss_39": 1755.2,
"kl_loss_7": 4441.6,
"learning_rate": 0.0006665294635499404,
"loss": 6600.0,
"step": 3980
},
{
"ce_loss_13": 3.3228425204753878,
"ce_loss_26": 2.836167597770691,
"ce_loss_39": 2.3086318761110305,
"ce_loss_52": 1.431598064303398,
"ce_loss_7": 3.617774724960327,
"epoch": 0.399,
"grad_norm": 20.48469634280121,
"kl_loss_13": 3879.6,
"kl_loss_26": 2876.0,
"kl_loss_39": 1769.2,
"kl_loss_7": 4495.2,
"learning_rate": 0.0006650325535423167,
"loss": 6653.5,
"step": 3990
},
{
"ce_loss_13": 3.3134547114372253,
"ce_loss_26": 2.8307457506656646,
"ce_loss_39": 2.29368577003479,
"ce_loss_52": 1.3969372153282165,
"ce_loss_7": 3.6135133028030397,
"epoch": 0.4,
"grad_norm": 21.23260680511818,
"kl_loss_13": 3962.8,
"kl_loss_26": 2951.2,
"kl_loss_39": 1800.2,
"kl_loss_7": 4588.0,
"learning_rate": 0.0006635339816587109,
"loss": 6715.2,
"step": 4000
},
{
"ce_loss_13": 3.478778451681137,
"ce_loss_26": 2.984225571155548,
"ce_loss_39": 2.432309350371361,
"ce_loss_52": 1.4644457131624222,
"ce_loss_7": 3.782983124256134,
"epoch": 0.401,
"grad_norm": 21.3701964180473,
"kl_loss_13": 4117.6,
"kl_loss_26": 3090.4,
"kl_loss_39": 1932.4,
"kl_loss_7": 4754.8,
"learning_rate": 0.0006620337629897252,
"loss": 6698.2,
"step": 4010
},
{
"ce_loss_13": 3.3284165620803834,
"ce_loss_26": 2.8434600114822386,
"ce_loss_39": 2.311116448044777,
"ce_loss_52": 1.4240004986524581,
"ce_loss_7": 3.625540155172348,
"epoch": 0.402,
"grad_norm": 20.004792328254855,
"kl_loss_13": 3939.2,
"kl_loss_26": 2918.4,
"kl_loss_39": 1780.6,
"kl_loss_7": 4553.2,
"learning_rate": 0.0006605319126425454,
"loss": 6664.7,
"step": 4020
},
{
"ce_loss_13": 3.366453301906586,
"ce_loss_26": 2.86657951772213,
"ce_loss_39": 2.3278372526168822,
"ce_loss_52": 1.4299451738595963,
"ce_loss_7": 3.6654918253421784,
"epoch": 0.403,
"grad_norm": 20.588190398863947,
"kl_loss_13": 4014.8,
"kl_loss_26": 2970.0,
"kl_loss_39": 1823.4,
"kl_loss_7": 4628.8,
"learning_rate": 0.0006590284457407876,
"loss": 6644.4,
"step": 4030
},
{
"ce_loss_13": 3.3478448331356048,
"ce_loss_26": 2.870484399795532,
"ce_loss_39": 2.341677349805832,
"ce_loss_52": 1.4598551213741302,
"ce_loss_7": 3.644811862707138,
"epoch": 0.404,
"grad_norm": 20.8221732353937,
"kl_loss_13": 3908.8,
"kl_loss_26": 2901.6,
"kl_loss_39": 1782.0,
"kl_loss_7": 4526.4,
"learning_rate": 0.0006575233774243465,
"loss": 6645.55,
"step": 4040
},
{
"ce_loss_13": 3.28169704079628,
"ce_loss_26": 2.7861692667007447,
"ce_loss_39": 2.2428950667381287,
"ce_loss_52": 1.374313686788082,
"ce_loss_7": 3.5797726988792418,
"epoch": 0.405,
"grad_norm": 21.110763703238916,
"kl_loss_13": 3936.4,
"kl_loss_26": 2902.0,
"kl_loss_39": 1754.0,
"kl_loss_7": 4561.2,
"learning_rate": 0.0006560167228492435,
"loss": 6646.3,
"step": 4050
},
{
"ce_loss_13": 3.44179083108902,
"ce_loss_26": 2.9405432820320128,
"ce_loss_39": 2.3968080401420595,
"ce_loss_52": 1.4659699857234956,
"ce_loss_7": 3.736131912469864,
"epoch": 0.406,
"grad_norm": 20.50041516447212,
"kl_loss_13": 4061.2,
"kl_loss_26": 3015.6,
"kl_loss_39": 1863.6,
"kl_loss_7": 4672.4,
"learning_rate": 0.0006545084971874737,
"loss": 6655.9,
"step": 4060
},
{
"ce_loss_13": 3.360958731174469,
"ce_loss_26": 2.870532661676407,
"ce_loss_39": 2.321683007478714,
"ce_loss_52": 1.4091651737689972,
"ce_loss_7": 3.6588110864162444,
"epoch": 0.407,
"grad_norm": 20.463583112470857,
"kl_loss_13": 3997.2,
"kl_loss_26": 2979.2,
"kl_loss_39": 1826.4,
"kl_loss_7": 4618.8,
"learning_rate": 0.0006529987156268526,
"loss": 6617.7,
"step": 4070
},
{
"ce_loss_13": 3.2686664044857023,
"ce_loss_26": 2.7765056490898132,
"ce_loss_39": 2.238715943694115,
"ce_loss_52": 1.362110722064972,
"ce_loss_7": 3.557782357931137,
"epoch": 0.408,
"grad_norm": 21.232766427379584,
"kl_loss_13": 3924.8,
"kl_loss_26": 2908.8,
"kl_loss_39": 1777.2,
"kl_loss_7": 4538.0,
"learning_rate": 0.0006514873933708637,
"loss": 6653.7,
"step": 4080
},
{
"ce_loss_13": 3.275262689590454,
"ce_loss_26": 2.7900135934352877,
"ce_loss_39": 2.2641283214092254,
"ce_loss_52": 1.3853250756859778,
"ce_loss_7": 3.584109377861023,
"epoch": 0.409,
"grad_norm": 21.583109836521306,
"kl_loss_13": 3887.6,
"kl_loss_26": 2868.8,
"kl_loss_39": 1755.0,
"kl_loss_7": 4526.4,
"learning_rate": 0.0006499745456385053,
"loss": 6553.8,
"step": 4090
},
{
"ce_loss_13": 3.3286924988031386,
"ce_loss_26": 2.8467950344085695,
"ce_loss_39": 2.312630409002304,
"ce_loss_52": 1.424817180633545,
"ce_loss_7": 3.6197818219661713,
"epoch": 0.41,
"grad_norm": 20.90973793223202,
"kl_loss_13": 3933.2,
"kl_loss_26": 2921.2,
"kl_loss_39": 1789.5,
"kl_loss_7": 4539.2,
"learning_rate": 0.0006484601876641375,
"loss": 6620.35,
"step": 4100
},
{
"ce_loss_13": 3.414019340276718,
"ce_loss_26": 2.922485715150833,
"ce_loss_39": 2.3825585186481475,
"ce_loss_52": 1.4558052003383637,
"ce_loss_7": 3.709526652097702,
"epoch": 0.411,
"grad_norm": 21.026331487000416,
"kl_loss_13": 4013.6,
"kl_loss_26": 2986.4,
"kl_loss_39": 1846.6,
"kl_loss_7": 4632.0,
"learning_rate": 0.000646944334697328,
"loss": 6576.6,
"step": 4110
},
{
"ce_loss_13": 3.3258216440677644,
"ce_loss_26": 2.848787486553192,
"ce_loss_39": 2.3223425179719923,
"ce_loss_52": 1.4663115084171294,
"ce_loss_7": 3.604213911294937,
"epoch": 0.412,
"grad_norm": 20.97048408186008,
"kl_loss_13": 3820.4,
"kl_loss_26": 2832.4,
"kl_loss_39": 1718.8,
"kl_loss_7": 4410.0,
"learning_rate": 0.0006454270020026995,
"loss": 6611.1,
"step": 4120
},
{
"ce_loss_13": 3.3510149538517,
"ce_loss_26": 2.853396385908127,
"ce_loss_39": 2.3119456827640534,
"ce_loss_52": 1.430704912543297,
"ce_loss_7": 3.642722541093826,
"epoch": 0.413,
"grad_norm": 21.962054178499802,
"kl_loss_13": 3953.6,
"kl_loss_26": 2916.4,
"kl_loss_39": 1780.4,
"kl_loss_7": 4556.4,
"learning_rate": 0.0006439082048597755,
"loss": 6584.4,
"step": 4130
},
{
"ce_loss_13": 3.3082414746284483,
"ce_loss_26": 2.8318909227848055,
"ce_loss_39": 2.309774273633957,
"ce_loss_52": 1.4488343179225922,
"ce_loss_7": 3.6024456560611724,
"epoch": 0.414,
"grad_norm": 21.685181722314038,
"kl_loss_13": 3830.8,
"kl_loss_26": 2848.8,
"kl_loss_39": 1746.2,
"kl_loss_7": 4442.8,
"learning_rate": 0.0006423879585628261,
"loss": 6547.1,
"step": 4140
},
{
"ce_loss_13": 3.3523667633533476,
"ce_loss_26": 2.855451303720474,
"ce_loss_39": 2.3214808642864226,
"ce_loss_52": 1.434419831633568,
"ce_loss_7": 3.63877694606781,
"epoch": 0.415,
"grad_norm": 20.289217196894946,
"kl_loss_13": 3971.6,
"kl_loss_26": 2938.8,
"kl_loss_39": 1799.8,
"kl_loss_7": 4575.6,
"learning_rate": 0.0006408662784207149,
"loss": 6535.7,
"step": 4150
},
{
"ce_loss_13": 3.351851773262024,
"ce_loss_26": 2.864642024040222,
"ce_loss_39": 2.3303563445806503,
"ce_loss_52": 1.417774812877178,
"ce_loss_7": 3.6486425340175628,
"epoch": 0.416,
"grad_norm": 20.99970502270892,
"kl_loss_13": 4014.4,
"kl_loss_26": 2990.4,
"kl_loss_39": 1838.4,
"kl_loss_7": 4632.8,
"learning_rate": 0.0006393431797567439,
"loss": 6546.0,
"step": 4160
},
{
"ce_loss_13": 3.3471143901348115,
"ce_loss_26": 2.866150665283203,
"ce_loss_39": 2.3319087445735933,
"ce_loss_52": 1.4416520655155183,
"ce_loss_7": 3.6357293486595155,
"epoch": 0.417,
"grad_norm": 21.192503416792082,
"kl_loss_13": 3909.6,
"kl_loss_26": 2915.8,
"kl_loss_39": 1798.8,
"kl_loss_7": 4510.8,
"learning_rate": 0.0006378186779084996,
"loss": 6527.0,
"step": 4170
},
{
"ce_loss_13": 3.316433811187744,
"ce_loss_26": 2.8411558747291563,
"ce_loss_39": 2.3257210671901705,
"ce_loss_52": 1.4415073692798615,
"ce_loss_7": 3.604754400253296,
"epoch": 0.418,
"grad_norm": 20.807157262193087,
"kl_loss_13": 3869.2,
"kl_loss_26": 2884.4,
"kl_loss_39": 1787.2,
"kl_loss_7": 4470.0,
"learning_rate": 0.0006362927882276989,
"loss": 6561.5,
"step": 4180
},
{
"ce_loss_13": 3.339698684215546,
"ce_loss_26": 2.85506985783577,
"ce_loss_39": 2.311668387055397,
"ce_loss_52": 1.4213855370879174,
"ce_loss_7": 3.6274226009845734,
"epoch": 0.419,
"grad_norm": 22.02061084804507,
"kl_loss_13": 3937.6,
"kl_loss_26": 2928.0,
"kl_loss_39": 1790.0,
"kl_loss_7": 4535.2,
"learning_rate": 0.000634765526080034,
"loss": 6534.9,
"step": 4190
},
{
"ce_loss_13": 3.304267328977585,
"ce_loss_26": 2.8152280390262603,
"ce_loss_39": 2.277574297785759,
"ce_loss_52": 1.3985714688897133,
"ce_loss_7": 3.598735523223877,
"epoch": 0.42,
"grad_norm": 19.98512507622475,
"kl_loss_13": 3911.6,
"kl_loss_26": 2888.0,
"kl_loss_39": 1761.0,
"kl_loss_7": 4520.8,
"learning_rate": 0.0006332369068450174,
"loss": 6522.4,
"step": 4200
},
{
"ce_loss_13": 3.2766413748264314,
"ce_loss_26": 2.793798440694809,
"ce_loss_39": 2.2647649705410005,
"ce_loss_52": 1.404912966489792,
"ce_loss_7": 3.5614658653736115,
"epoch": 0.421,
"grad_norm": 21.98548722652707,
"kl_loss_13": 3862.8,
"kl_loss_26": 2858.0,
"kl_loss_39": 1744.0,
"kl_loss_7": 4457.6,
"learning_rate": 0.0006317069459158283,
"loss": 6461.5,
"step": 4210
},
{
"ce_loss_13": 3.303953742980957,
"ce_loss_26": 2.817542538046837,
"ce_loss_39": 2.2884632468223574,
"ce_loss_52": 1.414811021089554,
"ce_loss_7": 3.5892197132110595,
"epoch": 0.422,
"grad_norm": 21.22519434260025,
"kl_loss_13": 3878.0,
"kl_loss_26": 2873.4,
"kl_loss_39": 1750.6,
"kl_loss_7": 4478.4,
"learning_rate": 0.0006301756586991561,
"loss": 6510.1,
"step": 4220
},
{
"ce_loss_13": 3.3445753276348116,
"ce_loss_26": 2.8686348736286162,
"ce_loss_39": 2.3429405450820924,
"ce_loss_52": 1.4785997077822686,
"ce_loss_7": 3.6385815382003783,
"epoch": 0.423,
"grad_norm": 19.644468709446457,
"kl_loss_13": 3856.4,
"kl_loss_26": 2870.4,
"kl_loss_39": 1760.0,
"kl_loss_7": 4462.4,
"learning_rate": 0.0006286430606150459,
"loss": 6493.9,
"step": 4230
},
{
"ce_loss_13": 3.3064311265945436,
"ce_loss_26": 2.8393130600452423,
"ce_loss_39": 2.321084627509117,
"ce_loss_52": 1.4542193472385407,
"ce_loss_7": 3.591093236207962,
"epoch": 0.424,
"grad_norm": 19.629096721329073,
"kl_loss_13": 3821.6,
"kl_loss_26": 2834.0,
"kl_loss_39": 1732.2,
"kl_loss_7": 4414.0,
"learning_rate": 0.0006271091670967436,
"loss": 6458.7,
"step": 4240
},
{
"ce_loss_13": 3.335456043481827,
"ce_loss_26": 2.8595637679100037,
"ce_loss_39": 2.319040137529373,
"ce_loss_52": 1.4537455767393113,
"ce_loss_7": 3.6277152955532075,
"epoch": 0.425,
"grad_norm": 22.23231840847551,
"kl_loss_13": 3861.2,
"kl_loss_26": 2865.2,
"kl_loss_39": 1725.4,
"kl_loss_7": 4470.8,
"learning_rate": 0.0006255739935905395,
"loss": 6438.9,
"step": 4250
},
{
"ce_loss_13": 3.312756323814392,
"ce_loss_26": 2.828430265188217,
"ce_loss_39": 2.290999186038971,
"ce_loss_52": 1.4147280350327491,
"ce_loss_7": 3.606942754983902,
"epoch": 0.426,
"grad_norm": 22.096816976355754,
"kl_loss_13": 3909.6,
"kl_loss_26": 2901.6,
"kl_loss_39": 1772.8,
"kl_loss_7": 4524.4,
"learning_rate": 0.0006240375555556145,
"loss": 6443.7,
"step": 4260
},
{
"ce_loss_13": 3.2455935895442964,
"ce_loss_26": 2.764835333824158,
"ce_loss_39": 2.2400916039943697,
"ce_loss_52": 1.3983588561415672,
"ce_loss_7": 3.5336900293827056,
"epoch": 0.427,
"grad_norm": 21.016189257560278,
"kl_loss_13": 3811.6,
"kl_loss_26": 2807.4,
"kl_loss_39": 1694.2,
"kl_loss_7": 4414.0,
"learning_rate": 0.000622499868463882,
"loss": 6395.1,
"step": 4270
},
{
"ce_loss_13": 3.3086226165294645,
"ce_loss_26": 2.823494350910187,
"ce_loss_39": 2.301421931385994,
"ce_loss_52": 1.4462398916482926,
"ce_loss_7": 3.595276767015457,
"epoch": 0.428,
"grad_norm": 21.54081801528653,
"kl_loss_13": 3848.8,
"kl_loss_26": 2835.6,
"kl_loss_39": 1725.6,
"kl_loss_7": 4449.6,
"learning_rate": 0.0006209609477998338,
"loss": 6429.2,
"step": 4280
},
{
"ce_loss_13": 3.34853395819664,
"ce_loss_26": 2.8662350177764893,
"ce_loss_39": 2.340099334716797,
"ce_loss_52": 1.455578488111496,
"ce_loss_7": 3.6379907071590423,
"epoch": 0.429,
"grad_norm": 22.431607304032426,
"kl_loss_13": 3907.2,
"kl_loss_26": 2905.6,
"kl_loss_39": 1784.0,
"kl_loss_7": 4512.8,
"learning_rate": 0.0006194208090603844,
"loss": 6469.9,
"step": 4290
},
{
"ce_loss_13": 3.2378364205360413,
"ce_loss_26": 2.771626591682434,
"ce_loss_39": 2.2533502638339997,
"ce_loss_52": 1.4288378104567527,
"ce_loss_7": 3.523706406354904,
"epoch": 0.43,
"grad_norm": 19.478272699999504,
"kl_loss_13": 3771.6,
"kl_loss_26": 2794.0,
"kl_loss_39": 1687.8,
"kl_loss_7": 4368.8,
"learning_rate": 0.0006178794677547138,
"loss": 6399.1,
"step": 4300
},
{
"ce_loss_13": 3.3320172011852263,
"ce_loss_26": 2.8396951615810395,
"ce_loss_39": 2.305786609649658,
"ce_loss_52": 1.434322476387024,
"ce_loss_7": 3.622057580947876,
"epoch": 0.431,
"grad_norm": 21.288095261764262,
"kl_loss_13": 3913.6,
"kl_loss_26": 2898.0,
"kl_loss_39": 1761.0,
"kl_loss_7": 4522.8,
"learning_rate": 0.0006163369394041111,
"loss": 6430.5,
"step": 4310
},
{
"ce_loss_13": 3.2775667309761047,
"ce_loss_26": 2.79736185669899,
"ce_loss_39": 2.2730204701423644,
"ce_loss_52": 1.4288703322410583,
"ce_loss_7": 3.562648755311966,
"epoch": 0.432,
"grad_norm": 22.037654673864616,
"kl_loss_13": 3816.0,
"kl_loss_26": 2814.0,
"kl_loss_39": 1690.8,
"kl_loss_7": 4412.8,
"learning_rate": 0.0006147932395418205,
"loss": 6392.0,
"step": 4320
},
{
"ce_loss_13": 3.2999020636081697,
"ce_loss_26": 2.8127501010894775,
"ce_loss_39": 2.2844816505908967,
"ce_loss_52": 1.4203194737434388,
"ce_loss_7": 3.5926522493362425,
"epoch": 0.433,
"grad_norm": 23.105450049389578,
"kl_loss_13": 3851.2,
"kl_loss_26": 2850.0,
"kl_loss_39": 1733.2,
"kl_loss_7": 4468.8,
"learning_rate": 0.0006132483837128823,
"loss": 6416.4,
"step": 4330
},
{
"ce_loss_13": 3.314070051908493,
"ce_loss_26": 2.8264743953943254,
"ce_loss_39": 2.295166790485382,
"ce_loss_52": 1.4510357692837714,
"ce_loss_7": 3.5982041239738463,
"epoch": 0.434,
"grad_norm": 21.907818649309228,
"kl_loss_13": 3842.0,
"kl_loss_26": 2836.0,
"kl_loss_39": 1709.4,
"kl_loss_7": 4428.8,
"learning_rate": 0.0006117023874739772,
"loss": 6437.0,
"step": 4340
},
{
"ce_loss_13": 3.296399414539337,
"ce_loss_26": 2.8059658110141754,
"ce_loss_39": 2.271949994564056,
"ce_loss_52": 1.4154132261872292,
"ce_loss_7": 3.588996112346649,
"epoch": 0.435,
"grad_norm": 21.889212253545118,
"kl_loss_13": 3898.8,
"kl_loss_26": 2887.2,
"kl_loss_39": 1746.6,
"kl_loss_7": 4508.4,
"learning_rate": 0.0006101552663932703,
"loss": 6431.3,
"step": 4350
},
{
"ce_loss_13": 3.306167459487915,
"ce_loss_26": 2.8240922570228575,
"ce_loss_39": 2.2931392163038256,
"ce_loss_52": 1.4310514152050018,
"ce_loss_7": 3.596520256996155,
"epoch": 0.436,
"grad_norm": 21.05075754740656,
"kl_loss_13": 3860.0,
"kl_loss_26": 2856.4,
"kl_loss_39": 1737.6,
"kl_loss_7": 4459.2,
"learning_rate": 0.0006086070360502539,
"loss": 6370.7,
"step": 4360
},
{
"ce_loss_13": 3.2818971514701842,
"ce_loss_26": 2.8096925973892213,
"ce_loss_39": 2.2902305334806443,
"ce_loss_52": 1.457671320438385,
"ce_loss_7": 3.567191207408905,
"epoch": 0.437,
"grad_norm": 19.98373918443626,
"kl_loss_13": 3770.8,
"kl_loss_26": 2788.4,
"kl_loss_39": 1692.0,
"kl_loss_7": 4367.2,
"learning_rate": 0.0006070577120355903,
"loss": 6341.1,
"step": 4370
},
{
"ce_loss_13": 3.317246896028519,
"ce_loss_26": 2.84905891418457,
"ce_loss_39": 2.322802722454071,
"ce_loss_52": 1.4916655078530312,
"ce_loss_7": 3.593036550283432,
"epoch": 0.438,
"grad_norm": 20.010722762004242,
"kl_loss_13": 3781.2,
"kl_loss_26": 2799.2,
"kl_loss_39": 1686.2,
"kl_loss_7": 4360.0,
"learning_rate": 0.0006055073099509549,
"loss": 6355.5,
"step": 4380
},
{
"ce_loss_13": 3.2864172756671906,
"ce_loss_26": 2.8085566580295565,
"ce_loss_39": 2.2875818789005278,
"ce_loss_52": 1.4407859086990356,
"ce_loss_7": 3.5741762936115267,
"epoch": 0.439,
"grad_norm": 21.10222653748024,
"kl_loss_13": 3824.4,
"kl_loss_26": 2822.4,
"kl_loss_39": 1713.2,
"kl_loss_7": 4426.0,
"learning_rate": 0.0006039558454088796,
"loss": 6354.5,
"step": 4390
},
{
"ce_loss_13": 3.2985159277915956,
"ce_loss_26": 2.817130261659622,
"ce_loss_39": 2.2846481442451476,
"ce_loss_52": 1.4308176964521409,
"ce_loss_7": 3.5788950502872465,
"epoch": 0.44,
"grad_norm": 22.10167857860873,
"kl_loss_13": 3852.4,
"kl_loss_26": 2856.8,
"kl_loss_39": 1727.6,
"kl_loss_7": 4434.8,
"learning_rate": 0.0006024033340325954,
"loss": 6381.3,
"step": 4400
},
{
"ce_loss_13": 3.2772581815719604,
"ce_loss_26": 2.7952946066856383,
"ce_loss_39": 2.259748488664627,
"ce_loss_52": 1.4122898250818252,
"ce_loss_7": 3.564402920007706,
"epoch": 0.441,
"grad_norm": 21.766013784294948,
"kl_loss_13": 3854.4,
"kl_loss_26": 2847.6,
"kl_loss_39": 1727.0,
"kl_loss_7": 4457.2,
"learning_rate": 0.0006008497914558743,
"loss": 6338.3,
"step": 4410
},
{
"ce_loss_13": 3.310656875371933,
"ce_loss_26": 2.825407701730728,
"ce_loss_39": 2.3054873913526537,
"ce_loss_52": 1.4574851334095,
"ce_loss_7": 3.595499175786972,
"epoch": 0.442,
"grad_norm": 23.195817032303225,
"kl_loss_13": 3826.4,
"kl_loss_26": 2816.8,
"kl_loss_39": 1707.6,
"kl_loss_7": 4415.6,
"learning_rate": 0.0005992952333228728,
"loss": 6415.4,
"step": 4420
},
{
"ce_loss_13": 3.147319358587265,
"ce_loss_26": 2.6695513784885407,
"ce_loss_39": 2.153283026814461,
"ce_loss_52": 1.362196257710457,
"ce_loss_7": 3.4255874812602998,
"epoch": 0.443,
"grad_norm": 21.365484698145746,
"kl_loss_13": 3677.2,
"kl_loss_26": 2685.6,
"kl_loss_39": 1588.2,
"kl_loss_7": 4260.0,
"learning_rate": 0.0005977396752879741,
"loss": 6284.8,
"step": 4430
},
{
"ce_loss_13": 3.2730862379074095,
"ce_loss_26": 2.790391606092453,
"ce_loss_39": 2.257637658715248,
"ce_loss_52": 1.4242565602064132,
"ce_loss_7": 3.5511350512504576,
"epoch": 0.444,
"grad_norm": 20.84050157821156,
"kl_loss_13": 3810.8,
"kl_loss_26": 2811.6,
"kl_loss_39": 1686.6,
"kl_loss_7": 4399.2,
"learning_rate": 0.0005961831330156305,
"loss": 6282.4,
"step": 4440
},
{
"ce_loss_13": 3.303426647186279,
"ce_loss_26": 2.8194876074790955,
"ce_loss_39": 2.286717027425766,
"ce_loss_52": 1.4367665380239487,
"ce_loss_7": 3.589170789718628,
"epoch": 0.445,
"grad_norm": 22.09664086851361,
"kl_loss_13": 3832.4,
"kl_loss_26": 2828.0,
"kl_loss_39": 1702.2,
"kl_loss_7": 4435.6,
"learning_rate": 0.0005946256221802051,
"loss": 6310.7,
"step": 4450
},
{
"ce_loss_13": 3.2220256984233857,
"ce_loss_26": 2.747016179561615,
"ce_loss_39": 2.2203843981027602,
"ce_loss_52": 1.4144678741693497,
"ce_loss_7": 3.5019364655017853,
"epoch": 0.446,
"grad_norm": 20.80212123158213,
"kl_loss_13": 3738.0,
"kl_loss_26": 2737.6,
"kl_loss_39": 1635.2,
"kl_loss_7": 4324.0,
"learning_rate": 0.0005930671584658151,
"loss": 6275.9,
"step": 4460
},
{
"ce_loss_13": 3.262040966749191,
"ce_loss_26": 2.7848617672920226,
"ce_loss_39": 2.2554671108722686,
"ce_loss_52": 1.4137398272752761,
"ce_loss_7": 3.5524380266666413,
"epoch": 0.447,
"grad_norm": 21.070671360315192,
"kl_loss_13": 3794.4,
"kl_loss_26": 2805.6,
"kl_loss_39": 1697.0,
"kl_loss_7": 4396.4,
"learning_rate": 0.0005915077575661722,
"loss": 6360.7,
"step": 4470
},
{
"ce_loss_13": 3.2109140872955324,
"ce_loss_26": 2.7298059910535812,
"ce_loss_39": 2.2085951179265977,
"ce_loss_52": 1.3873766094446183,
"ce_loss_7": 3.4926227211952208,
"epoch": 0.448,
"grad_norm": 21.38920961497166,
"kl_loss_13": 3765.2,
"kl_loss_26": 2759.4,
"kl_loss_39": 1656.0,
"kl_loss_7": 4359.6,
"learning_rate": 0.000589947435184427,
"loss": 6255.15,
"step": 4480
},
{
"ce_loss_13": 3.2468604743480682,
"ce_loss_26": 2.7669826805591584,
"ce_loss_39": 2.2381924211978914,
"ce_loss_52": 1.4454800367355347,
"ce_loss_7": 3.5310903012752535,
"epoch": 0.449,
"grad_norm": 23.74435148547982,
"kl_loss_13": 3708.0,
"kl_loss_26": 2716.8,
"kl_loss_39": 1595.4,
"kl_loss_7": 4307.2,
"learning_rate": 0.0005883862070330078,
"loss": 6262.9,
"step": 4490
},
{
"ce_loss_13": 3.2490183234214784,
"ce_loss_26": 2.775378829240799,
"ce_loss_39": 2.259204548597336,
"ce_loss_52": 1.4259025424718856,
"ce_loss_7": 3.532491201162338,
"epoch": 0.45,
"grad_norm": 19.921742072679248,
"kl_loss_13": 3736.0,
"kl_loss_26": 2748.4,
"kl_loss_39": 1665.2,
"kl_loss_7": 4324.4,
"learning_rate": 0.0005868240888334653,
"loss": 6279.4,
"step": 4500
},
{
"ce_loss_13": 3.2022728264331817,
"ce_loss_26": 2.7251765221357345,
"ce_loss_39": 2.2201029896736144,
"ce_loss_52": 1.4243690267205238,
"ce_loss_7": 3.4856902956962585,
"epoch": 0.451,
"grad_norm": 22.336812688943994,
"kl_loss_13": 3701.6,
"kl_loss_26": 2716.6,
"kl_loss_39": 1627.0,
"kl_loss_7": 4291.6,
"learning_rate": 0.0005852610963163119,
"loss": 6274.9,
"step": 4510
},
{
"ce_loss_13": 3.2056246638298034,
"ce_loss_26": 2.735306566953659,
"ce_loss_39": 2.2278982251882553,
"ce_loss_52": 1.4315001338720321,
"ce_loss_7": 3.4921528518199922,
"epoch": 0.452,
"grad_norm": 21.324834799180188,
"kl_loss_13": 3671.2,
"kl_loss_26": 2692.4,
"kl_loss_39": 1610.6,
"kl_loss_7": 4263.2,
"learning_rate": 0.0005836972452208654,
"loss": 6241.8,
"step": 4520
},
{
"ce_loss_13": 3.2711530566215514,
"ce_loss_26": 2.794381695985794,
"ce_loss_39": 2.2675902634859084,
"ce_loss_52": 1.4365027844905853,
"ce_loss_7": 3.5576207876205443,
"epoch": 0.453,
"grad_norm": 21.88775011761487,
"kl_loss_13": 3792.8,
"kl_loss_26": 2804.8,
"kl_loss_39": 1700.8,
"kl_loss_7": 4388.0,
"learning_rate": 0.0005821325512950885,
"loss": 6283.8,
"step": 4530
},
{
"ce_loss_13": 3.2904800713062285,
"ce_loss_26": 2.8134379625320434,
"ce_loss_39": 2.2945436596870423,
"ce_loss_52": 1.476692470908165,
"ce_loss_7": 3.5703530073165894,
"epoch": 0.454,
"grad_norm": 20.942055908262446,
"kl_loss_13": 3751.6,
"kl_loss_26": 2758.0,
"kl_loss_39": 1655.0,
"kl_loss_7": 4336.0,
"learning_rate": 0.0005805670302954321,
"loss": 6268.6,
"step": 4540
},
{
"ce_loss_13": 3.2201909184455872,
"ce_loss_26": 2.737530159950256,
"ce_loss_39": 2.2144886016845704,
"ce_loss_52": 1.4106894597411155,
"ce_loss_7": 3.5021199345588685,
"epoch": 0.455,
"grad_norm": 21.81892564093558,
"kl_loss_13": 3758.8,
"kl_loss_26": 2750.8,
"kl_loss_39": 1624.2,
"kl_loss_7": 4358.4,
"learning_rate": 0.000579000697986675,
"loss": 6232.8,
"step": 4550
},
{
"ce_loss_13": 3.2489894032478333,
"ce_loss_26": 2.780492717027664,
"ce_loss_39": 2.2484638780355453,
"ce_loss_52": 1.4397764205932617,
"ce_loss_7": 3.5364687144756317,
"epoch": 0.456,
"grad_norm": 21.02937623207538,
"kl_loss_13": 3754.8,
"kl_loss_26": 2776.8,
"kl_loss_39": 1651.6,
"kl_loss_7": 4345.6,
"learning_rate": 0.0005774335701417662,
"loss": 6241.6,
"step": 4560
},
{
"ce_loss_13": 3.2101799607276917,
"ce_loss_26": 2.7431035935878754,
"ce_loss_39": 2.2130339086055755,
"ce_loss_52": 1.4163681983947753,
"ce_loss_7": 3.4954857528209686,
"epoch": 0.457,
"grad_norm": 19.83299768002262,
"kl_loss_13": 3705.2,
"kl_loss_26": 2723.2,
"kl_loss_39": 1618.4,
"kl_loss_7": 4301.2,
"learning_rate": 0.0005758656625416658,
"loss": 6247.2,
"step": 4570
},
{
"ce_loss_13": 3.2813266932964327,
"ce_loss_26": 2.793111354112625,
"ce_loss_39": 2.2625322908163072,
"ce_loss_52": 1.4499622374773025,
"ce_loss_7": 3.5644542396068575,
"epoch": 0.458,
"grad_norm": 21.53754747250698,
"kl_loss_13": 3797.6,
"kl_loss_26": 2782.8,
"kl_loss_39": 1647.4,
"kl_loss_7": 4391.2,
"learning_rate": 0.0005742969909751859,
"loss": 6266.1,
"step": 4580
},
{
"ce_loss_13": 3.3478006780147553,
"ce_loss_26": 2.8727428793907164,
"ce_loss_39": 2.3303479075431826,
"ce_loss_52": 1.4829013347625732,
"ce_loss_7": 3.6364180862903597,
"epoch": 0.459,
"grad_norm": 21.34326820051386,
"kl_loss_13": 3844.0,
"kl_loss_26": 2840.4,
"kl_loss_39": 1703.4,
"kl_loss_7": 4441.6,
"learning_rate": 0.0005727275712388318,
"loss": 6209.7,
"step": 4590
},
{
"ce_loss_13": 3.285200160741806,
"ce_loss_26": 2.807385641336441,
"ce_loss_39": 2.275821554660797,
"ce_loss_52": 1.4415770262479781,
"ce_loss_7": 3.563661777973175,
"epoch": 0.46,
"grad_norm": 20.952340509651894,
"kl_loss_13": 3807.6,
"kl_loss_26": 2814.0,
"kl_loss_39": 1683.4,
"kl_loss_7": 4390.0,
"learning_rate": 0.0005711574191366427,
"loss": 6174.2,
"step": 4600
},
{
"ce_loss_13": 3.2419202089309693,
"ce_loss_26": 2.7672870814800263,
"ce_loss_39": 2.2521429657936096,
"ce_loss_52": 1.4457757875323296,
"ce_loss_7": 3.5175400972366333,
"epoch": 0.461,
"grad_norm": 20.667083707625775,
"kl_loss_13": 3685.6,
"kl_loss_26": 2706.8,
"kl_loss_39": 1618.2,
"kl_loss_7": 4262.4,
"learning_rate": 0.0005695865504800327,
"loss": 6154.6,
"step": 4610
},
{
"ce_loss_13": 3.2078490257263184,
"ce_loss_26": 2.7373934209346773,
"ce_loss_39": 2.2357589691877364,
"ce_loss_52": 1.4443277925252915,
"ce_loss_7": 3.4876007556915285,
"epoch": 0.462,
"grad_norm": 21.07091453525893,
"kl_loss_13": 3645.6,
"kl_loss_26": 2666.8,
"kl_loss_39": 1598.2,
"kl_loss_7": 4226.0,
"learning_rate": 0.0005680149810876322,
"loss": 6178.4,
"step": 4620
},
{
"ce_loss_13": 3.2486107409000398,
"ce_loss_26": 2.7601176381111143,
"ce_loss_39": 2.227588337659836,
"ce_loss_52": 1.4004375696182252,
"ce_loss_7": 3.532775843143463,
"epoch": 0.463,
"grad_norm": 21.569532668695917,
"kl_loss_13": 3786.8,
"kl_loss_26": 2780.4,
"kl_loss_39": 1664.6,
"kl_loss_7": 4386.0,
"learning_rate": 0.0005664427267851271,
"loss": 6215.6,
"step": 4630
},
{
"ce_loss_13": 3.2331403851509095,
"ce_loss_26": 2.7579665184020996,
"ce_loss_39": 2.233389773964882,
"ce_loss_52": 1.4353074416518212,
"ce_loss_7": 3.5171454668045046,
"epoch": 0.464,
"grad_norm": 21.55234160622014,
"kl_loss_13": 3697.6,
"kl_loss_26": 2708.8,
"kl_loss_39": 1599.2,
"kl_loss_7": 4292.8,
"learning_rate": 0.0005648698034051009,
"loss": 6233.0,
"step": 4640
},
{
"ce_loss_13": 3.264119005203247,
"ce_loss_26": 2.784970927238464,
"ce_loss_39": 2.2625187635421753,
"ce_loss_52": 1.4551048219203948,
"ce_loss_7": 3.536862540245056,
"epoch": 0.465,
"grad_norm": 21.943732618524454,
"kl_loss_13": 3715.6,
"kl_loss_26": 2727.6,
"kl_loss_39": 1626.8,
"kl_loss_7": 4294.8,
"learning_rate": 0.0005632962267868747,
"loss": 6180.9,
"step": 4650
},
{
"ce_loss_13": 3.124424380064011,
"ce_loss_26": 2.670001748204231,
"ce_loss_39": 2.161810302734375,
"ce_loss_52": 1.3920277938246728,
"ce_loss_7": 3.4072051107883454,
"epoch": 0.466,
"grad_norm": 19.992372851535457,
"kl_loss_13": 3608.0,
"kl_loss_26": 2641.4,
"kl_loss_39": 1573.8,
"kl_loss_7": 4190.0,
"learning_rate": 0.0005617220127763474,
"loss": 6158.8,
"step": 4660
},
{
"ce_loss_13": 3.2301677465438843,
"ce_loss_26": 2.7488952726125717,
"ce_loss_39": 2.2431567162275314,
"ce_loss_52": 1.4423212110996246,
"ce_loss_7": 3.5069880545139314,
"epoch": 0.467,
"grad_norm": 20.78261479761198,
"kl_loss_13": 3680.4,
"kl_loss_26": 2686.0,
"kl_loss_39": 1603.8,
"kl_loss_7": 4262.0,
"learning_rate": 0.0005601471772258368,
"loss": 6129.5,
"step": 4670
},
{
"ce_loss_13": 3.2057377636432647,
"ce_loss_26": 2.7412378191947937,
"ce_loss_39": 2.2341296702623366,
"ce_loss_52": 1.4304189920425414,
"ce_loss_7": 3.4817180752754213,
"epoch": 0.468,
"grad_norm": 20.848135049030358,
"kl_loss_13": 3672.8,
"kl_loss_26": 2706.4,
"kl_loss_39": 1624.6,
"kl_loss_7": 4245.6,
"learning_rate": 0.0005585717359939192,
"loss": 6123.3,
"step": 4680
},
{
"ce_loss_13": 3.233567637205124,
"ce_loss_26": 2.768052551150322,
"ce_loss_39": 2.25777924656868,
"ce_loss_52": 1.4504828751087189,
"ce_loss_7": 3.5125366508960725,
"epoch": 0.469,
"grad_norm": 20.709517983153315,
"kl_loss_13": 3660.4,
"kl_loss_26": 2696.0,
"kl_loss_39": 1619.6,
"kl_loss_7": 4241.2,
"learning_rate": 0.0005569957049452703,
"loss": 6101.6,
"step": 4690
},
{
"ce_loss_13": 3.2725342512130737,
"ce_loss_26": 2.7866754591464997,
"ce_loss_39": 2.2480324536561964,
"ce_loss_52": 1.404937854409218,
"ce_loss_7": 3.5641084611415863,
"epoch": 0.47,
"grad_norm": 20.633433200619724,
"kl_loss_13": 3863.2,
"kl_loss_26": 2851.6,
"kl_loss_39": 1720.6,
"kl_loss_7": 4468.0,
"learning_rate": 0.0005554190999505056,
"loss": 6211.0,
"step": 4700
},
{
"ce_loss_13": 3.2052918612957,
"ce_loss_26": 2.732904624938965,
"ce_loss_39": 2.2116665810346605,
"ce_loss_52": 1.4257652133703231,
"ce_loss_7": 3.4880705952644346,
"epoch": 0.471,
"grad_norm": 20.706622626666558,
"kl_loss_13": 3660.0,
"kl_loss_26": 2670.8,
"kl_loss_39": 1571.8,
"kl_loss_7": 4252.4,
"learning_rate": 0.0005538419368860196,
"loss": 6097.3,
"step": 4710
},
{
"ce_loss_13": 3.201172482967377,
"ce_loss_26": 2.726384937763214,
"ce_loss_39": 2.198946151137352,
"ce_loss_52": 1.4113327443599701,
"ce_loss_7": 3.487533462047577,
"epoch": 0.472,
"grad_norm": 21.400203482886983,
"kl_loss_13": 3675.6,
"kl_loss_26": 2693.2,
"kl_loss_39": 1591.2,
"kl_loss_7": 4272.0,
"learning_rate": 0.0005522642316338268,
"loss": 6121.3,
"step": 4720
},
{
"ce_loss_13": 3.224886018037796,
"ce_loss_26": 2.7551407277584077,
"ce_loss_39": 2.2328554034233092,
"ce_loss_52": 1.4563202857971191,
"ce_loss_7": 3.5022718131542208,
"epoch": 0.473,
"grad_norm": 21.66295321363831,
"kl_loss_13": 3648.0,
"kl_loss_26": 2662.8,
"kl_loss_39": 1575.8,
"kl_loss_7": 4219.6,
"learning_rate": 0.0005506860000814017,
"loss": 6051.3,
"step": 4730
},
{
"ce_loss_13": 3.2025643050670625,
"ce_loss_26": 2.7286852061748506,
"ce_loss_39": 2.21729561984539,
"ce_loss_52": 1.453881350159645,
"ce_loss_7": 3.4832063794136046,
"epoch": 0.474,
"grad_norm": 20.603608171505254,
"kl_loss_13": 3623.6,
"kl_loss_26": 2642.4,
"kl_loss_39": 1549.2,
"kl_loss_7": 4212.8,
"learning_rate": 0.0005491072581215186,
"loss": 6098.5,
"step": 4740
},
{
"ce_loss_13": 3.212699604034424,
"ce_loss_26": 2.732209050655365,
"ce_loss_39": 2.2089115262031553,
"ce_loss_52": 1.4185278177261353,
"ce_loss_7": 3.4970239818096163,
"epoch": 0.475,
"grad_norm": 20.455172908061705,
"kl_loss_13": 3701.2,
"kl_loss_26": 2708.4,
"kl_loss_39": 1600.0,
"kl_loss_7": 4294.0,
"learning_rate": 0.0005475280216520913,
"loss": 6092.7,
"step": 4750
},
{
"ce_loss_13": 3.161313956975937,
"ce_loss_26": 2.6922851324081423,
"ce_loss_39": 2.185682702064514,
"ce_loss_52": 1.4132045745849608,
"ce_loss_7": 3.4361780524253844,
"epoch": 0.476,
"grad_norm": 21.048159476746886,
"kl_loss_13": 3632.8,
"kl_loss_26": 2655.2,
"kl_loss_39": 1574.6,
"kl_loss_7": 4208.4,
"learning_rate": 0.0005459483065760138,
"loss": 6159.3,
"step": 4760
},
{
"ce_loss_13": 3.2172334492206573,
"ce_loss_26": 2.739536887407303,
"ce_loss_39": 2.2192301630973814,
"ce_loss_52": 1.4282803654670715,
"ce_loss_7": 3.5064621806144713,
"epoch": 0.477,
"grad_norm": 20.62269454966768,
"kl_loss_13": 3708.4,
"kl_loss_26": 2710.0,
"kl_loss_39": 1610.2,
"kl_loss_7": 4301.6,
"learning_rate": 0.0005443681288009991,
"loss": 6104.1,
"step": 4770
},
{
"ce_loss_13": 3.217255789041519,
"ce_loss_26": 2.7369415044784544,
"ce_loss_39": 2.2001087069511414,
"ce_loss_52": 1.401385571062565,
"ce_loss_7": 3.5032376050949097,
"epoch": 0.478,
"grad_norm": 20.40853719962087,
"kl_loss_13": 3760.0,
"kl_loss_26": 2763.2,
"kl_loss_39": 1625.0,
"kl_loss_7": 4353.2,
"learning_rate": 0.0005427875042394199,
"loss": 6064.0,
"step": 4780
},
{
"ce_loss_13": 3.199622023105621,
"ce_loss_26": 2.734530872106552,
"ce_loss_39": 2.2225404649972917,
"ce_loss_52": 1.4552370458841324,
"ce_loss_7": 3.475612831115723,
"epoch": 0.479,
"grad_norm": 21.111949592982874,
"kl_loss_13": 3598.8,
"kl_loss_26": 2623.2,
"kl_loss_39": 1545.4,
"kl_loss_7": 4177.6,
"learning_rate": 0.0005412064488081482,
"loss": 6074.2,
"step": 4790
},
{
"ce_loss_13": 3.1485446810722353,
"ce_loss_26": 2.6845290422439576,
"ce_loss_39": 2.1669752955436707,
"ce_loss_52": 1.4116393029689789,
"ce_loss_7": 3.423712509870529,
"epoch": 0.48,
"grad_norm": 20.355440951189763,
"kl_loss_13": 3606.4,
"kl_loss_26": 2638.4,
"kl_loss_39": 1543.2,
"kl_loss_7": 4182.4,
"learning_rate": 0.0005396249784283942,
"loss": 6051.0,
"step": 4800
},
{
"ce_loss_13": 3.1923361301422117,
"ce_loss_26": 2.71637277007103,
"ce_loss_39": 2.1978514790534973,
"ce_loss_52": 1.4327621147036553,
"ce_loss_7": 3.47632372379303,
"epoch": 0.481,
"grad_norm": 22.229636039543518,
"kl_loss_13": 3644.0,
"kl_loss_26": 2653.2,
"kl_loss_39": 1555.8,
"kl_loss_7": 4232.8,
"learning_rate": 0.0005380431090255476,
"loss": 6143.3,
"step": 4810
},
{
"ce_loss_13": 3.232038801908493,
"ce_loss_26": 2.7648274183273314,
"ce_loss_39": 2.2573492497205736,
"ce_loss_52": 1.4371329843997955,
"ce_loss_7": 3.5092617154121397,
"epoch": 0.482,
"grad_norm": 21.36587062891148,
"kl_loss_13": 3704.8,
"kl_loss_26": 2740.8,
"kl_loss_39": 1651.2,
"kl_loss_7": 4281.6,
"learning_rate": 0.0005364608565290155,
"loss": 6031.2,
"step": 4820
},
{
"ce_loss_13": 3.250445681810379,
"ce_loss_26": 2.773394727706909,
"ce_loss_39": 2.243200385570526,
"ce_loss_52": 1.4556994497776032,
"ce_loss_7": 3.536140114068985,
"epoch": 0.483,
"grad_norm": 20.760727607225178,
"kl_loss_13": 3706.4,
"kl_loss_26": 2711.2,
"kl_loss_39": 1595.8,
"kl_loss_7": 4300.0,
"learning_rate": 0.0005348782368720626,
"loss": 6094.3,
"step": 4830
},
{
"ce_loss_13": 3.2234844088554384,
"ce_loss_26": 2.7573211640119553,
"ce_loss_39": 2.244653856754303,
"ce_loss_52": 1.4427421689033508,
"ce_loss_7": 3.497196841239929,
"epoch": 0.484,
"grad_norm": 20.726340018616938,
"kl_loss_13": 3685.6,
"kl_loss_26": 2708.0,
"kl_loss_39": 1610.0,
"kl_loss_7": 4262.4,
"learning_rate": 0.000533295265991652,
"loss": 6062.9,
"step": 4840
},
{
"ce_loss_13": 3.152006584405899,
"ce_loss_26": 2.678810328245163,
"ce_loss_39": 2.166279435157776,
"ce_loss_52": 1.3957384467124938,
"ce_loss_7": 3.4304138660430907,
"epoch": 0.485,
"grad_norm": 21.522248137711077,
"kl_loss_13": 3630.8,
"kl_loss_26": 2639.0,
"kl_loss_39": 1547.4,
"kl_loss_7": 4218.0,
"learning_rate": 0.0005317119598282822,
"loss": 6033.9,
"step": 4850
},
{
"ce_loss_13": 3.2258131086826323,
"ce_loss_26": 2.7538253903388976,
"ce_loss_39": 2.237151172757149,
"ce_loss_52": 1.4640001267194749,
"ce_loss_7": 3.493991768360138,
"epoch": 0.486,
"grad_norm": 19.71152116189248,
"kl_loss_13": 3656.8,
"kl_loss_26": 2681.2,
"kl_loss_39": 1581.0,
"kl_loss_7": 4221.6,
"learning_rate": 0.0005301283343258293,
"loss": 6062.4,
"step": 4860
},
{
"ce_loss_13": 3.188614493608475,
"ce_loss_26": 2.7119751185178758,
"ce_loss_39": 2.189143994450569,
"ce_loss_52": 1.422459150850773,
"ce_loss_7": 3.468545514345169,
"epoch": 0.487,
"grad_norm": 20.57479406007355,
"kl_loss_13": 3638.4,
"kl_loss_26": 2642.2,
"kl_loss_39": 1539.5,
"kl_loss_7": 4222.0,
"learning_rate": 0.000528544405431384,
"loss": 6047.1,
"step": 4870
},
{
"ce_loss_13": 3.1630406379699707,
"ce_loss_26": 2.6917948126792908,
"ce_loss_39": 2.187080183625221,
"ce_loss_52": 1.430996198952198,
"ce_loss_7": 3.437908464670181,
"epoch": 0.488,
"grad_norm": 20.266256252328688,
"kl_loss_13": 3589.2,
"kl_loss_26": 2606.0,
"kl_loss_39": 1535.4,
"kl_loss_7": 4160.0,
"learning_rate": 0.000526960189095093,
"loss": 6056.9,
"step": 4880
},
{
"ce_loss_13": 3.1363641381263734,
"ce_loss_26": 2.6843234658241273,
"ce_loss_39": 2.1881404638290407,
"ce_loss_52": 1.4287778049707414,
"ce_loss_7": 3.403191590309143,
"epoch": 0.489,
"grad_norm": 20.737227217707265,
"kl_loss_13": 3534.8,
"kl_loss_26": 2597.2,
"kl_loss_39": 1535.4,
"kl_loss_7": 4094.0,
"learning_rate": 0.0005253757012699972,
"loss": 6013.8,
"step": 4890
},
{
"ce_loss_13": 3.2066462457180025,
"ce_loss_26": 2.7318670630455015,
"ce_loss_39": 2.2165933042764663,
"ce_loss_52": 1.4400058209896087,
"ce_loss_7": 3.4839820206165313,
"epoch": 0.49,
"grad_norm": 20.942033187869956,
"kl_loss_13": 3651.6,
"kl_loss_26": 2660.8,
"kl_loss_39": 1565.0,
"kl_loss_7": 4233.6,
"learning_rate": 0.0005237909579118712,
"loss": 5973.0,
"step": 4900
},
{
"ce_loss_13": 3.2107683062553405,
"ce_loss_26": 2.723215198516846,
"ce_loss_39": 2.2134319245815277,
"ce_loss_52": 1.4413373351097107,
"ce_loss_7": 3.498019593954086,
"epoch": 0.491,
"grad_norm": 19.852386563944762,
"kl_loss_13": 3647.2,
"kl_loss_26": 2634.4,
"kl_loss_39": 1545.0,
"kl_loss_7": 4246.4,
"learning_rate": 0.0005222059749790631,
"loss": 5997.8,
"step": 4910
},
{
"ce_loss_13": 3.2151939988136293,
"ce_loss_26": 2.7481437802314757,
"ce_loss_39": 2.2312227368354796,
"ce_loss_52": 1.4501317411661148,
"ce_loss_7": 3.4992719650268556,
"epoch": 0.492,
"grad_norm": 21.940050434667352,
"kl_loss_13": 3627.6,
"kl_loss_26": 2655.2,
"kl_loss_39": 1571.4,
"kl_loss_7": 4213.6,
"learning_rate": 0.0005206207684323337,
"loss": 5989.6,
"step": 4920
},
{
"ce_loss_13": 3.14618239402771,
"ce_loss_26": 2.6644466161727904,
"ce_loss_39": 2.1522305369377137,
"ce_loss_52": 1.4040746569633484,
"ce_loss_7": 3.430801051855087,
"epoch": 0.493,
"grad_norm": 21.866819048126402,
"kl_loss_13": 3609.6,
"kl_loss_26": 2614.8,
"kl_loss_39": 1523.4,
"kl_loss_7": 4205.6,
"learning_rate": 0.000519035354234695,
"loss": 5971.3,
"step": 4930
},
{
"ce_loss_13": 3.2634301006793978,
"ce_loss_26": 2.7843244314193725,
"ce_loss_39": 2.264421299099922,
"ce_loss_52": 1.4652716666460037,
"ce_loss_7": 3.5409990727901457,
"epoch": 0.494,
"grad_norm": 22.02758833423459,
"kl_loss_13": 3711.6,
"kl_loss_26": 2710.6,
"kl_loss_39": 1607.3,
"kl_loss_7": 4290.4,
"learning_rate": 0.0005174497483512506,
"loss": 6017.3,
"step": 4940
},
{
"ce_loss_13": 3.2103551268577575,
"ce_loss_26": 2.7440166890621187,
"ce_loss_39": 2.2352791130542755,
"ce_loss_52": 1.4523784220218658,
"ce_loss_7": 3.486135560274124,
"epoch": 0.495,
"grad_norm": 23.749732875931574,
"kl_loss_13": 3618.0,
"kl_loss_26": 2656.6,
"kl_loss_39": 1573.0,
"kl_loss_7": 4201.2,
"learning_rate": 0.0005158639667490339,
"loss": 5989.9,
"step": 4950
},
{
"ce_loss_13": 3.118060350418091,
"ce_loss_26": 2.6498723566532134,
"ce_loss_39": 2.1354818284511565,
"ce_loss_52": 1.3788613289594651,
"ce_loss_7": 3.403479200601578,
"epoch": 0.496,
"grad_norm": 20.524190680845354,
"kl_loss_13": 3599.2,
"kl_loss_26": 2620.4,
"kl_loss_39": 1541.0,
"kl_loss_7": 4189.6,
"learning_rate": 0.0005142780253968481,
"loss": 5973.2,
"step": 4960
},
{
"ce_loss_13": 3.1573639094829558,
"ce_loss_26": 2.6952777743339538,
"ce_loss_39": 2.184722366929054,
"ce_loss_52": 1.4360924899578094,
"ce_loss_7": 3.4324650526046754,
"epoch": 0.497,
"grad_norm": 21.848899371522094,
"kl_loss_13": 3594.0,
"kl_loss_26": 2620.8,
"kl_loss_39": 1532.4,
"kl_loss_7": 4163.6,
"learning_rate": 0.0005126919402651053,
"loss": 5950.9,
"step": 4970
},
{
"ce_loss_13": 3.148969703912735,
"ce_loss_26": 2.6792631447315216,
"ce_loss_39": 2.168180876970291,
"ce_loss_52": 1.4135416984558105,
"ce_loss_7": 3.4292350709438324,
"epoch": 0.498,
"grad_norm": 21.082558392676134,
"kl_loss_13": 3576.4,
"kl_loss_26": 2608.8,
"kl_loss_39": 1527.0,
"kl_loss_7": 4166.4,
"learning_rate": 0.0005111057273256647,
"loss": 5924.7,
"step": 4980
},
{
"ce_loss_13": 3.1738288044929504,
"ce_loss_26": 2.7059105813503264,
"ce_loss_39": 2.194283801317215,
"ce_loss_52": 1.4388620942831039,
"ce_loss_7": 3.461875486373901,
"epoch": 0.499,
"grad_norm": 21.062049454907296,
"kl_loss_13": 3574.8,
"kl_loss_26": 2594.8,
"kl_loss_39": 1521.0,
"kl_loss_7": 4165.2,
"learning_rate": 0.0005095194025516733,
"loss": 5935.8,
"step": 4990
},
{
"ce_loss_13": 3.2131927073001862,
"ce_loss_26": 2.7427126079797746,
"ce_loss_39": 2.2358015894889833,
"ce_loss_52": 1.4697326198220253,
"ce_loss_7": 3.48975727558136,
"epoch": 0.5,
"grad_norm": 19.951336775236356,
"kl_loss_13": 3620.8,
"kl_loss_26": 2633.2,
"kl_loss_39": 1549.0,
"kl_loss_7": 4195.6,
"learning_rate": 0.000507932981917404,
"loss": 5955.8,
"step": 5000
},
{
"ce_loss_13": 3.064238077402115,
"ce_loss_26": 2.605297487974167,
"ce_loss_39": 2.106147512793541,
"ce_loss_52": 1.3699454009532928,
"ce_loss_7": 3.3354556441307066,
"epoch": 0.501,
"grad_norm": 22.434294806872032,
"kl_loss_13": 3513.6,
"kl_loss_26": 2556.0,
"kl_loss_39": 1493.0,
"kl_loss_7": 4079.2,
"learning_rate": 0.0005063464813980949,
"loss": 5921.7,
"step": 5010
},
{
"ce_loss_13": 3.120131802558899,
"ce_loss_26": 2.6457916140556335,
"ce_loss_39": 2.136381095647812,
"ce_loss_52": 1.3973354250192642,
"ce_loss_7": 3.3927165508270263,
"epoch": 0.502,
"grad_norm": 20.534145152408744,
"kl_loss_13": 3558.4,
"kl_loss_26": 2578.4,
"kl_loss_39": 1497.8,
"kl_loss_7": 4132.8,
"learning_rate": 0.0005047599169697884,
"loss": 5945.8,
"step": 5020
},
{
"ce_loss_13": 3.155858016014099,
"ce_loss_26": 2.683425110578537,
"ce_loss_39": 2.1650480359792708,
"ce_loss_52": 1.426735344529152,
"ce_loss_7": 3.4369399666786196,
"epoch": 0.503,
"grad_norm": 20.583275474881205,
"kl_loss_13": 3593.2,
"kl_loss_26": 2606.4,
"kl_loss_39": 1509.8,
"kl_loss_7": 4176.0,
"learning_rate": 0.000503173304609171,
"loss": 5949.4,
"step": 5030
},
{
"ce_loss_13": 3.224624240398407,
"ce_loss_26": 2.742176574468613,
"ce_loss_39": 2.2156084358692167,
"ce_loss_52": 1.4474807173013686,
"ce_loss_7": 3.508391612768173,
"epoch": 0.504,
"grad_norm": 20.860727938912092,
"kl_loss_13": 3678.4,
"kl_loss_26": 2680.8,
"kl_loss_39": 1568.2,
"kl_loss_7": 4265.2,
"learning_rate": 0.0005015866602934111,
"loss": 5957.4,
"step": 5040
},
{
"ce_loss_13": 3.1227844834327696,
"ce_loss_26": 2.661714029312134,
"ce_loss_39": 2.1653817743062973,
"ce_loss_52": 1.4367017298936844,
"ce_loss_7": 3.3902225315570833,
"epoch": 0.505,
"grad_norm": 19.797756106985307,
"kl_loss_13": 3490.8,
"kl_loss_26": 2530.0,
"kl_loss_39": 1472.6,
"kl_loss_7": 4055.6,
"learning_rate": 0.0005,
"loss": 5927.3,
"step": 5050
},
{
"ce_loss_13": 3.177449029684067,
"ce_loss_26": 2.716005155444145,
"ce_loss_39": 2.205168914794922,
"ce_loss_52": 1.442250807583332,
"ce_loss_7": 3.453594130277634,
"epoch": 0.506,
"grad_norm": 20.433104016560257,
"kl_loss_13": 3588.0,
"kl_loss_26": 2622.2,
"kl_loss_39": 1546.2,
"kl_loss_7": 4161.6,
"learning_rate": 0.0004984133397065889,
"loss": 5913.9,
"step": 5060
},
{
"ce_loss_13": 3.1423951983451843,
"ce_loss_26": 2.674048882722855,
"ce_loss_39": 2.1621575862169267,
"ce_loss_52": 1.4322402387857438,
"ce_loss_7": 3.416734743118286,
"epoch": 0.507,
"grad_norm": 20.444836977919294,
"kl_loss_13": 3545.2,
"kl_loss_26": 2564.8,
"kl_loss_39": 1486.0,
"kl_loss_7": 4119.6,
"learning_rate": 0.0004968266953908291,
"loss": 5880.6,
"step": 5070
},
{
"ce_loss_13": 3.070253336429596,
"ce_loss_26": 2.6069840848445893,
"ce_loss_39": 2.1048853427171705,
"ce_loss_52": 1.3886964708566665,
"ce_loss_7": 3.350748908519745,
"epoch": 0.508,
"grad_norm": 21.18309883625556,
"kl_loss_13": 3487.2,
"kl_loss_26": 2520.8,
"kl_loss_39": 1460.8,
"kl_loss_7": 4069.6,
"learning_rate": 0.0004952400830302117,
"loss": 5885.3,
"step": 5080
},
{
"ce_loss_13": 3.077636110782623,
"ce_loss_26": 2.6177482545375823,
"ce_loss_39": 2.1197337061166763,
"ce_loss_52": 1.3917517423629762,
"ce_loss_7": 3.3575133979320526,
"epoch": 0.509,
"grad_norm": 19.77626859818484,
"kl_loss_13": 3496.0,
"kl_loss_26": 2536.8,
"kl_loss_39": 1477.0,
"kl_loss_7": 4073.6,
"learning_rate": 0.0004936535186019053,
"loss": 5872.1,
"step": 5090
},
{
"ce_loss_13": 3.178175300359726,
"ce_loss_26": 2.701016789674759,
"ce_loss_39": 2.1893070548772813,
"ce_loss_52": 1.4174780696630478,
"ce_loss_7": 3.4626995623111725,
"epoch": 0.51,
"grad_norm": 19.62760678285289,
"kl_loss_13": 3640.4,
"kl_loss_26": 2642.0,
"kl_loss_39": 1551.2,
"kl_loss_7": 4232.4,
"learning_rate": 0.000492067018082596,
"loss": 5937.7,
"step": 5100
},
{
"ce_loss_13": 3.169191563129425,
"ce_loss_26": 2.708436530828476,
"ce_loss_39": 2.1886946499347686,
"ce_loss_52": 1.4288851469755173,
"ce_loss_7": 3.4514395534992217,
"epoch": 0.511,
"grad_norm": 20.552371425986603,
"kl_loss_13": 3588.0,
"kl_loss_26": 2634.8,
"kl_loss_39": 1546.0,
"kl_loss_7": 4178.4,
"learning_rate": 0.0004904805974483267,
"loss": 5867.4,
"step": 5110
},
{
"ce_loss_13": 3.152217388153076,
"ce_loss_26": 2.6922530949115755,
"ce_loss_39": 2.181437623500824,
"ce_loss_52": 1.4460914835333825,
"ce_loss_7": 3.429844158887863,
"epoch": 0.512,
"grad_norm": 20.36189036420726,
"kl_loss_13": 3504.0,
"kl_loss_26": 2543.2,
"kl_loss_39": 1481.0,
"kl_loss_7": 4075.2,
"learning_rate": 0.0004888942726743353,
"loss": 5848.7,
"step": 5120
},
{
"ce_loss_13": 3.1243839859962463,
"ce_loss_26": 2.6554999887943267,
"ce_loss_39": 2.1515370845794677,
"ce_loss_52": 1.4089675784111022,
"ce_loss_7": 3.4115704774856566,
"epoch": 0.513,
"grad_norm": 20.218379503515084,
"kl_loss_13": 3534.0,
"kl_loss_26": 2557.4,
"kl_loss_39": 1487.0,
"kl_loss_7": 4121.6,
"learning_rate": 0.0004873080597348947,
"loss": 5856.5,
"step": 5130
},
{
"ce_loss_13": 3.223481798171997,
"ce_loss_26": 2.754591333866119,
"ce_loss_39": 2.236158034205437,
"ce_loss_52": 1.4544740557670592,
"ce_loss_7": 3.5026029109954835,
"epoch": 0.514,
"grad_norm": 20.810099829480396,
"kl_loss_13": 3656.8,
"kl_loss_26": 2670.8,
"kl_loss_39": 1575.8,
"kl_loss_7": 4237.2,
"learning_rate": 0.0004857219746031519,
"loss": 5882.8,
"step": 5140
},
{
"ce_loss_13": 3.146134835481644,
"ce_loss_26": 2.680527698993683,
"ce_loss_39": 2.168422257900238,
"ce_loss_52": 1.4367385059595108,
"ce_loss_7": 3.423633599281311,
"epoch": 0.515,
"grad_norm": 20.424421047481275,
"kl_loss_13": 3516.8,
"kl_loss_26": 2539.6,
"kl_loss_39": 1472.0,
"kl_loss_7": 4094.8,
"learning_rate": 0.0004841360332509663,
"loss": 5895.45,
"step": 5150
},
{
"ce_loss_13": 3.1724873065948485,
"ce_loss_26": 2.7010417520999908,
"ce_loss_39": 2.188371130824089,
"ce_loss_52": 1.4464277178049088,
"ce_loss_7": 3.4471147775650026,
"epoch": 0.516,
"grad_norm": 20.221560032680266,
"kl_loss_13": 3555.2,
"kl_loss_26": 2575.2,
"kl_loss_39": 1495.4,
"kl_loss_7": 4124.8,
"learning_rate": 0.0004825502516487497,
"loss": 5883.8,
"step": 5160
},
{
"ce_loss_13": 3.1568028390407563,
"ce_loss_26": 2.695826065540314,
"ce_loss_39": 2.1907377928495406,
"ce_loss_52": 1.467595374584198,
"ce_loss_7": 3.4282791554927825,
"epoch": 0.517,
"grad_norm": 21.375776905692355,
"kl_loss_13": 3490.8,
"kl_loss_26": 2528.8,
"kl_loss_39": 1463.6,
"kl_loss_7": 4049.2,
"learning_rate": 0.00048096464576530507,
"loss": 5813.7,
"step": 5170
},
{
"ce_loss_13": 3.0959118604660034,
"ce_loss_26": 2.6389028072357177,
"ce_loss_39": 2.1422775775194167,
"ce_loss_52": 1.4158973768353462,
"ce_loss_7": 3.3716647744178774,
"epoch": 0.518,
"grad_norm": 20.796367284367612,
"kl_loss_13": 3490.0,
"kl_loss_26": 2527.0,
"kl_loss_39": 1467.8,
"kl_loss_7": 4068.4,
"learning_rate": 0.00047937923156766646,
"loss": 5832.2,
"step": 5180
},
{
"ce_loss_13": 3.181716579198837,
"ce_loss_26": 2.7091287076473236,
"ce_loss_39": 2.205903950333595,
"ce_loss_52": 1.4591009467840195,
"ce_loss_7": 3.4553593516349794,
"epoch": 0.519,
"grad_norm": 21.655670358447043,
"kl_loss_13": 3570.4,
"kl_loss_26": 2594.8,
"kl_loss_39": 1522.8,
"kl_loss_7": 4142.4,
"learning_rate": 0.00047779402502093696,
"loss": 5844.1,
"step": 5190
},
{
"ce_loss_13": 3.114813321828842,
"ce_loss_26": 2.6511843532323836,
"ce_loss_39": 2.1350393682718276,
"ce_loss_52": 1.4030846193432809,
"ce_loss_7": 3.3881891489028932,
"epoch": 0.52,
"grad_norm": 21.261317204954832,
"kl_loss_13": 3537.2,
"kl_loss_26": 2569.4,
"kl_loss_39": 1490.2,
"kl_loss_7": 4112.8,
"learning_rate": 0.0004762090420881289,
"loss": 5904.5,
"step": 5200
},
{
"ce_loss_13": 3.178787976503372,
"ce_loss_26": 2.716679725050926,
"ce_loss_39": 2.214344197511673,
"ce_loss_52": 1.4766788110136986,
"ce_loss_7": 3.445727747678757,
"epoch": 0.521,
"grad_norm": 21.5237930218517,
"kl_loss_13": 3533.2,
"kl_loss_26": 2579.6,
"kl_loss_39": 1511.4,
"kl_loss_7": 4104.0,
"learning_rate": 0.00047462429873000296,
"loss": 5807.7,
"step": 5210
},
{
"ce_loss_13": 3.1838007628917695,
"ce_loss_26": 2.7048393905162813,
"ce_loss_39": 2.19879055917263,
"ce_loss_52": 1.4492772698402405,
"ce_loss_7": 3.460488295555115,
"epoch": 0.522,
"grad_norm": 23.509628530732027,
"kl_loss_13": 3558.8,
"kl_loss_26": 2571.2,
"kl_loss_39": 1503.6,
"kl_loss_7": 4141.6,
"learning_rate": 0.0004730398109049071,
"loss": 5850.6,
"step": 5220
},
{
"ce_loss_13": 3.1778542578220366,
"ce_loss_26": 2.723250871896744,
"ce_loss_39": 2.2311396062374116,
"ce_loss_52": 1.4870843648910523,
"ce_loss_7": 3.4500561714172364,
"epoch": 0.523,
"grad_norm": 20.540010606623795,
"kl_loss_13": 3511.6,
"kl_loss_26": 2561.4,
"kl_loss_39": 1495.8,
"kl_loss_7": 4082.4,
"learning_rate": 0.000471455594568616,
"loss": 5864.9,
"step": 5230
},
{
"ce_loss_13": 3.184338331222534,
"ce_loss_26": 2.719081574678421,
"ce_loss_39": 2.1989813148975372,
"ce_loss_52": 1.4533298462629318,
"ce_loss_7": 3.465100187063217,
"epoch": 0.524,
"grad_norm": 19.66913851254436,
"kl_loss_13": 3588.4,
"kl_loss_26": 2616.0,
"kl_loss_39": 1518.0,
"kl_loss_7": 4170.0,
"learning_rate": 0.00046987166567417086,
"loss": 5881.2,
"step": 5240
},
{
"ce_loss_13": 3.097227877378464,
"ce_loss_26": 2.6349131643772123,
"ce_loss_39": 2.1339926183223725,
"ce_loss_52": 1.3901747956871986,
"ce_loss_7": 3.3714997708797454,
"epoch": 0.525,
"grad_norm": 21.094189724694242,
"kl_loss_13": 3503.6,
"kl_loss_26": 2546.6,
"kl_loss_39": 1491.4,
"kl_loss_7": 4079.2,
"learning_rate": 0.00046828804017171776,
"loss": 5869.8,
"step": 5250
},
{
"ce_loss_13": 3.1320539236068727,
"ce_loss_26": 2.667567166686058,
"ce_loss_39": 2.17643720805645,
"ce_loss_52": 1.450567391514778,
"ce_loss_7": 3.4080025017261506,
"epoch": 0.526,
"grad_norm": 20.61619717149242,
"kl_loss_13": 3507.6,
"kl_loss_26": 2536.4,
"kl_loss_39": 1483.6,
"kl_loss_7": 4082.4,
"learning_rate": 0.00046670473400834805,
"loss": 5811.0,
"step": 5260
},
{
"ce_loss_13": 3.123244607448578,
"ce_loss_26": 2.6622200667858125,
"ce_loss_39": 2.1446547359228134,
"ce_loss_52": 1.4058131739497184,
"ce_loss_7": 3.393282580375671,
"epoch": 0.527,
"grad_norm": 20.13067058218258,
"kl_loss_13": 3536.4,
"kl_loss_26": 2574.0,
"kl_loss_39": 1499.8,
"kl_loss_7": 4103.6,
"learning_rate": 0.00046512176312793734,
"loss": 5799.9,
"step": 5270
},
{
"ce_loss_13": 3.0799066185951234,
"ce_loss_26": 2.620718148350716,
"ce_loss_39": 2.1284131199121474,
"ce_loss_52": 1.4004437893629074,
"ce_loss_7": 3.357691395282745,
"epoch": 0.528,
"grad_norm": 20.729785702601426,
"kl_loss_13": 3458.8,
"kl_loss_26": 2500.6,
"kl_loss_39": 1456.6,
"kl_loss_7": 4039.2,
"learning_rate": 0.00046353914347098467,
"loss": 5784.9,
"step": 5280
},
{
"ce_loss_13": 3.100508135557175,
"ce_loss_26": 2.645443448424339,
"ce_loss_39": 2.140786075592041,
"ce_loss_52": 1.4068747192621232,
"ce_loss_7": 3.3814845025539397,
"epoch": 0.529,
"grad_norm": 20.87154357218591,
"kl_loss_13": 3506.0,
"kl_loss_26": 2553.6,
"kl_loss_39": 1489.5,
"kl_loss_7": 4091.2,
"learning_rate": 0.0004619568909744524,
"loss": 5772.2,
"step": 5290
},
{
"ce_loss_13": 3.127873086929321,
"ce_loss_26": 2.6634323090314864,
"ce_loss_39": 2.157263731956482,
"ce_loss_52": 1.4283392548561096,
"ce_loss_7": 3.3975966036319734,
"epoch": 0.53,
"grad_norm": 20.20322543048614,
"kl_loss_13": 3490.8,
"kl_loss_26": 2524.4,
"kl_loss_39": 1474.6,
"kl_loss_7": 4055.2,
"learning_rate": 0.00046037502157160573,
"loss": 5795.9,
"step": 5300
},
{
"ce_loss_13": 3.076627087593079,
"ce_loss_26": 2.616869166493416,
"ce_loss_39": 2.117774197459221,
"ce_loss_52": 1.4120985105633737,
"ce_loss_7": 3.3510021567344666,
"epoch": 0.531,
"grad_norm": 20.811499078870163,
"kl_loss_13": 3444.0,
"kl_loss_26": 2482.4,
"kl_loss_39": 1422.8,
"kl_loss_7": 4016.0,
"learning_rate": 0.00045879355119185207,
"loss": 5749.7,
"step": 5310
},
{
"ce_loss_13": 3.1226239800453186,
"ce_loss_26": 2.662443572282791,
"ce_loss_39": 2.156531369686127,
"ce_loss_52": 1.4430431425571442,
"ce_loss_7": 3.396123135089874,
"epoch": 0.532,
"grad_norm": 18.59996382548897,
"kl_loss_13": 3496.0,
"kl_loss_26": 2528.8,
"kl_loss_39": 1445.2,
"kl_loss_7": 4069.2,
"learning_rate": 0.0004572124957605803,
"loss": 5776.5,
"step": 5320
},
{
"ce_loss_13": 3.1100788176059724,
"ce_loss_26": 2.647669917345047,
"ce_loss_39": 2.140716627240181,
"ce_loss_52": 1.4189704924821853,
"ce_loss_7": 3.387048715353012,
"epoch": 0.533,
"grad_norm": 20.905976368739214,
"kl_loss_13": 3497.6,
"kl_loss_26": 2533.0,
"kl_loss_39": 1466.2,
"kl_loss_7": 4079.6,
"learning_rate": 0.00045563187119900103,
"loss": 5752.6,
"step": 5330
},
{
"ce_loss_13": 3.06461244225502,
"ce_loss_26": 2.6009975552558897,
"ce_loss_39": 2.1015468716621397,
"ce_loss_52": 1.3909726276993752,
"ce_loss_7": 3.3485201001167297,
"epoch": 0.534,
"grad_norm": 19.831333266576646,
"kl_loss_13": 3462.4,
"kl_loss_26": 2487.0,
"kl_loss_39": 1427.1,
"kl_loss_7": 4053.2,
"learning_rate": 0.00045405169342398633,
"loss": 5804.75,
"step": 5340
},
{
"ce_loss_13": 3.1465537667274477,
"ce_loss_26": 2.6862030625343323,
"ce_loss_39": 2.1878136694431305,
"ce_loss_52": 1.4572405338287353,
"ce_loss_7": 3.4159956216812133,
"epoch": 0.535,
"grad_norm": 20.130648070413336,
"kl_loss_13": 3486.8,
"kl_loss_26": 2529.2,
"kl_loss_39": 1463.2,
"kl_loss_7": 4050.4,
"learning_rate": 0.0004524719783479088,
"loss": 5785.8,
"step": 5350
},
{
"ce_loss_13": 3.1608322679996492,
"ce_loss_26": 2.6899396955966948,
"ce_loss_39": 2.192571198940277,
"ce_loss_52": 1.466444182395935,
"ce_loss_7": 3.4340961396694185,
"epoch": 0.536,
"grad_norm": 21.175578440541553,
"kl_loss_13": 3511.2,
"kl_loss_26": 2537.2,
"kl_loss_39": 1482.0,
"kl_loss_7": 4079.6,
"learning_rate": 0.00045089274187848144,
"loss": 5831.5,
"step": 5360
},
{
"ce_loss_13": 3.122362142801285,
"ce_loss_26": 2.6539461642503737,
"ce_loss_39": 2.1548154592514037,
"ce_loss_52": 1.4296748742461205,
"ce_loss_7": 3.403215527534485,
"epoch": 0.537,
"grad_norm": 21.426385415702033,
"kl_loss_13": 3491.6,
"kl_loss_26": 2524.6,
"kl_loss_39": 1464.2,
"kl_loss_7": 4068.8,
"learning_rate": 0.00044931399991859835,
"loss": 5775.7,
"step": 5370
},
{
"ce_loss_13": 3.0924407064914705,
"ce_loss_26": 2.6285818815231323,
"ce_loss_39": 2.12394041121006,
"ce_loss_52": 1.4268731981515885,
"ce_loss_7": 3.365303188562393,
"epoch": 0.538,
"grad_norm": 21.06287805296543,
"kl_loss_13": 3469.6,
"kl_loss_26": 2495.8,
"kl_loss_39": 1421.6,
"kl_loss_7": 4039.2,
"learning_rate": 0.00044773576836617336,
"loss": 5748.1,
"step": 5380
},
{
"ce_loss_13": 3.0936496675014498,
"ce_loss_26": 2.6234502464532854,
"ce_loss_39": 2.112909361720085,
"ce_loss_52": 1.3892342567443847,
"ce_loss_7": 3.3770604133605957,
"epoch": 0.539,
"grad_norm": 20.257845085428166,
"kl_loss_13": 3512.0,
"kl_loss_26": 2533.6,
"kl_loss_39": 1454.2,
"kl_loss_7": 4101.2,
"learning_rate": 0.00044615806311398056,
"loss": 5750.7,
"step": 5390
},
{
"ce_loss_13": 3.1293481528759,
"ce_loss_26": 2.661091110110283,
"ce_loss_39": 2.1563747018575667,
"ce_loss_52": 1.4326880395412445,
"ce_loss_7": 3.4015913248062133,
"epoch": 0.54,
"grad_norm": 19.884995716311128,
"kl_loss_13": 3510.0,
"kl_loss_26": 2528.6,
"kl_loss_39": 1461.8,
"kl_loss_7": 4087.2,
"learning_rate": 0.00044458090004949454,
"loss": 5775.2,
"step": 5400
},
{
"ce_loss_13": 3.1498380303382874,
"ce_loss_26": 2.6860203623771666,
"ce_loss_39": 2.184649482369423,
"ce_loss_52": 1.4619063645601273,
"ce_loss_7": 3.425897455215454,
"epoch": 0.541,
"grad_norm": 21.787073133904556,
"kl_loss_13": 3476.0,
"kl_loss_26": 2516.4,
"kl_loss_39": 1456.6,
"kl_loss_7": 4048.0,
"learning_rate": 0.0004430042950547297,
"loss": 5755.9,
"step": 5410
},
{
"ce_loss_13": 3.1863462030887604,
"ce_loss_26": 2.726431465148926,
"ce_loss_39": 2.2207835763692856,
"ce_loss_52": 1.5021536648273468,
"ce_loss_7": 3.454101949930191,
"epoch": 0.542,
"grad_norm": 20.7568922984159,
"kl_loss_13": 3510.0,
"kl_loss_26": 2532.8,
"kl_loss_39": 1455.8,
"kl_loss_7": 4068.8,
"learning_rate": 0.0004414282640060809,
"loss": 5749.1,
"step": 5420
},
{
"ce_loss_13": 3.1338580727577208,
"ce_loss_26": 2.6763171195983886,
"ce_loss_39": 2.181287834048271,
"ce_loss_52": 1.4664452508091927,
"ce_loss_7": 3.4055596351623536,
"epoch": 0.543,
"grad_norm": 19.620114685157933,
"kl_loss_13": 3421.6,
"kl_loss_26": 2476.8,
"kl_loss_39": 1437.8,
"kl_loss_7": 3988.8,
"learning_rate": 0.0004398528227741633,
"loss": 5704.8,
"step": 5430
},
{
"ce_loss_13": 3.104156017303467,
"ce_loss_26": 2.639963132143021,
"ce_loss_39": 2.127584692835808,
"ce_loss_52": 1.4150341883301736,
"ce_loss_7": 3.390649217367172,
"epoch": 0.544,
"grad_norm": 20.487750099372004,
"kl_loss_13": 3479.6,
"kl_loss_26": 2511.2,
"kl_loss_39": 1437.2,
"kl_loss_7": 4085.6,
"learning_rate": 0.00043827798722365264,
"loss": 5688.3,
"step": 5440
},
{
"ce_loss_13": 3.0290561497211455,
"ce_loss_26": 2.5664966076612474,
"ce_loss_39": 2.065689593553543,
"ce_loss_52": 1.3703163504600524,
"ce_loss_7": 3.3105548918247223,
"epoch": 0.545,
"grad_norm": 20.36704365885761,
"kl_loss_13": 3438.0,
"kl_loss_26": 2471.0,
"kl_loss_39": 1405.0,
"kl_loss_7": 4010.4,
"learning_rate": 0.00043670377321312535,
"loss": 5715.9,
"step": 5450
},
{
"ce_loss_13": 3.1363044023513793,
"ce_loss_26": 2.674453055858612,
"ce_loss_39": 2.169647827744484,
"ce_loss_52": 1.440093258023262,
"ce_loss_7": 3.409278839826584,
"epoch": 0.546,
"grad_norm": 20.32480480357714,
"kl_loss_13": 3496.8,
"kl_loss_26": 2524.8,
"kl_loss_39": 1466.2,
"kl_loss_7": 4078.0,
"learning_rate": 0.0004351301965948991,
"loss": 5722.2,
"step": 5460
},
{
"ce_loss_13": 3.152593141794205,
"ce_loss_26": 2.6999820828437806,
"ce_loss_39": 2.1983327239751818,
"ce_loss_52": 1.4628954619169234,
"ce_loss_7": 3.4299716293811797,
"epoch": 0.547,
"grad_norm": 19.856114793460193,
"kl_loss_13": 3499.2,
"kl_loss_26": 2548.2,
"kl_loss_39": 1495.2,
"kl_loss_7": 4066.8,
"learning_rate": 0.000433557273214873,
"loss": 5741.7,
"step": 5470
},
{
"ce_loss_13": 3.064018839597702,
"ce_loss_26": 2.6090955317020414,
"ce_loss_39": 2.1058148056268693,
"ce_loss_52": 1.4195792496204376,
"ce_loss_7": 3.344401216506958,
"epoch": 0.548,
"grad_norm": 20.616048950091404,
"kl_loss_13": 3411.2,
"kl_loss_26": 2459.2,
"kl_loss_39": 1389.6,
"kl_loss_7": 3992.4,
"learning_rate": 0.000431985018912368,
"loss": 5718.6,
"step": 5480
},
{
"ce_loss_13": 3.0607047379016876,
"ce_loss_26": 2.597879120707512,
"ce_loss_39": 2.108409595489502,
"ce_loss_52": 1.4123182266950607,
"ce_loss_7": 3.336356836557388,
"epoch": 0.549,
"grad_norm": 20.036021472042393,
"kl_loss_13": 3404.4,
"kl_loss_26": 2447.2,
"kl_loss_39": 1413.8,
"kl_loss_7": 3983.2,
"learning_rate": 0.0004304134495199674,
"loss": 5692.5,
"step": 5490
},
{
"ce_loss_13": 3.052510768175125,
"ce_loss_26": 2.6108580827713013,
"ce_loss_39": 2.1228879362344744,
"ce_loss_52": 1.4244433492422104,
"ce_loss_7": 3.3247893154621124,
"epoch": 0.55,
"grad_norm": 20.158536855725483,
"kl_loss_13": 3391.6,
"kl_loss_26": 2469.0,
"kl_loss_39": 1430.9,
"kl_loss_7": 3959.6,
"learning_rate": 0.0004288425808633575,
"loss": 5660.8,
"step": 5500
},
{
"ce_loss_13": 3.164420074224472,
"ce_loss_26": 2.6986677646636963,
"ce_loss_39": 2.192535865306854,
"ce_loss_52": 1.4702347338199615,
"ce_loss_7": 3.4305537164211275,
"epoch": 0.551,
"grad_norm": 21.34549178191646,
"kl_loss_13": 3508.0,
"kl_loss_26": 2541.6,
"kl_loss_39": 1463.2,
"kl_loss_7": 4075.6,
"learning_rate": 0.0004272724287611684,
"loss": 5697.3,
"step": 5510
},
{
"ce_loss_13": 3.1066312968730925,
"ce_loss_26": 2.6526737749576568,
"ce_loss_39": 2.1508010149002077,
"ce_loss_52": 1.4371357694268228,
"ce_loss_7": 3.381526863574982,
"epoch": 0.552,
"grad_norm": 20.017257048745616,
"kl_loss_13": 3449.6,
"kl_loss_26": 2497.2,
"kl_loss_39": 1440.4,
"kl_loss_7": 4027.2,
"learning_rate": 0.00042570300902481425,
"loss": 5661.1,
"step": 5520
},
{
"ce_loss_13": 3.0768387794494627,
"ce_loss_26": 2.6105665415525436,
"ce_loss_39": 2.1041111290454864,
"ce_loss_52": 1.3769602328538895,
"ce_loss_7": 3.3504232287406923,
"epoch": 0.553,
"grad_norm": 20.71287480099847,
"kl_loss_13": 3497.6,
"kl_loss_26": 2529.0,
"kl_loss_39": 1464.6,
"kl_loss_7": 4067.6,
"learning_rate": 0.00042413433745833423,
"loss": 5675.2,
"step": 5530
},
{
"ce_loss_13": 3.078362447023392,
"ce_loss_26": 2.6186123132705688,
"ce_loss_39": 2.1204461604356766,
"ce_loss_52": 1.42312493622303,
"ce_loss_7": 3.3565984547138212,
"epoch": 0.554,
"grad_norm": 20.243506516231662,
"kl_loss_13": 3424.0,
"kl_loss_26": 2463.4,
"kl_loss_39": 1411.4,
"kl_loss_7": 4003.6,
"learning_rate": 0.0004225664298582339,
"loss": 5650.4,
"step": 5540
},
{
"ce_loss_13": 3.104430967569351,
"ce_loss_26": 2.6466626435518266,
"ce_loss_39": 2.149568349123001,
"ce_loss_52": 1.4317003712058067,
"ce_loss_7": 3.380819743871689,
"epoch": 0.555,
"grad_norm": 20.086456724534386,
"kl_loss_13": 3478.8,
"kl_loss_26": 2528.0,
"kl_loss_39": 1462.6,
"kl_loss_7": 4051.6,
"learning_rate": 0.000420999302013325,
"loss": 5686.3,
"step": 5550
},
{
"ce_loss_13": 3.09715017080307,
"ce_loss_26": 2.6249477684497835,
"ce_loss_39": 2.112127733230591,
"ce_loss_52": 1.3998657062649726,
"ce_loss_7": 3.3848826706409456,
"epoch": 0.556,
"grad_norm": 19.84836283457165,
"kl_loss_13": 3500.0,
"kl_loss_26": 2520.0,
"kl_loss_39": 1432.4,
"kl_loss_7": 4098.4,
"learning_rate": 0.000419432969704568,
"loss": 5721.05,
"step": 5560
},
{
"ce_loss_13": 3.1611645042896273,
"ce_loss_26": 2.7037321001291277,
"ce_loss_39": 2.2008607923984527,
"ce_loss_52": 1.4731642618775367,
"ce_loss_7": 3.4352382242679598,
"epoch": 0.557,
"grad_norm": 19.486736620492533,
"kl_loss_13": 3494.8,
"kl_loss_26": 2542.2,
"kl_loss_39": 1471.9,
"kl_loss_7": 4071.6,
"learning_rate": 0.00041786744870491154,
"loss": 5664.5,
"step": 5570
},
{
"ce_loss_13": 3.1723786175251005,
"ce_loss_26": 2.712140661478043,
"ce_loss_39": 2.2021081149578094,
"ce_loss_52": 1.4713574051856995,
"ce_loss_7": 3.4434271275997164,
"epoch": 0.558,
"grad_norm": 21.812924852409434,
"kl_loss_13": 3518.4,
"kl_loss_26": 2552.0,
"kl_loss_39": 1476.8,
"kl_loss_7": 4082.0,
"learning_rate": 0.0004163027547791347,
"loss": 5667.5,
"step": 5580
},
{
"ce_loss_13": 3.0999899983406065,
"ce_loss_26": 2.6524887919425963,
"ce_loss_39": 2.159899726510048,
"ce_loss_52": 1.457381361722946,
"ce_loss_7": 3.376901388168335,
"epoch": 0.559,
"grad_norm": 20.855137229386383,
"kl_loss_13": 3402.8,
"kl_loss_26": 2469.2,
"kl_loss_39": 1417.0,
"kl_loss_7": 3983.6,
"learning_rate": 0.0004147389036836881,
"loss": 5623.2,
"step": 5590
},
{
"ce_loss_13": 3.048000919818878,
"ce_loss_26": 2.6028879463672636,
"ce_loss_39": 2.110449159145355,
"ce_loss_52": 1.4273749262094497,
"ce_loss_7": 3.3209989249706267,
"epoch": 0.56,
"grad_norm": 21.293221784840117,
"kl_loss_13": 3370.8,
"kl_loss_26": 2436.6,
"kl_loss_39": 1393.6,
"kl_loss_7": 3935.6,
"learning_rate": 0.00041317591116653486,
"loss": 5661.4,
"step": 5600
},
{
"ce_loss_13": 3.117449927330017,
"ce_loss_26": 2.6561968684196473,
"ce_loss_39": 2.1393496483564376,
"ce_loss_52": 1.4263568341732025,
"ce_loss_7": 3.3912305176258086,
"epoch": 0.561,
"grad_norm": 19.806130661521266,
"kl_loss_13": 3510.8,
"kl_loss_26": 2543.6,
"kl_loss_39": 1446.0,
"kl_loss_7": 4074.8,
"learning_rate": 0.0004116137929669921,
"loss": 5646.7,
"step": 5610
},
{
"ce_loss_13": 3.032761037349701,
"ce_loss_26": 2.5814107984304426,
"ce_loss_39": 2.092069110274315,
"ce_loss_52": 1.4175067842006683,
"ce_loss_7": 3.3051693975925445,
"epoch": 0.562,
"grad_norm": 21.265424868671836,
"kl_loss_13": 3342.0,
"kl_loss_26": 2397.0,
"kl_loss_39": 1360.0,
"kl_loss_7": 3902.8,
"learning_rate": 0.00041005256481557305,
"loss": 5649.3,
"step": 5620
},
{
"ce_loss_13": 3.1011641681194306,
"ce_loss_26": 2.6472670078277587,
"ce_loss_39": 2.1640954107046126,
"ce_loss_52": 1.4572645276784897,
"ce_loss_7": 3.3653019249439238,
"epoch": 0.563,
"grad_norm": 19.60469852329931,
"kl_loss_13": 3397.2,
"kl_loss_26": 2452.6,
"kl_loss_39": 1426.2,
"kl_loss_7": 3957.6,
"learning_rate": 0.00040849224243382767,
"loss": 5635.6,
"step": 5630
},
{
"ce_loss_13": 3.0702085912227632,
"ce_loss_26": 2.6029874324798583,
"ce_loss_39": 2.1070942997932436,
"ce_loss_52": 1.4102862730622292,
"ce_loss_7": 3.345056527853012,
"epoch": 0.564,
"grad_norm": 19.66351329562965,
"kl_loss_13": 3444.8,
"kl_loss_26": 2478.8,
"kl_loss_39": 1416.0,
"kl_loss_7": 4010.8,
"learning_rate": 0.000406932841534185,
"loss": 5678.4,
"step": 5640
},
{
"ce_loss_13": 3.1035412073135378,
"ce_loss_26": 2.6404273927211763,
"ce_loss_39": 2.148519089818001,
"ce_loss_52": 1.4659966766834258,
"ce_loss_7": 3.3797259271144866,
"epoch": 0.565,
"grad_norm": 20.61367124543832,
"kl_loss_13": 3418.4,
"kl_loss_26": 2452.6,
"kl_loss_39": 1399.6,
"kl_loss_7": 3989.2,
"learning_rate": 0.0004053743778197951,
"loss": 5619.4,
"step": 5650
},
{
"ce_loss_13": 3.074153536558151,
"ce_loss_26": 2.6227691769599915,
"ce_loss_39": 2.12532425224781,
"ce_loss_52": 1.4269753962755203,
"ce_loss_7": 3.3447964787483215,
"epoch": 0.566,
"grad_norm": 20.08816441246758,
"kl_loss_13": 3398.8,
"kl_loss_26": 2455.2,
"kl_loss_39": 1406.8,
"kl_loss_7": 3968.8,
"learning_rate": 0.0004038168669843697,
"loss": 5607.4,
"step": 5660
},
{
"ce_loss_13": 3.117138236761093,
"ce_loss_26": 2.6556679248809814,
"ce_loss_39": 2.1434233844280244,
"ce_loss_52": 1.4418231889605522,
"ce_loss_7": 3.3935614347457888,
"epoch": 0.567,
"grad_norm": 19.629663422829108,
"kl_loss_13": 3460.4,
"kl_loss_26": 2499.6,
"kl_loss_39": 1423.0,
"kl_loss_7": 4038.4,
"learning_rate": 0.000402260324712026,
"loss": 5653.85,
"step": 5670
},
{
"ce_loss_13": 3.0241773188114167,
"ce_loss_26": 2.5764558911323547,
"ce_loss_39": 2.0826069891452788,
"ce_loss_52": 1.4262234181165696,
"ce_loss_7": 3.292912298440933,
"epoch": 0.568,
"grad_norm": 19.863333175376273,
"kl_loss_13": 3317.2,
"kl_loss_26": 2380.6,
"kl_loss_39": 1341.9,
"kl_loss_7": 3879.6,
"learning_rate": 0.00040070476667712743,
"loss": 5602.0,
"step": 5680
},
{
"ce_loss_13": 3.1207240760326385,
"ce_loss_26": 2.667350098490715,
"ce_loss_39": 2.1637227922677993,
"ce_loss_52": 1.4507419973611833,
"ce_loss_7": 3.3947570443153383,
"epoch": 0.569,
"grad_norm": 20.18457161613193,
"kl_loss_13": 3470.8,
"kl_loss_26": 2511.8,
"kl_loss_39": 1453.2,
"kl_loss_7": 4038.0,
"learning_rate": 0.0003991502085441259,
"loss": 5637.8,
"step": 5690
},
{
"ce_loss_13": 3.052716261148453,
"ce_loss_26": 2.5999310851097106,
"ce_loss_39": 2.103394716978073,
"ce_loss_52": 1.4248915880918502,
"ce_loss_7": 3.3296144127845766,
"epoch": 0.57,
"grad_norm": 21.32933636042312,
"kl_loss_13": 3384.4,
"kl_loss_26": 2434.4,
"kl_loss_39": 1385.3,
"kl_loss_7": 3961.6,
"learning_rate": 0.0003975966659674047,
"loss": 5572.65,
"step": 5700
},
{
"ce_loss_13": 3.0565085709095,
"ce_loss_26": 2.612689271569252,
"ce_loss_39": 2.133056679368019,
"ce_loss_52": 1.456464058160782,
"ce_loss_7": 3.3283673584461213,
"epoch": 0.571,
"grad_norm": 20.795332274261725,
"kl_loss_13": 3328.4,
"kl_loss_26": 2402.4,
"kl_loss_39": 1384.6,
"kl_loss_7": 3893.6,
"learning_rate": 0.0003960441545911204,
"loss": 5637.0,
"step": 5710
},
{
"ce_loss_13": 3.100340133905411,
"ce_loss_26": 2.6507264733314515,
"ce_loss_39": 2.1515814483165743,
"ce_loss_52": 1.4689666152000427,
"ce_loss_7": 3.36507533788681,
"epoch": 0.572,
"grad_norm": 19.479562193670343,
"kl_loss_13": 3367.6,
"kl_loss_26": 2430.4,
"kl_loss_39": 1381.6,
"kl_loss_7": 3924.4,
"learning_rate": 0.0003944926900490452,
"loss": 5586.7,
"step": 5720
},
{
"ce_loss_13": 3.035090607404709,
"ce_loss_26": 2.5765923827886583,
"ce_loss_39": 2.0693789660930633,
"ce_loss_52": 1.394070391356945,
"ce_loss_7": 3.3126678228378297,
"epoch": 0.573,
"grad_norm": 20.769836690311337,
"kl_loss_13": 3418.0,
"kl_loss_26": 2455.8,
"kl_loss_39": 1385.6,
"kl_loss_7": 3996.8,
"learning_rate": 0.0003929422879644099,
"loss": 5607.2,
"step": 5730
},
{
"ce_loss_13": 3.0840226650238036,
"ce_loss_26": 2.6376162350177763,
"ce_loss_39": 2.1402535855770113,
"ce_loss_52": 1.4594999521970748,
"ce_loss_7": 3.3528446674346926,
"epoch": 0.574,
"grad_norm": 19.55271310482914,
"kl_loss_13": 3396.0,
"kl_loss_26": 2453.0,
"kl_loss_39": 1389.4,
"kl_loss_7": 3971.6,
"learning_rate": 0.0003913929639497462,
"loss": 5561.5,
"step": 5740
},
{
"ce_loss_13": 3.0994504928588866,
"ce_loss_26": 2.645833945274353,
"ce_loss_39": 2.151085004210472,
"ce_loss_52": 1.453307920694351,
"ce_loss_7": 3.375282955169678,
"epoch": 0.575,
"grad_norm": 20.394651449213793,
"kl_loss_13": 3410.8,
"kl_loss_26": 2459.0,
"kl_loss_39": 1418.8,
"kl_loss_7": 3977.2,
"learning_rate": 0.00038984473360672965,
"loss": 5587.6,
"step": 5750
},
{
"ce_loss_13": 3.063554251194,
"ce_loss_26": 2.602804532647133,
"ce_loss_39": 2.112149253487587,
"ce_loss_52": 1.4378221184015274,
"ce_loss_7": 3.3263466000556945,
"epoch": 0.576,
"grad_norm": 20.732107583485522,
"kl_loss_13": 3335.2,
"kl_loss_26": 2397.0,
"kl_loss_39": 1362.5,
"kl_loss_7": 3885.6,
"learning_rate": 0.0003882976125260229,
"loss": 5618.5,
"step": 5760
},
{
"ce_loss_13": 3.011310315132141,
"ce_loss_26": 2.5538589358329773,
"ce_loss_39": 2.0608473628759385,
"ce_loss_52": 1.3871852427721023,
"ce_loss_7": 3.2823293566703797,
"epoch": 0.577,
"grad_norm": 20.222489437839133,
"kl_loss_13": 3356.0,
"kl_loss_26": 2403.4,
"kl_loss_39": 1363.6,
"kl_loss_7": 3921.2,
"learning_rate": 0.00038675161628711776,
"loss": 5583.6,
"step": 5770
},
{
"ce_loss_13": 3.060704934597015,
"ce_loss_26": 2.6082344591617583,
"ce_loss_39": 2.1105374455451966,
"ce_loss_52": 1.4133148401975633,
"ce_loss_7": 3.337174046039581,
"epoch": 0.578,
"grad_norm": 20.5258419136392,
"kl_loss_13": 3407.2,
"kl_loss_26": 2468.6,
"kl_loss_39": 1416.8,
"kl_loss_7": 3978.0,
"learning_rate": 0.0003852067604581794,
"loss": 5550.9,
"step": 5780
},
{
"ce_loss_13": 3.0313555896282196,
"ce_loss_26": 2.5763757705688475,
"ce_loss_39": 2.090648338198662,
"ce_loss_52": 1.4237483531236648,
"ce_loss_7": 3.303388088941574,
"epoch": 0.579,
"grad_norm": 19.375379659731948,
"kl_loss_13": 3338.4,
"kl_loss_26": 2400.6,
"kl_loss_39": 1371.4,
"kl_loss_7": 3902.8,
"learning_rate": 0.0003836630605958888,
"loss": 5548.1,
"step": 5790
},
{
"ce_loss_13": 3.0871520400047303,
"ce_loss_26": 2.628482538461685,
"ce_loss_39": 2.1345098853111266,
"ce_loss_52": 1.439907690882683,
"ce_loss_7": 3.360053616762161,
"epoch": 0.58,
"grad_norm": 20.217419963891068,
"kl_loss_13": 3384.8,
"kl_loss_26": 2429.6,
"kl_loss_39": 1387.0,
"kl_loss_7": 3952.0,
"learning_rate": 0.0003821205322452863,
"loss": 5608.2,
"step": 5800
},
{
"ce_loss_13": 3.098143881559372,
"ce_loss_26": 2.648523300886154,
"ce_loss_39": 2.1544575184583663,
"ce_loss_52": 1.4684545397758484,
"ce_loss_7": 3.369327354431152,
"epoch": 0.581,
"grad_norm": 20.306772003501575,
"kl_loss_13": 3376.0,
"kl_loss_26": 2423.6,
"kl_loss_39": 1374.4,
"kl_loss_7": 3943.2,
"learning_rate": 0.0003805791909396155,
"loss": 5609.7,
"step": 5810
},
{
"ce_loss_13": 3.03250247836113,
"ce_loss_26": 2.5763048112392424,
"ce_loss_39": 2.0811164885759355,
"ce_loss_52": 1.4048690304160119,
"ce_loss_7": 3.2992306888103484,
"epoch": 0.582,
"grad_norm": 19.674023919479502,
"kl_loss_13": 3376.8,
"kl_loss_26": 2428.6,
"kl_loss_39": 1384.2,
"kl_loss_7": 3936.8,
"learning_rate": 0.0003790390522001662,
"loss": 5494.5,
"step": 5820
},
{
"ce_loss_13": 3.0025113344192507,
"ce_loss_26": 2.55022137761116,
"ce_loss_39": 2.0565065026283262,
"ce_loss_52": 1.377510306239128,
"ce_loss_7": 3.2718186736106873,
"epoch": 0.583,
"grad_norm": 20.728894667471106,
"kl_loss_13": 3354.8,
"kl_loss_26": 2406.2,
"kl_loss_39": 1367.9,
"kl_loss_7": 3917.6,
"learning_rate": 0.0003775001315361183,
"loss": 5559.7,
"step": 5830
},
{
"ce_loss_13": 3.0971179008483887,
"ce_loss_26": 2.6451155722141264,
"ce_loss_39": 2.1508567333221436,
"ce_loss_52": 1.4824247717857362,
"ce_loss_7": 3.37031666636467,
"epoch": 0.584,
"grad_norm": 20.600074401568932,
"kl_loss_13": 3337.2,
"kl_loss_26": 2403.8,
"kl_loss_39": 1359.2,
"kl_loss_7": 3907.2,
"learning_rate": 0.0003759624444443858,
"loss": 5519.4,
"step": 5840
},
{
"ce_loss_13": 3.0961884200572967,
"ce_loss_26": 2.626956915855408,
"ce_loss_39": 2.1310097485780717,
"ce_loss_52": 1.4370131075382233,
"ce_loss_7": 3.371414542198181,
"epoch": 0.585,
"grad_norm": 21.119848921717185,
"kl_loss_13": 3437.6,
"kl_loss_26": 2463.6,
"kl_loss_39": 1412.8,
"kl_loss_7": 4010.4,
"learning_rate": 0.00037442600640946044,
"loss": 5564.2,
"step": 5850
},
{
"ce_loss_13": 3.022693085670471,
"ce_loss_26": 2.556842041015625,
"ce_loss_39": 2.054610991477966,
"ce_loss_52": 1.3797120869159698,
"ce_loss_7": 3.2991441190242767,
"epoch": 0.586,
"grad_norm": 20.829952423189983,
"kl_loss_13": 3400.0,
"kl_loss_26": 2436.4,
"kl_loss_39": 1378.6,
"kl_loss_7": 3976.4,
"learning_rate": 0.00037289083290325663,
"loss": 5555.3,
"step": 5860
},
{
"ce_loss_13": 3.048868530988693,
"ce_loss_26": 2.597111147642136,
"ce_loss_39": 2.1075122743844985,
"ce_loss_52": 1.4477093726396562,
"ce_loss_7": 3.3170620620250704,
"epoch": 0.587,
"grad_norm": 19.743999290482414,
"kl_loss_13": 3295.2,
"kl_loss_26": 2349.8,
"kl_loss_39": 1314.6,
"kl_loss_7": 3862.0,
"learning_rate": 0.0003713569393849543,
"loss": 5582.2,
"step": 5870
},
{
"ce_loss_13": 3.0275582671165466,
"ce_loss_26": 2.5641341865062715,
"ce_loss_39": 2.075681546330452,
"ce_loss_52": 1.413079009950161,
"ce_loss_7": 3.30134374499321,
"epoch": 0.588,
"grad_norm": 19.60417578779171,
"kl_loss_13": 3342.8,
"kl_loss_26": 2385.2,
"kl_loss_39": 1348.4,
"kl_loss_7": 3911.6,
"learning_rate": 0.00036982434130084397,
"loss": 5547.2,
"step": 5880
},
{
"ce_loss_13": 3.0114518344402312,
"ce_loss_26": 2.5489202946424485,
"ce_loss_39": 2.051037350296974,
"ce_loss_52": 1.3947512209415436,
"ce_loss_7": 3.2770422041416167,
"epoch": 0.589,
"grad_norm": 19.943871445513693,
"kl_loss_13": 3324.8,
"kl_loss_26": 2358.6,
"kl_loss_39": 1318.4,
"kl_loss_7": 3886.4,
"learning_rate": 0.00036829305408417166,
"loss": 5519.0,
"step": 5890
},
{
"ce_loss_13": 2.9831090033054353,
"ce_loss_26": 2.5218098521232606,
"ce_loss_39": 2.0276191979646683,
"ce_loss_52": 1.3660520613193512,
"ce_loss_7": 3.262023079395294,
"epoch": 0.59,
"grad_norm": 20.136107811495098,
"kl_loss_13": 3337.2,
"kl_loss_26": 2377.0,
"kl_loss_39": 1333.6,
"kl_loss_7": 3918.4,
"learning_rate": 0.0003667630931549826,
"loss": 5547.1,
"step": 5900
},
{
"ce_loss_13": 3.1185491621494292,
"ce_loss_26": 2.6609552204608917,
"ce_loss_39": 2.154519048333168,
"ce_loss_52": 1.4638203248381614,
"ce_loss_7": 3.3888748466968535,
"epoch": 0.591,
"grad_norm": 21.04398596009474,
"kl_loss_13": 3422.0,
"kl_loss_26": 2461.2,
"kl_loss_39": 1398.6,
"kl_loss_7": 3996.8,
"learning_rate": 0.00036523447391996613,
"loss": 5529.25,
"step": 5910
},
{
"ce_loss_13": 3.01438627243042,
"ce_loss_26": 2.5740538477897643,
"ce_loss_39": 2.0915403455495833,
"ce_loss_52": 1.431156474351883,
"ce_loss_7": 3.2843648850917817,
"epoch": 0.592,
"grad_norm": 21.11836249464285,
"kl_loss_13": 3292.4,
"kl_loss_26": 2358.6,
"kl_loss_39": 1331.3,
"kl_loss_7": 3853.2,
"learning_rate": 0.00036370721177230114,
"loss": 5565.45,
"step": 5920
},
{
"ce_loss_13": 3.069197976589203,
"ce_loss_26": 2.619757717847824,
"ce_loss_39": 2.1231694549322127,
"ce_loss_52": 1.4499645620584487,
"ce_loss_7": 3.339762735366821,
"epoch": 0.593,
"grad_norm": 19.81588610039522,
"kl_loss_13": 3318.4,
"kl_loss_26": 2373.0,
"kl_loss_39": 1346.8,
"kl_loss_7": 3881.2,
"learning_rate": 0.00036218132209150044,
"loss": 5483.7,
"step": 5930
},
{
"ce_loss_13": 3.0346123337745667,
"ce_loss_26": 2.5827606499195097,
"ce_loss_39": 2.090969371795654,
"ce_loss_52": 1.4306629955768586,
"ce_loss_7": 3.3081043541431425,
"epoch": 0.594,
"grad_norm": 20.579777117155064,
"kl_loss_13": 3338.4,
"kl_loss_26": 2404.8,
"kl_loss_39": 1361.4,
"kl_loss_7": 3897.6,
"learning_rate": 0.0003606568202432562,
"loss": 5508.2,
"step": 5940
},
{
"ce_loss_13": 2.9628920376300814,
"ce_loss_26": 2.5103672593832016,
"ce_loss_39": 2.028717193007469,
"ce_loss_52": 1.3798889175057412,
"ce_loss_7": 3.238497519493103,
"epoch": 0.595,
"grad_norm": 19.336805098702218,
"kl_loss_13": 3271.2,
"kl_loss_26": 2333.8,
"kl_loss_39": 1324.3,
"kl_loss_7": 3845.6,
"learning_rate": 0.0003591337215792851,
"loss": 5512.3,
"step": 5950
},
{
"ce_loss_13": 3.0788431644439695,
"ce_loss_26": 2.608420321345329,
"ce_loss_39": 2.1091116696596144,
"ce_loss_52": 1.4199562400579453,
"ce_loss_7": 3.359624499082565,
"epoch": 0.596,
"grad_norm": 20.533566806186286,
"kl_loss_13": 3426.4,
"kl_loss_26": 2447.8,
"kl_loss_39": 1399.7,
"kl_loss_7": 4004.4,
"learning_rate": 0.00035761204143717383,
"loss": 5551.7,
"step": 5960
},
{
"ce_loss_13": 3.03274342417717,
"ce_loss_26": 2.585944026708603,
"ce_loss_39": 2.096950164437294,
"ce_loss_52": 1.416708105802536,
"ce_loss_7": 3.3088025748729706,
"epoch": 0.597,
"grad_norm": 19.695670321869642,
"kl_loss_13": 3339.2,
"kl_loss_26": 2400.2,
"kl_loss_39": 1368.0,
"kl_loss_7": 3908.0,
"learning_rate": 0.0003560917951402245,
"loss": 5483.9,
"step": 5970
},
{
"ce_loss_13": 3.01766916513443,
"ce_loss_26": 2.561279395222664,
"ce_loss_39": 2.06943539083004,
"ce_loss_52": 1.418130737543106,
"ce_loss_7": 3.2899491429328918,
"epoch": 0.598,
"grad_norm": 21.072429989583483,
"kl_loss_13": 3344.4,
"kl_loss_26": 2382.6,
"kl_loss_39": 1331.0,
"kl_loss_7": 3913.6,
"learning_rate": 0.00035457299799730046,
"loss": 5551.3,
"step": 5980
},
{
"ce_loss_13": 3.0659261345863342,
"ce_loss_26": 2.600133925676346,
"ce_loss_39": 2.110896447300911,
"ce_loss_52": 1.437103909254074,
"ce_loss_7": 3.3336906135082245,
"epoch": 0.599,
"grad_norm": 24.325101698795564,
"kl_loss_13": 3380.0,
"kl_loss_26": 2416.4,
"kl_loss_39": 1376.0,
"kl_loss_7": 3938.8,
"learning_rate": 0.0003530556653026721,
"loss": 5495.3,
"step": 5990
},
{
"ce_loss_13": 3.0716091096401215,
"ce_loss_26": 2.614444798231125,
"ce_loss_39": 2.1203446626663207,
"ce_loss_52": 1.4388466402888298,
"ce_loss_7": 3.3488605439662935,
"epoch": 0.6,
"grad_norm": 20.521363612152747,
"kl_loss_13": 3384.4,
"kl_loss_26": 2432.4,
"kl_loss_39": 1382.6,
"kl_loss_7": 3959.2,
"learning_rate": 0.00035153981233586274,
"loss": 5545.9,
"step": 6000
},
{
"ce_loss_13": 3.106086379289627,
"ce_loss_26": 2.6430875420570374,
"ce_loss_39": 2.1337583631277086,
"ce_loss_52": 1.4439469754695893,
"ce_loss_7": 3.3797987580299376,
"epoch": 0.601,
"grad_norm": 20.80497849503253,
"kl_loss_13": 3453.6,
"kl_loss_26": 2484.8,
"kl_loss_39": 1398.0,
"kl_loss_7": 4028.4,
"learning_rate": 0.00035002545436149473,
"loss": 5491.4,
"step": 6010
},
{
"ce_loss_13": 3.000385183095932,
"ce_loss_26": 2.5445862293243406,
"ce_loss_39": 2.045133265852928,
"ce_loss_52": 1.387436880171299,
"ce_loss_7": 3.271744018793106,
"epoch": 0.602,
"grad_norm": 32.49284937945306,
"kl_loss_13": 3335.6,
"kl_loss_26": 2380.8,
"kl_loss_39": 1330.6,
"kl_loss_7": 3907.2,
"learning_rate": 0.0003485126066291364,
"loss": 5483.7,
"step": 6020
},
{
"ce_loss_13": 3.0256562530994415,
"ce_loss_26": 2.577129301428795,
"ce_loss_39": 2.0889712274074554,
"ce_loss_52": 1.4449194520711899,
"ce_loss_7": 3.2831216752529144,
"epoch": 0.603,
"grad_norm": 20.23621815480592,
"kl_loss_13": 3272.0,
"kl_loss_26": 2339.4,
"kl_loss_39": 1312.8,
"kl_loss_7": 3817.2,
"learning_rate": 0.0003470012843731476,
"loss": 5461.5,
"step": 6030
},
{
"ce_loss_13": 3.0063592195510864,
"ce_loss_26": 2.5461477816104887,
"ce_loss_39": 2.047996437549591,
"ce_loss_52": 1.4004468455910684,
"ce_loss_7": 3.2747744262218474,
"epoch": 0.604,
"grad_norm": 20.161854907584466,
"kl_loss_13": 3331.2,
"kl_loss_26": 2365.2,
"kl_loss_39": 1318.3,
"kl_loss_7": 3894.4,
"learning_rate": 0.00034549150281252633,
"loss": 5446.25,
"step": 6040
},
{
"ce_loss_13": 3.0043693661689757,
"ce_loss_26": 2.5464318215847017,
"ce_loss_39": 2.054910770058632,
"ce_loss_52": 1.402955588698387,
"ce_loss_7": 3.277425891160965,
"epoch": 0.605,
"grad_norm": 19.39479695015959,
"kl_loss_13": 3303.6,
"kl_loss_26": 2362.4,
"kl_loss_39": 1315.5,
"kl_loss_7": 3867.2,
"learning_rate": 0.0003439832771507565,
"loss": 5513.7,
"step": 6050
},
{
"ce_loss_13": 3.0073243379592896,
"ce_loss_26": 2.552249348163605,
"ce_loss_39": 2.0556343287229537,
"ce_loss_52": 1.4056400418281556,
"ce_loss_7": 3.276465517282486,
"epoch": 0.606,
"grad_norm": 20.632328630604196,
"kl_loss_13": 3315.2,
"kl_loss_26": 2367.2,
"kl_loss_39": 1320.7,
"kl_loss_7": 3890.0,
"learning_rate": 0.0003424766225756537,
"loss": 5437.2,
"step": 6060
},
{
"ce_loss_13": 3.0342160046100615,
"ce_loss_26": 2.5702687412500382,
"ce_loss_39": 2.0748631983995436,
"ce_loss_52": 1.4053118824958801,
"ce_loss_7": 3.30224769115448,
"epoch": 0.607,
"grad_norm": 20.381462378536757,
"kl_loss_13": 3375.6,
"kl_loss_26": 2412.6,
"kl_loss_39": 1368.5,
"kl_loss_7": 3938.0,
"learning_rate": 0.00034097155425921255,
"loss": 5453.95,
"step": 6070
},
{
"ce_loss_13": 2.979208827018738,
"ce_loss_26": 2.5391657948493958,
"ce_loss_39": 2.0675252109766005,
"ce_loss_52": 1.4293861359357833,
"ce_loss_7": 3.243548810482025,
"epoch": 0.608,
"grad_norm": 20.059894251099692,
"kl_loss_13": 3201.6,
"kl_loss_26": 2277.2,
"kl_loss_39": 1276.3,
"kl_loss_7": 3759.2,
"learning_rate": 0.0003394680873574546,
"loss": 5463.6,
"step": 6080
},
{
"ce_loss_13": 3.0087065517902376,
"ce_loss_26": 2.5543720543384554,
"ce_loss_39": 2.0706711769104005,
"ce_loss_52": 1.4312650561332703,
"ce_loss_7": 3.274740469455719,
"epoch": 0.609,
"grad_norm": 19.682020137215275,
"kl_loss_13": 3278.0,
"kl_loss_26": 2334.8,
"kl_loss_39": 1311.6,
"kl_loss_7": 3829.2,
"learning_rate": 0.0003379662370102747,
"loss": 5485.1,
"step": 6090
},
{
"ce_loss_13": 2.9708913624286652,
"ce_loss_26": 2.526490569114685,
"ce_loss_39": 2.045279270410538,
"ce_loss_52": 1.406798492372036,
"ce_loss_7": 3.2348900735378265,
"epoch": 0.61,
"grad_norm": 20.816822332244506,
"kl_loss_13": 3220.0,
"kl_loss_26": 2302.2,
"kl_loss_39": 1290.7,
"kl_loss_7": 3776.8,
"learning_rate": 0.0003364660183412892,
"loss": 5434.0,
"step": 6100
},
{
"ce_loss_13": 3.034948408603668,
"ce_loss_26": 2.5749203741550444,
"ce_loss_39": 2.083053132891655,
"ce_loss_52": 1.4239173114299775,
"ce_loss_7": 3.308141976594925,
"epoch": 0.611,
"grad_norm": 19.1619951590758,
"kl_loss_13": 3322.8,
"kl_loss_26": 2375.8,
"kl_loss_39": 1335.6,
"kl_loss_7": 3891.2,
"learning_rate": 0.0003349674464576834,
"loss": 5422.4,
"step": 6110
},
{
"ce_loss_13": 3.004230409860611,
"ce_loss_26": 2.559123533964157,
"ce_loss_39": 2.0680998235940935,
"ce_loss_52": 1.416846913099289,
"ce_loss_7": 3.268228167295456,
"epoch": 0.612,
"grad_norm": 19.814048572070753,
"kl_loss_13": 3277.2,
"kl_loss_26": 2352.2,
"kl_loss_39": 1329.4,
"kl_loss_7": 3827.6,
"learning_rate": 0.00033347053645005966,
"loss": 5408.25,
"step": 6120
},
{
"ce_loss_13": 3.043248528242111,
"ce_loss_26": 2.5893827825784683,
"ce_loss_39": 2.106617513298988,
"ce_loss_52": 1.4531341344118118,
"ce_loss_7": 3.3129510939121247,
"epoch": 0.613,
"grad_norm": 20.76807127463998,
"kl_loss_13": 3286.4,
"kl_loss_26": 2356.0,
"kl_loss_39": 1332.4,
"kl_loss_7": 3850.8,
"learning_rate": 0.00033197530339228485,
"loss": 5370.55,
"step": 6130
},
{
"ce_loss_13": 3.00436030626297,
"ce_loss_26": 2.551770430803299,
"ce_loss_39": 2.0617963910102843,
"ce_loss_52": 1.4093145355582237,
"ce_loss_7": 3.287700629234314,
"epoch": 0.614,
"grad_norm": 20.13277338184701,
"kl_loss_13": 3319.2,
"kl_loss_26": 2374.0,
"kl_loss_39": 1331.1,
"kl_loss_7": 3899.6,
"learning_rate": 0.00033048176234133967,
"loss": 5468.4,
"step": 6140
},
{
"ce_loss_13": 3.0555618822574617,
"ce_loss_26": 2.6027767241001127,
"ce_loss_39": 2.1256050765514374,
"ce_loss_52": 1.4570606127381325,
"ce_loss_7": 3.3252673983573913,
"epoch": 0.615,
"grad_norm": 20.812161363431123,
"kl_loss_13": 3324.4,
"kl_loss_26": 2376.0,
"kl_loss_39": 1350.9,
"kl_loss_7": 3884.4,
"learning_rate": 0.0003289899283371657,
"loss": 5469.45,
"step": 6150
},
{
"ce_loss_13": 2.9699956268072127,
"ce_loss_26": 2.5214530378580093,
"ce_loss_39": 2.047743359208107,
"ce_loss_52": 1.404917973279953,
"ce_loss_7": 3.242190235853195,
"epoch": 0.616,
"grad_norm": 21.707368727671295,
"kl_loss_13": 3253.6,
"kl_loss_26": 2330.0,
"kl_loss_39": 1321.7,
"kl_loss_7": 3830.4,
"learning_rate": 0.0003274998164025148,
"loss": 5448.5,
"step": 6160
},
{
"ce_loss_13": 3.1136968553066255,
"ce_loss_26": 2.6612686932086946,
"ce_loss_39": 2.1661961168050765,
"ce_loss_52": 1.4886637568473815,
"ce_loss_7": 3.390312284231186,
"epoch": 0.617,
"grad_norm": 20.098473719799063,
"kl_loss_13": 3356.0,
"kl_loss_26": 2410.6,
"kl_loss_39": 1369.8,
"kl_loss_7": 3926.4,
"learning_rate": 0.0003260114415427975,
"loss": 5420.0,
"step": 6170
},
{
"ce_loss_13": 3.0537400960922243,
"ce_loss_26": 2.6002777397632597,
"ce_loss_39": 2.0997320264577866,
"ce_loss_52": 1.425080481171608,
"ce_loss_7": 3.3301878392696382,
"epoch": 0.618,
"grad_norm": 20.959268913691595,
"kl_loss_13": 3376.8,
"kl_loss_26": 2431.0,
"kl_loss_39": 1376.8,
"kl_loss_7": 3956.0,
"learning_rate": 0.0003245248187459323,
"loss": 5467.5,
"step": 6180
},
{
"ce_loss_13": 3.0195475459098815,
"ce_loss_26": 2.5703806400299074,
"ce_loss_39": 2.0828246504068373,
"ce_loss_52": 1.4389306217432023,
"ce_loss_7": 3.2870283126831055,
"epoch": 0.619,
"grad_norm": 19.77508773242922,
"kl_loss_13": 3278.4,
"kl_loss_26": 2325.6,
"kl_loss_39": 1301.5,
"kl_loss_7": 3832.0,
"learning_rate": 0.00032303996298219416,
"loss": 5436.5,
"step": 6190
},
{
"ce_loss_13": 3.080602079629898,
"ce_loss_26": 2.6279105126857756,
"ce_loss_39": 2.1348745226860046,
"ce_loss_52": 1.4668798118829727,
"ce_loss_7": 3.349226105213165,
"epoch": 0.62,
"grad_norm": 20.105835424543073,
"kl_loss_13": 3342.4,
"kl_loss_26": 2402.4,
"kl_loss_39": 1358.2,
"kl_loss_7": 3903.6,
"learning_rate": 0.00032155688920406414,
"loss": 5427.1,
"step": 6200
},
{
"ce_loss_13": 2.980887794494629,
"ce_loss_26": 2.5403966814279557,
"ce_loss_39": 2.06268994808197,
"ce_loss_52": 1.4154802724719047,
"ce_loss_7": 3.252926254272461,
"epoch": 0.621,
"grad_norm": 19.282334771133197,
"kl_loss_13": 3252.8,
"kl_loss_26": 2327.6,
"kl_loss_39": 1309.6,
"kl_loss_7": 3818.4,
"learning_rate": 0.0003200756123460788,
"loss": 5482.55,
"step": 6210
},
{
"ce_loss_13": 2.9776609361171724,
"ce_loss_26": 2.5325854122638702,
"ce_loss_39": 2.0461337983608248,
"ce_loss_52": 1.410691213607788,
"ce_loss_7": 3.24890678524971,
"epoch": 0.622,
"grad_norm": 20.106994551943693,
"kl_loss_13": 3252.0,
"kl_loss_26": 2326.2,
"kl_loss_39": 1296.8,
"kl_loss_7": 3812.4,
"learning_rate": 0.00031859614732467957,
"loss": 5416.4,
"step": 6220
},
{
"ce_loss_13": 3.032334786653519,
"ce_loss_26": 2.579844218492508,
"ce_loss_39": 2.1053399711847307,
"ce_loss_52": 1.4573373839259147,
"ce_loss_7": 3.2939690172672274,
"epoch": 0.623,
"grad_norm": 19.37466323417316,
"kl_loss_13": 3254.0,
"kl_loss_26": 2315.6,
"kl_loss_39": 1307.4,
"kl_loss_7": 3800.0,
"learning_rate": 0.00031711850903806275,
"loss": 5384.0,
"step": 6230
},
{
"ce_loss_13": 3.0033844828605654,
"ce_loss_26": 2.554124391078949,
"ce_loss_39": 2.072583147883415,
"ce_loss_52": 1.405614359676838,
"ce_loss_7": 3.272351396083832,
"epoch": 0.624,
"grad_norm": 19.68798944815722,
"kl_loss_13": 3318.8,
"kl_loss_26": 2383.4,
"kl_loss_39": 1353.1,
"kl_loss_7": 3878.4,
"learning_rate": 0.0003156427123660297,
"loss": 5409.1,
"step": 6240
},
{
"ce_loss_13": 3.044550156593323,
"ce_loss_26": 2.5933004200458525,
"ce_loss_39": 2.0951038181781767,
"ce_loss_52": 1.432724517583847,
"ce_loss_7": 3.31173922419548,
"epoch": 0.625,
"grad_norm": 19.588532833188335,
"kl_loss_13": 3324.0,
"kl_loss_26": 2390.2,
"kl_loss_39": 1351.9,
"kl_loss_7": 3884.8,
"learning_rate": 0.0003141687721698363,
"loss": 5410.2,
"step": 6250
},
{
"ce_loss_13": 3.0133075952529906,
"ce_loss_26": 2.562904554605484,
"ce_loss_39": 2.084453445672989,
"ce_loss_52": 1.4354033678770066,
"ce_loss_7": 3.2846658766269683,
"epoch": 0.626,
"grad_norm": 20.175418509715428,
"kl_loss_13": 3256.4,
"kl_loss_26": 2322.2,
"kl_loss_39": 1302.4,
"kl_loss_7": 3820.0,
"learning_rate": 0.00031269670329204396,
"loss": 5413.4,
"step": 6260
},
{
"ce_loss_13": 3.020740455389023,
"ce_loss_26": 2.576273998618126,
"ce_loss_39": 2.094499522447586,
"ce_loss_52": 1.46863095164299,
"ce_loss_7": 3.2838824689388275,
"epoch": 0.627,
"grad_norm": 19.159006125141875,
"kl_loss_13": 3217.6,
"kl_loss_26": 2295.2,
"kl_loss_39": 1276.9,
"kl_loss_7": 3769.2,
"learning_rate": 0.00031122652055637015,
"loss": 5419.65,
"step": 6270
},
{
"ce_loss_13": 2.9717007994651796,
"ce_loss_26": 2.5309597969055178,
"ce_loss_39": 2.045916485786438,
"ce_loss_52": 1.4202698469161987,
"ce_loss_7": 3.244756191968918,
"epoch": 0.628,
"grad_norm": 20.34275756584657,
"kl_loss_13": 3237.2,
"kl_loss_26": 2312.2,
"kl_loss_39": 1284.9,
"kl_loss_7": 3804.0,
"learning_rate": 0.0003097582387675385,
"loss": 5361.6,
"step": 6280
},
{
"ce_loss_13": 2.959231287240982,
"ce_loss_26": 2.5172866880893707,
"ce_loss_39": 2.043312183022499,
"ce_loss_52": 1.4159017190337182,
"ce_loss_7": 3.220345306396484,
"epoch": 0.629,
"grad_norm": 20.150529720020295,
"kl_loss_13": 3215.2,
"kl_loss_26": 2296.4,
"kl_loss_39": 1276.2,
"kl_loss_7": 3763.2,
"learning_rate": 0.00030829187271113034,
"loss": 5363.8,
"step": 6290
},
{
"ce_loss_13": 2.9990767776966094,
"ce_loss_26": 2.5505200415849685,
"ce_loss_39": 2.060671201348305,
"ce_loss_52": 1.395907147228718,
"ce_loss_7": 3.2725743770599367,
"epoch": 0.63,
"grad_norm": 19.518127150050482,
"kl_loss_13": 3309.2,
"kl_loss_26": 2380.2,
"kl_loss_39": 1347.4,
"kl_loss_7": 3878.4,
"learning_rate": 0.00030682743715343565,
"loss": 5435.15,
"step": 6300
},
{
"ce_loss_13": 3.080632323026657,
"ce_loss_26": 2.6302684545516968,
"ce_loss_39": 2.135207986831665,
"ce_loss_52": 1.4774031162261962,
"ce_loss_7": 3.3441965878009796,
"epoch": 0.631,
"grad_norm": 21.06860944259249,
"kl_loss_13": 3302.8,
"kl_loss_26": 2366.6,
"kl_loss_39": 1332.6,
"kl_loss_7": 3862.8,
"learning_rate": 0.0003053649468413043,
"loss": 5425.25,
"step": 6310
},
{
"ce_loss_13": 3.0259739339351652,
"ce_loss_26": 2.570155072212219,
"ce_loss_39": 2.078566926717758,
"ce_loss_52": 1.4384133130311967,
"ce_loss_7": 3.288107806444168,
"epoch": 0.632,
"grad_norm": 20.78357258068192,
"kl_loss_13": 3282.0,
"kl_loss_26": 2335.8,
"kl_loss_39": 1293.6,
"kl_loss_7": 3833.2,
"learning_rate": 0.00030390441650199725,
"loss": 5412.2,
"step": 6320
},
{
"ce_loss_13": 2.9379296779632567,
"ce_loss_26": 2.49056881070137,
"ce_loss_39": 2.0068995296955108,
"ce_loss_52": 1.3864078581333161,
"ce_loss_7": 3.203891623020172,
"epoch": 0.633,
"grad_norm": 20.174137035703254,
"kl_loss_13": 3206.8,
"kl_loss_26": 2268.4,
"kl_loss_39": 1251.1,
"kl_loss_7": 3762.4,
"learning_rate": 0.00030244586084303903,
"loss": 5352.9,
"step": 6330
},
{
"ce_loss_13": 2.9555823683738707,
"ce_loss_26": 2.501230263710022,
"ce_loss_39": 2.0173334002494814,
"ce_loss_52": 1.3891687452793122,
"ce_loss_7": 3.2268748760223387,
"epoch": 0.634,
"grad_norm": 20.047186620123167,
"kl_loss_13": 3264.8,
"kl_loss_26": 2316.6,
"kl_loss_39": 1274.1,
"kl_loss_7": 3843.2,
"learning_rate": 0.00030098929455206903,
"loss": 5365.4,
"step": 6340
},
{
"ce_loss_13": 2.9806570291519163,
"ce_loss_26": 2.533248084783554,
"ce_loss_39": 2.0557660490274428,
"ce_loss_52": 1.4183998316526414,
"ce_loss_7": 3.248631852865219,
"epoch": 0.635,
"grad_norm": 19.42792815463783,
"kl_loss_13": 3226.0,
"kl_loss_26": 2301.0,
"kl_loss_39": 1289.2,
"kl_loss_7": 3784.4,
"learning_rate": 0.00029953473229669324,
"loss": 5429.0,
"step": 6350
},
{
"ce_loss_13": 3.008418655395508,
"ce_loss_26": 2.5531763255596163,
"ce_loss_39": 2.069964846968651,
"ce_loss_52": 1.4419535219669342,
"ce_loss_7": 3.274876070022583,
"epoch": 0.636,
"grad_norm": 20.040461482265158,
"kl_loss_13": 3248.4,
"kl_loss_26": 2315.4,
"kl_loss_39": 1287.5,
"kl_loss_7": 3802.8,
"learning_rate": 0.00029808218872433767,
"loss": 5390.5,
"step": 6360
},
{
"ce_loss_13": 2.9624147057533263,
"ce_loss_26": 2.509669789671898,
"ce_loss_39": 2.017057329416275,
"ce_loss_52": 1.392769531905651,
"ce_loss_7": 3.2405923306941986,
"epoch": 0.637,
"grad_norm": 19.969518630784094,
"kl_loss_13": 3281.6,
"kl_loss_26": 2321.8,
"kl_loss_39": 1284.8,
"kl_loss_7": 3857.6,
"learning_rate": 0.0002966316784621,
"loss": 5344.4,
"step": 6370
},
{
"ce_loss_13": 2.9690242230892183,
"ce_loss_26": 2.5119642555713653,
"ce_loss_39": 2.022391200065613,
"ce_loss_52": 1.3881384432315826,
"ce_loss_7": 3.2436072409152983,
"epoch": 0.638,
"grad_norm": 19.443805471212187,
"kl_loss_13": 3260.4,
"kl_loss_26": 2314.8,
"kl_loss_39": 1281.4,
"kl_loss_7": 3835.6,
"learning_rate": 0.0002951832161166024,
"loss": 5333.0,
"step": 6380
},
{
"ce_loss_13": 3.019889771938324,
"ce_loss_26": 2.574093183875084,
"ce_loss_39": 2.085198149085045,
"ce_loss_52": 1.4531068801879883,
"ce_loss_7": 3.286811703443527,
"epoch": 0.639,
"grad_norm": 19.50308932074232,
"kl_loss_13": 3246.0,
"kl_loss_26": 2313.6,
"kl_loss_39": 1281.1,
"kl_loss_7": 3800.8,
"learning_rate": 0.0002937368162738445,
"loss": 5358.7,
"step": 6390
},
{
"ce_loss_13": 2.979416298866272,
"ce_loss_26": 2.5201680839061735,
"ce_loss_39": 2.0343465119600297,
"ce_loss_52": 1.4125050336122513,
"ce_loss_7": 3.2494026124477386,
"epoch": 0.64,
"grad_norm": 19.887826117374196,
"kl_loss_13": 3261.2,
"kl_loss_26": 2302.8,
"kl_loss_39": 1277.2,
"kl_loss_7": 3827.6,
"learning_rate": 0.0002922924934990568,
"loss": 5361.0,
"step": 6400
},
{
"ce_loss_13": 2.966394138336182,
"ce_loss_26": 2.5159747898578644,
"ce_loss_39": 2.0343314677476885,
"ce_loss_52": 1.3983295410871506,
"ce_loss_7": 3.2374337732791902,
"epoch": 0.641,
"grad_norm": 21.205292313379594,
"kl_loss_13": 3264.4,
"kl_loss_26": 2317.8,
"kl_loss_39": 1301.9,
"kl_loss_7": 3828.8,
"learning_rate": 0.0002908502623365536,
"loss": 5348.95,
"step": 6410
},
{
"ce_loss_13": 3.0186184704303742,
"ce_loss_26": 2.5620053589344023,
"ce_loss_39": 2.079856187105179,
"ce_loss_52": 1.437650018930435,
"ce_loss_7": 3.2890258550643923,
"epoch": 0.642,
"grad_norm": 20.335253629932936,
"kl_loss_13": 3264.4,
"kl_loss_26": 2325.4,
"kl_loss_39": 1302.1,
"kl_loss_7": 3830.0,
"learning_rate": 0.0002894101373095867,
"loss": 5303.0,
"step": 6420
},
{
"ce_loss_13": 3.0677368700504304,
"ce_loss_26": 2.6209602475166323,
"ce_loss_39": 2.1306197196245193,
"ce_loss_52": 1.4929826736450196,
"ce_loss_7": 3.3314808785915373,
"epoch": 0.643,
"grad_norm": 20.265575824381624,
"kl_loss_13": 3289.6,
"kl_loss_26": 2355.4,
"kl_loss_39": 1318.2,
"kl_loss_7": 3835.2,
"learning_rate": 0.00028797213292019926,
"loss": 5380.85,
"step": 6430
},
{
"ce_loss_13": 2.966922277212143,
"ce_loss_26": 2.51933411359787,
"ce_loss_39": 2.040864047408104,
"ce_loss_52": 1.4254619121551513,
"ce_loss_7": 3.232788211107254,
"epoch": 0.644,
"grad_norm": 19.660532004484757,
"kl_loss_13": 3209.2,
"kl_loss_26": 2273.0,
"kl_loss_39": 1260.8,
"kl_loss_7": 3770.0,
"learning_rate": 0.0002865362636490791,
"loss": 5309.3,
"step": 6440
},
{
"ce_loss_13": 3.0011440992355345,
"ce_loss_26": 2.5491324365139008,
"ce_loss_39": 2.057476672530174,
"ce_loss_52": 1.4156641319394112,
"ce_loss_7": 3.2698193073272703,
"epoch": 0.645,
"grad_norm": 20.528114278014176,
"kl_loss_13": 3284.8,
"kl_loss_26": 2344.4,
"kl_loss_39": 1307.4,
"kl_loss_7": 3853.6,
"learning_rate": 0.0002851025439554142,
"loss": 5329.3,
"step": 6450
},
{
"ce_loss_13": 3.0585977435112,
"ce_loss_26": 2.593171867728233,
"ce_loss_39": 2.1089387238025665,
"ce_loss_52": 1.4590320155024528,
"ce_loss_7": 3.3243721425533295,
"epoch": 0.646,
"grad_norm": 19.177895864651298,
"kl_loss_13": 3308.0,
"kl_loss_26": 2342.6,
"kl_loss_39": 1306.8,
"kl_loss_7": 3858.4,
"learning_rate": 0.00028367098827674573,
"loss": 5399.4,
"step": 6460
},
{
"ce_loss_13": 3.0068358182907104,
"ce_loss_26": 2.559490966796875,
"ce_loss_39": 2.0818791508674623,
"ce_loss_52": 1.4510639190673829,
"ce_loss_7": 3.2726912021636965,
"epoch": 0.647,
"grad_norm": 20.001351203231668,
"kl_loss_13": 3231.2,
"kl_loss_26": 2299.0,
"kl_loss_39": 1281.6,
"kl_loss_7": 3785.2,
"learning_rate": 0.00028224161102882397,
"loss": 5353.5,
"step": 6470
},
{
"ce_loss_13": 3.0042510509490965,
"ce_loss_26": 2.545697581768036,
"ce_loss_39": 2.0517130315303804,
"ce_loss_52": 1.4082761898636817,
"ce_loss_7": 3.2736884713172913,
"epoch": 0.648,
"grad_norm": 19.176592480543317,
"kl_loss_13": 3304.8,
"kl_loss_26": 2344.6,
"kl_loss_39": 1308.6,
"kl_loss_7": 3870.8,
"learning_rate": 0.00028081442660546124,
"loss": 5357.45,
"step": 6480
},
{
"ce_loss_13": 2.9604024648666383,
"ce_loss_26": 2.5067259430885316,
"ce_loss_39": 2.0217309921979902,
"ce_loss_52": 1.396033638715744,
"ce_loss_7": 3.229094612598419,
"epoch": 0.649,
"grad_norm": 20.132685826085943,
"kl_loss_13": 3230.8,
"kl_loss_26": 2292.8,
"kl_loss_39": 1272.4,
"kl_loss_7": 3788.8,
"learning_rate": 0.0002793894493783892,
"loss": 5337.2,
"step": 6490
},
{
"ce_loss_13": 3.038835954666138,
"ce_loss_26": 2.5828086912631987,
"ce_loss_39": 2.094580352306366,
"ce_loss_52": 1.4504825562238692,
"ce_loss_7": 3.3101982474327087,
"epoch": 0.65,
"grad_norm": 20.26204607317675,
"kl_loss_13": 3292.0,
"kl_loss_26": 2347.2,
"kl_loss_39": 1312.5,
"kl_loss_7": 3857.2,
"learning_rate": 0.0002779666936971129,
"loss": 5341.9,
"step": 6500
},
{
"ce_loss_13": 2.981122875213623,
"ce_loss_26": 2.5436301648616793,
"ce_loss_39": 2.06348480284214,
"ce_loss_52": 1.4388723462820052,
"ce_loss_7": 3.2536712110042574,
"epoch": 0.651,
"grad_norm": 19.527282619134503,
"kl_loss_13": 3203.2,
"kl_loss_26": 2279.0,
"kl_loss_39": 1260.5,
"kl_loss_7": 3763.6,
"learning_rate": 0.00027654617388876614,
"loss": 5303.55,
"step": 6510
},
{
"ce_loss_13": 2.985329604148865,
"ce_loss_26": 2.5429716140031813,
"ce_loss_39": 2.0698666363954543,
"ce_loss_52": 1.43270433396101,
"ce_loss_7": 3.2484419345855713,
"epoch": 0.652,
"grad_norm": 19.361430489337188,
"kl_loss_13": 3187.2,
"kl_loss_26": 2277.2,
"kl_loss_39": 1278.4,
"kl_loss_7": 3732.4,
"learning_rate": 0.0002751279042579672,
"loss": 5316.3,
"step": 6520
},
{
"ce_loss_13": 2.9774204194545746,
"ce_loss_26": 2.5208486020565033,
"ce_loss_39": 2.0273024052381516,
"ce_loss_52": 1.402597150206566,
"ce_loss_7": 3.249239844083786,
"epoch": 0.653,
"grad_norm": 19.527542591573294,
"kl_loss_13": 3262.4,
"kl_loss_26": 2313.8,
"kl_loss_39": 1281.3,
"kl_loss_7": 3834.0,
"learning_rate": 0.00027371189908667604,
"loss": 5336.1,
"step": 6530
},
{
"ce_loss_13": 3.003592276573181,
"ce_loss_26": 2.5556287467479706,
"ce_loss_39": 2.075370451807976,
"ce_loss_52": 1.4352567225694657,
"ce_loss_7": 3.271352219581604,
"epoch": 0.654,
"grad_norm": 19.924679150402795,
"kl_loss_13": 3252.0,
"kl_loss_26": 2323.8,
"kl_loss_39": 1299.3,
"kl_loss_7": 3811.2,
"learning_rate": 0.00027229817263404863,
"loss": 5288.8,
"step": 6540
},
{
"ce_loss_13": 2.995857471227646,
"ce_loss_26": 2.5282726138830185,
"ce_loss_39": 2.0456599622964857,
"ce_loss_52": 1.4150889962911606,
"ce_loss_7": 3.259833812713623,
"epoch": 0.655,
"grad_norm": 19.56917393810092,
"kl_loss_13": 3242.4,
"kl_loss_26": 2291.2,
"kl_loss_39": 1266.0,
"kl_loss_7": 3806.4,
"learning_rate": 0.0002708867391362948,
"loss": 5328.1,
"step": 6550
},
{
"ce_loss_13": 3.004334282875061,
"ce_loss_26": 2.553117799758911,
"ce_loss_39": 2.0656849920749663,
"ce_loss_52": 1.4380108654499053,
"ce_loss_7": 3.275963246822357,
"epoch": 0.656,
"grad_norm": 19.95397617147254,
"kl_loss_13": 3232.8,
"kl_loss_26": 2292.0,
"kl_loss_39": 1262.4,
"kl_loss_7": 3798.0,
"learning_rate": 0.0002694776128065345,
"loss": 5289.15,
"step": 6560
},
{
"ce_loss_13": 3.0339253902435304,
"ce_loss_26": 2.5763088524341584,
"ce_loss_39": 2.0788813173770904,
"ce_loss_52": 1.4449120432138443,
"ce_loss_7": 3.3040917217731476,
"epoch": 0.657,
"grad_norm": 20.212973328584127,
"kl_loss_13": 3268.4,
"kl_loss_26": 2314.6,
"kl_loss_39": 1279.8,
"kl_loss_7": 3840.8,
"learning_rate": 0.00026807080783465374,
"loss": 5293.2,
"step": 6570
},
{
"ce_loss_13": 3.032737511396408,
"ce_loss_26": 2.5693029284477236,
"ce_loss_39": 2.081709760427475,
"ce_loss_52": 1.4346143543720244,
"ce_loss_7": 3.308923304080963,
"epoch": 0.658,
"grad_norm": 19.880750606313658,
"kl_loss_13": 3330.4,
"kl_loss_26": 2365.8,
"kl_loss_39": 1316.5,
"kl_loss_7": 3904.0,
"learning_rate": 0.00026666633838716316,
"loss": 5330.1,
"step": 6580
},
{
"ce_loss_13": 3.0193063259124755,
"ce_loss_26": 2.576409709453583,
"ce_loss_39": 2.0992994725704195,
"ce_loss_52": 1.4646779403090477,
"ce_loss_7": 3.285114985704422,
"epoch": 0.659,
"grad_norm": 20.363457370030773,
"kl_loss_13": 3237.2,
"kl_loss_26": 2313.0,
"kl_loss_39": 1303.8,
"kl_loss_7": 3788.4,
"learning_rate": 0.00026526421860705474,
"loss": 5307.4,
"step": 6590
},
{
"ce_loss_13": 2.9948873639106752,
"ce_loss_26": 2.563684010505676,
"ce_loss_39": 2.0847960352897643,
"ce_loss_52": 1.4657811507582665,
"ce_loss_7": 3.2635042905807494,
"epoch": 0.66,
"grad_norm": 20.73724151705583,
"kl_loss_13": 3176.8,
"kl_loss_26": 2267.2,
"kl_loss_39": 1259.0,
"kl_loss_7": 3732.8,
"learning_rate": 0.0002638644626136587,
"loss": 5326.5,
"step": 6600
},
{
"ce_loss_13": 3.007009822130203,
"ce_loss_26": 2.5652425408363344,
"ce_loss_39": 2.085008403658867,
"ce_loss_52": 1.4418020695447922,
"ce_loss_7": 3.2768814861774445,
"epoch": 0.661,
"grad_norm": 19.59865222649701,
"kl_loss_13": 3192.4,
"kl_loss_26": 2272.6,
"kl_loss_39": 1265.8,
"kl_loss_7": 3758.8,
"learning_rate": 0.00026246708450250255,
"loss": 5252.1,
"step": 6610
},
{
"ce_loss_13": 3.0007415533065798,
"ce_loss_26": 2.57885719537735,
"ce_loss_39": 2.1149094998836517,
"ce_loss_52": 1.4889407217502595,
"ce_loss_7": 3.26429398059845,
"epoch": 0.662,
"grad_norm": 19.791378915341742,
"kl_loss_13": 3154.4,
"kl_loss_26": 2261.8,
"kl_loss_39": 1265.9,
"kl_loss_7": 3702.8,
"learning_rate": 0.00026107209834516854,
"loss": 5253.75,
"step": 6620
},
{
"ce_loss_13": 3.023489362001419,
"ce_loss_26": 2.565022760629654,
"ce_loss_39": 2.0684966832399367,
"ce_loss_52": 1.4181891351938247,
"ce_loss_7": 3.295238083600998,
"epoch": 0.663,
"grad_norm": 18.83951798457028,
"kl_loss_13": 3308.8,
"kl_loss_26": 2356.8,
"kl_loss_39": 1317.4,
"kl_loss_7": 3874.4,
"learning_rate": 0.0002596795181891514,
"loss": 5303.2,
"step": 6630
},
{
"ce_loss_13": 2.948693299293518,
"ce_loss_26": 2.511568069458008,
"ce_loss_39": 2.0302646070718766,
"ce_loss_52": 1.40511611700058,
"ce_loss_7": 3.215245670080185,
"epoch": 0.664,
"grad_norm": 19.938510106571005,
"kl_loss_13": 3190.8,
"kl_loss_26": 2276.8,
"kl_loss_39": 1269.2,
"kl_loss_7": 3747.6,
"learning_rate": 0.000258289358057718,
"loss": 5355.55,
"step": 6640
},
{
"ce_loss_13": 2.964204251766205,
"ce_loss_26": 2.5196115612983703,
"ce_loss_39": 2.032116264104843,
"ce_loss_52": 1.397587490081787,
"ce_loss_7": 3.234144788980484,
"epoch": 0.665,
"grad_norm": 19.688819745922057,
"kl_loss_13": 3241.6,
"kl_loss_26": 2310.8,
"kl_loss_39": 1285.8,
"kl_loss_7": 3812.0,
"learning_rate": 0.0002569016319497657,
"loss": 5275.7,
"step": 6650
},
{
"ce_loss_13": 3.0219891548156737,
"ce_loss_26": 2.568303269147873,
"ce_loss_39": 2.0779166162014007,
"ce_loss_52": 1.4420379608869554,
"ce_loss_7": 3.2859797060489653,
"epoch": 0.666,
"grad_norm": 19.909960186263373,
"kl_loss_13": 3259.2,
"kl_loss_26": 2308.2,
"kl_loss_39": 1277.3,
"kl_loss_7": 3822.8,
"learning_rate": 0.00025551635383968066,
"loss": 5336.5,
"step": 6660
},
{
"ce_loss_13": 2.994647592306137,
"ce_loss_26": 2.5398232668638228,
"ce_loss_39": 2.0492498099803926,
"ce_loss_52": 1.434773786365986,
"ce_loss_7": 3.264130574464798,
"epoch": 0.667,
"grad_norm": 20.004167920061033,
"kl_loss_13": 3214.0,
"kl_loss_26": 2272.2,
"kl_loss_39": 1256.6,
"kl_loss_7": 3778.0,
"learning_rate": 0.00025413353767719804,
"loss": 5257.5,
"step": 6670
},
{
"ce_loss_13": 2.96993693113327,
"ce_loss_26": 2.533370888233185,
"ce_loss_39": 2.055029663443565,
"ce_loss_52": 1.4544675678014756,
"ce_loss_7": 3.2342797338962557,
"epoch": 0.668,
"grad_norm": 20.025849444564383,
"kl_loss_13": 3142.4,
"kl_loss_26": 2230.8,
"kl_loss_39": 1222.0,
"kl_loss_7": 3698.4,
"learning_rate": 0.0002527531973872617,
"loss": 5248.5,
"step": 6680
},
{
"ce_loss_13": 2.9402814984321592,
"ce_loss_26": 2.4974717676639555,
"ce_loss_39": 2.0207353264093397,
"ce_loss_52": 1.4053010821342469,
"ce_loss_7": 3.211796945333481,
"epoch": 0.669,
"grad_norm": 20.57900906104847,
"kl_loss_13": 3184.0,
"kl_loss_26": 2262.6,
"kl_loss_39": 1252.7,
"kl_loss_7": 3753.6,
"learning_rate": 0.0002513753468698826,
"loss": 5296.7,
"step": 6690
},
{
"ce_loss_13": 3.049540191888809,
"ce_loss_26": 2.5854183793067933,
"ce_loss_39": 2.092196524143219,
"ce_loss_52": 1.4520713061094284,
"ce_loss_7": 3.3153574585914614,
"epoch": 0.67,
"grad_norm": 20.064255813843516,
"kl_loss_13": 3277.6,
"kl_loss_26": 2320.0,
"kl_loss_39": 1288.6,
"kl_loss_7": 3837.2,
"learning_rate": 0.0002500000000000001,
"loss": 5320.3,
"step": 6700
},
{
"ce_loss_13": 2.944129317998886,
"ce_loss_26": 2.503596860170364,
"ce_loss_39": 2.0286095440387726,
"ce_loss_52": 1.4282706409692765,
"ce_loss_7": 3.2095106482505797,
"epoch": 0.671,
"grad_norm": 20.12603370941262,
"kl_loss_13": 3178.8,
"kl_loss_26": 2257.6,
"kl_loss_39": 1240.0,
"kl_loss_7": 3734.8,
"learning_rate": 0.0002486271706273421,
"loss": 5232.2,
"step": 6710
},
{
"ce_loss_13": 2.9713922500610352,
"ce_loss_26": 2.5247735172510146,
"ce_loss_39": 2.050862190127373,
"ce_loss_52": 1.449659252166748,
"ce_loss_7": 3.2344084203243257,
"epoch": 0.672,
"grad_norm": 20.72667693004533,
"kl_loss_13": 3149.6,
"kl_loss_26": 2225.2,
"kl_loss_39": 1217.0,
"kl_loss_7": 3703.2,
"learning_rate": 0.0002472568725762853,
"loss": 5273.45,
"step": 6720
},
{
"ce_loss_13": 2.9712363362312315,
"ce_loss_26": 2.5258888751268387,
"ce_loss_39": 2.038512706756592,
"ce_loss_52": 1.4098822742700576,
"ce_loss_7": 3.238377648591995,
"epoch": 0.673,
"grad_norm": 19.364942978516346,
"kl_loss_13": 3241.6,
"kl_loss_26": 2313.0,
"kl_loss_39": 1283.6,
"kl_loss_7": 3804.0,
"learning_rate": 0.00024588911964571554,
"loss": 5264.25,
"step": 6730
},
{
"ce_loss_13": 3.0029661655426025,
"ce_loss_26": 2.5618400514125823,
"ce_loss_39": 2.0779304295778274,
"ce_loss_52": 1.4597969472408294,
"ce_loss_7": 3.2699401795864107,
"epoch": 0.674,
"grad_norm": 19.386861595432553,
"kl_loss_13": 3201.2,
"kl_loss_26": 2283.2,
"kl_loss_39": 1260.7,
"kl_loss_7": 3760.0,
"learning_rate": 0.00024452392560888974,
"loss": 5256.1,
"step": 6740
},
{
"ce_loss_13": 2.9799251735210417,
"ce_loss_26": 2.5340505450963975,
"ce_loss_39": 2.049144822359085,
"ce_loss_52": 1.4137116000056267,
"ce_loss_7": 3.257823657989502,
"epoch": 0.675,
"grad_norm": 19.909504320065746,
"kl_loss_13": 3246.4,
"kl_loss_26": 2320.6,
"kl_loss_39": 1295.2,
"kl_loss_7": 3816.4,
"learning_rate": 0.00024316130421329695,
"loss": 5221.1,
"step": 6750
},
{
"ce_loss_13": 2.9629843533039093,
"ce_loss_26": 2.524860253930092,
"ce_loss_39": 2.042747235298157,
"ce_loss_52": 1.4334406018257142,
"ce_loss_7": 3.2345532715320586,
"epoch": 0.676,
"grad_norm": 20.369497806638545,
"kl_loss_13": 3186.8,
"kl_loss_26": 2271.6,
"kl_loss_39": 1245.2,
"kl_loss_7": 3754.4,
"learning_rate": 0.00024180126918051909,
"loss": 5236.3,
"step": 6760
},
{
"ce_loss_13": 2.9732554376125337,
"ce_loss_26": 2.5324235647916793,
"ce_loss_39": 2.0435447841882706,
"ce_loss_52": 1.4181665301322937,
"ce_loss_7": 3.2470718741416933,
"epoch": 0.677,
"grad_norm": 20.49515152965971,
"kl_loss_13": 3219.2,
"kl_loss_26": 2299.4,
"kl_loss_39": 1278.7,
"kl_loss_7": 3786.8,
"learning_rate": 0.00024044383420609406,
"loss": 5319.65,
"step": 6770
},
{
"ce_loss_13": 2.9884051978588104,
"ce_loss_26": 2.552768051624298,
"ce_loss_39": 2.0788974314928055,
"ce_loss_52": 1.460537651181221,
"ce_loss_7": 3.251654601097107,
"epoch": 0.678,
"grad_norm": 19.11355169384201,
"kl_loss_13": 3167.6,
"kl_loss_26": 2248.8,
"kl_loss_39": 1249.3,
"kl_loss_7": 3718.4,
"learning_rate": 0.00023908901295937712,
"loss": 5270.2,
"step": 6780
},
{
"ce_loss_13": 2.974563705921173,
"ce_loss_26": 2.536205679178238,
"ce_loss_39": 2.059639421105385,
"ce_loss_52": 1.4519853800535203,
"ce_loss_7": 3.2307840466499327,
"epoch": 0.679,
"grad_norm": 19.49943649381752,
"kl_loss_13": 3131.2,
"kl_loss_26": 2224.6,
"kl_loss_39": 1227.3,
"kl_loss_7": 3672.8,
"learning_rate": 0.00023773681908340283,
"loss": 5293.35,
"step": 6790
},
{
"ce_loss_13": 2.961294001340866,
"ce_loss_26": 2.5129422783851623,
"ce_loss_39": 2.0338284403085707,
"ce_loss_52": 1.409618005156517,
"ce_loss_7": 3.226576966047287,
"epoch": 0.68,
"grad_norm": 19.714496760455777,
"kl_loss_13": 3218.4,
"kl_loss_26": 2291.6,
"kl_loss_39": 1267.1,
"kl_loss_7": 3768.8,
"learning_rate": 0.00023638726619474876,
"loss": 5250.5,
"step": 6800
},
{
"ce_loss_13": 3.0715033173561097,
"ce_loss_26": 2.626942425966263,
"ce_loss_39": 2.1481954157352448,
"ce_loss_52": 1.5187551528215408,
"ce_loss_7": 3.336813968420029,
"epoch": 0.681,
"grad_norm": 19.805927066338636,
"kl_loss_13": 3238.0,
"kl_loss_26": 2300.0,
"kl_loss_39": 1278.2,
"kl_loss_7": 3790.0,
"learning_rate": 0.0002350403678833976,
"loss": 5234.9,
"step": 6810
},
{
"ce_loss_13": 2.957927519083023,
"ce_loss_26": 2.5146523237228395,
"ce_loss_39": 2.0373351722955704,
"ce_loss_52": 1.42108353972435,
"ce_loss_7": 3.223799991607666,
"epoch": 0.682,
"grad_norm": 20.479900710283268,
"kl_loss_13": 3202.4,
"kl_loss_26": 2281.6,
"kl_loss_39": 1265.3,
"kl_loss_7": 3751.2,
"learning_rate": 0.00023369613771260007,
"loss": 5258.8,
"step": 6820
},
{
"ce_loss_13": 2.9837976515293123,
"ce_loss_26": 2.5472346246242523,
"ce_loss_39": 2.0789157301187515,
"ce_loss_52": 1.4713156789541244,
"ce_loss_7": 3.2464165806770326,
"epoch": 0.683,
"grad_norm": 19.479700971519588,
"kl_loss_13": 3160.0,
"kl_loss_26": 2251.4,
"kl_loss_39": 1251.2,
"kl_loss_7": 3708.8,
"learning_rate": 0.00023235458921873925,
"loss": 5205.3,
"step": 6830
},
{
"ce_loss_13": 2.9887463808059693,
"ce_loss_26": 2.54727523624897,
"ce_loss_39": 2.0671548724174498,
"ce_loss_52": 1.429240283370018,
"ce_loss_7": 3.2527148902416227,
"epoch": 0.684,
"grad_norm": 19.517242754730322,
"kl_loss_13": 3196.4,
"kl_loss_26": 2285.2,
"kl_loss_39": 1272.4,
"kl_loss_7": 3748.4,
"learning_rate": 0.0002310157359111938,
"loss": 5234.8,
"step": 6840
},
{
"ce_loss_13": 2.916054058074951,
"ce_loss_26": 2.4669763922691343,
"ce_loss_39": 1.992121958732605,
"ce_loss_52": 1.390305233001709,
"ce_loss_7": 3.179300290346146,
"epoch": 0.685,
"grad_norm": 20.2160173629293,
"kl_loss_13": 3168.4,
"kl_loss_26": 2234.6,
"kl_loss_39": 1230.6,
"kl_loss_7": 3719.2,
"learning_rate": 0.0002296795912722014,
"loss": 5214.55,
"step": 6850
},
{
"ce_loss_13": 2.9123338878154756,
"ce_loss_26": 2.4650843650102616,
"ce_loss_39": 1.9903331339359283,
"ce_loss_52": 1.3830609425902367,
"ce_loss_7": 3.174854850769043,
"epoch": 0.686,
"grad_norm": 19.3573154940235,
"kl_loss_13": 3154.0,
"kl_loss_26": 2228.4,
"kl_loss_39": 1226.6,
"kl_loss_7": 3712.0,
"learning_rate": 0.0002283461687567236,
"loss": 5186.2,
"step": 6860
},
{
"ce_loss_13": 2.9503078758716583,
"ce_loss_26": 2.5050206154584886,
"ce_loss_39": 2.0353992134332657,
"ce_loss_52": 1.4260726869106293,
"ce_loss_7": 3.2181301593780516,
"epoch": 0.687,
"grad_norm": 19.436405773017547,
"kl_loss_13": 3173.6,
"kl_loss_26": 2249.4,
"kl_loss_39": 1239.4,
"kl_loss_7": 3725.2,
"learning_rate": 0.00022701548179231045,
"loss": 5180.9,
"step": 6870
},
{
"ce_loss_13": 2.989210718870163,
"ce_loss_26": 2.5456956744194033,
"ce_loss_39": 2.07299542427063,
"ce_loss_52": 1.45269995033741,
"ce_loss_7": 3.25973704457283,
"epoch": 0.688,
"grad_norm": 19.21415326438658,
"kl_loss_13": 3172.0,
"kl_loss_26": 2245.6,
"kl_loss_39": 1252.7,
"kl_loss_7": 3735.6,
"learning_rate": 0.00022568754377896516,
"loss": 5258.6,
"step": 6880
},
{
"ce_loss_13": 2.9914295256137846,
"ce_loss_26": 2.5424347430467606,
"ce_loss_39": 2.0552540928125382,
"ce_loss_52": 1.4221117675304413,
"ce_loss_7": 3.2662317156791687,
"epoch": 0.689,
"grad_norm": 19.29554445155232,
"kl_loss_13": 3243.2,
"kl_loss_26": 2310.8,
"kl_loss_39": 1282.1,
"kl_loss_7": 3819.2,
"learning_rate": 0.00022436236808900844,
"loss": 5241.3,
"step": 6890
},
{
"ce_loss_13": 2.9910283386707306,
"ce_loss_26": 2.550189185142517,
"ce_loss_39": 2.07351476252079,
"ce_loss_52": 1.4624590903520585,
"ce_loss_7": 3.2594147861003875,
"epoch": 0.69,
"grad_norm": 19.896412241424787,
"kl_loss_13": 3197.2,
"kl_loss_26": 2269.0,
"kl_loss_39": 1261.5,
"kl_loss_7": 3754.8,
"learning_rate": 0.00022303996806694487,
"loss": 5245.0,
"step": 6900
},
{
"ce_loss_13": 2.9984730899333956,
"ce_loss_26": 2.563878893852234,
"ce_loss_39": 2.073988217115402,
"ce_loss_52": 1.4545040100812912,
"ce_loss_7": 3.266710376739502,
"epoch": 0.691,
"grad_norm": 18.309749884904267,
"kl_loss_13": 3220.8,
"kl_loss_26": 2301.4,
"kl_loss_39": 1269.6,
"kl_loss_7": 3776.4,
"learning_rate": 0.00022172035702932823,
"loss": 5221.5,
"step": 6910
},
{
"ce_loss_13": 2.9589293122291567,
"ce_loss_26": 2.5164781630039217,
"ce_loss_39": 2.045464962720871,
"ce_loss_52": 1.4442868947982788,
"ce_loss_7": 3.2192385673522947,
"epoch": 0.692,
"grad_norm": 19.310059013922874,
"kl_loss_13": 3138.4,
"kl_loss_26": 2227.4,
"kl_loss_39": 1231.9,
"kl_loss_7": 3680.8,
"learning_rate": 0.00022040354826462666,
"loss": 5190.7,
"step": 6920
},
{
"ce_loss_13": 2.947145390510559,
"ce_loss_26": 2.5085155785083773,
"ce_loss_39": 2.0424805164337156,
"ce_loss_52": 1.443164300918579,
"ce_loss_7": 3.206177592277527,
"epoch": 0.693,
"grad_norm": 20.439949582745886,
"kl_loss_13": 3135.6,
"kl_loss_26": 2219.4,
"kl_loss_39": 1222.2,
"kl_loss_7": 3681.2,
"learning_rate": 0.0002190895550330899,
"loss": 5252.8,
"step": 6930
},
{
"ce_loss_13": 2.951523560285568,
"ce_loss_26": 2.4968682497739794,
"ce_loss_39": 2.021780180931091,
"ce_loss_52": 1.4140155717730523,
"ce_loss_7": 3.2223109781742094,
"epoch": 0.694,
"grad_norm": 19.632700567273133,
"kl_loss_13": 3171.6,
"kl_loss_26": 2224.6,
"kl_loss_39": 1221.6,
"kl_loss_7": 3730.8,
"learning_rate": 0.00021777839056661552,
"loss": 5204.95,
"step": 6940
},
{
"ce_loss_13": 2.9996495246887207,
"ce_loss_26": 2.545914036035538,
"ce_loss_39": 2.0629312634468078,
"ce_loss_52": 1.4569539099931716,
"ce_loss_7": 3.260616344213486,
"epoch": 0.695,
"grad_norm": 19.802798519542876,
"kl_loss_13": 3188.0,
"kl_loss_26": 2252.8,
"kl_loss_39": 1238.7,
"kl_loss_7": 3738.8,
"learning_rate": 0.0002164700680686147,
"loss": 5219.0,
"step": 6950
},
{
"ce_loss_13": 2.965209072828293,
"ce_loss_26": 2.523294594883919,
"ce_loss_39": 2.0499251425266265,
"ce_loss_52": 1.450638398528099,
"ce_loss_7": 3.224820476770401,
"epoch": 0.696,
"grad_norm": 19.91434345309615,
"kl_loss_13": 3134.0,
"kl_loss_26": 2218.0,
"kl_loss_39": 1215.6,
"kl_loss_7": 3669.6,
"learning_rate": 0.0002151646007138806,
"loss": 5247.2,
"step": 6960
},
{
"ce_loss_13": 2.989179176092148,
"ce_loss_26": 2.5318516552448274,
"ce_loss_39": 2.049258217215538,
"ce_loss_52": 1.4338771492242812,
"ce_loss_7": 3.2527658343315125,
"epoch": 0.697,
"grad_norm": 19.22493467335688,
"kl_loss_13": 3224.0,
"kl_loss_26": 2281.6,
"kl_loss_39": 1260.3,
"kl_loss_7": 3768.8,
"learning_rate": 0.00021386200164845526,
"loss": 5208.2,
"step": 6970
},
{
"ce_loss_13": 2.9675691723823547,
"ce_loss_26": 2.524250292778015,
"ce_loss_39": 2.0449122846126557,
"ce_loss_52": 1.4266346216201782,
"ce_loss_7": 3.238176566362381,
"epoch": 0.698,
"grad_norm": 19.28582875590594,
"kl_loss_13": 3208.4,
"kl_loss_26": 2279.6,
"kl_loss_39": 1258.2,
"kl_loss_7": 3762.8,
"learning_rate": 0.0002125622839894964,
"loss": 5196.95,
"step": 6980
},
{
"ce_loss_13": 3.08748916387558,
"ce_loss_26": 2.6341689109802244,
"ce_loss_39": 2.131711891293526,
"ce_loss_52": 1.4756682693958283,
"ce_loss_7": 3.3573212742805483,
"epoch": 0.699,
"grad_norm": 19.570816095455605,
"kl_loss_13": 3319.2,
"kl_loss_26": 2375.0,
"kl_loss_39": 1332.9,
"kl_loss_7": 3878.0,
"learning_rate": 0.00021126546082514663,
"loss": 5264.6,
"step": 6990
},
{
"ce_loss_13": 2.9623800575733186,
"ce_loss_26": 2.528759664297104,
"ce_loss_39": 2.0456511676311493,
"ce_loss_52": 1.4381566911935806,
"ce_loss_7": 3.2283441185951234,
"epoch": 0.7,
"grad_norm": 20.0544191043279,
"kl_loss_13": 3144.4,
"kl_loss_26": 2245.4,
"kl_loss_39": 1237.7,
"kl_loss_7": 3701.6,
"learning_rate": 0.00020997154521440098,
"loss": 5184.75,
"step": 7000
},
{
"ce_loss_13": 2.9258078813552855,
"ce_loss_26": 2.5000339925289152,
"ce_loss_39": 2.030132883787155,
"ce_loss_52": 1.432218487560749,
"ce_loss_7": 3.1859234631061555,
"epoch": 0.701,
"grad_norm": 20.032367176949514,
"kl_loss_13": 3112.4,
"kl_loss_26": 2211.2,
"kl_loss_39": 1212.0,
"kl_loss_7": 3653.6,
"learning_rate": 0.0002086805501869749,
"loss": 5163.7,
"step": 7010
},
{
"ce_loss_13": 2.9877611219882967,
"ce_loss_26": 2.5467711210250856,
"ce_loss_39": 2.0729553580284117,
"ce_loss_52": 1.4694935828447342,
"ce_loss_7": 3.2482242822647094,
"epoch": 0.702,
"grad_norm": 19.48021495423088,
"kl_loss_13": 3139.2,
"kl_loss_26": 2221.2,
"kl_loss_39": 1226.6,
"kl_loss_7": 3688.4,
"learning_rate": 0.0002073924887431744,
"loss": 5172.1,
"step": 7020
},
{
"ce_loss_13": 2.908235615491867,
"ce_loss_26": 2.477229207754135,
"ce_loss_39": 2.0130053520202638,
"ce_loss_52": 1.4191944628953934,
"ce_loss_7": 3.1711674451828005,
"epoch": 0.703,
"grad_norm": 19.67889818109448,
"kl_loss_13": 3069.6,
"kl_loss_26": 2175.0,
"kl_loss_39": 1200.3,
"kl_loss_7": 3607.2,
"learning_rate": 0.00020610737385376348,
"loss": 5178.0,
"step": 7030
},
{
"ce_loss_13": 2.925868648290634,
"ce_loss_26": 2.4821185052394865,
"ce_loss_39": 2.013263535499573,
"ce_loss_52": 1.407904815673828,
"ce_loss_7": 3.184937173128128,
"epoch": 0.704,
"grad_norm": 19.315710978547724,
"kl_loss_13": 3152.4,
"kl_loss_26": 2238.2,
"kl_loss_39": 1233.7,
"kl_loss_7": 3694.8,
"learning_rate": 0.00020482521845983521,
"loss": 5182.5,
"step": 7040
},
{
"ce_loss_13": 2.978561645746231,
"ce_loss_26": 2.537975686788559,
"ce_loss_39": 2.068689134716988,
"ce_loss_52": 1.449235063791275,
"ce_loss_7": 3.239302319288254,
"epoch": 0.705,
"grad_norm": 20.022921411442997,
"kl_loss_13": 3163.2,
"kl_loss_26": 2247.8,
"kl_loss_39": 1249.9,
"kl_loss_7": 3709.6,
"learning_rate": 0.00020354603547267987,
"loss": 5191.65,
"step": 7050
},
{
"ce_loss_13": 2.926995551586151,
"ce_loss_26": 2.4728329688310624,
"ce_loss_39": 2.000428321957588,
"ce_loss_52": 1.4053510591387748,
"ce_loss_7": 3.1873682618141173,
"epoch": 0.706,
"grad_norm": 20.203929178538456,
"kl_loss_13": 3170.4,
"kl_loss_26": 2222.2,
"kl_loss_39": 1210.8,
"kl_loss_7": 3714.0,
"learning_rate": 0.00020226983777365604,
"loss": 5154.3,
"step": 7060
},
{
"ce_loss_13": 2.9693270325660706,
"ce_loss_26": 2.520361030101776,
"ce_loss_39": 2.040296342968941,
"ce_loss_52": 1.4353806316852569,
"ce_loss_7": 3.2412941575050356,
"epoch": 0.707,
"grad_norm": 19.721352799273095,
"kl_loss_13": 3196.4,
"kl_loss_26": 2259.8,
"kl_loss_39": 1250.8,
"kl_loss_7": 3758.8,
"learning_rate": 0.00020099663821406056,
"loss": 5217.7,
"step": 7070
},
{
"ce_loss_13": 2.9820376515388487,
"ce_loss_26": 2.5476091861724854,
"ce_loss_39": 2.072761395573616,
"ce_loss_52": 1.4543047964572906,
"ce_loss_7": 3.2506080687046053,
"epoch": 0.708,
"grad_norm": 20.324232804485565,
"kl_loss_13": 3159.6,
"kl_loss_26": 2249.8,
"kl_loss_39": 1247.5,
"kl_loss_7": 3714.4,
"learning_rate": 0.00019972644961499853,
"loss": 5197.1,
"step": 7080
},
{
"ce_loss_13": 2.9332118809223173,
"ce_loss_26": 2.488256406784058,
"ce_loss_39": 2.0110603511333465,
"ce_loss_52": 1.4073437690734862,
"ce_loss_7": 3.202910542488098,
"epoch": 0.709,
"grad_norm": 19.810367107777207,
"kl_loss_13": 3178.8,
"kl_loss_26": 2255.4,
"kl_loss_39": 1244.9,
"kl_loss_7": 3745.2,
"learning_rate": 0.00019845928476725522,
"loss": 5159.15,
"step": 7090
},
{
"ce_loss_13": 2.963654935359955,
"ce_loss_26": 2.530976951122284,
"ce_loss_39": 2.0635117918252943,
"ce_loss_52": 1.466596108675003,
"ce_loss_7": 3.219101697206497,
"epoch": 0.71,
"grad_norm": 20.1039471333501,
"kl_loss_13": 3130.4,
"kl_loss_26": 2221.8,
"kl_loss_39": 1222.3,
"kl_loss_7": 3668.4,
"learning_rate": 0.00019719515643116677,
"loss": 5138.7,
"step": 7100
},
{
"ce_loss_13": 2.9432359755039217,
"ce_loss_26": 2.497118225693703,
"ce_loss_39": 2.0123680919408797,
"ce_loss_52": 1.3938605546951295,
"ce_loss_7": 3.214134621620178,
"epoch": 0.711,
"grad_norm": 20.875475364068677,
"kl_loss_13": 3169.6,
"kl_loss_26": 2236.6,
"kl_loss_39": 1232.8,
"kl_loss_7": 3728.8,
"learning_rate": 0.0001959340773364911,
"loss": 5177.25,
"step": 7110
},
{
"ce_loss_13": 2.937902510166168,
"ce_loss_26": 2.5005611896514894,
"ce_loss_39": 2.025138959288597,
"ce_loss_52": 1.4121045261621474,
"ce_loss_7": 3.21354022026062,
"epoch": 0.712,
"grad_norm": 19.300987998871754,
"kl_loss_13": 3168.4,
"kl_loss_26": 2254.8,
"kl_loss_39": 1247.8,
"kl_loss_7": 3737.2,
"learning_rate": 0.0001946760601822809,
"loss": 5183.35,
"step": 7120
},
{
"ce_loss_13": 2.9532769322395325,
"ce_loss_26": 2.5094838380813598,
"ce_loss_39": 2.042011481523514,
"ce_loss_52": 1.4359831362962723,
"ce_loss_7": 3.2172477781772613,
"epoch": 0.713,
"grad_norm": 19.806306622567096,
"kl_loss_13": 3130.8,
"kl_loss_26": 2213.4,
"kl_loss_39": 1210.7,
"kl_loss_7": 3690.4,
"learning_rate": 0.00019342111763675512,
"loss": 5121.55,
"step": 7130
},
{
"ce_loss_13": 2.9316843450069427,
"ce_loss_26": 2.4771865159273148,
"ce_loss_39": 1.9945536375045776,
"ce_loss_52": 1.3953458324074746,
"ce_loss_7": 3.1933025121688843,
"epoch": 0.714,
"grad_norm": 19.89107625803987,
"kl_loss_13": 3143.2,
"kl_loss_26": 2214.4,
"kl_loss_39": 1207.5,
"kl_loss_7": 3687.6,
"learning_rate": 0.00019216926233717085,
"loss": 5175.1,
"step": 7140
},
{
"ce_loss_13": 2.9416719019412993,
"ce_loss_26": 2.4982976377010346,
"ce_loss_39": 2.018396332859993,
"ce_loss_52": 1.4147447228431702,
"ce_loss_7": 3.206580412387848,
"epoch": 0.715,
"grad_norm": 19.473553749134638,
"kl_loss_13": 3170.0,
"kl_loss_26": 2250.0,
"kl_loss_39": 1236.5,
"kl_loss_7": 3716.0,
"learning_rate": 0.00019092050688969737,
"loss": 5168.85,
"step": 7150
},
{
"ce_loss_13": 2.93907487988472,
"ce_loss_26": 2.5087509632110594,
"ce_loss_39": 2.036549669504166,
"ce_loss_52": 1.4339062184095384,
"ce_loss_7": 3.2007872402668,
"epoch": 0.716,
"grad_norm": 18.582597661219776,
"kl_loss_13": 3097.2,
"kl_loss_26": 2211.2,
"kl_loss_39": 1215.8,
"kl_loss_7": 3638.4,
"learning_rate": 0.00018967486386928817,
"loss": 5158.35,
"step": 7160
},
{
"ce_loss_13": 2.945317584276199,
"ce_loss_26": 2.4993871986865996,
"ce_loss_39": 2.0176479905843734,
"ce_loss_52": 1.4336241394281388,
"ce_loss_7": 3.20940922498703,
"epoch": 0.717,
"grad_norm": 20.943645323187056,
"kl_loss_13": 3133.6,
"kl_loss_26": 2211.4,
"kl_loss_39": 1198.8,
"kl_loss_7": 3684.0,
"learning_rate": 0.00018843234581955443,
"loss": 5165.1,
"step": 7170
},
{
"ce_loss_13": 2.9533539593219755,
"ce_loss_26": 2.5154259234666823,
"ce_loss_39": 2.0371447414159776,
"ce_loss_52": 1.4300806164741515,
"ce_loss_7": 3.2220456659793855,
"epoch": 0.718,
"grad_norm": 20.352991453837664,
"kl_loss_13": 3138.0,
"kl_loss_26": 2233.8,
"kl_loss_39": 1229.9,
"kl_loss_7": 3696.4,
"learning_rate": 0.00018719296525263924,
"loss": 5165.1,
"step": 7180
},
{
"ce_loss_13": 2.8928813517093657,
"ce_loss_26": 2.4584825813770292,
"ce_loss_39": 1.9816097348928452,
"ce_loss_52": 1.4100207000970841,
"ce_loss_7": 3.1512379109859467,
"epoch": 0.719,
"grad_norm": 19.73288927838942,
"kl_loss_13": 3091.2,
"kl_loss_26": 2168.2,
"kl_loss_39": 1162.9,
"kl_loss_7": 3634.8,
"learning_rate": 0.0001859567346490913,
"loss": 5125.45,
"step": 7190
},
{
"ce_loss_13": 3.013565558195114,
"ce_loss_26": 2.5717740774154665,
"ce_loss_39": 2.0986361503601074,
"ce_loss_52": 1.4729616045951843,
"ce_loss_7": 3.270446163415909,
"epoch": 0.72,
"grad_norm": 19.301343533343292,
"kl_loss_13": 3201.2,
"kl_loss_26": 2278.0,
"kl_loss_39": 1269.9,
"kl_loss_7": 3744.0,
"learning_rate": 0.0001847236664577389,
"loss": 5151.05,
"step": 7200
},
{
"ce_loss_13": 2.8901243984699247,
"ce_loss_26": 2.447618916630745,
"ce_loss_39": 1.9805465787649155,
"ce_loss_52": 1.393562839925289,
"ce_loss_7": 3.152049034833908,
"epoch": 0.721,
"grad_norm": 19.80481816325832,
"kl_loss_13": 3108.8,
"kl_loss_26": 2184.6,
"kl_loss_39": 1197.1,
"kl_loss_7": 3651.2,
"learning_rate": 0.00018349377309556487,
"loss": 5147.6,
"step": 7210
},
{
"ce_loss_13": 2.935315173864365,
"ce_loss_26": 2.4844827204942703,
"ce_loss_39": 2.014389392733574,
"ce_loss_52": 1.4218608409166336,
"ce_loss_7": 3.202117031812668,
"epoch": 0.722,
"grad_norm": 21.29811195159757,
"kl_loss_13": 3145.6,
"kl_loss_26": 2210.6,
"kl_loss_39": 1204.5,
"kl_loss_7": 3703.6,
"learning_rate": 0.00018226706694758193,
"loss": 5128.1,
"step": 7220
},
{
"ce_loss_13": 2.9786236941814423,
"ce_loss_26": 2.536505568027496,
"ce_loss_39": 2.0584143906831742,
"ce_loss_52": 1.4622384160757065,
"ce_loss_7": 3.243917632102966,
"epoch": 0.723,
"grad_norm": 19.6163049246679,
"kl_loss_13": 3157.6,
"kl_loss_26": 2230.8,
"kl_loss_39": 1218.2,
"kl_loss_7": 3710.4,
"learning_rate": 0.0001810435603667075,
"loss": 5135.45,
"step": 7230
},
{
"ce_loss_13": 2.9429832458496095,
"ce_loss_26": 2.4910512387752535,
"ce_loss_39": 2.0214726239442826,
"ce_loss_52": 1.4324376732110977,
"ce_loss_7": 3.201977092027664,
"epoch": 0.724,
"grad_norm": 19.933959617154258,
"kl_loss_13": 3130.8,
"kl_loss_26": 2197.0,
"kl_loss_39": 1197.6,
"kl_loss_7": 3676.0,
"learning_rate": 0.0001798232656736389,
"loss": 5101.6,
"step": 7240
},
{
"ce_loss_13": 3.017507255077362,
"ce_loss_26": 2.5613476634025574,
"ce_loss_39": 2.065951904654503,
"ce_loss_52": 1.4529381558299064,
"ce_loss_7": 3.284585565328598,
"epoch": 0.725,
"grad_norm": 19.69163026795737,
"kl_loss_13": 3244.0,
"kl_loss_26": 2294.8,
"kl_loss_39": 1252.0,
"kl_loss_7": 3804.4,
"learning_rate": 0.0001786061951567303,
"loss": 5145.8,
"step": 7250
},
{
"ce_loss_13": 2.8927790343761446,
"ce_loss_26": 2.4552118331193924,
"ce_loss_39": 1.9874976933002473,
"ce_loss_52": 1.3983306601643561,
"ce_loss_7": 3.160378706455231,
"epoch": 0.726,
"grad_norm": 19.959972483591027,
"kl_loss_13": 3097.2,
"kl_loss_26": 2189.6,
"kl_loss_39": 1199.8,
"kl_loss_7": 3653.2,
"learning_rate": 0.00017739236107186857,
"loss": 5152.65,
"step": 7260
},
{
"ce_loss_13": 2.938109403848648,
"ce_loss_26": 2.4923312455415725,
"ce_loss_39": 2.0182687640190125,
"ce_loss_52": 1.4262833833694457,
"ce_loss_7": 3.2080613017082213,
"epoch": 0.727,
"grad_norm": 19.42416009314478,
"kl_loss_13": 3165.6,
"kl_loss_26": 2229.8,
"kl_loss_39": 1226.4,
"kl_loss_7": 3724.0,
"learning_rate": 0.00017618177564234904,
"loss": 5132.0,
"step": 7270
},
{
"ce_loss_13": 2.932406869530678,
"ce_loss_26": 2.481098806858063,
"ce_loss_39": 1.9998224407434464,
"ce_loss_52": 1.4021466106176377,
"ce_loss_7": 3.192963147163391,
"epoch": 0.728,
"grad_norm": 19.542374311814292,
"kl_loss_13": 3155.2,
"kl_loss_26": 2223.4,
"kl_loss_39": 1216.9,
"kl_loss_7": 3704.0,
"learning_rate": 0.00017497445105875377,
"loss": 5186.8,
"step": 7280
},
{
"ce_loss_13": 2.9192141771316527,
"ce_loss_26": 2.490318274497986,
"ce_loss_39": 2.027893853187561,
"ce_loss_52": 1.43733262270689,
"ce_loss_7": 3.177316850423813,
"epoch": 0.729,
"grad_norm": 20.20715160517786,
"kl_loss_13": 3084.0,
"kl_loss_26": 2177.6,
"kl_loss_39": 1196.3,
"kl_loss_7": 3620.0,
"learning_rate": 0.000173770399478828,
"loss": 5076.85,
"step": 7290
},
{
"ce_loss_13": 2.9109710931777952,
"ce_loss_26": 2.4745964229106905,
"ce_loss_39": 2.007509797811508,
"ce_loss_52": 1.4228856399655343,
"ce_loss_7": 3.179339534044266,
"epoch": 0.73,
"grad_norm": 19.31391716230683,
"kl_loss_13": 3123.2,
"kl_loss_26": 2212.0,
"kl_loss_39": 1198.3,
"kl_loss_7": 3681.6,
"learning_rate": 0.0001725696330273575,
"loss": 5123.9,
"step": 7300
},
{
"ce_loss_13": 2.9543818056583406,
"ce_loss_26": 2.5187111288309096,
"ce_loss_39": 2.046798062324524,
"ce_loss_52": 1.4316737815737723,
"ce_loss_7": 3.2196085810661317,
"epoch": 0.731,
"grad_norm": 19.333490210710867,
"kl_loss_13": 3136.4,
"kl_loss_26": 2223.6,
"kl_loss_39": 1228.2,
"kl_loss_7": 3683.6,
"learning_rate": 0.00017137216379604724,
"loss": 5093.05,
"step": 7310
},
{
"ce_loss_13": 2.991909348964691,
"ce_loss_26": 2.533888804912567,
"ce_loss_39": 2.048043805360794,
"ce_loss_52": 1.426993179321289,
"ce_loss_7": 3.2596666753292083,
"epoch": 0.732,
"grad_norm": 18.669638559883268,
"kl_loss_13": 3225.6,
"kl_loss_26": 2287.6,
"kl_loss_39": 1264.8,
"kl_loss_7": 3784.0,
"learning_rate": 0.00017017800384339925,
"loss": 5127.2,
"step": 7320
},
{
"ce_loss_13": 2.919608438014984,
"ce_loss_26": 2.475746387243271,
"ce_loss_39": 2.0119441866874697,
"ce_loss_52": 1.416624790430069,
"ce_loss_7": 3.1857059836387633,
"epoch": 0.733,
"grad_norm": 19.375624695020313,
"kl_loss_13": 3109.2,
"kl_loss_26": 2190.6,
"kl_loss_39": 1207.2,
"kl_loss_7": 3667.2,
"learning_rate": 0.00016898716519459073,
"loss": 5204.9,
"step": 7330
},
{
"ce_loss_13": 2.978672456741333,
"ce_loss_26": 2.5233275532722472,
"ce_loss_39": 2.0352649986743927,
"ce_loss_52": 1.396337878704071,
"ce_loss_7": 3.2537453293800356,
"epoch": 0.734,
"grad_norm": 19.487833479563225,
"kl_loss_13": 3269.6,
"kl_loss_26": 2319.4,
"kl_loss_39": 1294.6,
"kl_loss_7": 3847.2,
"learning_rate": 0.00016779965984135375,
"loss": 5141.65,
"step": 7340
},
{
"ce_loss_13": 2.946110498905182,
"ce_loss_26": 2.507070618867874,
"ce_loss_39": 2.0425552487373353,
"ce_loss_52": 1.4492309480905532,
"ce_loss_7": 3.2022292137146,
"epoch": 0.735,
"grad_norm": 19.428920750438415,
"kl_loss_13": 3113.2,
"kl_loss_26": 2205.0,
"kl_loss_39": 1210.6,
"kl_loss_7": 3652.4,
"learning_rate": 0.00016661549974185424,
"loss": 5094.6,
"step": 7350
},
{
"ce_loss_13": 2.9823583602905273,
"ce_loss_26": 2.539780905842781,
"ce_loss_39": 2.0619786471128463,
"ce_loss_52": 1.45176909416914,
"ce_loss_7": 3.249054718017578,
"epoch": 0.736,
"grad_norm": 19.839368194621894,
"kl_loss_13": 3204.0,
"kl_loss_26": 2280.6,
"kl_loss_39": 1256.4,
"kl_loss_7": 3753.6,
"learning_rate": 0.00016543469682057105,
"loss": 5196.95,
"step": 7360
},
{
"ce_loss_13": 2.958816784620285,
"ce_loss_26": 2.526939642429352,
"ce_loss_39": 2.0634298622608185,
"ce_loss_52": 1.4754424065351486,
"ce_loss_7": 3.223200261592865,
"epoch": 0.737,
"grad_norm": 19.922895259671222,
"kl_loss_13": 3096.0,
"kl_loss_26": 2185.8,
"kl_loss_39": 1192.1,
"kl_loss_7": 3641.6,
"learning_rate": 0.00016425726296817632,
"loss": 5155.3,
"step": 7370
},
{
"ce_loss_13": 2.9623453855514525,
"ce_loss_26": 2.5124567419290544,
"ce_loss_39": 2.032202622294426,
"ce_loss_52": 1.4377738699316978,
"ce_loss_7": 3.22493896484375,
"epoch": 0.738,
"grad_norm": 19.913040297451975,
"kl_loss_13": 3150.8,
"kl_loss_26": 2213.6,
"kl_loss_39": 1202.5,
"kl_loss_7": 3703.2,
"learning_rate": 0.00016308321004141607,
"loss": 5152.6,
"step": 7380
},
{
"ce_loss_13": 2.9184991478919984,
"ce_loss_26": 2.4769508123397825,
"ce_loss_39": 2.0019295692443846,
"ce_loss_52": 1.423092892765999,
"ce_loss_7": 3.185350716114044,
"epoch": 0.739,
"grad_norm": 19.322834578437774,
"kl_loss_13": 3095.6,
"kl_loss_26": 2174.4,
"kl_loss_39": 1176.6,
"kl_loss_7": 3652.0,
"learning_rate": 0.00016191254986299043,
"loss": 5134.25,
"step": 7390
},
{
"ce_loss_13": 2.8498477935791016,
"ce_loss_26": 2.4134464621543885,
"ce_loss_39": 1.952101919054985,
"ce_loss_52": 1.3852853626012802,
"ce_loss_7": 3.117431342601776,
"epoch": 0.74,
"grad_norm": 20.243774871674358,
"kl_loss_13": 3055.2,
"kl_loss_26": 2147.0,
"kl_loss_39": 1163.3,
"kl_loss_7": 3606.8,
"learning_rate": 0.00016074529422143398,
"loss": 5086.95,
"step": 7400
},
{
"ce_loss_13": 2.999220699071884,
"ce_loss_26": 2.550427186489105,
"ce_loss_39": 2.0656515032052996,
"ce_loss_52": 1.4574634283781052,
"ce_loss_7": 3.2632993936538695,
"epoch": 0.741,
"grad_norm": 20.693157593916027,
"kl_loss_13": 3181.6,
"kl_loss_26": 2251.2,
"kl_loss_39": 1241.3,
"kl_loss_7": 3731.6,
"learning_rate": 0.0001595814548709983,
"loss": 5127.4,
"step": 7410
},
{
"ce_loss_13": 2.9262797057628633,
"ce_loss_26": 2.4955521285533906,
"ce_loss_39": 2.043664366006851,
"ce_loss_52": 1.4607123613357544,
"ce_loss_7": 3.178546887636185,
"epoch": 0.742,
"grad_norm": 19.251560643481486,
"kl_loss_13": 3060.8,
"kl_loss_26": 2159.0,
"kl_loss_39": 1190.2,
"kl_loss_7": 3588.8,
"learning_rate": 0.00015842104353153285,
"loss": 5092.2,
"step": 7420
},
{
"ce_loss_13": 3.0160707533359528,
"ce_loss_26": 2.5663387060165403,
"ce_loss_39": 2.0802172899246214,
"ce_loss_52": 1.4715656280517577,
"ce_loss_7": 3.2800197422504427,
"epoch": 0.743,
"grad_norm": 19.587217025482605,
"kl_loss_13": 3190.4,
"kl_loss_26": 2255.8,
"kl_loss_39": 1233.7,
"kl_loss_7": 3735.6,
"learning_rate": 0.0001572640718883667,
"loss": 5115.1,
"step": 7430
},
{
"ce_loss_13": 2.9469042241573336,
"ce_loss_26": 2.504935991764069,
"ce_loss_39": 2.0331138372421265,
"ce_loss_52": 1.4291576787829399,
"ce_loss_7": 3.2098668992519377,
"epoch": 0.744,
"grad_norm": 18.92686745034597,
"kl_loss_13": 3124.8,
"kl_loss_26": 2209.8,
"kl_loss_39": 1212.4,
"kl_loss_7": 3675.2,
"learning_rate": 0.0001561105515921915,
"loss": 5076.55,
"step": 7440
},
{
"ce_loss_13": 2.924536573886871,
"ce_loss_26": 2.487199380993843,
"ce_loss_39": 2.0280190229415895,
"ce_loss_52": 1.4308366000652313,
"ce_loss_7": 3.184017467498779,
"epoch": 0.745,
"grad_norm": 20.464433868383605,
"kl_loss_13": 3079.6,
"kl_loss_26": 2173.0,
"kl_loss_39": 1203.3,
"kl_loss_7": 3634.0,
"learning_rate": 0.0001549604942589441,
"loss": 5072.9,
"step": 7450
},
{
"ce_loss_13": 2.929040068387985,
"ce_loss_26": 2.4782379269599915,
"ce_loss_39": 1.997541171312332,
"ce_loss_52": 1.4025927037000656,
"ce_loss_7": 3.1986856281757357,
"epoch": 0.746,
"grad_norm": 19.749937853484084,
"kl_loss_13": 3136.4,
"kl_loss_26": 2195.8,
"kl_loss_39": 1197.5,
"kl_loss_7": 3701.6,
"learning_rate": 0.00015381391146968864,
"loss": 5119.05,
"step": 7460
},
{
"ce_loss_13": 2.9494260370731356,
"ce_loss_26": 2.5060392141342165,
"ce_loss_39": 2.0337665289640428,
"ce_loss_52": 1.437817743420601,
"ce_loss_7": 3.2058426082134246,
"epoch": 0.747,
"grad_norm": 20.32794285148468,
"kl_loss_13": 3134.4,
"kl_loss_26": 2216.0,
"kl_loss_39": 1213.4,
"kl_loss_7": 3678.0,
"learning_rate": 0.00015267081477050133,
"loss": 5102.65,
"step": 7470
},
{
"ce_loss_13": 2.921141803264618,
"ce_loss_26": 2.4805154383182524,
"ce_loss_39": 2.0182774633169176,
"ce_loss_52": 1.4364572942256928,
"ce_loss_7": 3.1770897448062896,
"epoch": 0.748,
"grad_norm": 19.06438842103908,
"kl_loss_13": 3083.4,
"kl_loss_26": 2171.6,
"kl_loss_39": 1187.2,
"kl_loss_7": 3614.4,
"learning_rate": 0.00015153121567235335,
"loss": 5127.55,
"step": 7480
},
{
"ce_loss_13": 2.913339024782181,
"ce_loss_26": 2.4667980909347533,
"ce_loss_39": 1.9971046984195708,
"ce_loss_52": 1.4167528375983238,
"ce_loss_7": 3.1835066616535186,
"epoch": 0.749,
"grad_norm": 19.868010576940023,
"kl_loss_13": 3104.0,
"kl_loss_26": 2178.6,
"kl_loss_39": 1188.7,
"kl_loss_7": 3662.4,
"learning_rate": 0.00015039512565099468,
"loss": 5094.65,
"step": 7490
},
{
"ce_loss_13": 2.914568355679512,
"ce_loss_26": 2.469626322388649,
"ce_loss_39": 2.0020422458648683,
"ce_loss_52": 1.4171572998166084,
"ce_loss_7": 3.1783276200294495,
"epoch": 0.75,
"grad_norm": 19.266335144116216,
"kl_loss_13": 3099.6,
"kl_loss_26": 2178.4,
"kl_loss_39": 1189.1,
"kl_loss_7": 3658.8,
"learning_rate": 0.00014926255614683932,
"loss": 5132.95,
"step": 7500
},
{
"ce_loss_13": 2.9255091905593873,
"ce_loss_26": 2.4897490620613096,
"ce_loss_39": 2.020438665151596,
"ce_loss_52": 1.43346728682518,
"ce_loss_7": 3.188784825801849,
"epoch": 0.751,
"grad_norm": 18.946748872518604,
"kl_loss_13": 3108.8,
"kl_loss_26": 2190.0,
"kl_loss_39": 1197.3,
"kl_loss_7": 3652.8,
"learning_rate": 0.0001481335185648498,
"loss": 5140.95,
"step": 7510
},
{
"ce_loss_13": 2.9898211777210237,
"ce_loss_26": 2.541801372170448,
"ce_loss_39": 2.068241673707962,
"ce_loss_52": 1.474491646885872,
"ce_loss_7": 3.2494523525238037,
"epoch": 0.752,
"grad_norm": 19.297834732191596,
"kl_loss_13": 3108.0,
"kl_loss_26": 2187.0,
"kl_loss_39": 1193.8,
"kl_loss_7": 3655.2,
"learning_rate": 0.0001470080242744218,
"loss": 5080.45,
"step": 7520
},
{
"ce_loss_13": 2.989736980199814,
"ce_loss_26": 2.5422766327857973,
"ce_loss_39": 2.0724924355745316,
"ce_loss_52": 1.4764755725860597,
"ce_loss_7": 3.2583333015441895,
"epoch": 0.753,
"grad_norm": 19.759302349149518,
"kl_loss_13": 3127.2,
"kl_loss_26": 2207.8,
"kl_loss_39": 1205.1,
"kl_loss_7": 3686.4,
"learning_rate": 0.0001458860846092705,
"loss": 5089.25,
"step": 7530
},
{
"ce_loss_13": 2.9518058955669404,
"ce_loss_26": 2.5123462677001953,
"ce_loss_39": 2.033254536986351,
"ce_loss_52": 1.430069674551487,
"ce_loss_7": 3.216610902547836,
"epoch": 0.754,
"grad_norm": 19.101743494796594,
"kl_loss_13": 3130.4,
"kl_loss_26": 2214.4,
"kl_loss_39": 1206.1,
"kl_loss_7": 3688.8,
"learning_rate": 0.00014476771086731566,
"loss": 5132.95,
"step": 7540
},
{
"ce_loss_13": 2.9499751746654512,
"ce_loss_26": 2.5148087441921234,
"ce_loss_39": 2.0469634413719175,
"ce_loss_52": 1.4645337551832198,
"ce_loss_7": 3.2114050924777984,
"epoch": 0.755,
"grad_norm": 18.95244057854832,
"kl_loss_13": 3084.4,
"kl_loss_26": 2166.2,
"kl_loss_39": 1178.4,
"kl_loss_7": 3625.2,
"learning_rate": 0.00014365291431056872,
"loss": 5111.9,
"step": 7550
},
{
"ce_loss_13": 2.915192812681198,
"ce_loss_26": 2.470223453640938,
"ce_loss_39": 1.9942311495542526,
"ce_loss_52": 1.4130516573786736,
"ce_loss_7": 3.186279386281967,
"epoch": 0.756,
"grad_norm": 19.87294750540145,
"kl_loss_13": 3138.4,
"kl_loss_26": 2214.2,
"kl_loss_39": 1201.4,
"kl_loss_7": 3700.0,
"learning_rate": 0.00014254170616501827,
"loss": 5096.4,
"step": 7560
},
{
"ce_loss_13": 2.954612511396408,
"ce_loss_26": 2.5071854114532472,
"ce_loss_39": 2.035946971178055,
"ce_loss_52": 1.439925280213356,
"ce_loss_7": 3.2222863495349885,
"epoch": 0.757,
"grad_norm": 20.90881340934633,
"kl_loss_13": 3122.0,
"kl_loss_26": 2194.2,
"kl_loss_39": 1192.5,
"kl_loss_7": 3677.6,
"learning_rate": 0.0001414340976205183,
"loss": 5060.45,
"step": 7570
},
{
"ce_loss_13": 2.9101392149925234,
"ce_loss_26": 2.471102824807167,
"ce_loss_39": 2.0064873933792113,
"ce_loss_52": 1.4280226349830627,
"ce_loss_7": 3.1714185059070585,
"epoch": 0.758,
"grad_norm": 19.49522818126467,
"kl_loss_13": 3076.0,
"kl_loss_26": 2159.8,
"kl_loss_39": 1176.2,
"kl_loss_7": 3620.4,
"learning_rate": 0.00014033009983067452,
"loss": 5108.35,
"step": 7580
},
{
"ce_loss_13": 2.966978985071182,
"ce_loss_26": 2.5087331235408783,
"ce_loss_39": 2.0336913764476776,
"ce_loss_52": 1.4274784743785858,
"ce_loss_7": 3.2379914104938505,
"epoch": 0.759,
"grad_norm": 18.693022063405696,
"kl_loss_13": 3203.2,
"kl_loss_26": 2255.0,
"kl_loss_39": 1243.9,
"kl_loss_7": 3762.0,
"learning_rate": 0.00013922972391273224,
"loss": 5094.65,
"step": 7590
},
{
"ce_loss_13": 2.9872674524784086,
"ce_loss_26": 2.5407335460186005,
"ce_loss_39": 2.061782196164131,
"ce_loss_52": 1.4462621062994003,
"ce_loss_7": 3.253601038455963,
"epoch": 0.76,
"grad_norm": 19.707863398571995,
"kl_loss_13": 3172.0,
"kl_loss_26": 2252.8,
"kl_loss_39": 1251.4,
"kl_loss_7": 3726.4,
"learning_rate": 0.0001381329809474649,
"loss": 5098.7,
"step": 7600
},
{
"ce_loss_13": 2.8771551668643953,
"ce_loss_26": 2.4450180411338804,
"ce_loss_39": 1.9818484753370285,
"ce_loss_52": 1.4145073384046554,
"ce_loss_7": 3.1438543021678926,
"epoch": 0.761,
"grad_norm": 18.46540807586579,
"kl_loss_13": 3029.2,
"kl_loss_26": 2126.8,
"kl_loss_39": 1154.6,
"kl_loss_7": 3583.2,
"learning_rate": 0.0001370398819790621,
"loss": 5084.25,
"step": 7610
},
{
"ce_loss_13": 2.960028713941574,
"ce_loss_26": 2.525826930999756,
"ce_loss_39": 2.0549875289201736,
"ce_loss_52": 1.4615961879491806,
"ce_loss_7": 3.2287797749042513,
"epoch": 0.762,
"grad_norm": 19.79273649753162,
"kl_loss_13": 3108.0,
"kl_loss_26": 2202.8,
"kl_loss_39": 1209.5,
"kl_loss_7": 3652.8,
"learning_rate": 0.00013595043801501794,
"loss": 5052.75,
"step": 7620
},
{
"ce_loss_13": 2.9331182718276976,
"ce_loss_26": 2.4871296346187592,
"ce_loss_39": 2.019395884871483,
"ce_loss_52": 1.4329772531986236,
"ce_loss_7": 3.19427090883255,
"epoch": 0.763,
"grad_norm": 20.265530214115834,
"kl_loss_13": 3124.0,
"kl_loss_26": 2195.4,
"kl_loss_39": 1199.7,
"kl_loss_7": 3669.6,
"learning_rate": 0.00013486466002602133,
"loss": 5092.15,
"step": 7630
},
{
"ce_loss_13": 2.86139075756073,
"ce_loss_26": 2.4176136374473574,
"ce_loss_39": 1.941940224170685,
"ce_loss_52": 1.3821519583463668,
"ce_loss_7": 3.1196699738502502,
"epoch": 0.764,
"grad_norm": 19.727213627271716,
"kl_loss_13": 3072.8,
"kl_loss_26": 2145.6,
"kl_loss_39": 1144.7,
"kl_loss_7": 3617.2,
"learning_rate": 0.00013378255894584462,
"loss": 5002.6,
"step": 7640
},
{
"ce_loss_13": 2.9353716015815734,
"ce_loss_26": 2.490555015206337,
"ce_loss_39": 2.019370597600937,
"ce_loss_52": 1.4347517609596252,
"ce_loss_7": 3.196541225910187,
"epoch": 0.765,
"grad_norm": 20.424012523901062,
"kl_loss_13": 3127.6,
"kl_loss_26": 2205.8,
"kl_loss_39": 1202.7,
"kl_loss_7": 3670.0,
"learning_rate": 0.0001327041456712334,
"loss": 5085.1,
"step": 7650
},
{
"ce_loss_13": 2.994156318902969,
"ce_loss_26": 2.56048826277256,
"ce_loss_39": 2.084024053812027,
"ce_loss_52": 1.487925711274147,
"ce_loss_7": 3.2512714982032778,
"epoch": 0.766,
"grad_norm": 19.650498118539073,
"kl_loss_13": 3125.6,
"kl_loss_26": 2221.6,
"kl_loss_39": 1220.7,
"kl_loss_7": 3665.2,
"learning_rate": 0.00013162943106179747,
"loss": 5105.9,
"step": 7660
},
{
"ce_loss_13": 2.9769886791706086,
"ce_loss_26": 2.5290128916501997,
"ce_loss_39": 2.0444375783205033,
"ce_loss_52": 1.4427102521061896,
"ce_loss_7": 3.2454589188098906,
"epoch": 0.767,
"grad_norm": 19.69057867231776,
"kl_loss_13": 3203.2,
"kl_loss_26": 2269.2,
"kl_loss_39": 1243.2,
"kl_loss_7": 3758.4,
"learning_rate": 0.00013055842593990132,
"loss": 5067.35,
"step": 7670
},
{
"ce_loss_13": 2.9738565921783446,
"ce_loss_26": 2.5335902631282807,
"ce_loss_39": 2.06500606238842,
"ce_loss_52": 1.4531759321689606,
"ce_loss_7": 3.2406944632530212,
"epoch": 0.768,
"grad_norm": 19.854778454902203,
"kl_loss_13": 3164.0,
"kl_loss_26": 2244.2,
"kl_loss_39": 1253.0,
"kl_loss_7": 3714.8,
"learning_rate": 0.00012949114109055414,
"loss": 5080.45,
"step": 7680
},
{
"ce_loss_13": 2.8718224823474885,
"ce_loss_26": 2.434892734885216,
"ce_loss_39": 1.9763070404529572,
"ce_loss_52": 1.4119910702109337,
"ce_loss_7": 3.1341715812683106,
"epoch": 0.769,
"grad_norm": 19.065166102671203,
"kl_loss_13": 3037.6,
"kl_loss_26": 2134.2,
"kl_loss_39": 1156.1,
"kl_loss_7": 3588.0,
"learning_rate": 0.00012842758726130281,
"loss": 5110.75,
"step": 7690
},
{
"ce_loss_13": 2.9209058582782745,
"ce_loss_26": 2.4878934979438783,
"ce_loss_39": 2.0205056190490724,
"ce_loss_52": 1.4421792283654213,
"ce_loss_7": 3.182013803720474,
"epoch": 0.77,
"grad_norm": 19.547331822021178,
"kl_loss_13": 3063.6,
"kl_loss_26": 2150.4,
"kl_loss_39": 1168.8,
"kl_loss_7": 3608.0,
"learning_rate": 0.00012736777516212267,
"loss": 5073.5,
"step": 7700
},
{
"ce_loss_13": 2.9334555983543398,
"ce_loss_26": 2.493560019135475,
"ce_loss_39": 2.0158845692873,
"ce_loss_52": 1.4151214450597762,
"ce_loss_7": 3.2045696437358857,
"epoch": 0.771,
"grad_norm": 18.662664559432073,
"kl_loss_13": 3145.2,
"kl_loss_26": 2230.4,
"kl_loss_39": 1228.4,
"kl_loss_7": 3706.0,
"learning_rate": 0.00012631171546530968,
"loss": 5058.75,
"step": 7710
},
{
"ce_loss_13": 2.944778233766556,
"ce_loss_26": 2.5031532883644103,
"ce_loss_39": 2.0205067574977873,
"ce_loss_52": 1.414848119020462,
"ce_loss_7": 3.2128884732723235,
"epoch": 0.772,
"grad_norm": 19.409023945233233,
"kl_loss_13": 3144.0,
"kl_loss_26": 2229.4,
"kl_loss_39": 1221.0,
"kl_loss_7": 3700.4,
"learning_rate": 0.00012525941880537307,
"loss": 5071.55,
"step": 7720
},
{
"ce_loss_13": 2.937892961502075,
"ce_loss_26": 2.492938667535782,
"ce_loss_39": 2.020972582697868,
"ce_loss_52": 1.4354220196604728,
"ce_loss_7": 3.2008834302425386,
"epoch": 0.773,
"grad_norm": 19.46905923212589,
"kl_loss_13": 3121.2,
"kl_loss_26": 2201.4,
"kl_loss_39": 1204.2,
"kl_loss_7": 3662.4,
"learning_rate": 0.00012421089577892869,
"loss": 5040.15,
"step": 7730
},
{
"ce_loss_13": 2.949704957008362,
"ce_loss_26": 2.5045041382312774,
"ce_loss_39": 2.016797697544098,
"ce_loss_52": 1.4014781221747399,
"ce_loss_7": 3.212642914056778,
"epoch": 0.774,
"grad_norm": 19.613908494270376,
"kl_loss_13": 3187.2,
"kl_loss_26": 2266.0,
"kl_loss_39": 1252.6,
"kl_loss_7": 3732.0,
"learning_rate": 0.0001231661569445919,
"loss": 5076.45,
"step": 7740
},
{
"ce_loss_13": 2.955250400304794,
"ce_loss_26": 2.5274777173995973,
"ce_loss_39": 2.067678835988045,
"ce_loss_52": 1.4838283985853196,
"ce_loss_7": 3.2114856481552123,
"epoch": 0.775,
"grad_norm": 19.844798674533834,
"kl_loss_13": 3084.8,
"kl_loss_26": 2190.6,
"kl_loss_39": 1199.8,
"kl_loss_7": 3616.4,
"learning_rate": 0.00012212521282287093,
"loss": 5060.4,
"step": 7750
},
{
"ce_loss_13": 2.9763452112674713,
"ce_loss_26": 2.5314755111932756,
"ce_loss_39": 2.0540303111076357,
"ce_loss_52": 1.450203076004982,
"ce_loss_7": 3.2422658264636994,
"epoch": 0.776,
"grad_norm": 20.432260851236787,
"kl_loss_13": 3181.2,
"kl_loss_26": 2258.4,
"kl_loss_39": 1248.7,
"kl_loss_7": 3726.0,
"learning_rate": 0.00012108807389606158,
"loss": 5084.95,
"step": 7760
},
{
"ce_loss_13": 2.920662760734558,
"ce_loss_26": 2.475513318181038,
"ce_loss_39": 2.0011812478303908,
"ce_loss_52": 1.4144959792494773,
"ce_loss_7": 3.1814299404621122,
"epoch": 0.777,
"grad_norm": 19.88207983374875,
"kl_loss_13": 3106.4,
"kl_loss_26": 2178.8,
"kl_loss_39": 1189.6,
"kl_loss_7": 3652.0,
"learning_rate": 0.00012005475060814159,
"loss": 5075.25,
"step": 7770
},
{
"ce_loss_13": 2.982200914621353,
"ce_loss_26": 2.5349232286214827,
"ce_loss_39": 2.069617584347725,
"ce_loss_52": 1.4744601517915725,
"ce_loss_7": 3.2433891892433167,
"epoch": 0.778,
"grad_norm": 19.290497638146196,
"kl_loss_13": 3128.4,
"kl_loss_26": 2198.0,
"kl_loss_39": 1199.9,
"kl_loss_7": 3677.2,
"learning_rate": 0.00011902525336466464,
"loss": 5053.05,
"step": 7780
},
{
"ce_loss_13": 2.9154593467712404,
"ce_loss_26": 2.477250945568085,
"ce_loss_39": 2.0098533272743224,
"ce_loss_52": 1.4167337000370026,
"ce_loss_7": 3.1773332476615908,
"epoch": 0.779,
"grad_norm": 19.252580297991088,
"kl_loss_13": 3096.4,
"kl_loss_26": 2190.8,
"kl_loss_39": 1202.1,
"kl_loss_7": 3643.6,
"learning_rate": 0.00011799959253265668,
"loss": 5067.65,
"step": 7790
},
{
"ce_loss_13": 2.9013588786125184,
"ce_loss_26": 2.468735784292221,
"ce_loss_39": 2.0137620836496355,
"ce_loss_52": 1.4429293110966683,
"ce_loss_7": 3.1603996396064757,
"epoch": 0.78,
"grad_norm": 18.773016641793355,
"kl_loss_13": 3040.0,
"kl_loss_26": 2138.8,
"kl_loss_39": 1157.9,
"kl_loss_7": 3579.6,
"learning_rate": 0.00011697777844051105,
"loss": 5056.85,
"step": 7800
},
{
"ce_loss_13": 2.99123472571373,
"ce_loss_26": 2.5481519401073456,
"ce_loss_39": 2.0644855082035063,
"ce_loss_52": 1.4605911195278167,
"ce_loss_7": 3.2550659775733948,
"epoch": 0.781,
"grad_norm": 19.108089078575876,
"kl_loss_13": 3156.0,
"kl_loss_26": 2232.8,
"kl_loss_39": 1225.5,
"kl_loss_7": 3705.6,
"learning_rate": 0.00011595982137788402,
"loss": 5045.05,
"step": 7810
},
{
"ce_loss_13": 2.979940289258957,
"ce_loss_26": 2.5384896367788317,
"ce_loss_39": 2.066192331910133,
"ce_loss_52": 1.4709408730268478,
"ce_loss_7": 3.2419922232627867,
"epoch": 0.782,
"grad_norm": 19.283597029818793,
"kl_loss_13": 3151.6,
"kl_loss_26": 2222.2,
"kl_loss_39": 1217.6,
"kl_loss_7": 3697.2,
"learning_rate": 0.00011494573159559212,
"loss": 5088.85,
"step": 7820
},
{
"ce_loss_13": 2.9142948031425475,
"ce_loss_26": 2.4835946947336196,
"ce_loss_39": 2.015666288137436,
"ce_loss_52": 1.433475723862648,
"ce_loss_7": 3.1793313324451447,
"epoch": 0.783,
"grad_norm": 18.805294673266925,
"kl_loss_13": 3065.2,
"kl_loss_26": 2169.8,
"kl_loss_39": 1195.1,
"kl_loss_7": 3620.0,
"learning_rate": 0.00011393551930550828,
"loss": 5021.8,
"step": 7830
},
{
"ce_loss_13": 2.942464643716812,
"ce_loss_26": 2.4966187834739686,
"ce_loss_39": 2.018853786587715,
"ce_loss_52": 1.4241285115480422,
"ce_loss_7": 3.209713137149811,
"epoch": 0.784,
"grad_norm": 18.802417227998035,
"kl_loss_13": 3153.6,
"kl_loss_26": 2218.8,
"kl_loss_39": 1214.7,
"kl_loss_7": 3709.6,
"learning_rate": 0.00011292919468045875,
"loss": 5056.2,
"step": 7840
},
{
"ce_loss_13": 2.9324662506580355,
"ce_loss_26": 2.490780544281006,
"ce_loss_39": 2.014774057269096,
"ce_loss_52": 1.4378075569868087,
"ce_loss_7": 3.2002897918224336,
"epoch": 0.785,
"grad_norm": 18.31675592106317,
"kl_loss_13": 3120.4,
"kl_loss_26": 2191.6,
"kl_loss_39": 1189.3,
"kl_loss_7": 3674.4,
"learning_rate": 0.00011192676785412154,
"loss": 5025.65,
"step": 7850
},
{
"ce_loss_13": 2.9004018545150756,
"ce_loss_26": 2.4679500609636307,
"ce_loss_39": 2.0106911092996596,
"ce_loss_52": 1.454608330130577,
"ce_loss_7": 3.1605159759521486,
"epoch": 0.786,
"grad_norm": 21.238240444429067,
"kl_loss_13": 2987.2,
"kl_loss_26": 2093.8,
"kl_loss_39": 1136.1,
"kl_loss_7": 3522.4,
"learning_rate": 0.00011092824892092374,
"loss": 4990.8,
"step": 7860
},
{
"ce_loss_13": 3.0144290030002594,
"ce_loss_26": 2.5742201924324037,
"ce_loss_39": 2.095407247543335,
"ce_loss_52": 1.4918664544820786,
"ce_loss_7": 3.2818655967712402,
"epoch": 0.787,
"grad_norm": 20.48506344270259,
"kl_loss_13": 3166.0,
"kl_loss_26": 2239.6,
"kl_loss_39": 1229.4,
"kl_loss_7": 3722.0,
"learning_rate": 0.0001099336479359398,
"loss": 5084.85,
"step": 7870
},
{
"ce_loss_13": 2.9180380165576936,
"ce_loss_26": 2.4719971120357513,
"ce_loss_39": 1.9984097123146056,
"ce_loss_52": 1.4270031958818437,
"ce_loss_7": 3.1842149913311006,
"epoch": 0.788,
"grad_norm": 19.617074455019072,
"kl_loss_13": 3109.6,
"kl_loss_26": 2182.8,
"kl_loss_39": 1178.3,
"kl_loss_7": 3662.0,
"learning_rate": 0.00010894297491479043,
"loss": 5092.2,
"step": 7880
},
{
"ce_loss_13": 2.8819404006004334,
"ce_loss_26": 2.449340745806694,
"ce_loss_39": 1.9862482339143752,
"ce_loss_52": 1.413575354218483,
"ce_loss_7": 3.143266361951828,
"epoch": 0.789,
"grad_norm": 19.73420487885875,
"kl_loss_13": 3037.2,
"kl_loss_26": 2140.0,
"kl_loss_39": 1164.1,
"kl_loss_7": 3572.0,
"learning_rate": 0.00010795623983354214,
"loss": 5012.15,
"step": 7890
},
{
"ce_loss_13": 2.936946928501129,
"ce_loss_26": 2.50938241481781,
"ce_loss_39": 2.0399589776992797,
"ce_loss_52": 1.4433409079909325,
"ce_loss_7": 3.1964890122413636,
"epoch": 0.79,
"grad_norm": 20.456776413214342,
"kl_loss_13": 3094.8,
"kl_loss_26": 2194.4,
"kl_loss_39": 1203.3,
"kl_loss_7": 3650.0,
"learning_rate": 0.00010697345262860636,
"loss": 5033.95,
"step": 7900
},
{
"ce_loss_13": 2.9234113335609435,
"ce_loss_26": 2.486256945133209,
"ce_loss_39": 2.014375075697899,
"ce_loss_52": 1.449743601679802,
"ce_loss_7": 3.18265563249588,
"epoch": 0.791,
"grad_norm": 19.900735615836812,
"kl_loss_13": 3074.8,
"kl_loss_26": 2163.0,
"kl_loss_39": 1161.1,
"kl_loss_7": 3612.8,
"learning_rate": 0.00010599462319663906,
"loss": 5038.65,
"step": 7910
},
{
"ce_loss_13": 2.9553000926971436,
"ce_loss_26": 2.5179340064525606,
"ce_loss_39": 2.0430867671966553,
"ce_loss_52": 1.444689854979515,
"ce_loss_7": 3.21486856341362,
"epoch": 0.792,
"grad_norm": 19.250533773319045,
"kl_loss_13": 3122.0,
"kl_loss_26": 2206.2,
"kl_loss_39": 1213.4,
"kl_loss_7": 3668.8,
"learning_rate": 0.00010501976139444191,
"loss": 5048.55,
"step": 7920
},
{
"ce_loss_13": 2.946609389781952,
"ce_loss_26": 2.5028085887432097,
"ce_loss_39": 2.0339547246694565,
"ce_loss_52": 1.4416055083274841,
"ce_loss_7": 3.2140405058860777,
"epoch": 0.793,
"grad_norm": 20.61899934712358,
"kl_loss_13": 3133.6,
"kl_loss_26": 2209.4,
"kl_loss_39": 1208.3,
"kl_loss_7": 3694.0,
"learning_rate": 0.0001040488770388625,
"loss": 5057.15,
"step": 7930
},
{
"ce_loss_13": 2.866725343465805,
"ce_loss_26": 2.424889090657234,
"ce_loss_39": 1.9532368332147598,
"ce_loss_52": 1.37542584836483,
"ce_loss_7": 3.1246279418468474,
"epoch": 0.794,
"grad_norm": 19.267832695141472,
"kl_loss_13": 3083.6,
"kl_loss_26": 2164.8,
"kl_loss_39": 1176.3,
"kl_loss_7": 3624.4,
"learning_rate": 0.00010308197990669538,
"loss": 5026.95,
"step": 7940
},
{
"ce_loss_13": 2.901643234491348,
"ce_loss_26": 2.4638597697019575,
"ce_loss_39": 1.9964057832956315,
"ce_loss_52": 1.4079250425100327,
"ce_loss_7": 3.1619919717311857,
"epoch": 0.795,
"grad_norm": 19.24818590589668,
"kl_loss_13": 3123.2,
"kl_loss_26": 2199.2,
"kl_loss_39": 1202.4,
"kl_loss_7": 3666.4,
"learning_rate": 0.0001021190797345839,
"loss": 5013.5,
"step": 7950
},
{
"ce_loss_13": 2.967918246984482,
"ce_loss_26": 2.533867511153221,
"ce_loss_39": 2.069682791829109,
"ce_loss_52": 1.4818589851260184,
"ce_loss_7": 3.222521889209747,
"epoch": 0.796,
"grad_norm": 19.75622423793478,
"kl_loss_13": 3058.8,
"kl_loss_26": 2155.2,
"kl_loss_39": 1175.7,
"kl_loss_7": 3592.0,
"learning_rate": 0.00010116018621892236,
"loss": 5009.6,
"step": 7960
},
{
"ce_loss_13": 2.8812991797924044,
"ce_loss_26": 2.4400356858968735,
"ce_loss_39": 1.9724171191453934,
"ce_loss_52": 1.4085147365927697,
"ce_loss_7": 3.1415765941143037,
"epoch": 0.797,
"grad_norm": 19.13497167869677,
"kl_loss_13": 3057.6,
"kl_loss_26": 2142.8,
"kl_loss_39": 1159.2,
"kl_loss_7": 3602.8,
"learning_rate": 0.00010020530901575753,
"loss": 5020.4,
"step": 7970
},
{
"ce_loss_13": 2.904066652059555,
"ce_loss_26": 2.4755689650774,
"ce_loss_39": 2.0152323603630067,
"ce_loss_52": 1.4373595044016838,
"ce_loss_7": 3.1696866393089294,
"epoch": 0.798,
"grad_norm": 20.01532447536976,
"kl_loss_13": 3044.0,
"kl_loss_26": 2150.0,
"kl_loss_39": 1170.5,
"kl_loss_7": 3601.6,
"learning_rate": 9.925445774069231e-05,
"loss": 5018.45,
"step": 7980
},
{
"ce_loss_13": 2.9288057029247283,
"ce_loss_26": 2.4822009325027468,
"ce_loss_39": 2.011521789431572,
"ce_loss_52": 1.4153838574886322,
"ce_loss_7": 3.1911131501197816,
"epoch": 0.799,
"grad_norm": 19.205851361122782,
"kl_loss_13": 3126.0,
"kl_loss_26": 2203.4,
"kl_loss_39": 1210.7,
"kl_loss_7": 3672.4,
"learning_rate": 9.830764196878872e-05,
"loss": 5069.4,
"step": 7990
},
{
"ce_loss_13": 2.9970316886901855,
"ce_loss_26": 2.5632322430610657,
"ce_loss_39": 2.089646649360657,
"ce_loss_52": 1.4764455169439317,
"ce_loss_7": 3.2621945440769196,
"epoch": 0.8,
"grad_norm": 18.817128068716947,
"kl_loss_13": 3164.4,
"kl_loss_26": 2259.8,
"kl_loss_39": 1248.2,
"kl_loss_7": 3716.4,
"learning_rate": 9.736487123447069e-05,
"loss": 5026.75,
"step": 8000
},
{
"ce_loss_13": 2.955092731118202,
"ce_loss_26": 2.5211679935455322,
"ce_loss_39": 2.061183473467827,
"ce_loss_52": 1.4790756076574325,
"ce_loss_7": 3.211963188648224,
"epoch": 0.801,
"grad_norm": 19.13731613570068,
"kl_loss_13": 3058.4,
"kl_loss_26": 2157.0,
"kl_loss_39": 1183.6,
"kl_loss_7": 3589.2,
"learning_rate": 9.642615503142926e-05,
"loss": 5013.8,
"step": 8010
},
{
"ce_loss_13": 2.8814174115657805,
"ce_loss_26": 2.451471582055092,
"ce_loss_39": 1.9925315648317337,
"ce_loss_52": 1.4264169082045555,
"ce_loss_7": 3.150370055437088,
"epoch": 0.802,
"grad_norm": 19.95644855086655,
"kl_loss_13": 3015.2,
"kl_loss_26": 2114.4,
"kl_loss_39": 1144.7,
"kl_loss_7": 3572.8,
"learning_rate": 9.549150281252633e-05,
"loss": 5066.0,
"step": 8020
},
{
"ce_loss_13": 2.9213546574115754,
"ce_loss_26": 2.488295114040375,
"ce_loss_39": 2.0217175424098968,
"ce_loss_52": 1.448304545879364,
"ce_loss_7": 3.177370023727417,
"epoch": 0.803,
"grad_norm": 19.167189329215354,
"kl_loss_13": 3046.0,
"kl_loss_26": 2140.4,
"kl_loss_39": 1161.6,
"kl_loss_7": 3582.8,
"learning_rate": 9.4560923989699e-05,
"loss": 5040.7,
"step": 8030
},
{
"ce_loss_13": 2.8780395865440367,
"ce_loss_26": 2.43271960914135,
"ce_loss_39": 1.966169360280037,
"ce_loss_52": 1.3888942331075669,
"ce_loss_7": 3.1494656085968016,
"epoch": 0.804,
"grad_norm": 18.853764898775175,
"kl_loss_13": 3103.6,
"kl_loss_26": 2173.0,
"kl_loss_39": 1171.5,
"kl_loss_7": 3672.8,
"learning_rate": 9.363442793386607e-05,
"loss": 5021.25,
"step": 8040
},
{
"ce_loss_13": 2.9130735754966737,
"ce_loss_26": 2.480276498198509,
"ce_loss_39": 2.0196522653102873,
"ce_loss_52": 1.4448804795742034,
"ce_loss_7": 3.1761886417865752,
"epoch": 0.805,
"grad_norm": 18.76619687601556,
"kl_loss_13": 3048.8,
"kl_loss_26": 2147.8,
"kl_loss_39": 1165.5,
"kl_loss_7": 3590.8,
"learning_rate": 9.271202397483213e-05,
"loss": 4983.8,
"step": 8050
},
{
"ce_loss_13": 2.960085618495941,
"ce_loss_26": 2.516904118657112,
"ce_loss_39": 2.0490771383047104,
"ce_loss_52": 1.4702347993850708,
"ce_loss_7": 3.2195077538490295,
"epoch": 0.806,
"grad_norm": 20.341613718883853,
"kl_loss_13": 3080.0,
"kl_loss_26": 2163.0,
"kl_loss_39": 1179.4,
"kl_loss_7": 3617.6,
"learning_rate": 9.179372140119524e-05,
"loss": 5044.25,
"step": 8060
},
{
"ce_loss_13": 2.895679956674576,
"ce_loss_26": 2.447220724821091,
"ce_loss_39": 1.978733304142952,
"ce_loss_52": 1.4091844826936721,
"ce_loss_7": 3.1559928357601166,
"epoch": 0.807,
"grad_norm": 19.648457937725187,
"kl_loss_13": 3088.0,
"kl_loss_26": 2160.6,
"kl_loss_39": 1172.1,
"kl_loss_7": 3631.6,
"learning_rate": 9.087952946025175e-05,
"loss": 5019.35,
"step": 8070
},
{
"ce_loss_13": 2.8867613554000853,
"ce_loss_26": 2.4542810022830963,
"ce_loss_39": 1.997492003440857,
"ce_loss_52": 1.4237906068563462,
"ce_loss_7": 3.1490099489688874,
"epoch": 0.808,
"grad_norm": 19.108440716050485,
"kl_loss_13": 3031.2,
"kl_loss_26": 2132.0,
"kl_loss_39": 1161.2,
"kl_loss_7": 3573.2,
"learning_rate": 8.996945735790446e-05,
"loss": 5081.55,
"step": 8080
},
{
"ce_loss_13": 2.903727024793625,
"ce_loss_26": 2.4667694687843325,
"ce_loss_39": 2.0024666130542754,
"ce_loss_52": 1.4252464413642882,
"ce_loss_7": 3.1627338111400602,
"epoch": 0.809,
"grad_norm": 19.45516980399187,
"kl_loss_13": 3065.2,
"kl_loss_26": 2156.8,
"kl_loss_39": 1173.6,
"kl_loss_7": 3607.6,
"learning_rate": 8.906351425856951e-05,
"loss": 5032.55,
"step": 8090
},
{
"ce_loss_13": 2.9971861839294434,
"ce_loss_26": 2.558201992511749,
"ce_loss_39": 2.0943466246128084,
"ce_loss_52": 1.5071399331092834,
"ce_loss_7": 3.2508726358413695,
"epoch": 0.81,
"grad_norm": 18.578032258449234,
"kl_loss_13": 3080.0,
"kl_loss_26": 2167.8,
"kl_loss_39": 1188.5,
"kl_loss_7": 3612.8,
"learning_rate": 8.816170928508365e-05,
"loss": 5060.5,
"step": 8100
},
{
"ce_loss_13": 2.9514600038528442,
"ce_loss_26": 2.511053240299225,
"ce_loss_39": 2.036428925395012,
"ce_loss_52": 1.4632433116436006,
"ce_loss_7": 3.2164508640766143,
"epoch": 0.811,
"grad_norm": 19.46007345669389,
"kl_loss_13": 3077.6,
"kl_loss_26": 2160.6,
"kl_loss_39": 1164.9,
"kl_loss_7": 3629.6,
"learning_rate": 8.7264051518613e-05,
"loss": 5036.75,
"step": 8110
},
{
"ce_loss_13": 2.8350981414318084,
"ce_loss_26": 2.3989670783281327,
"ce_loss_39": 1.9394948929548264,
"ce_loss_52": 1.385464173555374,
"ce_loss_7": 3.0943558514118195,
"epoch": 0.812,
"grad_norm": 20.429118003347835,
"kl_loss_13": 3019.6,
"kl_loss_26": 2118.2,
"kl_loss_39": 1139.6,
"kl_loss_7": 3559.6,
"learning_rate": 8.637054999856148e-05,
"loss": 5033.2,
"step": 8120
},
{
"ce_loss_13": 2.956312870979309,
"ce_loss_26": 2.510516768693924,
"ce_loss_39": 2.0456418454647065,
"ce_loss_52": 1.454368954896927,
"ce_loss_7": 3.2136961221694946,
"epoch": 0.813,
"grad_norm": 19.290479115918554,
"kl_loss_13": 3110.4,
"kl_loss_26": 2193.4,
"kl_loss_39": 1206.0,
"kl_loss_7": 3651.6,
"learning_rate": 8.548121372247918e-05,
"loss": 5042.35,
"step": 8130
},
{
"ce_loss_13": 2.8997408151626587,
"ce_loss_26": 2.4578535914421082,
"ce_loss_39": 1.9877970427274705,
"ce_loss_52": 1.4171391934156419,
"ce_loss_7": 3.1653897404670714,
"epoch": 0.814,
"grad_norm": 19.231104242907662,
"kl_loss_13": 3062.4,
"kl_loss_26": 2154.2,
"kl_loss_39": 1161.8,
"kl_loss_7": 3620.0,
"learning_rate": 8.459605164597267e-05,
"loss": 4990.3,
"step": 8140
},
{
"ce_loss_13": 2.8967735528945924,
"ce_loss_26": 2.462614360451698,
"ce_loss_39": 1.9984548151493073,
"ce_loss_52": 1.438458850979805,
"ce_loss_7": 3.1581345558166505,
"epoch": 0.815,
"grad_norm": 19.45006377816174,
"kl_loss_13": 3056.8,
"kl_loss_26": 2153.0,
"kl_loss_39": 1160.6,
"kl_loss_7": 3602.0,
"learning_rate": 8.371507268261436e-05,
"loss": 4980.2,
"step": 8150
},
{
"ce_loss_13": 2.936253345012665,
"ce_loss_26": 2.5057778120040894,
"ce_loss_39": 2.038064029812813,
"ce_loss_52": 1.4622955560684203,
"ce_loss_7": 3.1979693949222563,
"epoch": 0.816,
"grad_norm": 18.91111939228637,
"kl_loss_13": 3060.8,
"kl_loss_26": 2153.2,
"kl_loss_39": 1175.6,
"kl_loss_7": 3612.0,
"learning_rate": 8.283828570385238e-05,
"loss": 5006.35,
"step": 8160
},
{
"ce_loss_13": 2.9529894649982453,
"ce_loss_26": 2.5044034361839294,
"ce_loss_39": 2.042719992995262,
"ce_loss_52": 1.4732781440019607,
"ce_loss_7": 3.2085989713668823,
"epoch": 0.817,
"grad_norm": 18.881507142327827,
"kl_loss_13": 3068.4,
"kl_loss_26": 2134.6,
"kl_loss_39": 1154.6,
"kl_loss_7": 3608.0,
"learning_rate": 8.196569953892202e-05,
"loss": 5023.2,
"step": 8170
},
{
"ce_loss_13": 2.901773339509964,
"ce_loss_26": 2.46816024184227,
"ce_loss_39": 2.0061379730701447,
"ce_loss_52": 1.4472751855850219,
"ce_loss_7": 3.162723332643509,
"epoch": 0.818,
"grad_norm": 19.58512082927694,
"kl_loss_13": 3021.6,
"kl_loss_26": 2116.2,
"kl_loss_39": 1138.3,
"kl_loss_7": 3567.6,
"learning_rate": 8.109732297475635e-05,
"loss": 5011.1,
"step": 8180
},
{
"ce_loss_13": 2.9283832788467405,
"ce_loss_26": 2.495754861831665,
"ce_loss_39": 2.0315854638814925,
"ce_loss_52": 1.4544063314795495,
"ce_loss_7": 3.184633868932724,
"epoch": 0.819,
"grad_norm": 20.338540288781616,
"kl_loss_13": 3056.0,
"kl_loss_26": 2152.0,
"kl_loss_39": 1164.7,
"kl_loss_7": 3604.8,
"learning_rate": 8.023316475589754e-05,
"loss": 4985.65,
"step": 8190
},
{
"ce_loss_13": 2.8736020922660828,
"ce_loss_26": 2.4326390773057938,
"ce_loss_39": 1.9687805682420731,
"ce_loss_52": 1.4114506781101226,
"ce_loss_7": 3.1362832963466643,
"epoch": 0.82,
"grad_norm": 19.348124098647066,
"kl_loss_13": 3039.6,
"kl_loss_26": 2118.4,
"kl_loss_39": 1131.5,
"kl_loss_7": 3592.4,
"learning_rate": 7.937323358440934e-05,
"loss": 4999.05,
"step": 8200
},
{
"ce_loss_13": 2.9405028223991394,
"ce_loss_26": 2.5053980708122254,
"ce_loss_39": 2.031425711512566,
"ce_loss_52": 1.4602935075759889,
"ce_loss_7": 3.1905817687511444,
"epoch": 0.821,
"grad_norm": 19.308929525960306,
"kl_loss_13": 3053.6,
"kl_loss_26": 2142.8,
"kl_loss_39": 1164.5,
"kl_loss_7": 3580.4,
"learning_rate": 7.851753811978923e-05,
"loss": 5013.6,
"step": 8210
},
{
"ce_loss_13": 2.8261436820030212,
"ce_loss_26": 2.396569001674652,
"ce_loss_39": 1.9376911997795105,
"ce_loss_52": 1.3843101486563683,
"ce_loss_7": 3.08916922211647,
"epoch": 0.822,
"grad_norm": 18.760581120890944,
"kl_loss_13": 2979.6,
"kl_loss_26": 2084.0,
"kl_loss_39": 1118.1,
"kl_loss_7": 3526.4,
"learning_rate": 7.766608697888095e-05,
"loss": 4996.05,
"step": 8220
},
{
"ce_loss_13": 2.895614618062973,
"ce_loss_26": 2.4538383156061174,
"ce_loss_39": 1.9913074195384979,
"ce_loss_52": 1.4129166051745414,
"ce_loss_7": 3.156418579816818,
"epoch": 0.823,
"grad_norm": 19.448137234461008,
"kl_loss_13": 3094.8,
"kl_loss_26": 2171.8,
"kl_loss_39": 1179.4,
"kl_loss_7": 3644.0,
"learning_rate": 7.681888873578785e-05,
"loss": 5010.15,
"step": 8230
},
{
"ce_loss_13": 2.8753804206848144,
"ce_loss_26": 2.448589825630188,
"ce_loss_39": 1.9881916165351867,
"ce_loss_52": 1.433535772562027,
"ce_loss_7": 3.133152514696121,
"epoch": 0.824,
"grad_norm": 19.780732761004185,
"kl_loss_13": 2998.8,
"kl_loss_26": 2105.2,
"kl_loss_39": 1137.2,
"kl_loss_7": 3529.6,
"learning_rate": 7.597595192178702e-05,
"loss": 4951.6,
"step": 8240
},
{
"ce_loss_13": 2.874552935361862,
"ce_loss_26": 2.426975393295288,
"ce_loss_39": 1.969171154499054,
"ce_loss_52": 1.4067743465304374,
"ce_loss_7": 3.1344926774501802,
"epoch": 0.825,
"grad_norm": 19.081018400834456,
"kl_loss_13": 3050.0,
"kl_loss_26": 2129.2,
"kl_loss_39": 1153.6,
"kl_loss_7": 3590.8,
"learning_rate": 7.513728502524286e-05,
"loss": 4924.6,
"step": 8250
},
{
"ce_loss_13": 2.8894054651260377,
"ce_loss_26": 2.4550040304660796,
"ce_loss_39": 1.9873575389385223,
"ce_loss_52": 1.417596697807312,
"ce_loss_7": 3.152091747522354,
"epoch": 0.826,
"grad_norm": 19.8137268983455,
"kl_loss_13": 3043.6,
"kl_loss_26": 2138.6,
"kl_loss_39": 1152.9,
"kl_loss_7": 3588.0,
"learning_rate": 7.430289649152156e-05,
"loss": 5032.7,
"step": 8260
},
{
"ce_loss_13": 2.9286361813545225,
"ce_loss_26": 2.500110092759132,
"ce_loss_39": 2.0343170315027237,
"ce_loss_52": 1.4783238634467124,
"ce_loss_7": 3.186429667472839,
"epoch": 0.827,
"grad_norm": 19.162288533319998,
"kl_loss_13": 3004.0,
"kl_loss_26": 2105.8,
"kl_loss_39": 1135.2,
"kl_loss_7": 3537.2,
"learning_rate": 7.347279472290646e-05,
"loss": 4997.95,
"step": 8270
},
{
"ce_loss_13": 2.8661180198192597,
"ce_loss_26": 2.4267468631267546,
"ce_loss_39": 1.9584647029638291,
"ce_loss_52": 1.3965127140283584,
"ce_loss_7": 3.1264285147190094,
"epoch": 0.828,
"grad_norm": 18.88865775629702,
"kl_loss_13": 3042.4,
"kl_loss_26": 2132.2,
"kl_loss_39": 1143.9,
"kl_loss_7": 3592.0,
"learning_rate": 7.264698807851328e-05,
"loss": 4945.25,
"step": 8280
},
{
"ce_loss_13": 2.963280272483826,
"ce_loss_26": 2.5102931052446364,
"ce_loss_39": 2.034750634431839,
"ce_loss_52": 1.4600775420665741,
"ce_loss_7": 3.2234760582447053,
"epoch": 0.829,
"grad_norm": 18.537963317549014,
"kl_loss_13": 3144.4,
"kl_loss_26": 2206.8,
"kl_loss_39": 1191.9,
"kl_loss_7": 3687.6,
"learning_rate": 7.182548487420554e-05,
"loss": 5033.4,
"step": 8290
},
{
"ce_loss_13": 2.9992467045783995,
"ce_loss_26": 2.55620219707489,
"ce_loss_39": 2.0874054759740828,
"ce_loss_52": 1.4909266605973244,
"ce_loss_7": 3.263571548461914,
"epoch": 0.83,
"grad_norm": 18.891053458181357,
"kl_loss_13": 3132.0,
"kl_loss_26": 2207.0,
"kl_loss_39": 1207.0,
"kl_loss_7": 3681.2,
"learning_rate": 7.100829338251146e-05,
"loss": 5053.05,
"step": 8300
},
{
"ce_loss_13": 2.9434437096118926,
"ce_loss_26": 2.5084387719631196,
"ce_loss_39": 2.0438412368297576,
"ce_loss_52": 1.455627703666687,
"ce_loss_7": 3.203293949365616,
"epoch": 0.831,
"grad_norm": 18.63506341583337,
"kl_loss_13": 3080.0,
"kl_loss_26": 2173.6,
"kl_loss_39": 1196.0,
"kl_loss_7": 3632.8,
"learning_rate": 7.019542183254046e-05,
"loss": 5020.25,
"step": 8310
},
{
"ce_loss_13": 2.9150700867176056,
"ce_loss_26": 2.4821571350097655,
"ce_loss_39": 2.011009243130684,
"ce_loss_52": 1.4295171827077866,
"ce_loss_7": 3.1743992388248445,
"epoch": 0.832,
"grad_norm": 19.224604457055225,
"kl_loss_13": 3061.6,
"kl_loss_26": 2156.4,
"kl_loss_39": 1178.0,
"kl_loss_7": 3603.6,
"learning_rate": 6.938687840989971e-05,
"loss": 4998.8,
"step": 8320
},
{
"ce_loss_13": 2.9248284220695497,
"ce_loss_26": 2.4842200338840486,
"ce_loss_39": 2.016203221678734,
"ce_loss_52": 1.4306745767593383,
"ce_loss_7": 3.1828024327754973,
"epoch": 0.833,
"grad_norm": 21.39912114954264,
"kl_loss_13": 3084.0,
"kl_loss_26": 2177.0,
"kl_loss_39": 1194.5,
"kl_loss_7": 3630.0,
"learning_rate": 6.858267125661271e-05,
"loss": 5022.85,
"step": 8330
},
{
"ce_loss_13": 2.8756704360246657,
"ce_loss_26": 2.447014120221138,
"ce_loss_39": 1.9950514793395997,
"ce_loss_52": 1.4190126568078996,
"ce_loss_7": 3.135387209057808,
"epoch": 0.834,
"grad_norm": 19.16865484866817,
"kl_loss_13": 3030.8,
"kl_loss_26": 2137.8,
"kl_loss_39": 1174.0,
"kl_loss_7": 3584.4,
"learning_rate": 6.778280847103668e-05,
"loss": 5009.55,
"step": 8340
},
{
"ce_loss_13": 2.8521511554718018,
"ce_loss_26": 2.415205565094948,
"ce_loss_39": 1.9484322667121887,
"ce_loss_52": 1.3919154837727548,
"ce_loss_7": 3.11352881193161,
"epoch": 0.835,
"grad_norm": 19.565513612829772,
"kl_loss_13": 3035.6,
"kl_loss_26": 2128.8,
"kl_loss_39": 1129.1,
"kl_loss_7": 3584.4,
"learning_rate": 6.698729810778065e-05,
"loss": 4997.8,
"step": 8350
},
{
"ce_loss_13": 2.9550336360931397,
"ce_loss_26": 2.516478735208511,
"ce_loss_39": 2.0486765056848526,
"ce_loss_52": 1.4607697233557702,
"ce_loss_7": 3.2147393763065337,
"epoch": 0.836,
"grad_norm": 19.2117514895061,
"kl_loss_13": 3091.2,
"kl_loss_26": 2181.2,
"kl_loss_39": 1192.7,
"kl_loss_7": 3642.8,
"learning_rate": 6.619614817762538e-05,
"loss": 4980.45,
"step": 8360
},
{
"ce_loss_13": 2.8862642347812653,
"ce_loss_26": 2.453189605474472,
"ce_loss_39": 1.9996996372938156,
"ce_loss_52": 1.42118998169899,
"ce_loss_7": 3.1397067666053773,
"epoch": 0.837,
"grad_norm": 19.74025196121118,
"kl_loss_13": 3027.2,
"kl_loss_26": 2130.4,
"kl_loss_39": 1163.7,
"kl_loss_7": 3555.6,
"learning_rate": 6.540936664744196e-05,
"loss": 5003.55,
"step": 8370
},
{
"ce_loss_13": 2.8884230494499206,
"ce_loss_26": 2.461115485429764,
"ce_loss_39": 1.996484610438347,
"ce_loss_52": 1.431185284256935,
"ce_loss_7": 3.1595689237117766,
"epoch": 0.838,
"grad_norm": 18.91687230970295,
"kl_loss_13": 3028.0,
"kl_loss_26": 2134.2,
"kl_loss_39": 1162.1,
"kl_loss_7": 3585.2,
"learning_rate": 6.462696144011149e-05,
"loss": 4978.95,
"step": 8380
},
{
"ce_loss_13": 2.9092856884002685,
"ce_loss_26": 2.477367341518402,
"ce_loss_39": 2.0064304888248445,
"ce_loss_52": 1.4392234981060028,
"ce_loss_7": 3.166864866018295,
"epoch": 0.839,
"grad_norm": 18.987849334165656,
"kl_loss_13": 3037.2,
"kl_loss_26": 2130.0,
"kl_loss_39": 1148.3,
"kl_loss_7": 3579.6,
"learning_rate": 6.384894043444567e-05,
"loss": 4978.1,
"step": 8390
},
{
"ce_loss_13": 2.926213449239731,
"ce_loss_26": 2.4814498484134675,
"ce_loss_39": 2.011722648143768,
"ce_loss_52": 1.4253545701503754,
"ce_loss_7": 3.1928284585475923,
"epoch": 0.84,
"grad_norm": 18.67190019337169,
"kl_loss_13": 3118.4,
"kl_loss_26": 2194.4,
"kl_loss_39": 1203.2,
"kl_loss_7": 3673.2,
"learning_rate": 6.307531146510753e-05,
"loss": 4975.3,
"step": 8400
},
{
"ce_loss_13": 2.9474796772003176,
"ce_loss_26": 2.513693606853485,
"ce_loss_39": 2.039980337023735,
"ce_loss_52": 1.4588691473007203,
"ce_loss_7": 3.208155167102814,
"epoch": 0.841,
"grad_norm": 19.23898472020216,
"kl_loss_13": 3091.2,
"kl_loss_26": 2176.0,
"kl_loss_39": 1186.3,
"kl_loss_7": 3638.8,
"learning_rate": 6.230608232253226e-05,
"loss": 4972.8,
"step": 8410
},
{
"ce_loss_13": 2.98299777507782,
"ce_loss_26": 2.546931451559067,
"ce_loss_39": 2.066228356957436,
"ce_loss_52": 1.4528164565563202,
"ce_loss_7": 3.2408275246620177,
"epoch": 0.842,
"grad_norm": 19.590505544042042,
"kl_loss_13": 3177.6,
"kl_loss_26": 2260.2,
"kl_loss_39": 1246.2,
"kl_loss_7": 3722.8,
"learning_rate": 6.154126075284855e-05,
"loss": 5025.9,
"step": 8420
},
{
"ce_loss_13": 2.8128190338611603,
"ce_loss_26": 2.3779567658901213,
"ce_loss_39": 1.922488284111023,
"ce_loss_52": 1.3714163228869438,
"ce_loss_7": 3.0708466947078703,
"epoch": 0.843,
"grad_norm": 19.244313422130332,
"kl_loss_13": 3014.4,
"kl_loss_26": 2105.4,
"kl_loss_39": 1130.8,
"kl_loss_7": 3552.8,
"learning_rate": 6.078085445780129e-05,
"loss": 5004.75,
"step": 8430
},
{
"ce_loss_13": 2.938475805521011,
"ce_loss_26": 2.497309777140617,
"ce_loss_39": 2.027024504542351,
"ce_loss_52": 1.4460827559232712,
"ce_loss_7": 3.194866645336151,
"epoch": 0.844,
"grad_norm": 18.715084502177618,
"kl_loss_13": 3086.4,
"kl_loss_26": 2168.0,
"kl_loss_39": 1180.4,
"kl_loss_7": 3630.0,
"learning_rate": 6.002487109467347e-05,
"loss": 5005.05,
"step": 8440
},
{
"ce_loss_13": 2.940459841489792,
"ce_loss_26": 2.5169106662273406,
"ce_loss_39": 2.0435358375310897,
"ce_loss_52": 1.4713785827159882,
"ce_loss_7": 3.199807566404343,
"epoch": 0.845,
"grad_norm": 20.662636581778713,
"kl_loss_13": 3050.8,
"kl_loss_26": 2152.2,
"kl_loss_39": 1166.1,
"kl_loss_7": 3588.8,
"learning_rate": 5.927331827620902e-05,
"loss": 5015.45,
"step": 8450
},
{
"ce_loss_13": 2.8635286152362824,
"ce_loss_26": 2.426770511269569,
"ce_loss_39": 1.9639566600322724,
"ce_loss_52": 1.4056006461381911,
"ce_loss_7": 3.126782363653183,
"epoch": 0.846,
"grad_norm": 19.43955116705752,
"kl_loss_13": 3026.8,
"kl_loss_26": 2123.4,
"kl_loss_39": 1139.5,
"kl_loss_7": 3581.2,
"learning_rate": 5.852620357053651e-05,
"loss": 4930.5,
"step": 8460
},
{
"ce_loss_13": 2.967438644170761,
"ce_loss_26": 2.5302784025669096,
"ce_loss_39": 2.0548608988523482,
"ce_loss_52": 1.456875516474247,
"ce_loss_7": 3.232648569345474,
"epoch": 0.847,
"grad_norm": 18.917933547815785,
"kl_loss_13": 3141.6,
"kl_loss_26": 2229.0,
"kl_loss_39": 1221.3,
"kl_loss_7": 3698.0,
"learning_rate": 5.778353450109286e-05,
"loss": 5049.2,
"step": 8470
},
{
"ce_loss_13": 2.8490840077400206,
"ce_loss_26": 2.4299704492092133,
"ce_loss_39": 1.9720161318778993,
"ce_loss_52": 1.428696632385254,
"ce_loss_7": 3.1050261557102203,
"epoch": 0.848,
"grad_norm": 19.028513845565772,
"kl_loss_13": 2946.4,
"kl_loss_26": 2077.0,
"kl_loss_39": 1109.1,
"kl_loss_7": 3480.0,
"learning_rate": 5.7045318546547206e-05,
"loss": 4964.25,
"step": 8480
},
{
"ce_loss_13": 2.91025772690773,
"ce_loss_26": 2.470404103398323,
"ce_loss_39": 1.9989687472581863,
"ce_loss_52": 1.4330752968788147,
"ce_loss_7": 3.1751048266887665,
"epoch": 0.849,
"grad_norm": 18.63333546431516,
"kl_loss_13": 3053.6,
"kl_loss_26": 2137.8,
"kl_loss_39": 1151.7,
"kl_loss_7": 3603.6,
"learning_rate": 5.631156314072605e-05,
"loss": 4997.6,
"step": 8490
},
{
"ce_loss_13": 2.969731491804123,
"ce_loss_26": 2.5224834442138673,
"ce_loss_39": 2.043315088748932,
"ce_loss_52": 1.4606564939022064,
"ce_loss_7": 3.235535615682602,
"epoch": 0.85,
"grad_norm": 19.13874159874534,
"kl_loss_13": 3116.0,
"kl_loss_26": 2192.8,
"kl_loss_39": 1187.7,
"kl_loss_7": 3671.6,
"learning_rate": 5.5582275672538315e-05,
"loss": 4973.7,
"step": 8500
},
{
"ce_loss_13": 2.9457097470760347,
"ce_loss_26": 2.5094266653060915,
"ce_loss_39": 2.037208506464958,
"ce_loss_52": 1.4461605846881866,
"ce_loss_7": 3.2044992685317992,
"epoch": 0.851,
"grad_norm": 19.084085332754032,
"kl_loss_13": 3088.0,
"kl_loss_26": 2174.4,
"kl_loss_39": 1186.9,
"kl_loss_7": 3629.2,
"learning_rate": 5.4857463485900484e-05,
"loss": 4979.7,
"step": 8510
},
{
"ce_loss_13": 2.932442033290863,
"ce_loss_26": 2.4945150196552275,
"ce_loss_39": 2.024565789103508,
"ce_loss_52": 1.4535668522119523,
"ce_loss_7": 3.1925411999225615,
"epoch": 0.852,
"grad_norm": 18.598717511449827,
"kl_loss_13": 3049.2,
"kl_loss_26": 2138.2,
"kl_loss_39": 1163.5,
"kl_loss_7": 3596.0,
"learning_rate": 5.413713387966329e-05,
"loss": 4976.35,
"step": 8520
},
{
"ce_loss_13": 2.850284093618393,
"ce_loss_26": 2.4304546415805817,
"ce_loss_39": 1.9769367069005965,
"ce_loss_52": 1.4221994251012802,
"ce_loss_7": 3.106715601682663,
"epoch": 0.853,
"grad_norm": 19.800943553162387,
"kl_loss_13": 2971.2,
"kl_loss_26": 2092.8,
"kl_loss_39": 1135.0,
"kl_loss_7": 3509.2,
"learning_rate": 5.34212941075381e-05,
"loss": 4969.3,
"step": 8530
},
{
"ce_loss_13": 2.8934908270835877,
"ce_loss_26": 2.4596606254577638,
"ce_loss_39": 2.00773600935936,
"ce_loss_52": 1.4613569289445878,
"ce_loss_7": 3.150895756483078,
"epoch": 0.854,
"grad_norm": 18.83825612066199,
"kl_loss_13": 3003.6,
"kl_loss_26": 2097.4,
"kl_loss_39": 1124.7,
"kl_loss_7": 3544.0,
"learning_rate": 5.270995137802315e-05,
"loss": 4942.95,
"step": 8540
},
{
"ce_loss_13": 2.9037817120552063,
"ce_loss_26": 2.462430712580681,
"ce_loss_39": 1.9962077885866165,
"ce_loss_52": 1.425583516061306,
"ce_loss_7": 3.173860079050064,
"epoch": 0.855,
"grad_norm": 18.951683103960118,
"kl_loss_13": 3081.6,
"kl_loss_26": 2158.2,
"kl_loss_39": 1162.8,
"kl_loss_7": 3640.8,
"learning_rate": 5.2003112854332125e-05,
"loss": 4931.9,
"step": 8550
},
{
"ce_loss_13": 2.945174980163574,
"ce_loss_26": 2.504639369249344,
"ce_loss_39": 2.038501372933388,
"ce_loss_52": 1.4634816706180573,
"ce_loss_7": 3.206812459230423,
"epoch": 0.856,
"grad_norm": 19.71064196024924,
"kl_loss_13": 3095.2,
"kl_loss_26": 2169.8,
"kl_loss_39": 1182.5,
"kl_loss_7": 3640.4,
"learning_rate": 5.130078565432089e-05,
"loss": 5022.2,
"step": 8560
},
{
"ce_loss_13": 2.916484522819519,
"ce_loss_26": 2.4740715622901917,
"ce_loss_39": 2.0083792597055434,
"ce_loss_52": 1.4411062002182007,
"ce_loss_7": 3.1799224853515624,
"epoch": 0.857,
"grad_norm": 18.476735822226924,
"kl_loss_13": 3088.4,
"kl_loss_26": 2169.8,
"kl_loss_39": 1171.0,
"kl_loss_7": 3638.0,
"learning_rate": 5.060297685041659e-05,
"loss": 4959.95,
"step": 8570
},
{
"ce_loss_13": 2.9657407224178316,
"ce_loss_26": 2.5159328460693358,
"ce_loss_39": 2.0297839671373366,
"ce_loss_52": 1.4374482572078704,
"ce_loss_7": 3.2368306040763857,
"epoch": 0.858,
"grad_norm": 18.666614846165707,
"kl_loss_13": 3174.0,
"kl_loss_26": 2243.8,
"kl_loss_39": 1214.4,
"kl_loss_7": 3739.2,
"learning_rate": 4.99096934695461e-05,
"loss": 4973.6,
"step": 8580
},
{
"ce_loss_13": 2.9122063517570496,
"ce_loss_26": 2.4818194091320036,
"ce_loss_39": 2.0184349328279496,
"ce_loss_52": 1.4382916703820228,
"ce_loss_7": 3.17412588596344,
"epoch": 0.859,
"grad_norm": 19.2367820447764,
"kl_loss_13": 3051.6,
"kl_loss_26": 2150.2,
"kl_loss_39": 1170.3,
"kl_loss_7": 3600.8,
"learning_rate": 4.922094249306558e-05,
"loss": 4986.8,
"step": 8590
},
{
"ce_loss_13": 2.856007623672485,
"ce_loss_26": 2.4249674677848816,
"ce_loss_39": 1.9603979021310807,
"ce_loss_52": 1.3998382538557053,
"ce_loss_7": 3.120637094974518,
"epoch": 0.86,
"grad_norm": 19.45927110525327,
"kl_loss_13": 3034.0,
"kl_loss_26": 2133.8,
"kl_loss_39": 1154.5,
"kl_loss_7": 3588.0,
"learning_rate": 4.853673085668947e-05,
"loss": 5020.0,
"step": 8600
},
{
"ce_loss_13": 2.8736523926258086,
"ce_loss_26": 2.4339311927556992,
"ce_loss_39": 1.9728148251771926,
"ce_loss_52": 1.4134869635105134,
"ce_loss_7": 3.132260227203369,
"epoch": 0.861,
"grad_norm": 18.566757154178084,
"kl_loss_13": 3035.6,
"kl_loss_26": 2121.8,
"kl_loss_39": 1143.1,
"kl_loss_7": 3595.6,
"learning_rate": 4.78570654504214e-05,
"loss": 5002.4,
"step": 8610
},
{
"ce_loss_13": 2.918911075592041,
"ce_loss_26": 2.4760118186473847,
"ce_loss_39": 2.0035496681928633,
"ce_loss_52": 1.4298115074634552,
"ce_loss_7": 3.1867982387542724,
"epoch": 0.862,
"grad_norm": 19.07407083640675,
"kl_loss_13": 3104.8,
"kl_loss_26": 2178.0,
"kl_loss_39": 1171.7,
"kl_loss_7": 3661.6,
"learning_rate": 4.7181953118484556e-05,
"loss": 4962.1,
"step": 8620
},
{
"ce_loss_13": 2.90929571390152,
"ce_loss_26": 2.4703837007284166,
"ce_loss_39": 2.004773771762848,
"ce_loss_52": 1.4389754503965377,
"ce_loss_7": 3.1703392446041105,
"epoch": 0.863,
"grad_norm": 19.469181472497027,
"kl_loss_13": 3027.6,
"kl_loss_26": 2134.8,
"kl_loss_39": 1156.0,
"kl_loss_7": 3579.2,
"learning_rate": 4.651140065925269e-05,
"loss": 4937.2,
"step": 8630
},
{
"ce_loss_13": 2.9975267946720123,
"ce_loss_26": 2.5536694526672363,
"ce_loss_39": 2.07455490231514,
"ce_loss_52": 1.4786568373441695,
"ce_loss_7": 3.265315741300583,
"epoch": 0.864,
"grad_norm": 19.328625062361652,
"kl_loss_13": 3149.6,
"kl_loss_26": 2221.4,
"kl_loss_39": 1210.4,
"kl_loss_7": 3701.6,
"learning_rate": 4.58454148251814e-05,
"loss": 4974.2,
"step": 8640
},
{
"ce_loss_13": 2.914253044128418,
"ce_loss_26": 2.4742087960243224,
"ce_loss_39": 2.0090451925992965,
"ce_loss_52": 1.4428718268871308,
"ce_loss_7": 3.174784082174301,
"epoch": 0.865,
"grad_norm": 18.948427807011946,
"kl_loss_13": 3048.4,
"kl_loss_26": 2129.2,
"kl_loss_39": 1142.4,
"kl_loss_7": 3588.0,
"learning_rate": 4.518400232274078e-05,
"loss": 4950.8,
"step": 8650
},
{
"ce_loss_13": 2.895064812898636,
"ce_loss_26": 2.452637565135956,
"ce_loss_39": 1.9877680152654649,
"ce_loss_52": 1.419823595881462,
"ce_loss_7": 3.151270192861557,
"epoch": 0.866,
"grad_norm": 18.81553972523,
"kl_loss_13": 3058.0,
"kl_loss_26": 2145.0,
"kl_loss_39": 1169.4,
"kl_loss_7": 3595.6,
"learning_rate": 4.452716981234745e-05,
"loss": 5007.2,
"step": 8660
},
{
"ce_loss_13": 2.929095983505249,
"ce_loss_26": 2.480497121810913,
"ce_loss_39": 2.006428611278534,
"ce_loss_52": 1.4315154731273652,
"ce_loss_7": 3.2007214546203615,
"epoch": 0.867,
"grad_norm": 18.95890107682501,
"kl_loss_13": 3086.0,
"kl_loss_26": 2159.8,
"kl_loss_39": 1163.2,
"kl_loss_7": 3645.6,
"learning_rate": 4.3874923908297335e-05,
"loss": 4988.0,
"step": 8670
},
{
"ce_loss_13": 2.917868083715439,
"ce_loss_26": 2.466423386335373,
"ce_loss_39": 1.9825115293264388,
"ce_loss_52": 1.4014427214860916,
"ce_loss_7": 3.187401866912842,
"epoch": 0.868,
"grad_norm": 18.57272436866935,
"kl_loss_13": 3128.0,
"kl_loss_26": 2194.4,
"kl_loss_39": 1181.8,
"kl_loss_7": 3698.4,
"learning_rate": 4.322727117869951e-05,
"loss": 4966.1,
"step": 8680
},
{
"ce_loss_13": 2.864998000860214,
"ce_loss_26": 2.4295243114233016,
"ce_loss_39": 1.9672368943691254,
"ce_loss_52": 1.4193324148654938,
"ce_loss_7": 3.128977674245834,
"epoch": 0.869,
"grad_norm": 18.701108919653294,
"kl_loss_13": 3002.4,
"kl_loss_26": 2088.8,
"kl_loss_39": 1117.8,
"kl_loss_7": 3558.4,
"learning_rate": 4.2584218145409916e-05,
"loss": 4955.6,
"step": 8690
},
{
"ce_loss_13": 2.866414725780487,
"ce_loss_26": 2.4338418275117872,
"ce_loss_39": 1.961911031603813,
"ce_loss_52": 1.3979329317808151,
"ce_loss_7": 3.1249643862247467,
"epoch": 0.87,
"grad_norm": 19.695244279115233,
"kl_loss_13": 3026.4,
"kl_loss_26": 2123.4,
"kl_loss_39": 1140.7,
"kl_loss_7": 3568.4,
"learning_rate": 4.194577128396521e-05,
"loss": 4954.85,
"step": 8700
},
{
"ce_loss_13": 2.9441056907176972,
"ce_loss_26": 2.5146015286445618,
"ce_loss_39": 2.0488742887973785,
"ce_loss_52": 1.4835967749357224,
"ce_loss_7": 3.202489811182022,
"epoch": 0.871,
"grad_norm": 18.533833127334756,
"kl_loss_13": 3031.2,
"kl_loss_26": 2129.4,
"kl_loss_39": 1154.3,
"kl_loss_7": 3572.4,
"learning_rate": 4.1311937023518264e-05,
"loss": 4983.55,
"step": 8710
},
{
"ce_loss_13": 2.927008146047592,
"ce_loss_26": 2.4801330626010896,
"ce_loss_39": 2.0113667100667953,
"ce_loss_52": 1.4392758041620255,
"ce_loss_7": 3.1931580364704133,
"epoch": 0.872,
"grad_norm": 19.604731242944656,
"kl_loss_13": 3073.2,
"kl_loss_26": 2148.4,
"kl_loss_39": 1161.3,
"kl_loss_7": 3631.6,
"learning_rate": 4.0682721746773344e-05,
"loss": 4966.3,
"step": 8720
},
{
"ce_loss_13": 2.875244301557541,
"ce_loss_26": 2.4395941644906998,
"ce_loss_39": 1.9802403211593629,
"ce_loss_52": 1.425346952676773,
"ce_loss_7": 3.1323861300945284,
"epoch": 0.873,
"grad_norm": 19.084321030186736,
"kl_loss_13": 3020.0,
"kl_loss_26": 2114.6,
"kl_loss_39": 1135.6,
"kl_loss_7": 3558.4,
"learning_rate": 4.0058131789920904e-05,
"loss": 4966.9,
"step": 8730
},
{
"ce_loss_13": 2.91582133769989,
"ce_loss_26": 2.469817638397217,
"ce_loss_39": 1.9961349010467528,
"ce_loss_52": 1.4161934450268745,
"ce_loss_7": 3.1843641221523287,
"epoch": 0.874,
"grad_norm": 19.701093771188038,
"kl_loss_13": 3130.0,
"kl_loss_26": 2205.0,
"kl_loss_39": 1187.7,
"kl_loss_7": 3684.8,
"learning_rate": 3.9438173442575e-05,
"loss": 4920.3,
"step": 8740
},
{
"ce_loss_13": 2.931247502565384,
"ce_loss_26": 2.499329847097397,
"ce_loss_39": 2.038593566417694,
"ce_loss_52": 1.4660022050142287,
"ce_loss_7": 3.1989371538162232,
"epoch": 0.875,
"grad_norm": 19.499459693858824,
"kl_loss_13": 3039.6,
"kl_loss_26": 2133.8,
"kl_loss_39": 1152.2,
"kl_loss_7": 3588.0,
"learning_rate": 3.882285294770937e-05,
"loss": 4984.7,
"step": 8750
},
{
"ce_loss_13": 2.903587061166763,
"ce_loss_26": 2.4606124222278596,
"ce_loss_39": 1.9871950060129167,
"ce_loss_52": 1.3980468481779098,
"ce_loss_7": 3.17354930639267,
"epoch": 0.876,
"grad_norm": 19.253090654259744,
"kl_loss_13": 3078.4,
"kl_loss_26": 2163.8,
"kl_loss_39": 1181.0,
"kl_loss_7": 3644.8,
"learning_rate": 3.821217650159453e-05,
"loss": 4982.75,
"step": 8760
},
{
"ce_loss_13": 2.820340207219124,
"ce_loss_26": 2.3951667070388796,
"ce_loss_39": 1.9451639890670775,
"ce_loss_52": 1.4168721199035645,
"ce_loss_7": 3.074656307697296,
"epoch": 0.877,
"grad_norm": 19.190959579311908,
"kl_loss_13": 2910.8,
"kl_loss_26": 2031.2,
"kl_loss_39": 1086.2,
"kl_loss_7": 3448.8,
"learning_rate": 3.760615025373543e-05,
"loss": 4941.25,
"step": 8770
},
{
"ce_loss_13": 2.9391641199588774,
"ce_loss_26": 2.5044194877147676,
"ce_loss_39": 2.0340330809354783,
"ce_loss_52": 1.4550057530403138,
"ce_loss_7": 3.200497591495514,
"epoch": 0.878,
"grad_norm": 19.218011314135488,
"kl_loss_13": 3063.6,
"kl_loss_26": 2155.8,
"kl_loss_39": 1164.4,
"kl_loss_7": 3604.8,
"learning_rate": 3.700478030680987e-05,
"loss": 4989.0,
"step": 8780
},
{
"ce_loss_13": 2.9181353628635405,
"ce_loss_26": 2.484974616765976,
"ce_loss_39": 2.0213694095611574,
"ce_loss_52": 1.4473600834608078,
"ce_loss_7": 3.1764037668704987,
"epoch": 0.879,
"grad_norm": 18.881217884773644,
"kl_loss_13": 3048.4,
"kl_loss_26": 2145.6,
"kl_loss_39": 1162.1,
"kl_loss_7": 3594.8,
"learning_rate": 3.6408072716606344e-05,
"loss": 4996.95,
"step": 8790
},
{
"ce_loss_13": 2.878078305721283,
"ce_loss_26": 2.445318901538849,
"ce_loss_39": 1.9787369549274445,
"ce_loss_52": 1.4154168665409088,
"ce_loss_7": 3.13910374045372,
"epoch": 0.88,
"grad_norm": 19.232153237296814,
"kl_loss_13": 3023.6,
"kl_loss_26": 2120.2,
"kl_loss_39": 1138.8,
"kl_loss_7": 3570.8,
"learning_rate": 3.5816033491963716e-05,
"loss": 4957.2,
"step": 8800
},
{
"ce_loss_13": 2.8982558727264403,
"ce_loss_26": 2.4716543793678283,
"ce_loss_39": 2.0096321552991867,
"ce_loss_52": 1.4367865800857544,
"ce_loss_7": 3.162083399295807,
"epoch": 0.881,
"grad_norm": 19.92275213516334,
"kl_loss_13": 3003.6,
"kl_loss_26": 2113.0,
"kl_loss_39": 1151.0,
"kl_loss_7": 3555.2,
"learning_rate": 3.522866859471047e-05,
"loss": 4925.7,
"step": 8810
},
{
"ce_loss_13": 2.9355869591236115,
"ce_loss_26": 2.495421326160431,
"ce_loss_39": 2.0324677735567094,
"ce_loss_52": 1.4521033734083175,
"ce_loss_7": 3.20084969997406,
"epoch": 0.882,
"grad_norm": 18.63804720981334,
"kl_loss_13": 3093.2,
"kl_loss_26": 2164.2,
"kl_loss_39": 1179.6,
"kl_loss_7": 3645.6,
"learning_rate": 3.46459839396045e-05,
"loss": 5007.2,
"step": 8820
},
{
"ce_loss_13": 2.9325197875499724,
"ce_loss_26": 2.4879075407981874,
"ce_loss_39": 2.0197067618370057,
"ce_loss_52": 1.4270877152681352,
"ce_loss_7": 3.198145306110382,
"epoch": 0.883,
"grad_norm": 18.241829484742485,
"kl_loss_13": 3112.0,
"kl_loss_26": 2188.6,
"kl_loss_39": 1188.1,
"kl_loss_7": 3676.4,
"learning_rate": 3.406798539427386e-05,
"loss": 4970.75,
"step": 8830
},
{
"ce_loss_13": 2.9144038438796995,
"ce_loss_26": 2.490026795864105,
"ce_loss_39": 2.0250850170850754,
"ce_loss_52": 1.4695867449045181,
"ce_loss_7": 3.171018958091736,
"epoch": 0.884,
"grad_norm": 19.295732101275807,
"kl_loss_13": 3016.4,
"kl_loss_26": 2109.6,
"kl_loss_39": 1133.6,
"kl_loss_7": 3556.0,
"learning_rate": 3.349467877915746e-05,
"loss": 4929.15,
"step": 8840
},
{
"ce_loss_13": 2.9367240130901338,
"ce_loss_26": 2.5029721915721894,
"ce_loss_39": 2.038401874899864,
"ce_loss_52": 1.4602129399776458,
"ce_loss_7": 3.1973277926445007,
"epoch": 0.885,
"grad_norm": 18.483553110198095,
"kl_loss_13": 3071.2,
"kl_loss_26": 2165.0,
"kl_loss_39": 1179.3,
"kl_loss_7": 3608.0,
"learning_rate": 3.292606986744667e-05,
"loss": 4997.15,
"step": 8850
},
{
"ce_loss_13": 2.962205785512924,
"ce_loss_26": 2.5182169795036318,
"ce_loss_39": 2.0426361471414567,
"ce_loss_52": 1.4736472845077515,
"ce_loss_7": 3.2304830133914946,
"epoch": 0.886,
"grad_norm": 19.501438521011444,
"kl_loss_13": 3089.6,
"kl_loss_26": 2164.0,
"kl_loss_39": 1168.6,
"kl_loss_7": 3642.4,
"learning_rate": 3.23621643850267e-05,
"loss": 4957.35,
"step": 8860
},
{
"ce_loss_13": 2.8543384969234467,
"ce_loss_26": 2.4200983941555023,
"ce_loss_39": 1.9575607985258103,
"ce_loss_52": 1.3949070930480958,
"ce_loss_7": 3.1160971879959107,
"epoch": 0.887,
"grad_norm": 19.320586944244614,
"kl_loss_13": 3009.6,
"kl_loss_26": 2108.8,
"kl_loss_39": 1139.1,
"kl_loss_7": 3548.0,
"learning_rate": 3.180296801041971e-05,
"loss": 4940.1,
"step": 8870
},
{
"ce_loss_13": 2.8845052778720857,
"ce_loss_26": 2.456773716211319,
"ce_loss_39": 1.9930354177951812,
"ce_loss_52": 1.4307963967323303,
"ce_loss_7": 3.144515538215637,
"epoch": 0.888,
"grad_norm": 19.659033917356428,
"kl_loss_13": 2998.8,
"kl_loss_26": 2105.2,
"kl_loss_39": 1137.6,
"kl_loss_7": 3543.2,
"learning_rate": 3.124848637472688e-05,
"loss": 4952.5,
"step": 8880
},
{
"ce_loss_13": 2.8944738626480104,
"ce_loss_26": 2.4629203975200653,
"ce_loss_39": 1.9913650721311569,
"ce_loss_52": 1.4339269563555717,
"ce_loss_7": 3.1572672605514525,
"epoch": 0.889,
"grad_norm": 18.88843635803768,
"kl_loss_13": 3032.4,
"kl_loss_26": 2127.8,
"kl_loss_39": 1135.0,
"kl_loss_7": 3579.2,
"learning_rate": 3.069872506157212e-05,
"loss": 4974.35,
"step": 8890
},
{
"ce_loss_13": 2.8339054346084596,
"ce_loss_26": 2.394126781821251,
"ce_loss_39": 1.9339392215013504,
"ce_loss_52": 1.3971292108297348,
"ce_loss_7": 3.0929621160030365,
"epoch": 0.89,
"grad_norm": 18.978687158228045,
"kl_loss_13": 2982.0,
"kl_loss_26": 2066.8,
"kl_loss_39": 1100.8,
"kl_loss_7": 3526.4,
"learning_rate": 3.0153689607045842e-05,
"loss": 4941.5,
"step": 8900
},
{
"ce_loss_13": 2.8811903417110445,
"ce_loss_26": 2.440036287903786,
"ce_loss_39": 1.975409933924675,
"ce_loss_52": 1.416017021238804,
"ce_loss_7": 3.1414348661899565,
"epoch": 0.891,
"grad_norm": 19.560639422793763,
"kl_loss_13": 3040.4,
"kl_loss_26": 2123.2,
"kl_loss_39": 1133.6,
"kl_loss_7": 3586.4,
"learning_rate": 2.9613385499648926e-05,
"loss": 4965.6,
"step": 8910
},
{
"ce_loss_13": 2.8596278965473174,
"ce_loss_26": 2.4295035183429716,
"ce_loss_39": 1.9728092432022095,
"ce_loss_52": 1.4283979684114456,
"ce_loss_7": 3.1134236633777617,
"epoch": 0.892,
"grad_norm": 18.908746612036932,
"kl_loss_13": 2974.8,
"kl_loss_26": 2082.0,
"kl_loss_39": 1115.7,
"kl_loss_7": 3505.2,
"learning_rate": 2.9077818180237692e-05,
"loss": 5007.95,
"step": 8920
},
{
"ce_loss_13": 2.8744696974754333,
"ce_loss_26": 2.4505746215581894,
"ce_loss_39": 1.9992403596639634,
"ce_loss_52": 1.4506706580519677,
"ce_loss_7": 3.13488364815712,
"epoch": 0.893,
"grad_norm": 18.957478200982788,
"kl_loss_13": 2985.6,
"kl_loss_26": 2094.6,
"kl_loss_39": 1129.4,
"kl_loss_7": 3523.6,
"learning_rate": 2.8546993041969172e-05,
"loss": 4940.15,
"step": 8930
},
{
"ce_loss_13": 2.8975345969200133,
"ce_loss_26": 2.4653249740600587,
"ce_loss_39": 2.0007053166627884,
"ce_loss_52": 1.4314887911081313,
"ce_loss_7": 3.155570811033249,
"epoch": 0.894,
"grad_norm": 18.739265695771255,
"kl_loss_13": 3012.4,
"kl_loss_26": 2117.2,
"kl_loss_39": 1151.3,
"kl_loss_7": 3551.6,
"learning_rate": 2.802091543024671e-05,
"loss": 4940.05,
"step": 8940
},
{
"ce_loss_13": 2.9072438359260557,
"ce_loss_26": 2.471187961101532,
"ce_loss_39": 2.0144962787628176,
"ce_loss_52": 1.42624132335186,
"ce_loss_7": 3.1694943487644194,
"epoch": 0.895,
"grad_norm": 19.21731826372691,
"kl_loss_13": 3062.0,
"kl_loss_26": 2168.4,
"kl_loss_39": 1196.7,
"kl_loss_7": 3618.8,
"learning_rate": 2.7499590642665774e-05,
"loss": 4979.0,
"step": 8950
},
{
"ce_loss_13": 2.893015044927597,
"ce_loss_26": 2.4596860975027086,
"ce_loss_39": 2.0018325716257097,
"ce_loss_52": 1.4285719782114028,
"ce_loss_7": 3.154858148097992,
"epoch": 0.896,
"grad_norm": 18.801842424478984,
"kl_loss_13": 3027.6,
"kl_loss_26": 2117.0,
"kl_loss_39": 1153.1,
"kl_loss_7": 3566.0,
"learning_rate": 2.6983023928961405e-05,
"loss": 4959.6,
"step": 8960
},
{
"ce_loss_13": 2.8546026587486266,
"ce_loss_26": 2.4183767944574357,
"ce_loss_39": 1.956614688038826,
"ce_loss_52": 1.3939528629183768,
"ce_loss_7": 3.1231652200222015,
"epoch": 0.897,
"grad_norm": 19.808391264092982,
"kl_loss_13": 3029.2,
"kl_loss_26": 2118.8,
"kl_loss_39": 1139.0,
"kl_loss_7": 3584.4,
"learning_rate": 2.6471220490954628e-05,
"loss": 4973.5,
"step": 8970
},
{
"ce_loss_13": 2.9009016394615172,
"ce_loss_26": 2.467138040065765,
"ce_loss_39": 2.0069568186998366,
"ce_loss_52": 1.4654083251953125,
"ce_loss_7": 3.155901938676834,
"epoch": 0.898,
"grad_norm": 19.041913132666583,
"kl_loss_13": 2967.2,
"kl_loss_26": 2069.0,
"kl_loss_39": 1105.9,
"kl_loss_7": 3506.0,
"learning_rate": 2.596418548250029e-05,
"loss": 4886.7,
"step": 8980
},
{
"ce_loss_13": 2.8516535699367522,
"ce_loss_26": 2.426672577857971,
"ce_loss_39": 1.9635347902774811,
"ce_loss_52": 1.419031423330307,
"ce_loss_7": 3.112126684188843,
"epoch": 0.899,
"grad_norm": 19.026364280537617,
"kl_loss_13": 2990.8,
"kl_loss_26": 2103.8,
"kl_loss_39": 1123.3,
"kl_loss_7": 3530.0,
"learning_rate": 2.5461924009435368e-05,
"loss": 4885.15,
"step": 8990
},
{
"ce_loss_13": 2.870776003599167,
"ce_loss_26": 2.4326407968997956,
"ce_loss_39": 1.9669345051050187,
"ce_loss_52": 1.420183390378952,
"ce_loss_7": 3.127993369102478,
"epoch": 0.9,
"grad_norm": 18.943171008960356,
"kl_loss_13": 3002.4,
"kl_loss_26": 2090.0,
"kl_loss_39": 1108.9,
"kl_loss_7": 3537.6,
"learning_rate": 2.4964441129527336e-05,
"loss": 4949.55,
"step": 9000
},
{
"ce_loss_13": 2.91824157834053,
"ce_loss_26": 2.4817179054021836,
"ce_loss_39": 2.015528929233551,
"ce_loss_52": 1.434774386882782,
"ce_loss_7": 3.181524306535721,
"epoch": 0.901,
"grad_norm": 19.717113587964203,
"kl_loss_13": 3071.6,
"kl_loss_26": 2176.4,
"kl_loss_39": 1186.6,
"kl_loss_7": 3618.4,
"learning_rate": 2.4471741852423235e-05,
"loss": 4970.45,
"step": 9010
},
{
"ce_loss_13": 2.8203544914722443,
"ce_loss_26": 2.394831323623657,
"ce_loss_39": 1.9447649121284485,
"ce_loss_52": 1.3895054385066032,
"ce_loss_7": 3.0815266370773315,
"epoch": 0.902,
"grad_norm": 19.201709482987514,
"kl_loss_13": 2963.6,
"kl_loss_26": 2077.6,
"kl_loss_39": 1127.3,
"kl_loss_7": 3508.4,
"learning_rate": 2.3983831139599287e-05,
"loss": 4939.65,
"step": 9020
},
{
"ce_loss_13": 2.8914650082588196,
"ce_loss_26": 2.46552118062973,
"ce_loss_39": 1.9927147597074508,
"ce_loss_52": 1.4259023681282996,
"ce_loss_7": 3.1594585537910462,
"epoch": 0.903,
"grad_norm": 18.327777842458563,
"kl_loss_13": 3054.4,
"kl_loss_26": 2152.4,
"kl_loss_39": 1157.6,
"kl_loss_7": 3607.6,
"learning_rate": 2.3500713904311022e-05,
"loss": 4963.8,
"step": 9030
},
{
"ce_loss_13": 2.8807626605033874,
"ce_loss_26": 2.432952329516411,
"ce_loss_39": 1.9567799299955368,
"ce_loss_52": 1.4073475629091263,
"ce_loss_7": 3.1469713926315306,
"epoch": 0.904,
"grad_norm": 20.180916839551323,
"kl_loss_13": 3056.0,
"kl_loss_26": 2131.4,
"kl_loss_39": 1132.8,
"kl_loss_7": 3607.6,
"learning_rate": 2.3022395011543685e-05,
"loss": 4930.8,
"step": 9040
},
{
"ce_loss_13": 2.895359253883362,
"ce_loss_26": 2.4620601534843445,
"ce_loss_39": 1.9888764083385468,
"ce_loss_52": 1.4224872916936875,
"ce_loss_7": 3.1553323328495027,
"epoch": 0.905,
"grad_norm": 19.515330390455173,
"kl_loss_13": 3061.2,
"kl_loss_26": 2157.8,
"kl_loss_39": 1165.6,
"kl_loss_7": 3605.6,
"learning_rate": 2.2548879277963063e-05,
"loss": 4965.65,
"step": 9050
},
{
"ce_loss_13": 2.913999766111374,
"ce_loss_26": 2.4826299071311952,
"ce_loss_39": 2.025396314263344,
"ce_loss_52": 1.4647169053554534,
"ce_loss_7": 3.1696211397647858,
"epoch": 0.906,
"grad_norm": 19.958942164442686,
"kl_loss_13": 2998.4,
"kl_loss_26": 2104.6,
"kl_loss_39": 1143.0,
"kl_loss_7": 3532.8,
"learning_rate": 2.208017147186736e-05,
"loss": 4953.35,
"step": 9060
},
{
"ce_loss_13": 2.931579887866974,
"ce_loss_26": 2.497947371006012,
"ce_loss_39": 2.0416529774665833,
"ce_loss_52": 1.4668794304132462,
"ce_loss_7": 3.1927560210227965,
"epoch": 0.907,
"grad_norm": 18.915099363205265,
"kl_loss_13": 3050.8,
"kl_loss_26": 2150.2,
"kl_loss_39": 1170.4,
"kl_loss_7": 3596.4,
"learning_rate": 2.1616276313139227e-05,
"loss": 4967.35,
"step": 9070
},
{
"ce_loss_13": 2.8379551649093626,
"ce_loss_26": 2.4058648884296416,
"ce_loss_39": 1.9522909700870514,
"ce_loss_52": 1.401316450536251,
"ce_loss_7": 3.096262776851654,
"epoch": 0.908,
"grad_norm": 17.939476679877323,
"kl_loss_13": 2996.4,
"kl_loss_26": 2103.4,
"kl_loss_39": 1130.6,
"kl_loss_7": 3545.6,
"learning_rate": 2.1157198473197415e-05,
"loss": 4983.65,
"step": 9080
},
{
"ce_loss_13": 2.9229146242141724,
"ce_loss_26": 2.4857192397117616,
"ce_loss_39": 2.014718788862228,
"ce_loss_52": 1.4371613681316375,
"ce_loss_7": 3.192232495546341,
"epoch": 0.909,
"grad_norm": 19.195813577565207,
"kl_loss_13": 3088.8,
"kl_loss_26": 2172.6,
"kl_loss_39": 1177.9,
"kl_loss_7": 3649.2,
"learning_rate": 2.0702942574950812e-05,
"loss": 4961.5,
"step": 9090
},
{
"ce_loss_13": 2.9034866452217103,
"ce_loss_26": 2.4769316017627716,
"ce_loss_39": 2.006492680311203,
"ce_loss_52": 1.4311101764440537,
"ce_loss_7": 3.164641487598419,
"epoch": 0.91,
"grad_norm": 18.881356963654913,
"kl_loss_13": 3059.6,
"kl_loss_26": 2172.6,
"kl_loss_39": 1174.0,
"kl_loss_7": 3598.4,
"learning_rate": 2.025351319275137e-05,
"loss": 4952.9,
"step": 9100
},
{
"ce_loss_13": 2.915982037782669,
"ce_loss_26": 2.4739193379879,
"ce_loss_39": 2.002902591228485,
"ce_loss_52": 1.4319583177566528,
"ce_loss_7": 3.17461501955986,
"epoch": 0.911,
"grad_norm": 18.844750958897702,
"kl_loss_13": 3071.6,
"kl_loss_26": 2158.2,
"kl_loss_39": 1169.6,
"kl_loss_7": 3613.6,
"learning_rate": 1.9808914852347816e-05,
"loss": 4969.1,
"step": 9110
},
{
"ce_loss_13": 2.9571076393127442,
"ce_loss_26": 2.5076956033706663,
"ce_loss_39": 2.0347861379384993,
"ce_loss_52": 1.4524286478757857,
"ce_loss_7": 3.2245921969413756,
"epoch": 0.912,
"grad_norm": 18.516316183492112,
"kl_loss_13": 3096.4,
"kl_loss_26": 2170.0,
"kl_loss_39": 1183.2,
"kl_loss_7": 3656.0,
"learning_rate": 1.9369152030840554e-05,
"loss": 4969.7,
"step": 9120
},
{
"ce_loss_13": 2.8493692874908447,
"ce_loss_26": 2.4184423595666886,
"ce_loss_39": 1.9636689513921737,
"ce_loss_52": 1.4161925345659256,
"ce_loss_7": 3.1067621290683745,
"epoch": 0.913,
"grad_norm": 19.557541417790066,
"kl_loss_13": 2969.6,
"kl_loss_26": 2073.0,
"kl_loss_39": 1110.7,
"kl_loss_7": 3508.8,
"learning_rate": 1.893422915663645e-05,
"loss": 4967.05,
"step": 9130
},
{
"ce_loss_13": 2.950014758110046,
"ce_loss_26": 2.5281428694725037,
"ce_loss_39": 2.0640300661325455,
"ce_loss_52": 1.4911428451538087,
"ce_loss_7": 3.20717169046402,
"epoch": 0.914,
"grad_norm": 19.126297962103095,
"kl_loss_13": 3062.4,
"kl_loss_26": 2166.6,
"kl_loss_39": 1184.2,
"kl_loss_7": 3597.2,
"learning_rate": 1.850415060940386e-05,
"loss": 4910.65,
"step": 9140
},
{
"ce_loss_13": 2.9050391018390656,
"ce_loss_26": 2.4736091554164887,
"ce_loss_39": 2.017327818274498,
"ce_loss_52": 1.4512428998947144,
"ce_loss_7": 3.162110447883606,
"epoch": 0.915,
"grad_norm": 18.914613896159423,
"kl_loss_13": 3018.0,
"kl_loss_26": 2122.6,
"kl_loss_39": 1153.5,
"kl_loss_7": 3547.6,
"learning_rate": 1.8078920720028978e-05,
"loss": 4898.6,
"step": 9150
},
{
"ce_loss_13": 2.8679368257522584,
"ce_loss_26": 2.4424335032701494,
"ce_loss_39": 1.9891292452812195,
"ce_loss_52": 1.4451990023255348,
"ce_loss_7": 3.127324694395065,
"epoch": 0.916,
"grad_norm": 20.019541856401887,
"kl_loss_13": 2959.2,
"kl_loss_26": 2064.4,
"kl_loss_39": 1098.8,
"kl_loss_7": 3496.8,
"learning_rate": 1.765854377057219e-05,
"loss": 4940.15,
"step": 9160
},
{
"ce_loss_13": 2.873015010356903,
"ce_loss_26": 2.433028203248978,
"ce_loss_39": 1.9746823519468308,
"ce_loss_52": 1.407148177921772,
"ce_loss_7": 3.1320842862129212,
"epoch": 0.917,
"grad_norm": 18.323389911529343,
"kl_loss_13": 3046.8,
"kl_loss_26": 2143.6,
"kl_loss_39": 1161.3,
"kl_loss_7": 3586.0,
"learning_rate": 1.724302399422456e-05,
"loss": 4937.75,
"step": 9170
},
{
"ce_loss_13": 2.864003378152847,
"ce_loss_26": 2.4368703365325928,
"ce_loss_39": 1.978201287984848,
"ce_loss_52": 1.4284173011779786,
"ce_loss_7": 3.117653822898865,
"epoch": 0.918,
"grad_norm": 19.851903614612837,
"kl_loss_13": 2960.4,
"kl_loss_26": 2069.6,
"kl_loss_39": 1110.5,
"kl_loss_7": 3494.8,
"learning_rate": 1.683236557526574e-05,
"loss": 4948.85,
"step": 9180
},
{
"ce_loss_13": 2.841529107093811,
"ce_loss_26": 2.4092674642801284,
"ce_loss_39": 1.9591031044721603,
"ce_loss_52": 1.4043558463454247,
"ce_loss_7": 3.099020904302597,
"epoch": 0.919,
"grad_norm": 18.98858399937095,
"kl_loss_13": 2971.2,
"kl_loss_26": 2076.4,
"kl_loss_39": 1122.0,
"kl_loss_7": 3508.0,
"learning_rate": 1.6426572649021475e-05,
"loss": 4944.1,
"step": 9190
},
{
"ce_loss_13": 2.902883565425873,
"ce_loss_26": 2.4624376207590104,
"ce_loss_39": 2.0033867925405504,
"ce_loss_52": 1.4482421904802323,
"ce_loss_7": 3.1540717780590057,
"epoch": 0.92,
"grad_norm": 19.524164167367193,
"kl_loss_13": 3016.4,
"kl_loss_26": 2114.8,
"kl_loss_39": 1136.7,
"kl_loss_7": 3542.4,
"learning_rate": 1.6025649301821876e-05,
"loss": 4936.95,
"step": 9200
},
{
"ce_loss_13": 2.962231194972992,
"ce_loss_26": 2.5141273856163027,
"ce_loss_39": 2.038061347603798,
"ce_loss_52": 1.4573458433151245,
"ce_loss_7": 3.22137930393219,
"epoch": 0.921,
"grad_norm": 19.07781770056793,
"kl_loss_13": 3091.6,
"kl_loss_26": 2174.2,
"kl_loss_39": 1179.6,
"kl_loss_7": 3638.0,
"learning_rate": 1.5629599570960716e-05,
"loss": 4931.05,
"step": 9210
},
{
"ce_loss_13": 2.828860414028168,
"ce_loss_26": 2.394576147198677,
"ce_loss_39": 1.940834417939186,
"ce_loss_52": 1.396960550546646,
"ce_loss_7": 3.0943815410137177,
"epoch": 0.922,
"grad_norm": 18.68562598066032,
"kl_loss_13": 2986.4,
"kl_loss_26": 2085.0,
"kl_loss_39": 1113.6,
"kl_loss_7": 3535.2,
"learning_rate": 1.5238427444654367e-05,
"loss": 4919.35,
"step": 9220
},
{
"ce_loss_13": 2.854993385076523,
"ce_loss_26": 2.4067456245422365,
"ce_loss_39": 1.9481880724430085,
"ce_loss_52": 1.392129084467888,
"ce_loss_7": 3.119863528013229,
"epoch": 0.923,
"grad_norm": 19.56628173058375,
"kl_loss_13": 3048.8,
"kl_loss_26": 2126.2,
"kl_loss_39": 1141.3,
"kl_loss_7": 3609.6,
"learning_rate": 1.4852136862001764e-05,
"loss": 4956.25,
"step": 9230
},
{
"ce_loss_13": 2.8672266066074372,
"ce_loss_26": 2.428171756863594,
"ce_loss_39": 1.967655423283577,
"ce_loss_52": 1.4228445023298264,
"ce_loss_7": 3.12925271987915,
"epoch": 0.924,
"grad_norm": 18.655136558750065,
"kl_loss_13": 3020.8,
"kl_loss_26": 2114.0,
"kl_loss_39": 1134.6,
"kl_loss_7": 3562.4,
"learning_rate": 1.4470731712944884e-05,
"loss": 4914.5,
"step": 9240
},
{
"ce_loss_13": 2.967056131362915,
"ce_loss_26": 2.527100908756256,
"ce_loss_39": 2.0592952966690063,
"ce_loss_52": 1.466832235455513,
"ce_loss_7": 3.2317879140377044,
"epoch": 0.925,
"grad_norm": 18.755830587214348,
"kl_loss_13": 3074.0,
"kl_loss_26": 2172.6,
"kl_loss_39": 1194.2,
"kl_loss_7": 3633.2,
"learning_rate": 1.4094215838229174e-05,
"loss": 4941.0,
"step": 9250
},
{
"ce_loss_13": 2.8956347942352294,
"ce_loss_26": 2.4609649628400803,
"ce_loss_39": 1.998116421699524,
"ce_loss_52": 1.4327284812927246,
"ce_loss_7": 3.1544252693653108,
"epoch": 0.926,
"grad_norm": 19.440875104184542,
"kl_loss_13": 3037.6,
"kl_loss_26": 2133.0,
"kl_loss_39": 1149.6,
"kl_loss_7": 3582.4,
"learning_rate": 1.372259302936546e-05,
"loss": 4929.25,
"step": 9260
},
{
"ce_loss_13": 2.818482467532158,
"ce_loss_26": 2.3888671875,
"ce_loss_39": 1.9417572438716888,
"ce_loss_52": 1.3873827829957008,
"ce_loss_7": 3.0732292413711546,
"epoch": 0.927,
"grad_norm": 19.09848340283336,
"kl_loss_13": 2988.4,
"kl_loss_26": 2096.8,
"kl_loss_39": 1136.6,
"kl_loss_7": 3519.2,
"learning_rate": 1.3355867028591206e-05,
"loss": 4917.85,
"step": 9270
},
{
"ce_loss_13": 2.8812867999076843,
"ce_loss_26": 2.445907565951347,
"ce_loss_39": 1.9824917227029801,
"ce_loss_52": 1.4204061418771743,
"ce_loss_7": 3.1463906168937683,
"epoch": 0.928,
"grad_norm": 19.73371377973639,
"kl_loss_13": 3015.6,
"kl_loss_26": 2109.8,
"kl_loss_39": 1132.3,
"kl_loss_7": 3565.2,
"learning_rate": 1.2994041528833267e-05,
"loss": 4914.15,
"step": 9280
},
{
"ce_loss_13": 2.989528793096542,
"ce_loss_26": 2.545223152637482,
"ce_loss_39": 2.0668440997600555,
"ce_loss_52": 1.4640702456235886,
"ce_loss_7": 3.2542518198490145,
"epoch": 0.929,
"grad_norm": 18.497071159749588,
"kl_loss_13": 3146.0,
"kl_loss_26": 2234.4,
"kl_loss_39": 1221.2,
"kl_loss_7": 3704.8,
"learning_rate": 1.2637120173670358e-05,
"loss": 4971.25,
"step": 9290
},
{
"ce_loss_13": 2.9433493435382845,
"ce_loss_26": 2.503718575835228,
"ce_loss_39": 2.029115191102028,
"ce_loss_52": 1.4293665170669556,
"ce_loss_7": 3.21596360206604,
"epoch": 0.93,
"grad_norm": 19.233646690177977,
"kl_loss_13": 3119.2,
"kl_loss_26": 2210.0,
"kl_loss_39": 1209.9,
"kl_loss_7": 3688.8,
"learning_rate": 1.2285106557296478e-05,
"loss": 4970.8,
"step": 9300
},
{
"ce_loss_13": 2.8525869846343994,
"ce_loss_26": 2.4185830265283585,
"ce_loss_39": 1.9529170453548432,
"ce_loss_52": 1.4022331610321999,
"ce_loss_7": 3.112831687927246,
"epoch": 0.931,
"grad_norm": 19.01919083076588,
"kl_loss_13": 3012.8,
"kl_loss_26": 2100.6,
"kl_loss_39": 1120.5,
"kl_loss_7": 3542.0,
"learning_rate": 1.1938004224484989e-05,
"loss": 4934.7,
"step": 9310
},
{
"ce_loss_13": 2.9074361979961396,
"ce_loss_26": 2.477645492553711,
"ce_loss_39": 2.010756382346153,
"ce_loss_52": 1.4430534109473228,
"ce_loss_7": 3.1718304812908173,
"epoch": 0.932,
"grad_norm": 18.572431056907458,
"kl_loss_13": 3020.0,
"kl_loss_26": 2116.0,
"kl_loss_39": 1144.2,
"kl_loss_7": 3575.6,
"learning_rate": 1.1595816670552429e-05,
"loss": 4913.95,
"step": 9320
},
{
"ce_loss_13": 2.8636857986450197,
"ce_loss_26": 2.424144572019577,
"ce_loss_39": 1.9600117355585098,
"ce_loss_52": 1.402983972430229,
"ce_loss_7": 3.1251452922821046,
"epoch": 0.933,
"grad_norm": 18.288942605726792,
"kl_loss_13": 3044.0,
"kl_loss_26": 2129.6,
"kl_loss_39": 1139.2,
"kl_loss_7": 3581.6,
"learning_rate": 1.1258547341323699e-05,
"loss": 4937.25,
"step": 9330
},
{
"ce_loss_13": 2.893140608072281,
"ce_loss_26": 2.457077306509018,
"ce_loss_39": 2.003238731622696,
"ce_loss_52": 1.4437968581914902,
"ce_loss_7": 3.1531366109848022,
"epoch": 0.934,
"grad_norm": 18.739319955640763,
"kl_loss_13": 3019.6,
"kl_loss_26": 2115.0,
"kl_loss_39": 1131.3,
"kl_loss_7": 3558.4,
"learning_rate": 1.0926199633097156e-05,
"loss": 4899.9,
"step": 9340
},
{
"ce_loss_13": 2.9001421511173247,
"ce_loss_26": 2.4687224984169007,
"ce_loss_39": 2.0006180971860887,
"ce_loss_52": 1.4220335900783538,
"ce_loss_7": 3.1691121637821196,
"epoch": 0.935,
"grad_norm": 19.392869535691936,
"kl_loss_13": 3054.0,
"kl_loss_26": 2149.8,
"kl_loss_39": 1176.1,
"kl_loss_7": 3619.2,
"learning_rate": 1.0598776892610684e-05,
"loss": 4922.25,
"step": 9350
},
{
"ce_loss_13": 2.953709363937378,
"ce_loss_26": 2.5250791788101195,
"ce_loss_39": 2.0616777926683425,
"ce_loss_52": 1.5004188895225525,
"ce_loss_7": 3.2059156119823458,
"epoch": 0.936,
"grad_norm": 18.98607482508187,
"kl_loss_13": 3007.2,
"kl_loss_26": 2107.0,
"kl_loss_39": 1138.6,
"kl_loss_7": 3536.0,
"learning_rate": 1.0276282417007399e-05,
"loss": 4935.75,
"step": 9360
},
{
"ce_loss_13": 2.902603155374527,
"ce_loss_26": 2.464896833896637,
"ce_loss_39": 2.0031634330749513,
"ce_loss_52": 1.4478828191757203,
"ce_loss_7": 3.166324245929718,
"epoch": 0.937,
"grad_norm": 18.72231921789515,
"kl_loss_13": 3025.6,
"kl_loss_26": 2120.4,
"kl_loss_39": 1137.7,
"kl_loss_7": 3574.4,
"learning_rate": 9.958719453803277e-06,
"loss": 4933.2,
"step": 9370
},
{
"ce_loss_13": 2.878055286407471,
"ce_loss_26": 2.4367445170879365,
"ce_loss_39": 1.9698922991752625,
"ce_loss_52": 1.40206458568573,
"ce_loss_7": 3.1407361745834352,
"epoch": 0.938,
"grad_norm": 19.520797823561637,
"kl_loss_13": 3045.6,
"kl_loss_26": 2130.6,
"kl_loss_39": 1145.7,
"kl_loss_7": 3591.6,
"learning_rate": 9.646091200853802e-06,
"loss": 4932.45,
"step": 9380
},
{
"ce_loss_13": 2.8573631644248962,
"ce_loss_26": 2.429997554421425,
"ce_loss_39": 1.9779304087162017,
"ce_loss_52": 1.4321624323725701,
"ce_loss_7": 3.119151920080185,
"epoch": 0.939,
"grad_norm": 18.61104788500602,
"kl_loss_13": 2968.4,
"kl_loss_26": 2075.6,
"kl_loss_39": 1113.7,
"kl_loss_7": 3509.6,
"learning_rate": 9.338400806321978e-06,
"loss": 4899.9,
"step": 9390
},
{
"ce_loss_13": 2.8828431129455567,
"ce_loss_26": 2.4453956365585325,
"ce_loss_39": 1.986677783727646,
"ce_loss_52": 1.4324709355831147,
"ce_loss_7": 3.1462887287139893,
"epoch": 0.94,
"grad_norm": 18.660409146960177,
"kl_loss_13": 3006.4,
"kl_loss_26": 2102.2,
"kl_loss_39": 1130.9,
"kl_loss_7": 3551.2,
"learning_rate": 9.035651368646646e-06,
"loss": 4963.1,
"step": 9400
},
{
"ce_loss_13": 2.856483778357506,
"ce_loss_26": 2.426860272884369,
"ce_loss_39": 1.9708759590983391,
"ce_loss_52": 1.4115710154175758,
"ce_loss_7": 3.114782178401947,
"epoch": 0.941,
"grad_norm": 19.55117077640538,
"kl_loss_13": 2986.0,
"kl_loss_26": 2096.2,
"kl_loss_39": 1131.2,
"kl_loss_7": 3526.4,
"learning_rate": 8.737845936511335e-06,
"loss": 4960.75,
"step": 9410
},
{
"ce_loss_13": 2.894274836778641,
"ce_loss_26": 2.454681032896042,
"ce_loss_39": 1.9826824754476546,
"ce_loss_52": 1.4298861980438233,
"ce_loss_7": 3.15040722489357,
"epoch": 0.942,
"grad_norm": 19.039583654377346,
"kl_loss_13": 3067.6,
"kl_loss_26": 2152.8,
"kl_loss_39": 1152.4,
"kl_loss_7": 3608.4,
"learning_rate": 8.444987508813451e-06,
"loss": 4899.6,
"step": 9420
},
{
"ce_loss_13": 2.9001412212848665,
"ce_loss_26": 2.4617854237556456,
"ce_loss_39": 1.999165838956833,
"ce_loss_52": 1.4294554442167282,
"ce_loss_7": 3.165439170598984,
"epoch": 0.943,
"grad_norm": 18.564983933864266,
"kl_loss_13": 3046.0,
"kl_loss_26": 2136.6,
"kl_loss_39": 1159.1,
"kl_loss_7": 3592.8,
"learning_rate": 8.157079034633974e-06,
"loss": 4920.3,
"step": 9430
},
{
"ce_loss_13": 2.863955610990524,
"ce_loss_26": 2.435519364476204,
"ce_loss_39": 1.9886516004800796,
"ce_loss_52": 1.4344154298305511,
"ce_loss_7": 3.1265052914619447,
"epoch": 0.944,
"grad_norm": 17.82647647549486,
"kl_loss_13": 2962.4,
"kl_loss_26": 2073.6,
"kl_loss_39": 1123.1,
"kl_loss_7": 3508.4,
"learning_rate": 7.874123413208145e-06,
"loss": 4921.7,
"step": 9440
},
{
"ce_loss_13": 2.8527204990386963,
"ce_loss_26": 2.418528434634209,
"ce_loss_39": 1.960913023352623,
"ce_loss_52": 1.4082367643713951,
"ce_loss_7": 3.118280106782913,
"epoch": 0.945,
"grad_norm": 17.642678200140654,
"kl_loss_13": 3000.8,
"kl_loss_26": 2093.6,
"kl_loss_39": 1127.9,
"kl_loss_7": 3547.2,
"learning_rate": 7.59612349389599e-06,
"loss": 4941.9,
"step": 9450
},
{
"ce_loss_13": 2.8983235955238342,
"ce_loss_26": 2.4708085656166077,
"ce_loss_39": 2.01363542675972,
"ce_loss_52": 1.4459212511777877,
"ce_loss_7": 3.1573162257671354,
"epoch": 0.946,
"grad_norm": 18.21137845155402,
"kl_loss_13": 3012.8,
"kl_loss_26": 2129.0,
"kl_loss_39": 1159.6,
"kl_loss_7": 3550.0,
"learning_rate": 7.323082076153509e-06,
"loss": 4932.45,
"step": 9460
},
{
"ce_loss_13": 2.8793884813785553,
"ce_loss_26": 2.444310560822487,
"ce_loss_39": 1.9878242909908295,
"ce_loss_52": 1.4219153225421906,
"ce_loss_7": 3.1359946370124816,
"epoch": 0.947,
"grad_norm": 19.11147526952516,
"kl_loss_13": 3000.4,
"kl_loss_26": 2106.2,
"kl_loss_39": 1141.4,
"kl_loss_7": 3539.6,
"learning_rate": 7.055001909504755e-06,
"loss": 4932.95,
"step": 9470
},
{
"ce_loss_13": 2.8483738005161285,
"ce_loss_26": 2.4169380724430085,
"ce_loss_39": 1.9552814781665802,
"ce_loss_52": 1.4036450207233429,
"ce_loss_7": 3.1039236187934875,
"epoch": 0.948,
"grad_norm": 19.227610169601164,
"kl_loss_13": 3000.4,
"kl_loss_26": 2102.0,
"kl_loss_39": 1125.6,
"kl_loss_7": 3530.0,
"learning_rate": 6.791885693514133e-06,
"loss": 4941.55,
"step": 9480
},
{
"ce_loss_13": 2.8693545699119567,
"ce_loss_26": 2.4362709283828736,
"ce_loss_39": 1.9619301795959472,
"ce_loss_52": 1.400461108982563,
"ce_loss_7": 3.133023035526276,
"epoch": 0.949,
"grad_norm": 19.323995399615697,
"kl_loss_13": 3058.8,
"kl_loss_26": 2146.6,
"kl_loss_39": 1149.1,
"kl_loss_7": 3608.8,
"learning_rate": 6.533736077758867e-06,
"loss": 4986.35,
"step": 9490
},
{
"ce_loss_13": 2.8667274117469788,
"ce_loss_26": 2.4240807622671126,
"ce_loss_39": 1.9586560875177383,
"ce_loss_52": 1.3980020493268968,
"ce_loss_7": 3.127706527709961,
"epoch": 0.95,
"grad_norm": 18.253118734633716,
"kl_loss_13": 3033.6,
"kl_loss_26": 2126.0,
"kl_loss_39": 1140.7,
"kl_loss_7": 3581.6,
"learning_rate": 6.2805556618028556e-06,
"loss": 4971.65,
"step": 9500
},
{
"ce_loss_13": 2.9265355467796326,
"ce_loss_26": 2.4994624704122543,
"ce_loss_39": 2.03882916867733,
"ce_loss_52": 1.4773303151130677,
"ce_loss_7": 3.1838342785835265,
"epoch": 0.951,
"grad_norm": 19.482478782354722,
"kl_loss_13": 2998.4,
"kl_loss_26": 2103.6,
"kl_loss_39": 1144.8,
"kl_loss_7": 3540.4,
"learning_rate": 6.032346995169968e-06,
"loss": 4951.7,
"step": 9510
},
{
"ce_loss_13": 2.9545272469520567,
"ce_loss_26": 2.5311076641082764,
"ce_loss_39": 2.068070963025093,
"ce_loss_52": 1.4843237161636353,
"ce_loss_7": 3.2110206544399262,
"epoch": 0.952,
"grad_norm": 19.225083219290383,
"kl_loss_13": 3055.2,
"kl_loss_26": 2164.8,
"kl_loss_39": 1188.4,
"kl_loss_7": 3590.4,
"learning_rate": 5.789112577318789e-06,
"loss": 4961.65,
"step": 9520
},
{
"ce_loss_13": 2.852985817193985,
"ce_loss_26": 2.4146564304828644,
"ce_loss_39": 1.961009207367897,
"ce_loss_52": 1.3947103202342988,
"ce_loss_7": 3.1259153723716735,
"epoch": 0.953,
"grad_norm": 18.155555980380427,
"kl_loss_13": 3021.6,
"kl_loss_26": 2118.2,
"kl_loss_39": 1157.4,
"kl_loss_7": 3575.6,
"learning_rate": 5.550854857617194e-06,
"loss": 4909.2,
"step": 9530
},
{
"ce_loss_13": 2.8418005287647246,
"ce_loss_26": 2.4128061681985855,
"ce_loss_39": 1.9482584029436112,
"ce_loss_52": 1.3925445035099984,
"ce_loss_7": 3.1012724101543427,
"epoch": 0.954,
"grad_norm": 18.797933923537936,
"kl_loss_13": 3018.8,
"kl_loss_26": 2116.8,
"kl_loss_39": 1135.1,
"kl_loss_7": 3566.4,
"learning_rate": 5.317576235317756e-06,
"loss": 4951.35,
"step": 9540
},
{
"ce_loss_13": 2.9137533485889433,
"ce_loss_26": 2.479102221131325,
"ce_loss_39": 2.011527943611145,
"ce_loss_52": 1.4640387833118438,
"ce_loss_7": 3.16554337143898,
"epoch": 0.955,
"grad_norm": 18.308431062302134,
"kl_loss_13": 3004.0,
"kl_loss_26": 2104.0,
"kl_loss_39": 1124.5,
"kl_loss_7": 3534.4,
"learning_rate": 5.089279059533658e-06,
"loss": 4893.9,
"step": 9550
},
{
"ce_loss_13": 2.9561933636665345,
"ce_loss_26": 2.520014223456383,
"ce_loss_39": 2.044199249148369,
"ce_loss_52": 1.4634439080953598,
"ce_loss_7": 3.2146646201610567,
"epoch": 0.956,
"grad_norm": 19.011558845630333,
"kl_loss_13": 3107.6,
"kl_loss_26": 2205.4,
"kl_loss_39": 1192.1,
"kl_loss_7": 3652.4,
"learning_rate": 4.865965629214819e-06,
"loss": 4928.6,
"step": 9560
},
{
"ce_loss_13": 2.9264722049236296,
"ce_loss_26": 2.500628116726875,
"ce_loss_39": 2.0424467980861665,
"ce_loss_52": 1.4656882539391518,
"ce_loss_7": 3.1882854044437408,
"epoch": 0.957,
"grad_norm": 19.491005801371585,
"kl_loss_13": 3028.0,
"kl_loss_26": 2138.6,
"kl_loss_39": 1174.2,
"kl_loss_7": 3566.0,
"learning_rate": 4.6476381931251366e-06,
"loss": 4947.75,
"step": 9570
},
{
"ce_loss_13": 2.8677931249141695,
"ce_loss_26": 2.432820278406143,
"ce_loss_39": 1.970087245106697,
"ce_loss_52": 1.4129542678594589,
"ce_loss_7": 3.129103422164917,
"epoch": 0.958,
"grad_norm": 18.903925554756427,
"kl_loss_13": 2994.0,
"kl_loss_26": 2096.0,
"kl_loss_39": 1120.2,
"kl_loss_7": 3546.4,
"learning_rate": 4.434298949819449e-06,
"loss": 4918.2,
"step": 9580
},
{
"ce_loss_13": 2.894206315279007,
"ce_loss_26": 2.4608440458774568,
"ce_loss_39": 2.00639765560627,
"ce_loss_52": 1.4453970074653626,
"ce_loss_7": 3.151961898803711,
"epoch": 0.959,
"grad_norm": 17.742534881377313,
"kl_loss_13": 2993.6,
"kl_loss_26": 2091.6,
"kl_loss_39": 1135.9,
"kl_loss_7": 3542.0,
"learning_rate": 4.2259500476214406e-06,
"loss": 4904.1,
"step": 9590
},
{
"ce_loss_13": 2.907763344049454,
"ce_loss_26": 2.4721481442451476,
"ce_loss_39": 2.013452297449112,
"ce_loss_52": 1.4463645279407502,
"ce_loss_7": 3.1700760960578918,
"epoch": 0.96,
"grad_norm": 18.762064601939382,
"kl_loss_13": 3033.2,
"kl_loss_26": 2127.6,
"kl_loss_39": 1154.1,
"kl_loss_7": 3576.8,
"learning_rate": 4.02259358460233e-06,
"loss": 4944.15,
"step": 9600
},
{
"ce_loss_13": 2.9404530614614486,
"ce_loss_26": 2.5118053376674654,
"ce_loss_39": 2.045960560441017,
"ce_loss_52": 1.4736278399825096,
"ce_loss_7": 3.199742293357849,
"epoch": 0.961,
"grad_norm": 19.091693827270714,
"kl_loss_13": 3049.6,
"kl_loss_26": 2150.4,
"kl_loss_39": 1165.5,
"kl_loss_7": 3590.8,
"learning_rate": 3.8242316085594916e-06,
"loss": 4931.75,
"step": 9610
},
{
"ce_loss_13": 2.8952401757240294,
"ce_loss_26": 2.4520116090774535,
"ce_loss_39": 1.9771205306053161,
"ce_loss_52": 1.3961644172668457,
"ce_loss_7": 3.1528802454471587,
"epoch": 0.962,
"grad_norm": 18.822596918413492,
"kl_loss_13": 3097.6,
"kl_loss_26": 2171.4,
"kl_loss_39": 1176.6,
"kl_loss_7": 3631.6,
"learning_rate": 3.630866116995757e-06,
"loss": 4991.65,
"step": 9620
},
{
"ce_loss_13": 2.848733913898468,
"ce_loss_26": 2.4173508852720262,
"ce_loss_39": 1.9629988223314285,
"ce_loss_52": 1.4227147445082664,
"ce_loss_7": 3.105393874645233,
"epoch": 0.963,
"grad_norm": 18.772124078001035,
"kl_loss_13": 2951.2,
"kl_loss_26": 2061.0,
"kl_loss_39": 1101.5,
"kl_loss_7": 3491.2,
"learning_rate": 3.4424990570994797e-06,
"loss": 4903.15,
"step": 9630
},
{
"ce_loss_13": 2.9066348552703856,
"ce_loss_26": 2.469021773338318,
"ce_loss_39": 2.010333400964737,
"ce_loss_52": 1.4492767244577407,
"ce_loss_7": 3.1746467888355254,
"epoch": 0.964,
"grad_norm": 19.32775364197132,
"kl_loss_13": 3013.6,
"kl_loss_26": 2110.8,
"kl_loss_39": 1135.7,
"kl_loss_7": 3570.0,
"learning_rate": 3.2591323257248896e-06,
"loss": 4939.25,
"step": 9640
},
{
"ce_loss_13": 2.9041188657283783,
"ce_loss_26": 2.471466612815857,
"ce_loss_39": 2.0076118439435957,
"ce_loss_52": 1.4575997084379195,
"ce_loss_7": 3.156245505809784,
"epoch": 0.965,
"grad_norm": 18.772007331370325,
"kl_loss_13": 3016.0,
"kl_loss_26": 2111.8,
"kl_loss_39": 1138.9,
"kl_loss_7": 3551.6,
"learning_rate": 3.0807677693729385e-06,
"loss": 4953.0,
"step": 9650
},
{
"ce_loss_13": 2.9185379564762117,
"ce_loss_26": 2.4852662444114686,
"ce_loss_39": 2.0213693618774413,
"ce_loss_52": 1.4567248612642287,
"ce_loss_7": 3.1788457691669465,
"epoch": 0.966,
"grad_norm": 19.301754151350856,
"kl_loss_13": 3049.2,
"kl_loss_26": 2142.4,
"kl_loss_39": 1165.5,
"kl_loss_7": 3589.2,
"learning_rate": 2.9074071841727055e-06,
"loss": 4966.3,
"step": 9660
},
{
"ce_loss_13": 2.856596076488495,
"ce_loss_26": 2.4228154510259627,
"ce_loss_39": 1.966923463344574,
"ce_loss_52": 1.3987573131918907,
"ce_loss_7": 3.112848150730133,
"epoch": 0.967,
"grad_norm": 18.636169987835014,
"kl_loss_13": 3015.2,
"kl_loss_26": 2124.8,
"kl_loss_39": 1150.1,
"kl_loss_7": 3558.0,
"learning_rate": 2.739052315863355e-06,
"loss": 4944.85,
"step": 9670
},
{
"ce_loss_13": 2.946157419681549,
"ce_loss_26": 2.506874307990074,
"ce_loss_39": 2.0385408878326414,
"ce_loss_52": 1.4472137212753295,
"ce_loss_7": 3.2151435017585754,
"epoch": 0.968,
"grad_norm": 19.361220180181423,
"kl_loss_13": 3105.6,
"kl_loss_26": 2193.0,
"kl_loss_39": 1200.9,
"kl_loss_7": 3669.2,
"learning_rate": 2.5757048597765396e-06,
"loss": 4938.1,
"step": 9680
},
{
"ce_loss_13": 2.838688534498215,
"ce_loss_26": 2.408904367685318,
"ce_loss_39": 1.9514323592185974,
"ce_loss_52": 1.4050966590642928,
"ce_loss_7": 3.102074921131134,
"epoch": 0.969,
"grad_norm": 18.982791691406838,
"kl_loss_13": 2984.8,
"kl_loss_26": 2086.6,
"kl_loss_39": 1106.6,
"kl_loss_7": 3533.2,
"learning_rate": 2.417366460819359e-06,
"loss": 4918.15,
"step": 9690
},
{
"ce_loss_13": 2.880851173400879,
"ce_loss_26": 2.447329577803612,
"ce_loss_39": 1.989661106467247,
"ce_loss_52": 1.4333824023604393,
"ce_loss_7": 3.137517309188843,
"epoch": 0.97,
"grad_norm": 19.196819142959395,
"kl_loss_13": 2988.0,
"kl_loss_26": 2092.4,
"kl_loss_39": 1128.9,
"kl_loss_7": 3524.4,
"learning_rate": 2.2640387134577057e-06,
"loss": 4938.15,
"step": 9700
},
{
"ce_loss_13": 2.8625703275203707,
"ce_loss_26": 2.4340526342391966,
"ce_loss_39": 1.968525806069374,
"ce_loss_52": 1.4236899584531784,
"ce_loss_7": 3.112631046772003,
"epoch": 0.971,
"grad_norm": 19.493522870524444,
"kl_loss_13": 2981.2,
"kl_loss_26": 2091.6,
"kl_loss_39": 1109.1,
"kl_loss_7": 3508.4,
"learning_rate": 2.115723161700278e-06,
"loss": 4978.3,
"step": 9710
},
{
"ce_loss_13": 2.930490869283676,
"ce_loss_26": 2.495130881667137,
"ce_loss_39": 2.030032703280449,
"ce_loss_52": 1.4442616790533065,
"ce_loss_7": 3.190455746650696,
"epoch": 0.972,
"grad_norm": 18.231386237261873,
"kl_loss_13": 3083.2,
"kl_loss_26": 2182.4,
"kl_loss_39": 1200.2,
"kl_loss_7": 3630.4,
"learning_rate": 1.9724212990830937e-06,
"loss": 4917.25,
"step": 9720
},
{
"ce_loss_13": 2.8745281517505648,
"ce_loss_26": 2.450575265288353,
"ce_loss_39": 1.9936909019947051,
"ce_loss_52": 1.4346210777759552,
"ce_loss_7": 3.1411093890666963,
"epoch": 0.973,
"grad_norm": 17.9155258115958,
"kl_loss_13": 2990.4,
"kl_loss_26": 2104.0,
"kl_loss_39": 1135.2,
"kl_loss_7": 3546.0,
"learning_rate": 1.8341345686543331e-06,
"loss": 4907.2,
"step": 9730
},
{
"ce_loss_13": 2.950432300567627,
"ce_loss_26": 2.5173233568668367,
"ce_loss_39": 2.05553839802742,
"ce_loss_52": 1.5059631228446961,
"ce_loss_7": 3.203335565328598,
"epoch": 0.974,
"grad_norm": 18.692267522295538,
"kl_loss_13": 2994.0,
"kl_loss_26": 2099.8,
"kl_loss_39": 1118.3,
"kl_loss_7": 3526.0,
"learning_rate": 1.7008643629596864e-06,
"loss": 4975.3,
"step": 9740
},
{
"ce_loss_13": 2.938932454586029,
"ce_loss_26": 2.4943090945482256,
"ce_loss_39": 2.030773627758026,
"ce_loss_52": 1.4614870190620421,
"ce_loss_7": 3.203068423271179,
"epoch": 0.975,
"grad_norm": 19.21025690602488,
"kl_loss_13": 3068.4,
"kl_loss_26": 2141.2,
"kl_loss_39": 1151.8,
"kl_loss_7": 3617.6,
"learning_rate": 1.5726120240288633e-06,
"loss": 4916.8,
"step": 9750
},
{
"ce_loss_13": 2.9820500314235687,
"ce_loss_26": 2.5365146696567535,
"ce_loss_39": 2.0661711603403092,
"ce_loss_52": 1.471569898724556,
"ce_loss_7": 3.2498775362968444,
"epoch": 0.976,
"grad_norm": 18.6163174066225,
"kl_loss_13": 3112.0,
"kl_loss_26": 2196.6,
"kl_loss_39": 1202.4,
"kl_loss_7": 3666.4,
"learning_rate": 1.4493788433612708e-06,
"loss": 4925.3,
"step": 9760
},
{
"ce_loss_13": 2.8684714436531067,
"ce_loss_26": 2.433712217211723,
"ce_loss_39": 1.979764473438263,
"ce_loss_52": 1.4259307652711868,
"ce_loss_7": 3.1243775844573975,
"epoch": 0.977,
"grad_norm": 18.645711029415455,
"kl_loss_13": 2991.6,
"kl_loss_26": 2090.8,
"kl_loss_39": 1122.5,
"kl_loss_7": 3526.8,
"learning_rate": 1.3311660619138578e-06,
"loss": 4899.9,
"step": 9770
},
{
"ce_loss_13": 2.875075614452362,
"ce_loss_26": 2.4274426341056823,
"ce_loss_39": 1.9597632795572282,
"ce_loss_52": 1.3981771111488341,
"ce_loss_7": 3.132591074705124,
"epoch": 0.978,
"grad_norm": 19.101397556379275,
"kl_loss_13": 3053.6,
"kl_loss_26": 2133.6,
"kl_loss_39": 1148.0,
"kl_loss_7": 3594.4,
"learning_rate": 1.2179748700879012e-06,
"loss": 4922.55,
"step": 9780
},
{
"ce_loss_13": 2.8309387296438215,
"ce_loss_26": 2.4053177654743196,
"ce_loss_39": 1.9509627014398574,
"ce_loss_52": 1.397429385781288,
"ce_loss_7": 3.098381590843201,
"epoch": 0.979,
"grad_norm": 18.730022448398557,
"kl_loss_13": 2994.8,
"kl_loss_26": 2100.6,
"kl_loss_39": 1129.3,
"kl_loss_7": 3549.6,
"learning_rate": 1.1098064077174619e-06,
"loss": 4943.05,
"step": 9790
},
{
"ce_loss_13": 2.939675289392471,
"ce_loss_26": 2.5042629301548005,
"ce_loss_39": 2.036014449596405,
"ce_loss_52": 1.4478511959314346,
"ce_loss_7": 3.2001714766025544,
"epoch": 0.98,
"grad_norm": 18.76259776094823,
"kl_loss_13": 3078.4,
"kl_loss_26": 2174.8,
"kl_loss_39": 1199.8,
"kl_loss_7": 3615.6,
"learning_rate": 1.006661764057837e-06,
"loss": 4908.35,
"step": 9800
},
{
"ce_loss_13": 2.871473455429077,
"ce_loss_26": 2.4314837962388993,
"ce_loss_39": 1.95380699634552,
"ce_loss_52": 1.3844006016850472,
"ce_loss_7": 3.1342472076416015,
"epoch": 0.981,
"grad_norm": 19.274903724206773,
"kl_loss_13": 3093.6,
"kl_loss_26": 2172.4,
"kl_loss_39": 1165.3,
"kl_loss_7": 3637.6,
"learning_rate": 9.085419777743465e-07,
"loss": 4984.5,
"step": 9810
},
{
"ce_loss_13": 2.895174187421799,
"ce_loss_26": 2.465178096294403,
"ce_loss_39": 2.006316193938255,
"ce_loss_52": 1.441744513809681,
"ce_loss_7": 3.1597203612327576,
"epoch": 0.982,
"grad_norm": 18.123510539706626,
"kl_loss_13": 3039.2,
"kl_loss_26": 2136.8,
"kl_loss_39": 1165.1,
"kl_loss_7": 3592.0,
"learning_rate": 8.15448036932176e-07,
"loss": 4978.7,
"step": 9820
},
{
"ce_loss_13": 2.9061976075172424,
"ce_loss_26": 2.477484393119812,
"ce_loss_39": 2.0169315338134766,
"ce_loss_52": 1.4464313685894012,
"ce_loss_7": 3.1681883454322817,
"epoch": 0.983,
"grad_norm": 18.434840579065046,
"kl_loss_13": 3067.2,
"kl_loss_26": 2157.8,
"kl_loss_39": 1175.9,
"kl_loss_7": 3616.8,
"learning_rate": 7.273808789862724e-07,
"loss": 4921.0,
"step": 9830
},
{
"ce_loss_13": 2.91153547167778,
"ce_loss_26": 2.473715308308601,
"ce_loss_39": 2.007182112336159,
"ce_loss_52": 1.4433553382754325,
"ce_loss_7": 3.170134776830673,
"epoch": 0.984,
"grad_norm": 19.402855155704938,
"kl_loss_13": 3056.0,
"kl_loss_26": 2142.6,
"kl_loss_39": 1154.2,
"kl_loss_7": 3589.6,
"learning_rate": 6.443413907720186e-07,
"loss": 4900.3,
"step": 9840
},
{
"ce_loss_13": 2.812727469205856,
"ce_loss_26": 2.3864874839782715,
"ce_loss_39": 1.9436532348394393,
"ce_loss_52": 1.3948013991117478,
"ce_loss_7": 3.0747777581214906,
"epoch": 0.985,
"grad_norm": 18.7509272372939,
"kl_loss_13": 2956.4,
"kl_loss_26": 2066.6,
"kl_loss_39": 1115.4,
"kl_loss_7": 3495.2,
"learning_rate": 5.663304084960185e-07,
"loss": 4941.5,
"step": 9850
},
{
"ce_loss_13": 2.8474230617284775,
"ce_loss_26": 2.4196896702051163,
"ce_loss_39": 1.958586323261261,
"ce_loss_52": 1.4132703453302384,
"ce_loss_7": 3.112215679883957,
"epoch": 0.986,
"grad_norm": 19.13033176723296,
"kl_loss_13": 2947.6,
"kl_loss_26": 2054.6,
"kl_loss_39": 1096.6,
"kl_loss_7": 3496.0,
"learning_rate": 4.933487177280482e-07,
"loss": 4900.7,
"step": 9860
},
{
"ce_loss_13": 2.914989507198334,
"ce_loss_26": 2.4912798583507536,
"ce_loss_39": 2.020476207137108,
"ce_loss_52": 1.45462586581707,
"ce_loss_7": 3.169612795114517,
"epoch": 0.987,
"grad_norm": 18.808140949859265,
"kl_loss_13": 3018.8,
"kl_loss_26": 2134.0,
"kl_loss_39": 1159.4,
"kl_loss_7": 3551.6,
"learning_rate": 4.2539705339295075e-07,
"loss": 4908.55,
"step": 9870
},
{
"ce_loss_13": 2.8734777927398683,
"ce_loss_26": 2.437858074903488,
"ce_loss_39": 1.9720120638608933,
"ce_loss_52": 1.4267651215195656,
"ce_loss_7": 3.13208429813385,
"epoch": 0.988,
"grad_norm": 18.962161399510684,
"kl_loss_13": 2984.8,
"kl_loss_26": 2080.8,
"kl_loss_39": 1110.4,
"kl_loss_7": 3520.0,
"learning_rate": 3.6247609976319816e-07,
"loss": 4944.0,
"step": 9880
},
{
"ce_loss_13": 2.941332721710205,
"ce_loss_26": 2.5040529906749724,
"ce_loss_39": 2.0378583818674088,
"ce_loss_52": 1.468481183052063,
"ce_loss_7": 3.198000502586365,
"epoch": 0.989,
"grad_norm": 18.392580217010416,
"kl_loss_13": 3027.6,
"kl_loss_26": 2127.6,
"kl_loss_39": 1152.6,
"kl_loss_7": 3568.8,
"learning_rate": 3.0458649045211895e-07,
"loss": 4940.25,
"step": 9890
},
{
"ce_loss_13": 2.7879110276699066,
"ce_loss_26": 2.361061328649521,
"ce_loss_39": 1.9103228181600571,
"ce_loss_52": 1.3702009424567223,
"ce_loss_7": 3.0456897139549257,
"epoch": 0.99,
"grad_norm": 18.728359427979246,
"kl_loss_13": 2962.0,
"kl_loss_26": 2068.0,
"kl_loss_39": 1101.6,
"kl_loss_7": 3497.6,
"learning_rate": 2.517288084074587e-07,
"loss": 4930.1,
"step": 9900
},
{
"ce_loss_13": 2.8971258997917175,
"ce_loss_26": 2.5054025918245317,
"ce_loss_39": 2.0191519230604174,
"ce_loss_52": 1.4615912348031999,
"ce_loss_7": 3.1491506710648536,
"epoch": 0.991,
"grad_norm": 18.31923607379438,
"kl_loss_13": 3021.4,
"kl_loss_26": 2147.8,
"kl_loss_39": 1158.2,
"kl_loss_7": 3569.6,
"learning_rate": 2.0390358590538505e-07,
"loss": 4961.35,
"step": 9910
},
{
"ce_loss_13": 2.8903492599725724,
"ce_loss_26": 2.463494861125946,
"ce_loss_39": 1.9990645915269851,
"ce_loss_52": 1.4176891192793846,
"ce_loss_7": 3.156185895204544,
"epoch": 0.992,
"grad_norm": 18.63207057019639,
"kl_loss_13": 3059.2,
"kl_loss_26": 2160.8,
"kl_loss_39": 1181.7,
"kl_loss_7": 3612.8,
"learning_rate": 1.61111304545436e-07,
"loss": 4924.65,
"step": 9920
},
{
"ce_loss_13": 2.9144785940647124,
"ce_loss_26": 2.4774809032678604,
"ce_loss_39": 2.0119458585977554,
"ce_loss_52": 1.4417672097682952,
"ce_loss_7": 3.1794375479221344,
"epoch": 0.993,
"grad_norm": 19.670611142271607,
"kl_loss_13": 3065.6,
"kl_loss_26": 2158.2,
"kl_loss_39": 1171.1,
"kl_loss_7": 3620.0,
"learning_rate": 1.2335239524541298e-07,
"loss": 4934.7,
"step": 9930
},
{
"ce_loss_13": 2.8820480942726134,
"ce_loss_26": 2.4476457953453066,
"ce_loss_39": 1.9870627135038377,
"ce_loss_52": 1.418778820335865,
"ce_loss_7": 3.140765738487244,
"epoch": 0.994,
"grad_norm": 18.855641981755237,
"kl_loss_13": 3036.0,
"kl_loss_26": 2135.8,
"kl_loss_39": 1164.2,
"kl_loss_7": 3583.2,
"learning_rate": 9.06272382371065e-08,
"loss": 4938.8,
"step": 9940
},
{
"ce_loss_13": 2.8076956808567046,
"ce_loss_26": 2.3849500566720963,
"ce_loss_39": 1.944134348630905,
"ce_loss_52": 1.4005259275436401,
"ce_loss_7": 3.060946136713028,
"epoch": 0.995,
"grad_norm": 18.321398390501436,
"kl_loss_13": 2907.2,
"kl_loss_26": 2027.0,
"kl_loss_39": 1087.6,
"kl_loss_7": 3442.4,
"learning_rate": 6.293616306246586e-08,
"loss": 4950.05,
"step": 9950
},
{
"ce_loss_13": 2.879979431629181,
"ce_loss_26": 2.4471361935138702,
"ce_loss_39": 1.9933580070734025,
"ce_loss_52": 1.4384188532829285,
"ce_loss_7": 3.137728548049927,
"epoch": 0.996,
"grad_norm": 18.715841313903894,
"kl_loss_13": 3004.0,
"kl_loss_26": 2101.4,
"kl_loss_39": 1139.8,
"kl_loss_7": 3537.6,
"learning_rate": 4.027944857032395e-08,
"loss": 4943.4,
"step": 9960
},
{
"ce_loss_13": 2.873315241932869,
"ce_loss_26": 2.4401324480772018,
"ce_loss_39": 1.9843392819166183,
"ce_loss_52": 1.4195681273937226,
"ce_loss_7": 3.131341791152954,
"epoch": 0.997,
"grad_norm": 18.684401038028668,
"kl_loss_13": 3010.0,
"kl_loss_26": 2109.2,
"kl_loss_39": 1143.6,
"kl_loss_7": 3546.8,
"learning_rate": 2.265732291356626e-08,
"loss": 4916.9,
"step": 9970
},
{
"ce_loss_13": 2.806668055057526,
"ce_loss_26": 2.379783111810684,
"ce_loss_39": 1.9266512155532838,
"ce_loss_52": 1.397612212598324,
"ce_loss_7": 3.066510772705078,
"epoch": 0.998,
"grad_norm": 18.616508974623724,
"kl_loss_13": 2953.6,
"kl_loss_26": 2053.8,
"kl_loss_39": 1093.4,
"kl_loss_7": 3497.6,
"learning_rate": 1.0069963546743833e-08,
"loss": 4905.25,
"step": 9980
},
{
"ce_loss_13": 2.8469128251075744,
"ce_loss_26": 2.4174416065216064,
"ce_loss_39": 1.9600837975740433,
"ce_loss_52": 1.4190092101693152,
"ce_loss_7": 3.1044517934322355,
"epoch": 0.999,
"grad_norm": 18.871308722121288,
"kl_loss_13": 2960.0,
"kl_loss_26": 2064.8,
"kl_loss_39": 1104.4,
"kl_loss_7": 3488.8,
"learning_rate": 2.517497224463483e-09,
"loss": 4901.2,
"step": 9990
},
{
"ce_loss_13": 2.895157891511917,
"ce_loss_26": 2.452625501155853,
"ce_loss_39": 1.9889036536216735,
"ce_loss_52": 1.4127988710999488,
"ce_loss_7": 3.167273908853531,
"epoch": 1.0,
"grad_norm": 19.02740690165538,
"kl_loss_13": 3067.6,
"kl_loss_26": 2157.2,
"kl_loss_39": 1167.3,
"kl_loss_7": 3628.8,
"learning_rate": 0.0,
"loss": 4933.3,
"step": 10000
}
],
"logging_steps": 10,
"max_steps": 10000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0167830278176768e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}