diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16049 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "ce_loss_13": 6.261108160018921, + "ce_loss_17": 3.2123656272888184, + "ce_loss_2": 8.975998401641846, + "ce_loss_4": 10.630545616149902, + "ce_loss_9": 7.892497777938843, + "epoch": 0.0001, + "grad_norm": 46080.0, + "kl_loss_13": 7493.62744140625, + "kl_loss_2": 11818.61083984375, + "kl_loss_4": 15299.1640625, + "kl_loss_9": 10258.54052734375, + "learning_rate": 1e-05, + "loss": 11616.1748, + "step": 1 + }, + { + "ce_loss_13": 5.229107724295722, + "ce_loss_17": 3.2694726387659707, + "ce_loss_2": 7.908167017830743, + "ce_loss_4": 8.510330465104845, + "ce_loss_9": 6.480849477979872, + "epoch": 0.001, + "grad_norm": 16192.0, + "kl_loss_13": 4690.5024007161455, + "kl_loss_2": 9165.25279405382, + "kl_loss_4": 10523.28955078125, + "kl_loss_9": 6728.469645182292, + "learning_rate": 0.0001, + "loss": 7788.046, + "step": 10 + }, + { + "ce_loss_13": 4.059880614280701, + "ce_loss_17": 3.2766793847084044, + "ce_loss_2": 6.427331566810608, + "ce_loss_4": 6.320853996276855, + "ce_loss_9": 5.012065768241882, + "epoch": 0.002, + "grad_norm": 2960.0, + "kl_loss_13": 1498.4480590820312, + "kl_loss_2": 5931.1205078125, + "kl_loss_4": 5700.908862304687, + "kl_loss_9": 3248.3931274414062, + "learning_rate": 0.0002, + "loss": 4141.4953, + "step": 20 + }, + { + "ce_loss_13": 3.482624328136444, + "ce_loss_17": 3.0720787167549135, + "ce_loss_2": 5.807352304458618, + "ce_loss_4": 5.4482827425003055, + "ce_loss_9": 4.268812489509583, + "epoch": 0.003, + "grad_norm": 2544.0, + "kl_loss_13": 771.2804718017578, + "kl_loss_2": 5205.060034179687, + "kl_loss_4": 4521.033422851562, + "kl_loss_9": 2273.452490234375, + "learning_rate": 0.0003, + "loss": 3136.2984, + "step": 30 + }, + { + "ce_loss_13": 3.542610156536102, + "ce_loss_17": 3.24235805273056, + "ce_loss_2": 5.604139733314514, + "ce_loss_4": 5.206647753715515, + "ce_loss_9": 4.202983009815216, + "epoch": 0.004, + "grad_norm": 1896.0, + "kl_loss_13": 571.3098617553711, + "kl_loss_2": 4525.145690917969, + "kl_loss_4": 3775.388525390625, + "kl_loss_9": 1843.7200561523437, + "learning_rate": 0.0004, + "loss": 2693.4086, + "step": 40 + }, + { + "ce_loss_13": 3.478264403343201, + "ce_loss_17": 3.2130247592926025, + "ce_loss_2": 5.45984570980072, + "ce_loss_4": 5.021239829063416, + "ce_loss_9": 4.0570685267448425, + "epoch": 0.005, + "grad_norm": 2544.0, + "kl_loss_13": 486.09989776611326, + "kl_loss_2": 4336.090563964844, + "kl_loss_4": 3511.476416015625, + "kl_loss_9": 1638.4347045898437, + "learning_rate": 0.0005, + "loss": 2481.9402, + "step": 50 + }, + { + "ce_loss_13": 3.4491629719734194, + "ce_loss_17": 3.2274078488349915, + "ce_loss_2": 5.326788687705994, + "ce_loss_4": 4.886805820465088, + "ce_loss_9": 4.006088042259217, + "epoch": 0.006, + "grad_norm": 1976.0, + "kl_loss_13": 409.9218521118164, + "kl_loss_2": 4072.911962890625, + "kl_loss_4": 3242.224365234375, + "kl_loss_9": 1502.4994689941407, + "learning_rate": 0.0006, + "loss": 2313.3283, + "step": 60 + }, + { + "ce_loss_13": 3.3460643768310545, + "ce_loss_17": 3.145425373315811, + "ce_loss_2": 5.251087355613708, + "ce_loss_4": 4.785633516311646, + "ce_loss_9": 3.8736647248268126, + "epoch": 0.007, + "grad_norm": 3040.0, + "kl_loss_13": 390.72980346679685, + "kl_loss_2": 4095.1318969726562, + "kl_loss_4": 3204.57490234375, + "kl_loss_9": 1435.42216796875, + "learning_rate": 0.0007, + "loss": 2270.2215, + "step": 70 + }, + { + "ce_loss_13": 3.4079737186431887, + "ce_loss_17": 3.146404445171356, + "ce_loss_2": 5.31060528755188, + "ce_loss_4": 4.817984127998352, + "ce_loss_9": 3.856471037864685, + "epoch": 0.008, + "grad_norm": 3120.0, + "kl_loss_13": 513.9969299316406, + "kl_loss_2": 4205.247351074218, + "kl_loss_4": 3256.803430175781, + "kl_loss_9": 1371.8184814453125, + "learning_rate": 0.0008, + "loss": 2348.7998, + "step": 80 + }, + { + "ce_loss_13": 3.3797936916351317, + "ce_loss_17": 3.1046133518218992, + "ce_loss_2": 5.22902545928955, + "ce_loss_4": 4.679562568664551, + "ce_loss_9": 3.78316969871521, + "epoch": 0.009, + "grad_norm": 2320.0, + "kl_loss_13": 514.5743957519531, + "kl_loss_2": 4132.310363769531, + "kl_loss_4": 3100.52490234375, + "kl_loss_9": 1329.9578063964843, + "learning_rate": 0.0009000000000000001, + "loss": 2255.0043, + "step": 90 + }, + { + "ce_loss_13": 3.4396926045417784, + "ce_loss_17": 3.2249288082122805, + "ce_loss_2": 5.2579874992370605, + "ce_loss_4": 4.769978952407837, + "ce_loss_9": 3.909707498550415, + "epoch": 0.01, + "grad_norm": 1880.0, + "kl_loss_13": 417.27407989501955, + "kl_loss_2": 3978.06357421875, + "kl_loss_4": 3047.667761230469, + "kl_loss_9": 1345.2632690429687, + "learning_rate": 0.001, + "loss": 2198.0012, + "step": 100 + }, + { + "ce_loss_13": 3.3686514616012575, + "ce_loss_17": 3.1747238278388976, + "ce_loss_2": 5.211386060714721, + "ce_loss_4": 4.760608530044555, + "ce_loss_9": 3.8562410354614256, + "epoch": 0.011, + "grad_norm": 1512.0, + "kl_loss_13": 370.25237426757815, + "kl_loss_2": 3972.5962768554687, + "kl_loss_4": 3119.8548095703127, + "kl_loss_9": 1331.0780883789062, + "learning_rate": 0.0009999974825027757, + "loss": 2188.252, + "step": 110 + }, + { + "ce_loss_13": 3.4155537605285646, + "ce_loss_17": 3.239515316486359, + "ce_loss_2": 5.191804099082947, + "ce_loss_4": 4.703694176673889, + "ce_loss_9": 3.8796786427497865, + "epoch": 0.012, + "grad_norm": 1208.0, + "kl_loss_13": 334.42599334716795, + "kl_loss_2": 3844.4222045898437, + "kl_loss_4": 2917.6344604492188, + "kl_loss_9": 1275.1363037109375, + "learning_rate": 0.0009999899300364532, + "loss": 2065.4391, + "step": 120 + }, + { + "ce_loss_13": 3.3851757526397703, + "ce_loss_17": 3.208660531044006, + "ce_loss_2": 5.187322640419007, + "ce_loss_4": 4.707761931419372, + "ce_loss_9": 3.8623048305511474, + "epoch": 0.013, + "grad_norm": 1464.0, + "kl_loss_13": 348.1716064453125, + "kl_loss_2": 3871.487927246094, + "kl_loss_4": 2947.180615234375, + "kl_loss_9": 1284.474591064453, + "learning_rate": 0.0009999773426770863, + "loss": 2137.5457, + "step": 130 + }, + { + "ce_loss_13": 3.4333951473236084, + "ce_loss_17": 3.247041952610016, + "ce_loss_2": 5.17234206199646, + "ce_loss_4": 4.659626841545105, + "ce_loss_9": 3.8604306101799013, + "epoch": 0.014, + "grad_norm": 1560.0, + "kl_loss_13": 372.6525421142578, + "kl_loss_2": 3781.9054443359373, + "kl_loss_4": 2810.7845825195313, + "kl_loss_9": 1222.908135986328, + "learning_rate": 0.0009999597205514296, + "loss": 2064.3145, + "step": 140 + }, + { + "ce_loss_13": 3.398501455783844, + "ce_loss_17": 3.201441490650177, + "ce_loss_2": 5.101144433021545, + "ce_loss_4": 4.589037704467773, + "ce_loss_9": 3.80131276845932, + "epoch": 0.015, + "grad_norm": 1096.0, + "kl_loss_13": 378.6312545776367, + "kl_loss_2": 3725.099365234375, + "kl_loss_4": 2742.1082153320312, + "kl_loss_9": 1179.0215209960938, + "learning_rate": 0.0009999370638369377, + "loss": 2019.5482, + "step": 150 + }, + { + "ce_loss_13": 3.4211544513702394, + "ce_loss_17": 3.2419292449951174, + "ce_loss_2": 5.1445982456207275, + "ce_loss_4": 4.637296891212463, + "ce_loss_9": 3.820467162132263, + "epoch": 0.016, + "grad_norm": 1744.0, + "kl_loss_13": 356.128923034668, + "kl_loss_2": 3739.355676269531, + "kl_loss_4": 2769.3091430664062, + "kl_loss_9": 1165.9014099121093, + "learning_rate": 0.000999909372761763, + "loss": 2011.9781, + "step": 160 + }, + { + "ce_loss_13": 3.3497793793678285, + "ce_loss_17": 3.1803732752799987, + "ce_loss_2": 5.0965896368026735, + "ce_loss_4": 4.569389176368714, + "ce_loss_9": 3.750406265258789, + "epoch": 0.017, + "grad_norm": 1080.0, + "kl_loss_13": 337.91043548583986, + "kl_loss_2": 3792.000451660156, + "kl_loss_4": 2794.6717529296875, + "kl_loss_9": 1165.9158325195312, + "learning_rate": 0.0009998766476047546, + "loss": 2035.0443, + "step": 170 + }, + { + "ce_loss_13": 3.3844147086143495, + "ce_loss_17": 3.2172260522842406, + "ce_loss_2": 5.131505370140076, + "ce_loss_4": 4.606155300140381, + "ce_loss_9": 3.8108633518218995, + "epoch": 0.018, + "grad_norm": 1248.0, + "kl_loss_13": 316.96852722167966, + "kl_loss_2": 3780.761364746094, + "kl_loss_4": 2771.714880371094, + "kl_loss_9": 1190.5228759765625, + "learning_rate": 0.0009998388886954545, + "loss": 2026.8406, + "step": 180 + }, + { + "ce_loss_13": 3.3414482712745666, + "ce_loss_17": 3.183154559135437, + "ce_loss_2": 5.078153777122497, + "ce_loss_4": 4.566149878501892, + "ce_loss_9": 3.7500080823898316, + "epoch": 0.019, + "grad_norm": 1232.0, + "kl_loss_13": 305.7339660644531, + "kl_loss_2": 3747.8099731445313, + "kl_loss_4": 2767.91455078125, + "kl_loss_9": 1140.842953491211, + "learning_rate": 0.0009997960964140947, + "loss": 1978.3309, + "step": 190 + }, + { + "ce_loss_13": 3.341668939590454, + "ce_loss_17": 3.1812687516212463, + "ce_loss_2": 5.067327260971069, + "ce_loss_4": 4.56243736743927, + "ce_loss_9": 3.7384706020355223, + "epoch": 0.02, + "grad_norm": 1020.0, + "kl_loss_13": 314.30238647460936, + "kl_loss_2": 3744.44990234375, + "kl_loss_4": 2772.7313842773438, + "kl_loss_9": 1114.3156524658202, + "learning_rate": 0.0009997482711915926, + "loss": 1972.7639, + "step": 200 + }, + { + "ce_loss_13": 3.2981292963027955, + "ce_loss_17": 3.1554569721221926, + "ce_loss_2": 5.020105624198914, + "ce_loss_4": 4.48168580532074, + "ce_loss_9": 3.6662776470184326, + "epoch": 0.021, + "grad_norm": 1272.0, + "kl_loss_13": 281.57157974243165, + "kl_loss_2": 3702.302490234375, + "kl_loss_4": 2660.3906372070314, + "kl_loss_9": 1047.2737518310546, + "learning_rate": 0.0009996954135095479, + "loss": 1914.5709, + "step": 210 + }, + { + "ce_loss_13": 3.373715043067932, + "ce_loss_17": 3.2301499366760256, + "ce_loss_2": 5.048925876617432, + "ce_loss_4": 4.48554573059082, + "ce_loss_9": 3.76205735206604, + "epoch": 0.022, + "grad_norm": 1216.0, + "kl_loss_13": 272.89011001586914, + "kl_loss_2": 3584.8785766601563, + "kl_loss_4": 2513.867822265625, + "kl_loss_9": 1066.2011169433595, + "learning_rate": 0.0009996375239002368, + "loss": 1857.4035, + "step": 220 + }, + { + "ce_loss_13": 3.451137161254883, + "ce_loss_17": 3.2963778614997863, + "ce_loss_2": 5.041608762741089, + "ce_loss_4": 4.506736373901367, + "ce_loss_9": 3.8228611826896666, + "epoch": 0.023, + "grad_norm": 1176.0, + "kl_loss_13": 300.3985954284668, + "kl_loss_2": 3461.8039794921874, + "kl_loss_4": 2438.723596191406, + "kl_loss_9": 1050.631265258789, + "learning_rate": 0.0009995746029466072, + "loss": 1818.5695, + "step": 230 + }, + { + "ce_loss_13": 3.2396350622177126, + "ce_loss_17": 3.0916191935539246, + "ce_loss_2": 4.955073380470276, + "ce_loss_4": 4.399210965633392, + "ce_loss_9": 3.6209271430969237, + "epoch": 0.024, + "grad_norm": 1072.0, + "kl_loss_13": 297.95966186523435, + "kl_loss_2": 3711.3630249023436, + "kl_loss_4": 2642.3200439453126, + "kl_loss_9": 1081.5263305664062, + "learning_rate": 0.0009995066512822719, + "loss": 1864.6961, + "step": 240 + }, + { + "ce_loss_13": 3.3319664120674135, + "ce_loss_17": 3.1891480684280396, + "ce_loss_2": 5.081031465530396, + "ce_loss_4": 4.5508263349533085, + "ce_loss_9": 3.7102392435073854, + "epoch": 0.025, + "grad_norm": 1152.0, + "kl_loss_13": 280.87394485473635, + "kl_loss_2": 3740.41455078125, + "kl_loss_4": 2714.971337890625, + "kl_loss_9": 1046.2191192626954, + "learning_rate": 0.000999433669591504, + "loss": 1866.7016, + "step": 250 + }, + { + "ce_loss_13": 3.242056429386139, + "ce_loss_17": 3.0937989592552184, + "ce_loss_2": 4.961329817771912, + "ce_loss_4": 4.409448575973511, + "ce_loss_9": 3.624157178401947, + "epoch": 0.026, + "grad_norm": 1048.0, + "kl_loss_13": 278.72955856323244, + "kl_loss_2": 3711.456396484375, + "kl_loss_4": 2647.7971801757812, + "kl_loss_9": 1060.2467346191406, + "learning_rate": 0.000999355658609228, + "loss": 1875.1668, + "step": 260 + }, + { + "ce_loss_13": 3.2744328379631042, + "ce_loss_17": 3.1282419323921205, + "ce_loss_2": 5.044838619232178, + "ce_loss_4": 4.461510038375854, + "ce_loss_9": 3.659050393104553, + "epoch": 0.027, + "grad_norm": 908.0, + "kl_loss_13": 274.6287528991699, + "kl_loss_2": 3774.6158813476563, + "kl_loss_4": 2653.425439453125, + "kl_loss_9": 1039.7476715087892, + "learning_rate": 0.0009992726191210138, + "loss": 1905.7691, + "step": 270 + }, + { + "ce_loss_13": 3.305083930492401, + "ce_loss_17": 3.1646654486656187, + "ce_loss_2": 4.980152988433838, + "ce_loss_4": 4.446230244636536, + "ce_loss_9": 3.6853484392166136, + "epoch": 0.028, + "grad_norm": 1064.0, + "kl_loss_13": 279.7167221069336, + "kl_loss_2": 3585.838916015625, + "kl_loss_4": 2562.940771484375, + "kl_loss_9": 1043.0598510742188, + "learning_rate": 0.0009991845519630679, + "loss": 1848.1752, + "step": 280 + }, + { + "ce_loss_13": 3.2109835386276244, + "ce_loss_17": 3.054629158973694, + "ce_loss_2": 4.875472736358643, + "ce_loss_4": 4.323333823680878, + "ce_loss_9": 3.56309974193573, + "epoch": 0.029, + "grad_norm": 844.0, + "kl_loss_13": 296.7427352905273, + "kl_loss_2": 3597.7573486328124, + "kl_loss_4": 2552.564587402344, + "kl_loss_9": 1027.3577239990234, + "learning_rate": 0.0009990914580222257, + "loss": 1860.0078, + "step": 290 + }, + { + "ce_loss_13": 3.335800087451935, + "ce_loss_17": 3.196966767311096, + "ce_loss_2": 4.934219813346862, + "ce_loss_4": 4.37315182685852, + "ce_loss_9": 3.6732271552085876, + "epoch": 0.03, + "grad_norm": 888.0, + "kl_loss_13": 279.8048011779785, + "kl_loss_2": 3475.6844482421875, + "kl_loss_4": 2397.2181396484375, + "kl_loss_9": 976.2124542236328, + "learning_rate": 0.0009989933382359422, + "loss": 1823.3582, + "step": 300 + }, + { + "ce_loss_13": 3.3346471786499023, + "ce_loss_17": 3.2023627877235414, + "ce_loss_2": 4.930377960205078, + "ce_loss_4": 4.38904504776001, + "ce_loss_9": 3.67050461769104, + "epoch": 0.031, + "grad_norm": 920.0, + "kl_loss_13": 269.11675415039065, + "kl_loss_2": 3432.012890625, + "kl_loss_4": 2408.4751953125, + "kl_loss_9": 946.1452331542969, + "learning_rate": 0.0009988901935922825, + "loss": 1780.2604, + "step": 310 + }, + { + "ce_loss_13": 3.192901241779327, + "ce_loss_17": 3.0488417744636536, + "ce_loss_2": 4.862772393226623, + "ce_loss_4": 4.343963694572449, + "ce_loss_9": 3.553617537021637, + "epoch": 0.032, + "grad_norm": 1072.0, + "kl_loss_13": 273.71946868896487, + "kl_loss_2": 3621.1454467773438, + "kl_loss_4": 2612.590441894531, + "kl_loss_9": 1008.4138580322266, + "learning_rate": 0.0009987820251299122, + "loss": 1824.2422, + "step": 320 + }, + { + "ce_loss_13": 3.312174892425537, + "ce_loss_17": 3.1781514286994934, + "ce_loss_2": 4.877210545539856, + "ce_loss_4": 4.358963871002198, + "ce_loss_9": 3.658026921749115, + "epoch": 0.033, + "grad_norm": 1000.0, + "kl_loss_13": 257.35581665039064, + "kl_loss_2": 3396.768103027344, + "kl_loss_4": 2406.3000610351564, + "kl_loss_9": 963.8726593017578, + "learning_rate": 0.0009986688339380862, + "loss": 1755.6203, + "step": 330 + }, + { + "ce_loss_13": 3.2596199750900268, + "ce_loss_17": 3.1342209696769716, + "ce_loss_2": 4.812474393844605, + "ce_loss_4": 4.287576389312744, + "ce_loss_9": 3.575206446647644, + "epoch": 0.034, + "grad_norm": 812.0, + "kl_loss_13": 246.57801818847656, + "kl_loss_2": 3333.1323486328124, + "kl_loss_4": 2335.852227783203, + "kl_loss_9": 914.6424011230469, + "learning_rate": 0.0009985506211566387, + "loss": 1725.0723, + "step": 340 + }, + { + "ce_loss_13": 3.280947911739349, + "ce_loss_17": 3.1588114142417907, + "ce_loss_2": 4.8139854907989506, + "ce_loss_4": 4.290620255470276, + "ce_loss_9": 3.6064313530921934, + "epoch": 0.035, + "grad_norm": 904.0, + "kl_loss_13": 243.38776550292968, + "kl_loss_2": 3307.4294189453126, + "kl_loss_4": 2290.8899108886717, + "kl_loss_9": 913.196875, + "learning_rate": 0.0009984273879759713, + "loss": 1695.1201, + "step": 350 + }, + { + "ce_loss_13": 3.3265373706817627, + "ce_loss_17": 3.1861754298210143, + "ce_loss_2": 4.8873292207717896, + "ce_loss_4": 4.357598650455475, + "ce_loss_9": 3.659719228744507, + "epoch": 0.036, + "grad_norm": 824.0, + "kl_loss_13": 266.27539443969727, + "kl_loss_2": 3374.9498779296873, + "kl_loss_4": 2360.150671386719, + "kl_loss_9": 942.069482421875, + "learning_rate": 0.0009982991356370402, + "loss": 1760.5016, + "step": 360 + }, + { + "ce_loss_13": 3.2904040098190306, + "ce_loss_17": 3.164570677280426, + "ce_loss_2": 4.853806781768799, + "ce_loss_4": 4.328417825698852, + "ce_loss_9": 3.6372653245925903, + "epoch": 0.037, + "grad_norm": 908.0, + "kl_loss_13": 251.49842071533203, + "kl_loss_2": 3357.7119262695314, + "kl_loss_4": 2352.0833435058594, + "kl_loss_9": 957.9045379638671, + "learning_rate": 0.0009981658654313456, + "loss": 1739.8387, + "step": 370 + }, + { + "ce_loss_13": 3.3607516407966616, + "ce_loss_17": 3.2385677337646483, + "ce_loss_2": 4.871717548370361, + "ce_loss_4": 4.349547493457794, + "ce_loss_9": 3.6877449631690977, + "epoch": 0.038, + "grad_norm": 792.0, + "kl_loss_13": 234.3559371948242, + "kl_loss_2": 3248.2672607421873, + "kl_loss_4": 2256.107794189453, + "kl_loss_9": 911.4440856933594, + "learning_rate": 0.000998027578700917, + "loss": 1690.8547, + "step": 380 + }, + { + "ce_loss_13": 3.3052435636520388, + "ce_loss_17": 3.182045602798462, + "ce_loss_2": 4.843131709098816, + "ce_loss_4": 4.324041199684143, + "ce_loss_9": 3.6335140585899355, + "epoch": 0.039, + "grad_norm": 808.0, + "kl_loss_13": 238.2157127380371, + "kl_loss_2": 3307.6914672851562, + "kl_loss_4": 2316.087762451172, + "kl_loss_9": 914.0661926269531, + "learning_rate": 0.0009978842768382998, + "loss": 1703.8633, + "step": 390 + }, + { + "ce_loss_13": 3.317215847969055, + "ce_loss_17": 3.2024751543998717, + "ce_loss_2": 4.808174395561219, + "ce_loss_4": 4.288105857372284, + "ce_loss_9": 3.6276572585105895, + "epoch": 0.04, + "grad_norm": 884.0, + "kl_loss_13": 226.09404296875, + "kl_loss_2": 3198.2519409179686, + "kl_loss_4": 2213.0399536132813, + "kl_loss_9": 870.5073211669921, + "learning_rate": 0.0009977359612865424, + "loss": 1649.0453, + "step": 400 + }, + { + "ce_loss_13": 3.3236431241035462, + "ce_loss_17": 3.198979449272156, + "ce_loss_2": 4.822295427322388, + "ce_loss_4": 4.313423693180084, + "ce_loss_9": 3.65645147562027, + "epoch": 0.041, + "grad_norm": 820.0, + "kl_loss_13": 239.1367645263672, + "kl_loss_2": 3236.166125488281, + "kl_loss_4": 2261.5610900878905, + "kl_loss_9": 924.2959411621093, + "learning_rate": 0.0009975826335391806, + "loss": 1652.9313, + "step": 410 + }, + { + "ce_loss_13": 3.35085951089859, + "ce_loss_17": 3.2257604241371154, + "ce_loss_2": 4.809059381484985, + "ce_loss_4": 4.317882597446442, + "ce_loss_9": 3.6480381369590758, + "epoch": 0.042, + "grad_norm": 812.0, + "kl_loss_13": 241.73236083984375, + "kl_loss_2": 3174.3310791015624, + "kl_loss_4": 2223.89658203125, + "kl_loss_9": 868.0359283447266, + "learning_rate": 0.0009974242951402235, + "loss": 1636.1242, + "step": 420 + }, + { + "ce_loss_13": 3.358625030517578, + "ce_loss_17": 3.232568895816803, + "ce_loss_2": 4.8505696058273315, + "ce_loss_4": 4.325848865509033, + "ce_loss_9": 3.660508942604065, + "epoch": 0.043, + "grad_norm": 1048.0, + "kl_loss_13": 245.52830200195314, + "kl_loss_2": 3251.2514526367186, + "kl_loss_4": 2242.529821777344, + "kl_loss_9": 896.2119293212891, + "learning_rate": 0.0009972609476841367, + "loss": 1635.2551, + "step": 430 + }, + { + "ce_loss_13": 3.2741202831268312, + "ce_loss_17": 3.142079508304596, + "ce_loss_2": 4.7986610412597654, + "ce_loss_4": 4.274619221687317, + "ce_loss_9": 3.5810211658477784, + "epoch": 0.044, + "grad_norm": 788.0, + "kl_loss_13": 252.2869758605957, + "kl_loss_2": 3288.5843017578127, + "kl_loss_4": 2285.492303466797, + "kl_loss_9": 884.1446136474609, + "learning_rate": 0.0009970925928158272, + "loss": 1674.193, + "step": 440 + }, + { + "ce_loss_13": 3.237841033935547, + "ce_loss_17": 3.087252652645111, + "ce_loss_2": 4.73412435054779, + "ce_loss_4": 4.227073168754577, + "ce_loss_9": 3.5232383966445924, + "epoch": 0.045, + "grad_norm": 852.0, + "kl_loss_13": 281.4284843444824, + "kl_loss_2": 3322.437707519531, + "kl_loss_4": 2332.561962890625, + "kl_loss_9": 888.7865325927735, + "learning_rate": 0.000996919232230627, + "loss": 1686.0275, + "step": 450 + }, + { + "ce_loss_13": 3.3005584836006165, + "ce_loss_17": 3.178881347179413, + "ce_loss_2": 4.758726978302002, + "ce_loss_4": 4.248190772533417, + "ce_loss_9": 3.586152768135071, + "epoch": 0.046, + "grad_norm": 920.0, + "kl_loss_13": 239.50137252807616, + "kl_loss_2": 3174.419714355469, + "kl_loss_4": 2196.733538818359, + "kl_loss_9": 837.9545532226563, + "learning_rate": 0.0009967408676742752, + "loss": 1580.9902, + "step": 460 + }, + { + "ce_loss_13": 3.4333751440048217, + "ce_loss_17": 3.3133748054504393, + "ce_loss_2": 4.876122450828552, + "ce_loss_4": 4.352652883529663, + "ce_loss_9": 3.7203984379768373, + "epoch": 0.047, + "grad_norm": 980.0, + "kl_loss_13": 236.0327392578125, + "kl_loss_2": 3139.813134765625, + "kl_loss_4": 2139.4392578125, + "kl_loss_9": 849.7753234863281, + "learning_rate": 0.0009965575009429006, + "loss": 1640.5717, + "step": 470 + }, + { + "ce_loss_13": 3.2189228653907778, + "ce_loss_17": 3.1018933296203612, + "ce_loss_2": 4.741197824478149, + "ce_loss_4": 4.202152931690216, + "ce_loss_9": 3.51752450466156, + "epoch": 0.048, + "grad_norm": 852.0, + "kl_loss_13": 229.07066268920897, + "kl_loss_2": 3285.421838378906, + "kl_loss_4": 2253.768994140625, + "kl_loss_9": 860.4444122314453, + "learning_rate": 0.0009963691338830043, + "loss": 1635.4795, + "step": 480 + }, + { + "ce_loss_13": 3.3028653502464294, + "ce_loss_17": 3.195988178253174, + "ce_loss_2": 4.772507858276367, + "ce_loss_4": 4.2517603754997255, + "ce_loss_9": 3.593648087978363, + "epoch": 0.049, + "grad_norm": 776.0, + "kl_loss_13": 216.6580467224121, + "kl_loss_2": 3199.962438964844, + "kl_loss_4": 2188.4291931152343, + "kl_loss_9": 826.735205078125, + "learning_rate": 0.0009961757683914405, + "loss": 1592.0387, + "step": 490 + }, + { + "ce_loss_13": 3.299613666534424, + "ce_loss_17": 3.184020471572876, + "ce_loss_2": 4.728046226501465, + "ce_loss_4": 4.217325842380523, + "ce_loss_9": 3.58817081451416, + "epoch": 0.05, + "grad_norm": 896.0, + "kl_loss_13": 223.86487884521483, + "kl_loss_2": 3117.516735839844, + "kl_loss_4": 2141.9565612792967, + "kl_loss_9": 833.4541717529297, + "learning_rate": 0.0009959774064153978, + "loss": 1601.4448, + "step": 500 + }, + { + "ce_loss_13": 3.2994203925132752, + "ce_loss_17": 3.1923436522483826, + "ce_loss_2": 4.710976791381836, + "ce_loss_4": 4.207711231708527, + "ce_loss_9": 3.5841498017311095, + "epoch": 0.051, + "grad_norm": 888.0, + "kl_loss_13": 213.7725730895996, + "kl_loss_2": 3068.8955078125, + "kl_loss_4": 2091.95087890625, + "kl_loss_9": 815.0173492431641, + "learning_rate": 0.0009957740499523787, + "loss": 1574.0013, + "step": 510 + }, + { + "ce_loss_13": 3.320161998271942, + "ce_loss_17": 3.210383927822113, + "ce_loss_2": 4.74263710975647, + "ce_loss_4": 4.23639748096466, + "ce_loss_9": 3.606817162036896, + "epoch": 0.052, + "grad_norm": 852.0, + "kl_loss_13": 211.8182815551758, + "kl_loss_2": 3054.875537109375, + "kl_loss_4": 2104.1990478515627, + "kl_loss_9": 819.871826171875, + "learning_rate": 0.0009955657010501807, + "loss": 1556.8715, + "step": 520 + }, + { + "ce_loss_13": 3.2835038781166075, + "ce_loss_17": 3.1703851938247682, + "ce_loss_2": 4.721176028251648, + "ce_loss_4": 4.219398236274719, + "ce_loss_9": 3.574008357524872, + "epoch": 0.053, + "grad_norm": 920.0, + "kl_loss_13": 215.8219902038574, + "kl_loss_2": 3113.4391235351563, + "kl_loss_4": 2141.0289306640625, + "kl_loss_9": 823.7013916015625, + "learning_rate": 0.000995352361806875, + "loss": 1563.4373, + "step": 530 + }, + { + "ce_loss_13": 3.3192780733108522, + "ce_loss_17": 3.209794056415558, + "ce_loss_2": 4.748934602737426, + "ce_loss_4": 4.23925244808197, + "ce_loss_9": 3.6144827246665954, + "epoch": 0.054, + "grad_norm": 1040.0, + "kl_loss_13": 216.51064758300782, + "kl_loss_2": 3116.839221191406, + "kl_loss_4": 2139.485247802734, + "kl_loss_9": 836.2906158447265, + "learning_rate": 0.0009951340343707852, + "loss": 1594.1735, + "step": 540 + }, + { + "ce_loss_13": 3.3704386234283445, + "ce_loss_17": 3.2618794322013853, + "ce_loss_2": 4.810385036468506, + "ce_loss_4": 4.29154908657074, + "ce_loss_9": 3.6570982336997986, + "epoch": 0.055, + "grad_norm": 864.0, + "kl_loss_13": 205.82294235229492, + "kl_loss_2": 3093.702746582031, + "kl_loss_4": 2105.367791748047, + "kl_loss_9": 807.5787536621094, + "learning_rate": 0.0009949107209404665, + "loss": 1569.8831, + "step": 550 + }, + { + "ce_loss_13": 3.2878594994544983, + "ce_loss_17": 3.180399990081787, + "ce_loss_2": 4.684529542922974, + "ce_loss_4": 4.193032765388489, + "ce_loss_9": 3.5761741995811462, + "epoch": 0.056, + "grad_norm": 844.0, + "kl_loss_13": 209.75063247680663, + "kl_loss_2": 3052.921826171875, + "kl_loss_4": 2102.746240234375, + "kl_loss_9": 817.0229125976563, + "learning_rate": 0.0009946824237646824, + "loss": 1552.7129, + "step": 560 + }, + { + "ce_loss_13": 3.2481284856796266, + "ce_loss_17": 3.130042612552643, + "ce_loss_2": 4.676752805709839, + "ce_loss_4": 4.158942246437073, + "ce_loss_9": 3.5407792210578917, + "epoch": 0.057, + "grad_norm": 892.0, + "kl_loss_13": 227.67123107910157, + "kl_loss_2": 3122.7850952148438, + "kl_loss_4": 2140.1810546875, + "kl_loss_9": 853.0984161376953, + "learning_rate": 0.0009944491451423828, + "loss": 1607.5732, + "step": 570 + }, + { + "ce_loss_13": 3.2379589796066286, + "ce_loss_17": 3.1253902673721314, + "ce_loss_2": 4.688625645637512, + "ce_loss_4": 4.172986876964569, + "ce_loss_9": 3.5358773946762083, + "epoch": 0.058, + "grad_norm": 980.0, + "kl_loss_13": 220.05194091796875, + "kl_loss_2": 3150.2963256835938, + "kl_loss_4": 2158.8337890625, + "kl_loss_9": 845.115853881836, + "learning_rate": 0.0009942108874226813, + "loss": 1565.5949, + "step": 580 + }, + { + "ce_loss_13": 3.3432974576950074, + "ce_loss_17": 3.237903428077698, + "ce_loss_2": 4.718752670288086, + "ce_loss_4": 4.2182901978492735, + "ce_loss_9": 3.6234296202659606, + "epoch": 0.059, + "grad_norm": 840.0, + "kl_loss_13": 208.56099853515624, + "kl_loss_2": 2984.700390625, + "kl_loss_4": 2032.6259399414062, + "kl_loss_9": 806.5859741210937, + "learning_rate": 0.00099396765300483, + "loss": 1500.7738, + "step": 590 + }, + { + "ce_loss_13": 3.328046131134033, + "ce_loss_17": 3.218497109413147, + "ce_loss_2": 4.708283472061157, + "ce_loss_4": 4.201330256462097, + "ce_loss_9": 3.6069679260253906, + "epoch": 0.06, + "grad_norm": 888.0, + "kl_loss_13": 208.2171417236328, + "kl_loss_2": 3003.5567260742187, + "kl_loss_4": 2035.9237670898438, + "kl_loss_9": 814.7355163574218, + "learning_rate": 0.0009937194443381972, + "loss": 1517.4422, + "step": 600 + }, + { + "ce_loss_13": 3.351373827457428, + "ce_loss_17": 3.248007595539093, + "ce_loss_2": 4.700898742675781, + "ce_loss_4": 4.1996043682098385, + "ce_loss_9": 3.6269415378570558, + "epoch": 0.061, + "grad_norm": 852.0, + "kl_loss_13": 213.1246765136719, + "kl_loss_2": 2948.2832763671877, + "kl_loss_4": 1977.8431701660156, + "kl_loss_9": 805.3146514892578, + "learning_rate": 0.0009934662639222412, + "loss": 1527.7855, + "step": 610 + }, + { + "ce_loss_13": 3.306493330001831, + "ce_loss_17": 3.197219491004944, + "ce_loss_2": 4.722518062591552, + "ce_loss_4": 4.20321216583252, + "ce_loss_9": 3.5900039792060854, + "epoch": 0.062, + "grad_norm": 828.0, + "kl_loss_13": 211.4220764160156, + "kl_loss_2": 3096.3796997070312, + "kl_loss_4": 2093.686730957031, + "kl_loss_9": 821.0024108886719, + "learning_rate": 0.000993208114306486, + "loss": 1540.724, + "step": 620 + }, + { + "ce_loss_13": 3.2282010436058046, + "ce_loss_17": 3.122623550891876, + "ce_loss_2": 4.667038011550903, + "ce_loss_4": 4.138805234432221, + "ce_loss_9": 3.523139810562134, + "epoch": 0.063, + "grad_norm": 832.0, + "kl_loss_13": 204.72729873657227, + "kl_loss_2": 3109.895373535156, + "kl_loss_4": 2091.053857421875, + "kl_loss_9": 830.4272247314453, + "learning_rate": 0.0009929449980904952, + "loss": 1513.9823, + "step": 630 + }, + { + "ce_loss_13": 3.280583643913269, + "ce_loss_17": 3.1798858761787416, + "ce_loss_2": 4.680929517745971, + "ce_loss_4": 4.179859507083893, + "ce_loss_9": 3.560952889919281, + "epoch": 0.064, + "grad_norm": 824.0, + "kl_loss_13": 203.02983779907225, + "kl_loss_2": 3051.500378417969, + "kl_loss_4": 2077.208544921875, + "kl_loss_9": 792.8849395751953, + "learning_rate": 0.0009926769179238466, + "loss": 1517.5954, + "step": 640 + }, + { + "ce_loss_13": 3.3260207295417787, + "ce_loss_17": 3.215930688381195, + "ce_loss_2": 4.718131160736084, + "ce_loss_4": 4.217852258682251, + "ce_loss_9": 3.611370527744293, + "epoch": 0.065, + "grad_norm": 852.0, + "kl_loss_13": 217.96212615966797, + "kl_loss_2": 3034.152966308594, + "kl_loss_4": 2066.0351013183595, + "kl_loss_9": 825.8590057373046, + "learning_rate": 0.000992403876506104, + "loss": 1528.566, + "step": 650 + }, + { + "ce_loss_13": 3.2665056943893434, + "ce_loss_17": 3.1604172229766845, + "ce_loss_2": 4.665199565887451, + "ce_loss_4": 4.154677987098694, + "ce_loss_9": 3.5465357542037963, + "epoch": 0.066, + "grad_norm": 788.0, + "kl_loss_13": 211.45065841674804, + "kl_loss_2": 3030.143273925781, + "kl_loss_4": 2047.1982666015624, + "kl_loss_9": 792.4408630371094, + "learning_rate": 0.0009921258765867918, + "loss": 1520.4655, + "step": 660 + }, + { + "ce_loss_13": 3.2385209918022158, + "ce_loss_17": 3.1311469674110413, + "ce_loss_2": 4.666577696800232, + "ce_loss_4": 4.152556109428406, + "ce_loss_9": 3.5041945457458494, + "epoch": 0.067, + "grad_norm": 980.0, + "kl_loss_13": 220.62510452270507, + "kl_loss_2": 3123.2228271484373, + "kl_loss_4": 2120.7830322265627, + "kl_loss_9": 787.7964691162109, + "learning_rate": 0.0009918429209653662, + "loss": 1535.3805, + "step": 670 + }, + { + "ce_loss_13": 3.2878018260002135, + "ce_loss_17": 3.185037171840668, + "ce_loss_2": 4.68726658821106, + "ce_loss_4": 4.178025591373443, + "ce_loss_9": 3.545906662940979, + "epoch": 0.068, + "grad_norm": 924.0, + "kl_loss_13": 212.08704223632813, + "kl_loss_2": 3069.2210205078127, + "kl_loss_4": 2080.496862792969, + "kl_loss_9": 769.9676300048828, + "learning_rate": 0.0009915550124911866, + "loss": 1496.4549, + "step": 680 + }, + { + "ce_loss_13": 3.2975847840309145, + "ce_loss_17": 3.191459906101227, + "ce_loss_2": 4.661965942382812, + "ce_loss_4": 4.168090236186981, + "ce_loss_9": 3.557912361621857, + "epoch": 0.069, + "grad_norm": 984.0, + "kl_loss_13": 202.267578125, + "kl_loss_2": 2969.9846069335936, + "kl_loss_4": 2024.4958984375, + "kl_loss_9": 759.2939453125, + "learning_rate": 0.0009912621540634887, + "loss": 1494.6292, + "step": 690 + }, + { + "ce_loss_13": 3.3303091049194338, + "ce_loss_17": 3.2297561883926393, + "ce_loss_2": 4.657060122489929, + "ce_loss_4": 4.15217102766037, + "ce_loss_9": 3.575939106941223, + "epoch": 0.07, + "grad_norm": 848.0, + "kl_loss_13": 197.1610595703125, + "kl_loss_2": 2921.2212890625, + "kl_loss_4": 1953.4702392578124, + "kl_loss_9": 743.5868896484375, + "learning_rate": 0.0009909643486313534, + "loss": 1470.627, + "step": 700 + }, + { + "ce_loss_13": 3.216227877140045, + "ce_loss_17": 3.1118495225906373, + "ce_loss_2": 4.621934008598328, + "ce_loss_4": 4.103268921375275, + "ce_loss_9": 3.484371030330658, + "epoch": 0.071, + "grad_norm": 944.0, + "kl_loss_13": 207.796826171875, + "kl_loss_2": 3059.703479003906, + "kl_loss_4": 2075.1166564941404, + "kl_loss_9": 775.9051879882812, + "learning_rate": 0.000990661599193678, + "loss": 1548.0862, + "step": 710 + }, + { + "ce_loss_13": 3.334531307220459, + "ce_loss_17": 3.2338029742240906, + "ce_loss_2": 4.672219610214233, + "ce_loss_4": 4.196356952190399, + "ce_loss_9": 3.5990506172180177, + "epoch": 0.072, + "grad_norm": 868.0, + "kl_loss_13": 198.27711029052733, + "kl_loss_2": 2917.584912109375, + "kl_loss_4": 1994.476416015625, + "kl_loss_9": 753.3980407714844, + "learning_rate": 0.0009903539087991462, + "loss": 1471.3186, + "step": 720 + }, + { + "ce_loss_13": 3.306708645820618, + "ce_loss_17": 3.211997056007385, + "ce_loss_2": 4.664749217033386, + "ce_loss_4": 4.165421199798584, + "ce_loss_9": 3.5773617148399355, + "epoch": 0.073, + "grad_norm": 924.0, + "kl_loss_13": 193.02234420776367, + "kl_loss_2": 2960.2150146484373, + "kl_loss_4": 1988.4537780761718, + "kl_loss_9": 766.4886810302735, + "learning_rate": 0.0009900412805461966, + "loss": 1486.0689, + "step": 730 + }, + { + "ce_loss_13": 3.380020833015442, + "ce_loss_17": 3.2837207198143004, + "ce_loss_2": 4.69500732421875, + "ce_loss_4": 4.193862104415894, + "ce_loss_9": 3.642804491519928, + "epoch": 0.074, + "grad_norm": 884.0, + "kl_loss_13": 191.6598693847656, + "kl_loss_2": 2885.8241455078123, + "kl_loss_4": 1912.3060546875, + "kl_loss_9": 760.4866882324219, + "learning_rate": 0.0009897237175829927, + "loss": 1459.9974, + "step": 740 + }, + { + "ce_loss_13": 3.2747390151023863, + "ce_loss_17": 3.172705125808716, + "ce_loss_2": 4.631512951850891, + "ce_loss_4": 4.128652667999267, + "ce_loss_9": 3.5400785207748413, + "epoch": 0.075, + "grad_norm": 816.0, + "kl_loss_13": 206.5009735107422, + "kl_loss_2": 2969.1276000976563, + "kl_loss_4": 1994.787353515625, + "kl_loss_9": 773.9206848144531, + "learning_rate": 0.0009894012231073895, + "loss": 1478.0417, + "step": 750 + }, + { + "ce_loss_13": 3.3099666357040407, + "ce_loss_17": 3.213700032234192, + "ce_loss_2": 4.657934308052063, + "ce_loss_4": 4.151252472400666, + "ce_loss_9": 3.5727837681770325, + "epoch": 0.076, + "grad_norm": 816.0, + "kl_loss_13": 192.21796875, + "kl_loss_2": 2935.162072753906, + "kl_loss_4": 1964.804754638672, + "kl_loss_9": 751.2708740234375, + "learning_rate": 0.0009890738003669028, + "loss": 1473.6896, + "step": 760 + }, + { + "ce_loss_13": 3.286187970638275, + "ce_loss_17": 3.1884525060653686, + "ce_loss_2": 4.675263500213623, + "ce_loss_4": 4.157026648521423, + "ce_loss_9": 3.5568581104278563, + "epoch": 0.077, + "grad_norm": 916.0, + "kl_loss_13": 195.28812713623046, + "kl_loss_2": 3032.9372436523436, + "kl_loss_4": 2032.5546325683595, + "kl_loss_9": 768.0154083251953, + "learning_rate": 0.0009887414526586764, + "loss": 1463.2814, + "step": 770 + }, + { + "ce_loss_13": 3.3352948307991026, + "ce_loss_17": 3.244133722782135, + "ce_loss_2": 4.680701231956482, + "ce_loss_4": 4.167735481262207, + "ce_loss_9": 3.5890052795410154, + "epoch": 0.078, + "grad_norm": 864.0, + "kl_loss_13": 184.67481536865233, + "kl_loss_2": 2936.3667846679687, + "kl_loss_4": 1940.8009765625, + "kl_loss_9": 733.9982391357422, + "learning_rate": 0.0009884041833294476, + "loss": 1426.1463, + "step": 780 + }, + { + "ce_loss_13": 3.339913558959961, + "ce_loss_17": 3.246951973438263, + "ce_loss_2": 4.6608011484146115, + "ce_loss_4": 4.160940778255463, + "ce_loss_9": 3.5881938099861146, + "epoch": 0.079, + "grad_norm": 940.0, + "kl_loss_13": 182.87334671020508, + "kl_loss_2": 2884.050256347656, + "kl_loss_4": 1906.6745239257812, + "kl_loss_9": 716.9421234130859, + "learning_rate": 0.000988061995775515, + "loss": 1458.4076, + "step": 790 + }, + { + "ce_loss_13": 3.2691999435424806, + "ce_loss_17": 3.1761279344558715, + "ce_loss_2": 4.578247976303101, + "ce_loss_4": 4.096813499927521, + "ce_loss_9": 3.5238449573516846, + "epoch": 0.08, + "grad_norm": 808.0, + "kl_loss_13": 185.4369689941406, + "kl_loss_2": 2891.0528198242187, + "kl_loss_4": 1946.8839965820312, + "kl_loss_9": 743.9428344726563, + "learning_rate": 0.0009877148934427035, + "loss": 1441.4863, + "step": 800 + }, + { + "ce_loss_13": 3.3132549285888673, + "ce_loss_17": 3.2210619926452635, + "ce_loss_2": 4.656325578689575, + "ce_loss_4": 4.150240135192871, + "ce_loss_9": 3.563402760028839, + "epoch": 0.081, + "grad_norm": 896.0, + "kl_loss_13": 186.15495758056642, + "kl_loss_2": 2933.0266357421874, + "kl_loss_4": 1955.7246215820312, + "kl_loss_9": 735.9151153564453, + "learning_rate": 0.0009873628798263297, + "loss": 1431.6289, + "step": 810 + }, + { + "ce_loss_13": 3.2747130036354064, + "ce_loss_17": 3.175995099544525, + "ce_loss_2": 4.574577975273132, + "ce_loss_4": 4.0853879570961, + "ce_loss_9": 3.5176785469055174, + "epoch": 0.082, + "grad_norm": 1048.0, + "kl_loss_13": 197.83297729492188, + "kl_loss_2": 2859.0701293945312, + "kl_loss_4": 1907.5581665039062, + "kl_loss_9": 720.6616302490235, + "learning_rate": 0.0009870059584711668, + "loss": 1462.2594, + "step": 820 + }, + { + "ce_loss_13": 3.2835761308670044, + "ce_loss_17": 3.188172221183777, + "ce_loss_2": 4.596423959732055, + "ce_loss_4": 4.103586983680725, + "ce_loss_9": 3.53554652929306, + "epoch": 0.083, + "grad_norm": 812.0, + "kl_loss_13": 186.16600036621094, + "kl_loss_2": 2884.51796875, + "kl_loss_4": 1926.0746459960938, + "kl_loss_9": 729.4662231445312, + "learning_rate": 0.000986644132971409, + "loss": 1430.7047, + "step": 830 + }, + { + "ce_loss_13": 3.2714996099472047, + "ce_loss_17": 3.176540768146515, + "ce_loss_2": 4.61619758605957, + "ce_loss_4": 4.12186279296875, + "ce_loss_9": 3.5301250576972962, + "epoch": 0.084, + "grad_norm": 828.0, + "kl_loss_13": 185.5873245239258, + "kl_loss_2": 2920.5774047851564, + "kl_loss_4": 1954.5056579589843, + "kl_loss_9": 741.8969207763672, + "learning_rate": 0.0009862774069706345, + "loss": 1437.839, + "step": 840 + }, + { + "ce_loss_13": 3.3890750765800477, + "ce_loss_17": 3.3003832340240478, + "ce_loss_2": 4.662535715103149, + "ce_loss_4": 4.182697641849518, + "ce_loss_9": 3.63095440864563, + "epoch": 0.085, + "grad_norm": 852.0, + "kl_loss_13": 180.29074478149414, + "kl_loss_2": 2812.5008056640627, + "kl_loss_4": 1887.4568115234374, + "kl_loss_9": 717.6001190185547, + "learning_rate": 0.000985905784161771, + "loss": 1416.2434, + "step": 850 + }, + { + "ce_loss_13": 3.3158096432685853, + "ce_loss_17": 3.228053939342499, + "ce_loss_2": 4.619459867477417, + "ce_loss_4": 4.117097306251526, + "ce_loss_9": 3.562677466869354, + "epoch": 0.086, + "grad_norm": 900.0, + "kl_loss_13": 174.8151641845703, + "kl_loss_2": 2844.3873901367188, + "kl_loss_4": 1881.8870666503906, + "kl_loss_9": 708.1405639648438, + "learning_rate": 0.000985529268287055, + "loss": 1398.2658, + "step": 860 + }, + { + "ce_loss_13": 3.248007822036743, + "ce_loss_17": 3.156341755390167, + "ce_loss_2": 4.600780200958252, + "ce_loss_4": 4.094212400913238, + "ce_loss_9": 3.5029677152633667, + "epoch": 0.087, + "grad_norm": 1040.0, + "kl_loss_13": 178.51637496948243, + "kl_loss_2": 2930.3122314453126, + "kl_loss_4": 1948.3072021484375, + "kl_loss_9": 719.6942810058594, + "learning_rate": 0.0009851478631379982, + "loss": 1437.3121, + "step": 870 + }, + { + "ce_loss_13": 3.31154705286026, + "ce_loss_17": 3.21948881149292, + "ce_loss_2": 4.628391528129578, + "ce_loss_4": 4.124966013431549, + "ce_loss_9": 3.5654338121414186, + "epoch": 0.088, + "grad_norm": 888.0, + "kl_loss_13": 175.02695007324218, + "kl_loss_2": 2871.5215698242187, + "kl_loss_4": 1896.9290771484375, + "kl_loss_9": 717.7419036865234, + "learning_rate": 0.0009847615725553456, + "loss": 1412.3423, + "step": 880 + }, + { + "ce_loss_13": 3.3594752073287966, + "ce_loss_17": 3.2758359909057617, + "ce_loss_2": 4.607688045501709, + "ce_loss_4": 4.12935242652893, + "ce_loss_9": 3.589799475669861, + "epoch": 0.089, + "grad_norm": 828.0, + "kl_loss_13": 167.3798500061035, + "kl_loss_2": 2722.2293212890627, + "kl_loss_4": 1806.6529846191406, + "kl_loss_9": 676.9883209228516, + "learning_rate": 0.0009843704004290394, + "loss": 1398.4903, + "step": 890 + }, + { + "ce_loss_13": 3.2744773983955384, + "ce_loss_17": 3.1850821495056154, + "ce_loss_2": 4.5737762451171875, + "ce_loss_4": 4.084364438056946, + "ce_loss_9": 3.51882985830307, + "epoch": 0.09, + "grad_norm": 860.0, + "kl_loss_13": 180.4751724243164, + "kl_loss_2": 2863.1496704101564, + "kl_loss_4": 1913.1431701660156, + "kl_loss_9": 722.023648071289, + "learning_rate": 0.0009839743506981783, + "loss": 1417.8385, + "step": 900 + }, + { + "ce_loss_13": 3.1950653553009034, + "ce_loss_17": 3.1013781309127806, + "ce_loss_2": 4.567150330543518, + "ce_loss_4": 4.052671599388122, + "ce_loss_9": 3.4548081278800966, + "epoch": 0.091, + "grad_norm": 920.0, + "kl_loss_13": 183.05032653808593, + "kl_loss_2": 3009.1415161132813, + "kl_loss_4": 2006.8298583984374, + "kl_loss_9": 744.9382751464843, + "learning_rate": 0.0009835734273509786, + "loss": 1449.6328, + "step": 910 + }, + { + "ce_loss_13": 3.291651165485382, + "ce_loss_17": 3.199280619621277, + "ce_loss_2": 4.614070200920105, + "ce_loss_4": 4.107669317722321, + "ce_loss_9": 3.5364214301109316, + "epoch": 0.092, + "grad_norm": 892.0, + "kl_loss_13": 181.22847747802734, + "kl_loss_2": 2855.7881591796877, + "kl_loss_4": 1885.0700744628907, + "kl_loss_9": 705.8559112548828, + "learning_rate": 0.0009831676344247342, + "loss": 1411.9321, + "step": 920 + }, + { + "ce_loss_13": 3.30694317817688, + "ce_loss_17": 3.219570589065552, + "ce_loss_2": 4.576501417160034, + "ce_loss_4": 4.094702243804932, + "ce_loss_9": 3.544420289993286, + "epoch": 0.093, + "grad_norm": 908.0, + "kl_loss_13": 174.2581703186035, + "kl_loss_2": 2793.7328247070313, + "kl_loss_4": 1848.8707702636718, + "kl_loss_9": 691.9741424560547, + "learning_rate": 0.0009827569760057755, + "loss": 1400.5642, + "step": 930 + }, + { + "ce_loss_13": 3.224755549430847, + "ce_loss_17": 3.1328879237174987, + "ce_loss_2": 4.5977825164794925, + "ce_loss_4": 4.091392683982849, + "ce_loss_9": 3.4865678668022158, + "epoch": 0.094, + "grad_norm": 1208.0, + "kl_loss_13": 182.91203765869142, + "kl_loss_2": 2979.7569946289063, + "kl_loss_4": 1993.7948974609376, + "kl_loss_9": 740.7369689941406, + "learning_rate": 0.000982341456229428, + "loss": 1426.21, + "step": 940 + }, + { + "ce_loss_13": 3.3133137345314028, + "ce_loss_17": 3.2248087882995606, + "ce_loss_2": 4.633661770820618, + "ce_loss_4": 4.145521080493927, + "ce_loss_9": 3.5662601947784425, + "epoch": 0.095, + "grad_norm": 988.0, + "kl_loss_13": 177.18944854736327, + "kl_loss_2": 2895.0067504882813, + "kl_loss_4": 1942.5609558105468, + "kl_loss_9": 731.1706604003906, + "learning_rate": 0.000981921079279971, + "loss": 1400.7357, + "step": 950 + }, + { + "ce_loss_13": 3.3292991876602174, + "ce_loss_17": 3.24774911403656, + "ce_loss_2": 4.5529484987258915, + "ce_loss_4": 4.088865315914154, + "ce_loss_9": 3.557070314884186, + "epoch": 0.096, + "grad_norm": 824.0, + "kl_loss_13": 168.73668365478517, + "kl_loss_2": 2716.8586181640626, + "kl_loss_4": 1811.3070129394532, + "kl_loss_9": 684.36279296875, + "learning_rate": 0.0009814958493905962, + "loss": 1365.1486, + "step": 960 + }, + { + "ce_loss_13": 3.2863628387451174, + "ce_loss_17": 3.1977596521377563, + "ce_loss_2": 4.602845215797425, + "ce_loss_4": 4.111985778808593, + "ce_loss_9": 3.5324569940567017, + "epoch": 0.097, + "grad_norm": 952.0, + "kl_loss_13": 173.95999755859376, + "kl_loss_2": 2862.782275390625, + "kl_loss_4": 1912.7567749023438, + "kl_loss_9": 709.2404663085938, + "learning_rate": 0.0009810657708433637, + "loss": 1434.0191, + "step": 970 + }, + { + "ce_loss_13": 3.3615291357040404, + "ce_loss_17": 3.2735565185546873, + "ce_loss_2": 4.593872308731079, + "ce_loss_4": 4.110198199748993, + "ce_loss_9": 3.5845670461654664, + "epoch": 0.098, + "grad_norm": 1104.0, + "kl_loss_13": 170.09545974731446, + "kl_loss_2": 2700.237255859375, + "kl_loss_4": 1781.9847534179687, + "kl_loss_9": 674.5600952148437, + "learning_rate": 0.0009806308479691594, + "loss": 1350.6071, + "step": 980 + }, + { + "ce_loss_13": 3.365022373199463, + "ce_loss_17": 3.2740842938423156, + "ce_loss_2": 4.645147061347961, + "ce_loss_4": 4.163328278064728, + "ce_loss_9": 3.606741690635681, + "epoch": 0.099, + "grad_norm": 828.0, + "kl_loss_13": 182.7116500854492, + "kl_loss_2": 2796.923962402344, + "kl_loss_4": 1857.0793518066407, + "kl_loss_9": 704.7737335205078, + "learning_rate": 0.0009801910851476522, + "loss": 1380.1334, + "step": 990 + }, + { + "ce_loss_13": 3.2967852473258974, + "ce_loss_17": 3.200886142253876, + "ce_loss_2": 4.613013792037964, + "ce_loss_4": 4.1017282724380495, + "ce_loss_9": 3.5339526891708375, + "epoch": 0.1, + "grad_norm": 840.0, + "kl_loss_13": 186.91926651000978, + "kl_loss_2": 2923.3825073242188, + "kl_loss_4": 1924.8136657714845, + "kl_loss_9": 715.3273315429688, + "learning_rate": 0.0009797464868072487, + "loss": 1397.8391, + "step": 1000 + }, + { + "ce_loss_13": 3.2863819003105164, + "ce_loss_17": 3.1901052355766297, + "ce_loss_2": 4.579378271102906, + "ce_loss_4": 4.075677537918091, + "ce_loss_9": 3.5197189807891847, + "epoch": 0.101, + "grad_norm": 936.0, + "kl_loss_13": 193.87194137573243, + "kl_loss_2": 2853.961389160156, + "kl_loss_4": 1882.4125427246095, + "kl_loss_9": 714.1756072998047, + "learning_rate": 0.0009792970574250492, + "loss": 1403.802, + "step": 1010 + }, + { + "ce_loss_13": 3.3069553136825562, + "ce_loss_17": 3.2100618958473204, + "ce_loss_2": 4.577626061439514, + "ce_loss_4": 4.090611159801483, + "ce_loss_9": 3.545800769329071, + "epoch": 0.102, + "grad_norm": 896.0, + "kl_loss_13": 201.89414901733397, + "kl_loss_2": 2804.7287475585936, + "kl_loss_4": 1858.0650390625, + "kl_loss_9": 712.4587890625, + "learning_rate": 0.0009788428015268028, + "loss": 1374.8735, + "step": 1020 + }, + { + "ce_loss_13": 3.306472420692444, + "ce_loss_17": 3.211907708644867, + "ce_loss_2": 4.55790421962738, + "ce_loss_4": 4.068548572063446, + "ce_loss_9": 3.5357192039489744, + "epoch": 0.103, + "grad_norm": 780.0, + "kl_loss_13": 189.52980728149413, + "kl_loss_2": 2759.1054931640624, + "kl_loss_4": 1814.9620727539063, + "kl_loss_9": 690.7964782714844, + "learning_rate": 0.0009783837236868609, + "loss": 1360.2857, + "step": 1030 + }, + { + "ce_loss_13": 3.2745940327644347, + "ce_loss_17": 3.175362932682037, + "ce_loss_2": 4.527231812477112, + "ce_loss_4": 4.049796831607819, + "ce_loss_9": 3.500406301021576, + "epoch": 0.104, + "grad_norm": 824.0, + "kl_loss_13": 188.25073165893554, + "kl_loss_2": 2740.2807373046876, + "kl_loss_4": 1825.733056640625, + "kl_loss_9": 685.0412841796875, + "learning_rate": 0.0009779198285281327, + "loss": 1355.6325, + "step": 1040 + }, + { + "ce_loss_13": 3.2650335311889647, + "ce_loss_17": 3.1757237792015074, + "ce_loss_2": 4.554905295372009, + "ce_loss_4": 4.055523097515106, + "ce_loss_9": 3.501880872249603, + "epoch": 0.105, + "grad_norm": 768.0, + "kl_loss_13": 179.175390625, + "kl_loss_2": 2826.79638671875, + "kl_loss_4": 1854.6113342285157, + "kl_loss_9": 692.5723236083984, + "learning_rate": 0.0009774511207220368, + "loss": 1383.286, + "step": 1050 + }, + { + "ce_loss_13": 3.3126551866531373, + "ce_loss_17": 3.220487451553345, + "ce_loss_2": 4.602885723114014, + "ce_loss_4": 4.097618770599365, + "ce_loss_9": 3.53983451128006, + "epoch": 0.106, + "grad_norm": 832.0, + "kl_loss_13": 180.63662490844726, + "kl_loss_2": 2827.6142822265624, + "kl_loss_4": 1844.4667907714843, + "kl_loss_9": 683.4518646240234, + "learning_rate": 0.0009769776049884564, + "loss": 1374.1806, + "step": 1060 + }, + { + "ce_loss_13": 3.2170669198036195, + "ce_loss_17": 3.130579960346222, + "ce_loss_2": 4.54125816822052, + "ce_loss_4": 4.028694069385528, + "ce_loss_9": 3.4636984348297117, + "epoch": 0.107, + "grad_norm": 1128.0, + "kl_loss_13": 176.6777114868164, + "kl_loss_2": 2897.8383056640623, + "kl_loss_4": 1896.3052307128905, + "kl_loss_9": 716.9917236328125, + "learning_rate": 0.0009764992860956889, + "loss": 1428.724, + "step": 1070 + }, + { + "ce_loss_13": 3.3626615524291994, + "ce_loss_17": 3.2824303030967714, + "ce_loss_2": 4.564817667007446, + "ce_loss_4": 4.107381689548492, + "ce_loss_9": 3.590293490886688, + "epoch": 0.108, + "grad_norm": 752.0, + "kl_loss_13": 162.8722915649414, + "kl_loss_2": 2650.8736206054687, + "kl_loss_4": 1751.8924377441406, + "kl_loss_9": 671.3090393066407, + "learning_rate": 0.0009760161688604008, + "loss": 1334.0248, + "step": 1080 + }, + { + "ce_loss_13": 3.3611219763755797, + "ce_loss_17": 3.279569685459137, + "ce_loss_2": 4.617689728736877, + "ce_loss_4": 4.137126886844635, + "ce_loss_9": 3.5966805458068847, + "epoch": 0.109, + "grad_norm": 1024.0, + "kl_loss_13": 164.7625717163086, + "kl_loss_2": 2726.881628417969, + "kl_loss_4": 1808.677264404297, + "kl_loss_9": 687.6751068115234, + "learning_rate": 0.0009755282581475768, + "loss": 1365.8348, + "step": 1090 + }, + { + "ce_loss_13": 3.4059563994407656, + "ce_loss_17": 3.322954511642456, + "ce_loss_2": 4.643022751808166, + "ce_loss_4": 4.158383297920227, + "ce_loss_9": 3.6426517486572267, + "epoch": 0.11, + "grad_norm": 748.0, + "kl_loss_13": 168.04851303100585, + "kl_loss_2": 2703.0779052734374, + "kl_loss_4": 1772.2031127929688, + "kl_loss_9": 690.7918273925782, + "learning_rate": 0.0009750355588704727, + "loss": 1325.6271, + "step": 1100 + }, + { + "ce_loss_13": 3.249567782878876, + "ce_loss_17": 3.167778730392456, + "ce_loss_2": 4.522468519210816, + "ce_loss_4": 4.022468602657318, + "ce_loss_9": 3.4856932878494264, + "epoch": 0.111, + "grad_norm": 884.0, + "kl_loss_13": 165.72621231079103, + "kl_loss_2": 2760.001416015625, + "kl_loss_4": 1802.3526733398437, + "kl_loss_9": 680.4075592041015, + "learning_rate": 0.0009745380759905647, + "loss": 1378.9523, + "step": 1110 + }, + { + "ce_loss_13": 3.203279495239258, + "ce_loss_17": 3.1249512672424316, + "ce_loss_2": 4.488472032546997, + "ce_loss_4": 3.998249518871307, + "ce_loss_9": 3.443999207019806, + "epoch": 0.112, + "grad_norm": 824.0, + "kl_loss_13": 165.66986923217775, + "kl_loss_2": 2810.6823120117188, + "kl_loss_4": 1844.29384765625, + "kl_loss_9": 683.6999298095703, + "learning_rate": 0.0009740358145174998, + "loss": 1404.8479, + "step": 1120 + }, + { + "ce_loss_13": 3.3505819439888, + "ce_loss_17": 3.2721446633338926, + "ce_loss_2": 4.559712290763855, + "ce_loss_4": 4.0938373565673825, + "ce_loss_9": 3.5750083208084105, + "epoch": 0.113, + "grad_norm": 840.0, + "kl_loss_13": 164.54634780883788, + "kl_loss_2": 2677.1094970703125, + "kl_loss_4": 1776.9581970214845, + "kl_loss_9": 664.6269165039063, + "learning_rate": 0.0009735287795090455, + "loss": 1339.4387, + "step": 1130 + }, + { + "ce_loss_13": 3.245984637737274, + "ce_loss_17": 3.163719344139099, + "ce_loss_2": 4.511726307868957, + "ce_loss_4": 4.028553128242493, + "ce_loss_9": 3.48677716255188, + "epoch": 0.114, + "grad_norm": 1152.0, + "kl_loss_13": 164.15415267944337, + "kl_loss_2": 2762.4733764648436, + "kl_loss_4": 1828.445751953125, + "kl_loss_9": 686.725439453125, + "learning_rate": 0.0009730169760710386, + "loss": 1351.7522, + "step": 1140 + }, + { + "ce_loss_13": 3.3219132304191588, + "ce_loss_17": 3.23946076631546, + "ce_loss_2": 4.5721259117126465, + "ce_loss_4": 4.086527943611145, + "ce_loss_9": 3.570422077178955, + "epoch": 0.115, + "grad_norm": 980.0, + "kl_loss_13": 172.59002685546875, + "kl_loss_2": 2722.2961303710936, + "kl_loss_4": 1786.2447204589844, + "kl_loss_9": 706.603970336914, + "learning_rate": 0.0009725004093573342, + "loss": 1360.8584, + "step": 1150 + }, + { + "ce_loss_13": 3.271444249153137, + "ce_loss_17": 3.18392835855484, + "ce_loss_2": 4.5310950756073, + "ce_loss_4": 4.044306802749634, + "ce_loss_9": 3.5186336159706117, + "epoch": 0.116, + "grad_norm": 1144.0, + "kl_loss_13": 174.4172332763672, + "kl_loss_2": 2716.383581542969, + "kl_loss_4": 1774.4743957519531, + "kl_loss_9": 696.9432922363281, + "learning_rate": 0.0009719790845697534, + "loss": 1338.9479, + "step": 1160 + }, + { + "ce_loss_13": 3.2252618074417114, + "ce_loss_17": 3.1453272104263306, + "ce_loss_2": 4.431479716300965, + "ce_loss_4": 3.9704400300979614, + "ce_loss_9": 3.4573128461837768, + "epoch": 0.117, + "grad_norm": 828.0, + "kl_loss_13": 165.33498992919922, + "kl_loss_2": 2661.2419677734374, + "kl_loss_4": 1766.5945068359374, + "kl_loss_9": 676.1386291503907, + "learning_rate": 0.0009714530069580309, + "loss": 1322.6236, + "step": 1170 + }, + { + "ce_loss_13": 3.3173227190971373, + "ce_loss_17": 3.2320764780044557, + "ce_loss_2": 4.566565918922424, + "ce_loss_4": 4.085048425197601, + "ce_loss_9": 3.5555443286895754, + "epoch": 0.118, + "grad_norm": 932.0, + "kl_loss_13": 173.66377639770508, + "kl_loss_2": 2721.5857788085937, + "kl_loss_4": 1805.4396362304688, + "kl_loss_9": 691.9438415527344, + "learning_rate": 0.0009709221818197624, + "loss": 1341.7314, + "step": 1180 + }, + { + "ce_loss_13": 3.3532497763633726, + "ce_loss_17": 3.2697224617004395, + "ce_loss_2": 4.613708448410034, + "ce_loss_4": 4.130217206478119, + "ce_loss_9": 3.5915398240089416, + "epoch": 0.119, + "grad_norm": 972.0, + "kl_loss_13": 166.0831672668457, + "kl_loss_2": 2753.8083984375, + "kl_loss_4": 1816.0059326171875, + "kl_loss_9": 684.0197967529297, + "learning_rate": 0.0009703866145003512, + "loss": 1352.1104, + "step": 1190 + }, + { + "ce_loss_13": 3.324125325679779, + "ce_loss_17": 3.243944191932678, + "ce_loss_2": 4.555847883224487, + "ce_loss_4": 4.080176639556885, + "ce_loss_9": 3.5500615358352663, + "epoch": 0.12, + "grad_norm": 976.0, + "kl_loss_13": 157.88376083374024, + "kl_loss_2": 2723.2943725585938, + "kl_loss_4": 1783.8689331054688, + "kl_loss_9": 672.6443267822266, + "learning_rate": 0.0009698463103929542, + "loss": 1358.7401, + "step": 1200 + }, + { + "ce_loss_13": 3.2819153308868407, + "ce_loss_17": 3.2019794940948487, + "ce_loss_2": 4.548604822158813, + "ce_loss_4": 4.0686638951301575, + "ce_loss_9": 3.527483081817627, + "epoch": 0.121, + "grad_norm": 836.0, + "kl_loss_13": 160.8688537597656, + "kl_loss_2": 2733.4061889648438, + "kl_loss_4": 1807.9644775390625, + "kl_loss_9": 682.4588287353515, + "learning_rate": 0.0009693012749384279, + "loss": 1358.3497, + "step": 1210 + }, + { + "ce_loss_13": 3.2975502490997313, + "ce_loss_17": 3.216256296634674, + "ce_loss_2": 4.530639266967773, + "ce_loss_4": 4.056679689884186, + "ce_loss_9": 3.5306329250335695, + "epoch": 0.122, + "grad_norm": 824.0, + "kl_loss_13": 161.37709732055663, + "kl_loss_2": 2705.5943969726563, + "kl_loss_4": 1780.7206604003907, + "kl_loss_9": 677.3824829101562, + "learning_rate": 0.0009687515136252732, + "loss": 1326.2443, + "step": 1220 + }, + { + "ce_loss_13": 3.258976364135742, + "ce_loss_17": 3.176534342765808, + "ce_loss_2": 4.556908535957336, + "ce_loss_4": 4.05363359451294, + "ce_loss_9": 3.5072725892066954, + "epoch": 0.123, + "grad_norm": 1104.0, + "kl_loss_13": 165.05678100585936, + "kl_loss_2": 2846.551965332031, + "kl_loss_4": 1866.6253051757812, + "kl_loss_9": 720.0403137207031, + "learning_rate": 0.0009681970319895803, + "loss": 1415.01, + "step": 1230 + }, + { + "ce_loss_13": 3.3342087149620054, + "ce_loss_17": 3.2581273674964906, + "ce_loss_2": 4.563027334213257, + "ce_loss_4": 4.0809567213058475, + "ce_loss_9": 3.5791317224502563, + "epoch": 0.124, + "grad_norm": 792.0, + "kl_loss_13": 164.79975357055665, + "kl_loss_2": 2695.2207641601562, + "kl_loss_4": 1762.9876098632812, + "kl_loss_9": 691.129638671875, + "learning_rate": 0.0009676378356149733, + "loss": 1331.3775, + "step": 1240 + }, + { + "ce_loss_13": 3.3004573583602905, + "ce_loss_17": 3.223739218711853, + "ce_loss_2": 4.515417742729187, + "ce_loss_4": 4.037710547447205, + "ce_loss_9": 3.527457320690155, + "epoch": 0.125, + "grad_norm": 1096.0, + "kl_loss_13": 154.71745758056642, + "kl_loss_2": 2666.19541015625, + "kl_loss_4": 1728.707696533203, + "kl_loss_9": 661.0793060302734, + "learning_rate": 0.0009670739301325534, + "loss": 1313.0102, + "step": 1250 + }, + { + "ce_loss_13": 3.2672061204910277, + "ce_loss_17": 3.1891830563545227, + "ce_loss_2": 4.484476375579834, + "ce_loss_4": 4.007357966899872, + "ce_loss_9": 3.4989005446434023, + "epoch": 0.126, + "grad_norm": 960.0, + "kl_loss_13": 158.11394271850585, + "kl_loss_2": 2653.94892578125, + "kl_loss_4": 1748.0544006347657, + "kl_loss_9": 671.8057220458984, + "learning_rate": 0.0009665053212208426, + "loss": 1329.394, + "step": 1260 + }, + { + "ce_loss_13": 3.3069382429122927, + "ce_loss_17": 3.2263609528541566, + "ce_loss_2": 4.549402809143066, + "ce_loss_4": 4.061746668815613, + "ce_loss_9": 3.5399792313575746, + "epoch": 0.127, + "grad_norm": 924.0, + "kl_loss_13": 160.6331443786621, + "kl_loss_2": 2713.835803222656, + "kl_loss_4": 1769.5395935058593, + "kl_loss_9": 671.4346527099609, + "learning_rate": 0.0009659320146057262, + "loss": 1331.5624, + "step": 1270 + }, + { + "ce_loss_13": 3.314958393573761, + "ce_loss_17": 3.234841597080231, + "ce_loss_2": 4.527111458778381, + "ce_loss_4": 4.0574010014534, + "ce_loss_9": 3.541261637210846, + "epoch": 0.128, + "grad_norm": 996.0, + "kl_loss_13": 156.1156005859375, + "kl_loss_2": 2662.183142089844, + "kl_loss_4": 1742.0220031738281, + "kl_loss_9": 651.1027404785157, + "learning_rate": 0.0009653540160603955, + "loss": 1308.4551, + "step": 1280 + }, + { + "ce_loss_13": 3.314372646808624, + "ce_loss_17": 3.2411246299743652, + "ce_loss_2": 4.520395636558533, + "ce_loss_4": 4.049313855171204, + "ce_loss_9": 3.535482621192932, + "epoch": 0.129, + "grad_norm": 1120.0, + "kl_loss_13": 157.87554626464845, + "kl_loss_2": 2660.3510864257814, + "kl_loss_4": 1747.1935180664063, + "kl_loss_9": 655.3367767333984, + "learning_rate": 0.0009647713314052896, + "loss": 1297.9022, + "step": 1290 + }, + { + "ce_loss_13": 3.2681477665901184, + "ce_loss_17": 3.185636782646179, + "ce_loss_2": 4.550763845443726, + "ce_loss_4": 4.05369781255722, + "ce_loss_9": 3.5122714757919313, + "epoch": 0.13, + "grad_norm": 824.0, + "kl_loss_13": 159.5022834777832, + "kl_loss_2": 2801.4686279296875, + "kl_loss_4": 1818.6080932617188, + "kl_loss_9": 685.4676544189454, + "learning_rate": 0.0009641839665080363, + "loss": 1352.5339, + "step": 1300 + }, + { + "ce_loss_13": 3.236352801322937, + "ce_loss_17": 3.159665215015411, + "ce_loss_2": 4.4887169122695925, + "ce_loss_4": 3.984550249576569, + "ce_loss_9": 3.4696427822113036, + "epoch": 0.131, + "grad_norm": 904.0, + "kl_loss_13": 154.50375671386718, + "kl_loss_2": 2721.4118774414064, + "kl_loss_4": 1756.6728759765624, + "kl_loss_9": 662.9141052246093, + "learning_rate": 0.0009635919272833937, + "loss": 1308.5488, + "step": 1310 + }, + { + "ce_loss_13": 3.2685189843177795, + "ce_loss_17": 3.184953343868256, + "ce_loss_2": 4.51920166015625, + "ce_loss_4": 4.023778927326203, + "ce_loss_9": 3.5157355427742005, + "epoch": 0.132, + "grad_norm": 1032.0, + "kl_loss_13": 159.60601959228515, + "kl_loss_2": 2706.0281494140627, + "kl_loss_4": 1751.5549682617188, + "kl_loss_9": 676.8560546875, + "learning_rate": 0.0009629952196931902, + "loss": 1299.0363, + "step": 1320 + }, + { + "ce_loss_13": 3.2563462615013123, + "ce_loss_17": 3.176827883720398, + "ce_loss_2": 4.495701432228088, + "ce_loss_4": 3.996892488002777, + "ce_loss_9": 3.489270102977753, + "epoch": 0.133, + "grad_norm": 876.0, + "kl_loss_13": 161.70468292236328, + "kl_loss_2": 2721.5709838867188, + "kl_loss_4": 1755.954852294922, + "kl_loss_9": 673.1398193359375, + "learning_rate": 0.0009623938497462645, + "loss": 1317.0791, + "step": 1330 + }, + { + "ce_loss_13": 3.248842144012451, + "ce_loss_17": 3.1695939660072328, + "ce_loss_2": 4.477500224113465, + "ce_loss_4": 3.9987229704856873, + "ce_loss_9": 3.475855028629303, + "epoch": 0.134, + "grad_norm": 956.0, + "kl_loss_13": 162.60150070190429, + "kl_loss_2": 2691.308349609375, + "kl_loss_4": 1755.7607482910157, + "kl_loss_9": 663.9021514892578, + "learning_rate": 0.0009617878234984055, + "loss": 1325.2939, + "step": 1340 + }, + { + "ce_loss_13": 3.334787106513977, + "ce_loss_17": 3.2601780772209166, + "ce_loss_2": 4.515818619728089, + "ce_loss_4": 4.041862988471985, + "ce_loss_9": 3.5549457550048826, + "epoch": 0.135, + "grad_norm": 884.0, + "kl_loss_13": 153.6476921081543, + "kl_loss_2": 2586.7006225585938, + "kl_loss_4": 1671.9819458007812, + "kl_loss_9": 630.8726684570313, + "learning_rate": 0.0009611771470522907, + "loss": 1284.6025, + "step": 1350 + }, + { + "ce_loss_13": 3.268973970413208, + "ce_loss_17": 3.1878760933876036, + "ce_loss_2": 4.501631140708923, + "ce_loss_4": 4.00111049413681, + "ce_loss_9": 3.4961073637008666, + "epoch": 0.136, + "grad_norm": 892.0, + "kl_loss_13": 155.69283447265624, + "kl_loss_2": 2670.2950927734373, + "kl_loss_4": 1709.4631469726562, + "kl_loss_9": 640.3249725341797, + "learning_rate": 0.0009605618265574251, + "loss": 1284.2874, + "step": 1360 + }, + { + "ce_loss_13": 3.239262568950653, + "ce_loss_17": 3.156244766712189, + "ce_loss_2": 4.492224717140198, + "ce_loss_4": 4.01040803194046, + "ce_loss_9": 3.468993043899536, + "epoch": 0.137, + "grad_norm": 892.0, + "kl_loss_13": 169.14001235961913, + "kl_loss_2": 2755.6871948242188, + "kl_loss_4": 1800.2430358886718, + "kl_loss_9": 674.57099609375, + "learning_rate": 0.0009599418682100792, + "loss": 1327.1396, + "step": 1370 + }, + { + "ce_loss_13": 3.2737831711769103, + "ce_loss_17": 3.195842683315277, + "ce_loss_2": 4.506568765640258, + "ce_loss_4": 4.0143644452095035, + "ce_loss_9": 3.4978524327278135, + "epoch": 0.138, + "grad_norm": 1144.0, + "kl_loss_13": 155.1969223022461, + "kl_loss_2": 2706.093786621094, + "kl_loss_4": 1750.3873718261718, + "kl_loss_9": 649.1048980712891, + "learning_rate": 0.0009593172782532268, + "loss": 1312.8533, + "step": 1380 + }, + { + "ce_loss_13": 3.3074227452278135, + "ce_loss_17": 3.2301692008972167, + "ce_loss_2": 4.5188212394714355, + "ce_loss_4": 4.035277736186981, + "ce_loss_9": 3.528612470626831, + "epoch": 0.139, + "grad_norm": 1160.0, + "kl_loss_13": 160.52422866821288, + "kl_loss_2": 2634.7707641601564, + "kl_loss_4": 1711.2152038574218, + "kl_loss_9": 640.6424102783203, + "learning_rate": 0.0009586880629764817, + "loss": 1290.3635, + "step": 1390 + }, + { + "ce_loss_13": 3.25186802148819, + "ce_loss_17": 3.1639440059661865, + "ce_loss_2": 4.4671852588653564, + "ce_loss_4": 3.9952181696891786, + "ce_loss_9": 3.47138888835907, + "epoch": 0.14, + "grad_norm": 884.0, + "kl_loss_13": 182.30699768066407, + "kl_loss_2": 2673.157568359375, + "kl_loss_4": 1751.9988037109374, + "kl_loss_9": 648.4319122314453, + "learning_rate": 0.0009580542287160348, + "loss": 1292.1773, + "step": 1400 + }, + { + "ce_loss_13": 3.2298731088638304, + "ce_loss_17": 3.129335343837738, + "ce_loss_2": 4.433214735984802, + "ce_loss_4": 3.94100421667099, + "ce_loss_9": 3.430233919620514, + "epoch": 0.141, + "grad_norm": 904.0, + "kl_loss_13": 192.17306213378907, + "kl_loss_2": 2684.3740966796877, + "kl_loss_4": 1718.7549194335938, + "kl_loss_9": 643.3223846435546, + "learning_rate": 0.0009574157818545901, + "loss": 1293.1211, + "step": 1410 + }, + { + "ce_loss_13": 3.2841809034347533, + "ce_loss_17": 3.2023167252540587, + "ce_loss_2": 4.470522499084472, + "ce_loss_4": 3.9958879351615906, + "ce_loss_9": 3.4895517945289614, + "epoch": 0.142, + "grad_norm": 804.0, + "kl_loss_13": 161.15379180908204, + "kl_loss_2": 2598.9580078125, + "kl_loss_4": 1681.3669860839843, + "kl_loss_9": 619.222900390625, + "learning_rate": 0.0009567727288213005, + "loss": 1294.5631, + "step": 1420 + }, + { + "ce_loss_13": 3.259111058712006, + "ce_loss_17": 3.1785897850990295, + "ce_loss_2": 4.492042422294617, + "ce_loss_4": 3.989784133434296, + "ce_loss_9": 3.4896228432655336, + "epoch": 0.143, + "grad_norm": 828.0, + "kl_loss_13": 163.7379463195801, + "kl_loss_2": 2714.502978515625, + "kl_loss_4": 1737.2164306640625, + "kl_loss_9": 669.247412109375, + "learning_rate": 0.0009561250760917027, + "loss": 1299.8157, + "step": 1430 + }, + { + "ce_loss_13": 3.2723585844039915, + "ce_loss_17": 3.1944003820419313, + "ce_loss_2": 4.4894095182418825, + "ce_loss_4": 3.9981072783470153, + "ce_loss_9": 3.4940529227256776, + "epoch": 0.144, + "grad_norm": 912.0, + "kl_loss_13": 161.85647048950196, + "kl_loss_2": 2689.2710693359377, + "kl_loss_4": 1732.9358032226562, + "kl_loss_9": 660.5617492675781, + "learning_rate": 0.0009554728301876525, + "loss": 1280.2966, + "step": 1440 + }, + { + "ce_loss_13": 3.31703804731369, + "ce_loss_17": 3.238239276409149, + "ce_loss_2": 4.515483808517456, + "ce_loss_4": 4.032298040390015, + "ce_loss_9": 3.5482900619506834, + "epoch": 0.145, + "grad_norm": 840.0, + "kl_loss_13": 159.5889747619629, + "kl_loss_2": 2633.941540527344, + "kl_loss_4": 1696.529229736328, + "kl_loss_9": 666.495004272461, + "learning_rate": 0.0009548159976772592, + "loss": 1325.2045, + "step": 1450 + }, + { + "ce_loss_13": 3.2723333716392515, + "ce_loss_17": 3.192622888088226, + "ce_loss_2": 4.509625649452209, + "ce_loss_4": 4.016653835773468, + "ce_loss_9": 3.502726447582245, + "epoch": 0.146, + "grad_norm": 860.0, + "kl_loss_13": 158.18171234130858, + "kl_loss_2": 2716.143273925781, + "kl_loss_4": 1762.5269836425782, + "kl_loss_9": 668.3198791503906, + "learning_rate": 0.0009541545851748186, + "loss": 1307.9629, + "step": 1460 + }, + { + "ce_loss_13": 3.1400001287460326, + "ce_loss_17": 3.0636285066604616, + "ce_loss_2": 4.407558798789978, + "ce_loss_4": 3.9112504363059997, + "ce_loss_9": 3.3685781836509703, + "epoch": 0.147, + "grad_norm": 892.0, + "kl_loss_13": 153.8387252807617, + "kl_loss_2": 2744.586047363281, + "kl_loss_4": 1781.9750915527343, + "kl_loss_9": 653.3158966064453, + "learning_rate": 0.0009534885993407473, + "loss": 1313.7582, + "step": 1470 + }, + { + "ce_loss_13": 3.2966676592826842, + "ce_loss_17": 3.2204318284988402, + "ce_loss_2": 4.519659852981567, + "ce_loss_4": 4.05392040014267, + "ce_loss_9": 3.5160779476165773, + "epoch": 0.148, + "grad_norm": 1016.0, + "kl_loss_13": 150.34495544433594, + "kl_loss_2": 2669.7378662109377, + "kl_loss_4": 1767.374462890625, + "kl_loss_9": 633.6015106201172, + "learning_rate": 0.0009528180468815154, + "loss": 1304.4729, + "step": 1480 + }, + { + "ce_loss_13": 3.350235092639923, + "ce_loss_17": 3.2761547923088075, + "ce_loss_2": 4.5362279415130615, + "ce_loss_4": 4.072121036052704, + "ce_loss_9": 3.5690738201141357, + "epoch": 0.149, + "grad_norm": 828.0, + "kl_loss_13": 151.07743339538575, + "kl_loss_2": 2612.098779296875, + "kl_loss_4": 1699.6807739257813, + "kl_loss_9": 637.027377319336, + "learning_rate": 0.0009521429345495787, + "loss": 1281.2876, + "step": 1490 + }, + { + "ce_loss_13": 3.324703276157379, + "ce_loss_17": 3.252811884880066, + "ce_loss_2": 4.505655813217163, + "ce_loss_4": 4.033749973773956, + "ce_loss_9": 3.538119316101074, + "epoch": 0.15, + "grad_norm": 984.0, + "kl_loss_13": 145.84724960327148, + "kl_loss_2": 2598.4200805664063, + "kl_loss_4": 1695.347430419922, + "kl_loss_9": 625.0196838378906, + "learning_rate": 0.0009514632691433108, + "loss": 1280.8635, + "step": 1500 + }, + { + "ce_loss_13": 3.2936044573783874, + "ce_loss_17": 3.2173104524612426, + "ce_loss_2": 4.505963945388794, + "ce_loss_4": 4.024936449527741, + "ce_loss_9": 3.522586441040039, + "epoch": 0.151, + "grad_norm": 1208.0, + "kl_loss_13": 151.73327865600587, + "kl_loss_2": 2654.6450927734377, + "kl_loss_4": 1714.04609375, + "kl_loss_9": 654.2632720947265, + "learning_rate": 0.0009507790575069346, + "loss": 1291.8244, + "step": 1510 + }, + { + "ce_loss_13": 3.261947202682495, + "ce_loss_17": 3.1803197503089904, + "ce_loss_2": 4.497062540054321, + "ce_loss_4": 4.009624814987182, + "ce_loss_9": 3.4927632212638855, + "epoch": 0.152, + "grad_norm": 988.0, + "kl_loss_13": 155.47834243774415, + "kl_loss_2": 2693.4311279296876, + "kl_loss_4": 1737.5451049804688, + "kl_loss_9": 667.654345703125, + "learning_rate": 0.0009500903065304539, + "loss": 1319.2768, + "step": 1520 + }, + { + "ce_loss_13": 3.301459753513336, + "ce_loss_17": 3.2291717171669005, + "ce_loss_2": 4.479698586463928, + "ce_loss_4": 4.004973948001862, + "ce_loss_9": 3.514641726016998, + "epoch": 0.153, + "grad_norm": 924.0, + "kl_loss_13": 146.84607620239257, + "kl_loss_2": 2573.41953125, + "kl_loss_4": 1653.3306945800782, + "kl_loss_9": 617.8248107910156, + "learning_rate": 0.0009493970231495835, + "loss": 1272.3006, + "step": 1530 + }, + { + "ce_loss_13": 3.2442415833473204, + "ce_loss_17": 3.1767807841300963, + "ce_loss_2": 4.408775794506073, + "ce_loss_4": 3.94643098115921, + "ce_loss_9": 3.4519525527954102, + "epoch": 0.154, + "grad_norm": 964.0, + "kl_loss_13": 144.3672275543213, + "kl_loss_2": 2583.994738769531, + "kl_loss_4": 1672.2036499023438, + "kl_loss_9": 617.1100189208985, + "learning_rate": 0.0009486992143456792, + "loss": 1255.7061, + "step": 1540 + }, + { + "ce_loss_13": 3.2666288018226624, + "ce_loss_17": 3.1871628880500795, + "ce_loss_2": 4.5429561853408815, + "ce_loss_4": 4.044189441204071, + "ce_loss_9": 3.5003819704055785, + "epoch": 0.155, + "grad_norm": 824.0, + "kl_loss_13": 156.8553424835205, + "kl_loss_2": 2785.3518310546874, + "kl_loss_4": 1804.0568359375, + "kl_loss_9": 668.8144226074219, + "learning_rate": 0.0009479968871456679, + "loss": 1310.7047, + "step": 1550 + }, + { + "ce_loss_13": 3.2396668672561644, + "ce_loss_17": 3.1666937947273253, + "ce_loss_2": 4.470914816856384, + "ce_loss_4": 3.9746352672576903, + "ce_loss_9": 3.4597315788269043, + "epoch": 0.156, + "grad_norm": 868.0, + "kl_loss_13": 149.41605339050292, + "kl_loss_2": 2710.2673828125, + "kl_loss_4": 1731.6584838867188, + "kl_loss_9": 639.4033935546875, + "learning_rate": 0.0009472900486219768, + "loss": 1277.1369, + "step": 1560 + }, + { + "ce_loss_13": 3.2267670154571535, + "ce_loss_17": 3.154045653343201, + "ce_loss_2": 4.423221015930176, + "ce_loss_4": 3.9451833248138426, + "ce_loss_9": 3.446901488304138, + "epoch": 0.157, + "grad_norm": 940.0, + "kl_loss_13": 146.73355407714843, + "kl_loss_2": 2626.4529663085937, + "kl_loss_4": 1697.2255493164062, + "kl_loss_9": 627.6230621337891, + "learning_rate": 0.000946578705892462, + "loss": 1273.7516, + "step": 1570 + }, + { + "ce_loss_13": 3.2652224779129027, + "ce_loss_17": 3.190036964416504, + "ce_loss_2": 4.441090035438537, + "ce_loss_4": 3.9603715419769285, + "ce_loss_9": 3.47905410528183, + "epoch": 0.158, + "grad_norm": 1184.0, + "kl_loss_13": 144.6514778137207, + "kl_loss_2": 2569.5556396484376, + "kl_loss_4": 1636.7543090820313, + "kl_loss_9": 615.1616607666016, + "learning_rate": 0.0009458628661203367, + "loss": 1264.8164, + "step": 1580 + }, + { + "ce_loss_13": 3.269646632671356, + "ce_loss_17": 3.196610224246979, + "ce_loss_2": 4.511783123016357, + "ce_loss_4": 4.012009418010711, + "ce_loss_9": 3.4952786922454835, + "epoch": 0.159, + "grad_norm": 928.0, + "kl_loss_13": 148.32306594848632, + "kl_loss_2": 2726.9982299804688, + "kl_loss_4": 1744.6489196777343, + "kl_loss_9": 641.6262603759766, + "learning_rate": 0.0009451425365140996, + "loss": 1267.4883, + "step": 1590 + }, + { + "ce_loss_13": 3.3440261363983153, + "ce_loss_17": 3.267223620414734, + "ce_loss_2": 4.50254716873169, + "ce_loss_4": 4.032317209243774, + "ce_loss_9": 3.554012417793274, + "epoch": 0.16, + "grad_norm": 988.0, + "kl_loss_13": 154.36162834167482, + "kl_loss_2": 2557.0782104492187, + "kl_loss_4": 1631.2450927734376, + "kl_loss_9": 615.0782852172852, + "learning_rate": 0.0009444177243274617, + "loss": 1240.2352, + "step": 1600 + }, + { + "ce_loss_13": 3.2070243835449217, + "ce_loss_17": 3.127614879608154, + "ce_loss_2": 4.428898012638092, + "ce_loss_4": 3.9259282469749452, + "ce_loss_9": 3.4343663930892943, + "epoch": 0.161, + "grad_norm": 1024.0, + "kl_loss_13": 160.67943954467773, + "kl_loss_2": 2679.0925048828126, + "kl_loss_4": 1709.7305786132813, + "kl_loss_9": 656.5942565917969, + "learning_rate": 0.0009436884368592739, + "loss": 1287.4846, + "step": 1610 + }, + { + "ce_loss_13": 3.2549012541770934, + "ce_loss_17": 3.178916025161743, + "ce_loss_2": 4.430709767341614, + "ce_loss_4": 3.9556885719299317, + "ce_loss_9": 3.478045654296875, + "epoch": 0.162, + "grad_norm": 840.0, + "kl_loss_13": 156.13519668579102, + "kl_loss_2": 2589.6864379882813, + "kl_loss_4": 1662.0882995605468, + "kl_loss_9": 635.3385131835937, + "learning_rate": 0.0009429546814534529, + "loss": 1284.9152, + "step": 1620 + }, + { + "ce_loss_13": 3.2557201266288756, + "ce_loss_17": 3.185758590698242, + "ce_loss_2": 4.449909734725952, + "ce_loss_4": 3.96008243560791, + "ce_loss_9": 3.465940499305725, + "epoch": 0.163, + "grad_norm": 884.0, + "kl_loss_13": 152.8242042541504, + "kl_loss_2": 2580.221484375, + "kl_loss_4": 1644.5145141601563, + "kl_loss_9": 623.5310699462891, + "learning_rate": 0.0009422164654989072, + "loss": 1239.4715, + "step": 1630 + }, + { + "ce_loss_13": 3.375183272361755, + "ce_loss_17": 3.296482276916504, + "ce_loss_2": 4.52595956325531, + "ce_loss_4": 4.056667053699494, + "ce_loss_9": 3.5789090275764464, + "epoch": 0.164, + "grad_norm": 880.0, + "kl_loss_13": 158.7288345336914, + "kl_loss_2": 2557.338903808594, + "kl_loss_4": 1642.3708923339843, + "kl_loss_9": 621.2283813476563, + "learning_rate": 0.0009414737964294635, + "loss": 1252.1454, + "step": 1640 + }, + { + "ce_loss_13": 3.3032506585121153, + "ce_loss_17": 3.234329044818878, + "ce_loss_2": 4.434148597717285, + "ce_loss_4": 3.977667534351349, + "ce_loss_9": 3.5031858801841738, + "epoch": 0.165, + "grad_norm": 892.0, + "kl_loss_13": 150.3079933166504, + "kl_loss_2": 2484.353466796875, + "kl_loss_4": 1595.0523559570313, + "kl_loss_9": 596.2776641845703, + "learning_rate": 0.000940726681723791, + "loss": 1246.2515, + "step": 1650 + }, + { + "ce_loss_13": 3.1487066388130187, + "ce_loss_17": 3.074619722366333, + "ce_loss_2": 4.387822198867798, + "ce_loss_4": 3.8858991622924806, + "ce_loss_9": 3.374624800682068, + "epoch": 0.166, + "grad_norm": 944.0, + "kl_loss_13": 150.79924697875975, + "kl_loss_2": 2704.8510620117186, + "kl_loss_4": 1723.6966552734375, + "kl_loss_9": 640.962646484375, + "learning_rate": 0.0009399751289053266, + "loss": 1252.8116, + "step": 1660 + }, + { + "ce_loss_13": 3.359143912792206, + "ce_loss_17": 3.284330594539642, + "ce_loss_2": 4.515512728691101, + "ce_loss_4": 4.049841487407685, + "ce_loss_9": 3.5694025635719298, + "epoch": 0.167, + "grad_norm": 1032.0, + "kl_loss_13": 151.4821464538574, + "kl_loss_2": 2562.1604125976564, + "kl_loss_4": 1653.0664184570312, + "kl_loss_9": 618.9189239501953, + "learning_rate": 0.0009392191455421988, + "loss": 1265.362, + "step": 1670 + }, + { + "ce_loss_13": 3.3327678442001343, + "ce_loss_17": 3.2565733790397644, + "ce_loss_2": 4.489892220497131, + "ce_loss_4": 4.014864361286163, + "ce_loss_9": 3.5410717844963076, + "epoch": 0.168, + "grad_norm": 1008.0, + "kl_loss_13": 153.88434982299805, + "kl_loss_2": 2578.5636596679688, + "kl_loss_4": 1655.9082092285157, + "kl_loss_9": 633.3927032470704, + "learning_rate": 0.0009384587392471515, + "loss": 1230.7436, + "step": 1680 + }, + { + "ce_loss_13": 3.316524589061737, + "ce_loss_17": 3.248715603351593, + "ce_loss_2": 4.4522337198257445, + "ce_loss_4": 3.999102163314819, + "ce_loss_9": 3.5262418508529665, + "epoch": 0.169, + "grad_norm": 924.0, + "kl_loss_13": 143.4655334472656, + "kl_loss_2": 2500.4628295898438, + "kl_loss_4": 1616.7253845214843, + "kl_loss_9": 605.6902252197266, + "learning_rate": 0.0009376939176774678, + "loss": 1222.1738, + "step": 1690 + }, + { + "ce_loss_13": 3.2950225949287413, + "ce_loss_17": 3.2230908036231996, + "ce_loss_2": 4.459612393379212, + "ce_loss_4": 3.997720181941986, + "ce_loss_9": 3.5059776425361635, + "epoch": 0.17, + "grad_norm": 744.0, + "kl_loss_13": 143.40423965454102, + "kl_loss_2": 2556.2604370117188, + "kl_loss_4": 1654.5242248535155, + "kl_loss_9": 617.8040802001954, + "learning_rate": 0.0009369246885348925, + "loss": 1262.6093, + "step": 1700 + }, + { + "ce_loss_13": 3.279132306575775, + "ce_loss_17": 3.2082658648490905, + "ce_loss_2": 4.4909937381744385, + "ce_loss_4": 4.014976680278778, + "ce_loss_9": 3.5044809341430665, + "epoch": 0.171, + "grad_norm": 1104.0, + "kl_loss_13": 147.30735778808594, + "kl_loss_2": 2649.2698608398437, + "kl_loss_4": 1717.6260559082032, + "kl_loss_9": 631.2106109619141, + "learning_rate": 0.0009361510595655545, + "loss": 1272.481, + "step": 1710 + }, + { + "ce_loss_13": 3.2431801676750185, + "ce_loss_17": 3.165341079235077, + "ce_loss_2": 4.419277501106262, + "ce_loss_4": 3.960156488418579, + "ce_loss_9": 3.4647254824638365, + "epoch": 0.172, + "grad_norm": 940.0, + "kl_loss_13": 151.63495941162108, + "kl_loss_2": 2606.44619140625, + "kl_loss_4": 1705.5336364746095, + "kl_loss_9": 636.0044860839844, + "learning_rate": 0.0009353730385598887, + "loss": 1266.2547, + "step": 1720 + }, + { + "ce_loss_13": 3.1767956018447876, + "ce_loss_17": 3.1028839588165282, + "ce_loss_2": 4.408117437362671, + "ce_loss_4": 3.9155510663986206, + "ce_loss_9": 3.3929005742073057, + "epoch": 0.173, + "grad_norm": 988.0, + "kl_loss_13": 147.00774688720702, + "kl_loss_2": 2672.864794921875, + "kl_loss_4": 1727.2372497558595, + "kl_loss_9": 630.8589263916016, + "learning_rate": 0.0009345906333525581, + "loss": 1291.5501, + "step": 1730 + }, + { + "ce_loss_13": 3.214318132400513, + "ce_loss_17": 3.1406612753868104, + "ce_loss_2": 4.412691330909729, + "ce_loss_4": 3.9299787878990173, + "ce_loss_9": 3.426638603210449, + "epoch": 0.174, + "grad_norm": 1472.0, + "kl_loss_13": 151.24997787475587, + "kl_loss_2": 2639.600634765625, + "kl_loss_4": 1701.865478515625, + "kl_loss_9": 628.9751953125, + "learning_rate": 0.0009338038518223745, + "loss": 1259.1267, + "step": 1740 + }, + { + "ce_loss_13": 3.2820828437805174, + "ce_loss_17": 3.2032444000244142, + "ce_loss_2": 4.473877859115601, + "ce_loss_4": 4.006321442127228, + "ce_loss_9": 3.512243223190308, + "epoch": 0.175, + "grad_norm": 944.0, + "kl_loss_13": 153.81817169189452, + "kl_loss_2": 2639.3710205078123, + "kl_loss_4": 1714.692694091797, + "kl_loss_9": 659.9329132080078, + "learning_rate": 0.0009330127018922195, + "loss": 1303.7902, + "step": 1750 + }, + { + "ce_loss_13": 3.23981317281723, + "ce_loss_17": 3.163079798221588, + "ce_loss_2": 4.431985330581665, + "ce_loss_4": 3.9461536049842834, + "ce_loss_9": 3.4676100730896, + "epoch": 0.176, + "grad_norm": 1112.0, + "kl_loss_13": 152.84890441894532, + "kl_loss_2": 2631.400524902344, + "kl_loss_4": 1687.3205871582031, + "kl_loss_9": 653.9106567382812, + "learning_rate": 0.0009322171915289634, + "loss": 1274.0447, + "step": 1760 + }, + { + "ce_loss_13": 3.270676612854004, + "ce_loss_17": 3.2016090750694275, + "ce_loss_2": 4.420488786697388, + "ce_loss_4": 3.955690586566925, + "ce_loss_9": 3.485394608974457, + "epoch": 0.177, + "grad_norm": 972.0, + "kl_loss_13": 151.00484085083008, + "kl_loss_2": 2558.2109252929686, + "kl_loss_4": 1654.1532043457032, + "kl_loss_9": 646.6074737548828, + "learning_rate": 0.0009314173287433873, + "loss": 1237.9533, + "step": 1770 + }, + { + "ce_loss_13": 3.260296106338501, + "ce_loss_17": 3.1839903831481933, + "ce_loss_2": 4.421822357177734, + "ce_loss_4": 3.9567521691322325, + "ce_loss_9": 3.47633159160614, + "epoch": 0.178, + "grad_norm": 992.0, + "kl_loss_13": 153.52974052429198, + "kl_loss_2": 2578.7755493164063, + "kl_loss_4": 1671.700665283203, + "kl_loss_9": 637.5532928466797, + "learning_rate": 0.0009306131215901003, + "loss": 1240.1828, + "step": 1780 + }, + { + "ce_loss_13": 3.2812577724456786, + "ce_loss_17": 3.209964430332184, + "ce_loss_2": 4.448306345939637, + "ce_loss_4": 3.9763675212860106, + "ce_loss_9": 3.500848615169525, + "epoch": 0.179, + "grad_norm": 1144.0, + "kl_loss_13": 147.72571258544923, + "kl_loss_2": 2553.325158691406, + "kl_loss_4": 1639.2069213867187, + "kl_loss_9": 640.635400390625, + "learning_rate": 0.0009298045781674596, + "loss": 1223.2294, + "step": 1790 + }, + { + "ce_loss_13": 3.270671546459198, + "ce_loss_17": 3.1968597292900087, + "ce_loss_2": 4.410528564453125, + "ce_loss_4": 3.9516660451889036, + "ce_loss_9": 3.488964307308197, + "epoch": 0.18, + "grad_norm": 1072.0, + "kl_loss_13": 142.7226249694824, + "kl_loss_2": 2515.3703857421874, + "kl_loss_4": 1620.968682861328, + "kl_loss_9": 621.6914978027344, + "learning_rate": 0.0009289917066174886, + "loss": 1239.7604, + "step": 1800 + }, + { + "ce_loss_13": 3.2641263127326967, + "ce_loss_17": 3.198021447658539, + "ce_loss_2": 4.380536890029907, + "ce_loss_4": 3.9220643997192384, + "ce_loss_9": 3.4712405681610106, + "epoch": 0.181, + "grad_norm": 868.0, + "kl_loss_13": 136.7609718322754, + "kl_loss_2": 2474.2447143554687, + "kl_loss_4": 1572.1568298339844, + "kl_loss_9": 597.8594604492188, + "learning_rate": 0.0009281745151257945, + "loss": 1208.2396, + "step": 1810 + }, + { + "ce_loss_13": 3.2798272252082823, + "ce_loss_17": 3.210568928718567, + "ce_loss_2": 4.454569697380066, + "ce_loss_4": 3.9785515785217287, + "ce_loss_9": 3.4902180314064024, + "epoch": 0.182, + "grad_norm": 1012.0, + "kl_loss_13": 141.46335830688477, + "kl_loss_2": 2562.048779296875, + "kl_loss_4": 1640.0156555175781, + "kl_loss_9": 610.2751922607422, + "learning_rate": 0.0009273530119214868, + "loss": 1243.6753, + "step": 1820 + }, + { + "ce_loss_13": 3.379050946235657, + "ce_loss_17": 3.310153138637543, + "ce_loss_2": 4.524109482765198, + "ce_loss_4": 4.065473437309265, + "ce_loss_9": 3.5805925130844116, + "epoch": 0.183, + "grad_norm": 904.0, + "kl_loss_13": 143.37997245788574, + "kl_loss_2": 2527.283251953125, + "kl_loss_4": 1631.6089599609375, + "kl_loss_9": 599.2062957763671, + "learning_rate": 0.0009265272052770935, + "loss": 1211.5092, + "step": 1830 + }, + { + "ce_loss_13": 3.2130213618278503, + "ce_loss_17": 3.1356865882873537, + "ce_loss_2": 4.401892566680909, + "ce_loss_4": 3.909571850299835, + "ce_loss_9": 3.422063076496124, + "epoch": 0.184, + "grad_norm": 1216.0, + "kl_loss_13": 149.91977462768554, + "kl_loss_2": 2594.2158081054686, + "kl_loss_4": 1647.5353393554688, + "kl_loss_9": 609.2092956542969, + "learning_rate": 0.0009256971035084784, + "loss": 1247.5318, + "step": 1840 + }, + { + "ce_loss_13": 3.151016342639923, + "ce_loss_17": 3.069221758842468, + "ce_loss_2": 4.369063758850098, + "ce_loss_4": 3.890806245803833, + "ce_loss_9": 3.3755425930023195, + "epoch": 0.185, + "grad_norm": 1192.0, + "kl_loss_13": 166.76636657714843, + "kl_loss_2": 2660.570690917969, + "kl_loss_4": 1727.7162719726562, + "kl_loss_9": 638.5365356445312, + "learning_rate": 0.0009248627149747573, + "loss": 1270.7006, + "step": 1850 + }, + { + "ce_loss_13": 3.3493673324584963, + "ce_loss_17": 3.274794840812683, + "ce_loss_2": 4.484880018234253, + "ce_loss_4": 4.026814413070679, + "ce_loss_9": 3.5509119510650633, + "epoch": 0.186, + "grad_norm": 1136.0, + "kl_loss_13": 167.58827209472656, + "kl_loss_2": 2532.6851440429687, + "kl_loss_4": 1635.8290649414062, + "kl_loss_9": 611.3373352050781, + "learning_rate": 0.0009240240480782129, + "loss": 1236.7268, + "step": 1860 + }, + { + "ce_loss_13": 3.264767789840698, + "ce_loss_17": 3.1734949231147764, + "ce_loss_2": 4.425526809692383, + "ce_loss_4": 3.9566202402114867, + "ce_loss_9": 3.4576178908348085, + "epoch": 0.187, + "grad_norm": 1072.0, + "kl_loss_13": 173.45805969238282, + "kl_loss_2": 2588.659460449219, + "kl_loss_4": 1679.1102478027344, + "kl_loss_9": 615.0454467773437, + "learning_rate": 0.0009231811112642122, + "loss": 1241.8299, + "step": 1870 + }, + { + "ce_loss_13": 3.297986900806427, + "ce_loss_17": 3.218840992450714, + "ce_loss_2": 4.413665437698365, + "ce_loss_4": 3.9500675559043885, + "ce_loss_9": 3.489453911781311, + "epoch": 0.188, + "grad_norm": 948.0, + "kl_loss_13": 171.95526275634765, + "kl_loss_2": 2479.3824951171873, + "kl_loss_4": 1586.8702880859375, + "kl_loss_9": 596.3064392089843, + "learning_rate": 0.0009223339130211192, + "loss": 1214.5687, + "step": 1880 + }, + { + "ce_loss_13": 3.165955650806427, + "ce_loss_17": 3.084008884429932, + "ce_loss_2": 4.335282790660858, + "ce_loss_4": 3.8536491274833677, + "ce_loss_9": 3.3592833638191224, + "epoch": 0.189, + "grad_norm": 936.0, + "kl_loss_13": 162.82597732543945, + "kl_loss_2": 2602.5996948242187, + "kl_loss_4": 1661.4218444824219, + "kl_loss_9": 592.1233367919922, + "learning_rate": 0.0009214824618802108, + "loss": 1245.9766, + "step": 1890 + }, + { + "ce_loss_13": 3.3376641511917113, + "ce_loss_17": 3.2568175435066222, + "ce_loss_2": 4.481165957450867, + "ce_loss_4": 4.0071622967720035, + "ce_loss_9": 3.530212700366974, + "epoch": 0.19, + "grad_norm": 852.0, + "kl_loss_13": 159.12362823486328, + "kl_loss_2": 2518.6216674804687, + "kl_loss_4": 1602.5659118652343, + "kl_loss_9": 596.4594696044921, + "learning_rate": 0.0009206267664155906, + "loss": 1251.5332, + "step": 1900 + }, + { + "ce_loss_13": 3.2547051429748537, + "ce_loss_17": 3.179506766796112, + "ce_loss_2": 4.42237286567688, + "ce_loss_4": 3.9463531970977783, + "ce_loss_9": 3.4603018283843996, + "epoch": 0.191, + "grad_norm": 1160.0, + "kl_loss_13": 153.24628219604492, + "kl_loss_2": 2553.2120849609373, + "kl_loss_4": 1626.7147033691406, + "kl_loss_9": 593.5009826660156, + "learning_rate": 0.0009197668352441024, + "loss": 1234.8078, + "step": 1910 + }, + { + "ce_loss_13": 3.3072059988975524, + "ce_loss_17": 3.2319939017295836, + "ce_loss_2": 4.4455688238143924, + "ce_loss_4": 3.980021631717682, + "ce_loss_9": 3.498464751243591, + "epoch": 0.192, + "grad_norm": 924.0, + "kl_loss_13": 147.35777130126954, + "kl_loss_2": 2514.983435058594, + "kl_loss_4": 1612.3255859375, + "kl_loss_9": 592.7270309448243, + "learning_rate": 0.0009189026770252437, + "loss": 1227.5839, + "step": 1920 + }, + { + "ce_loss_13": 3.328125202655792, + "ce_loss_17": 3.255943310260773, + "ce_loss_2": 4.472360634803772, + "ce_loss_4": 4.001090836524964, + "ce_loss_9": 3.5319607257843018, + "epoch": 0.193, + "grad_norm": 1168.0, + "kl_loss_13": 145.3461006164551, + "kl_loss_2": 2519.930877685547, + "kl_loss_4": 1600.8451416015625, + "kl_loss_9": 600.2095947265625, + "learning_rate": 0.000918034300461078, + "loss": 1262.618, + "step": 1930 + }, + { + "ce_loss_13": 3.3509397506713867, + "ce_loss_17": 3.281086301803589, + "ce_loss_2": 4.482430100440979, + "ce_loss_4": 4.022635567188263, + "ce_loss_9": 3.5541141390800477, + "epoch": 0.194, + "grad_norm": 1128.0, + "kl_loss_13": 145.15637893676757, + "kl_loss_2": 2490.844104003906, + "kl_loss_4": 1599.6368896484375, + "kl_loss_9": 593.7297210693359, + "learning_rate": 0.0009171617142961477, + "loss": 1217.3812, + "step": 1940 + }, + { + "ce_loss_13": 3.3146053671836855, + "ce_loss_17": 3.246134078502655, + "ce_loss_2": 4.455230975151062, + "ce_loss_4": 3.991355764865875, + "ce_loss_9": 3.522234392166138, + "epoch": 0.195, + "grad_norm": 1104.0, + "kl_loss_13": 141.99313468933104, + "kl_loss_2": 2512.3952026367188, + "kl_loss_4": 1600.815899658203, + "kl_loss_9": 601.8870651245118, + "learning_rate": 0.0009162849273173857, + "loss": 1219.1488, + "step": 1950 + }, + { + "ce_loss_13": 3.257956290245056, + "ce_loss_17": 3.1889537930488587, + "ce_loss_2": 4.404689359664917, + "ce_loss_4": 3.9324918746948243, + "ce_loss_9": 3.467221164703369, + "epoch": 0.196, + "grad_norm": 856.0, + "kl_loss_13": 139.4066120147705, + "kl_loss_2": 2505.5807495117188, + "kl_loss_4": 1597.2408508300782, + "kl_loss_9": 605.9314147949219, + "learning_rate": 0.0009154039483540273, + "loss": 1223.8816, + "step": 1960 + }, + { + "ce_loss_13": 3.241231632232666, + "ce_loss_17": 3.167975401878357, + "ce_loss_2": 4.3952877759933475, + "ce_loss_4": 3.915211498737335, + "ce_loss_9": 3.444790947437286, + "epoch": 0.197, + "grad_norm": 1192.0, + "kl_loss_13": 143.28952445983887, + "kl_loss_2": 2549.246545410156, + "kl_loss_4": 1615.576397705078, + "kl_loss_9": 618.2861907958984, + "learning_rate": 0.0009145187862775209, + "loss": 1227.8156, + "step": 1970 + }, + { + "ce_loss_13": 3.269066858291626, + "ce_loss_17": 3.200450074672699, + "ce_loss_2": 4.408350932598114, + "ce_loss_4": 3.9542667508125304, + "ce_loss_9": 3.4886820316314697, + "epoch": 0.198, + "grad_norm": 1020.0, + "kl_loss_13": 140.2093235015869, + "kl_loss_2": 2509.5830017089843, + "kl_loss_4": 1631.0263305664062, + "kl_loss_9": 641.3100524902344, + "learning_rate": 0.0009136294500014386, + "loss": 1221.0155, + "step": 1980 + }, + { + "ce_loss_13": 3.2199265122413636, + "ce_loss_17": 3.153044319152832, + "ce_loss_2": 4.4184041023254395, + "ce_loss_4": 3.9345576047897337, + "ce_loss_9": 3.46233047246933, + "epoch": 0.199, + "grad_norm": 1224.0, + "kl_loss_13": 140.62947616577148, + "kl_loss_2": 2600.795263671875, + "kl_loss_4": 1660.025408935547, + "kl_loss_9": 656.784765625, + "learning_rate": 0.000912735948481387, + "loss": 1256.957, + "step": 1990 + }, + { + "ce_loss_13": 3.2496906518936157, + "ce_loss_17": 3.18221800327301, + "ce_loss_2": 4.392653834819794, + "ce_loss_4": 3.9263997673988342, + "ce_loss_9": 3.4921125411987304, + "epoch": 0.2, + "grad_norm": 1176.0, + "kl_loss_13": 141.75217552185057, + "kl_loss_2": 2529.630029296875, + "kl_loss_4": 1621.6295349121094, + "kl_loss_9": 681.4832122802734, + "learning_rate": 0.0009118382907149164, + "loss": 1223.4629, + "step": 2000 + }, + { + "ce_loss_13": 3.2776938676834106, + "ce_loss_17": 3.2082446813583374, + "ce_loss_2": 4.412687301635742, + "ce_loss_4": 3.9448086619377136, + "ce_loss_9": 3.4980456709861754, + "epoch": 0.201, + "grad_norm": 1072.0, + "kl_loss_13": 140.21098251342772, + "kl_loss_2": 2482.6479248046876, + "kl_loss_4": 1586.4408630371095, + "kl_loss_9": 646.6394714355469, + "learning_rate": 0.0009109364857414306, + "loss": 1209.3566, + "step": 2010 + }, + { + "ce_loss_13": 3.2410451889038088, + "ce_loss_17": 3.1722410321235657, + "ce_loss_2": 4.38575508594513, + "ce_loss_4": 3.906363785266876, + "ce_loss_9": 3.4609405279159544, + "epoch": 0.202, + "grad_norm": 1080.0, + "kl_loss_13": 140.61730346679687, + "kl_loss_2": 2527.1364990234374, + "kl_loss_4": 1596.0163818359374, + "kl_loss_9": 624.1560577392578, + "learning_rate": 0.0009100305426420956, + "loss": 1245.0102, + "step": 2020 + }, + { + "ce_loss_13": 3.209837460517883, + "ce_loss_17": 3.143941330909729, + "ce_loss_2": 4.409683322906494, + "ce_loss_4": 3.9142362475395203, + "ce_loss_9": 3.4257458448410034, + "epoch": 0.203, + "grad_norm": 1184.0, + "kl_loss_13": 140.07508544921876, + "kl_loss_2": 2628.2286010742187, + "kl_loss_4": 1659.7788818359375, + "kl_loss_9": 621.1918701171875, + "learning_rate": 0.0009091204705397484, + "loss": 1233.0754, + "step": 2030 + }, + { + "ce_loss_13": 3.198666572570801, + "ce_loss_17": 3.129299855232239, + "ce_loss_2": 4.389280295372009, + "ce_loss_4": 3.9088759541511537, + "ce_loss_9": 3.417533981800079, + "epoch": 0.204, + "grad_norm": 1120.0, + "kl_loss_13": 144.9146884918213, + "kl_loss_2": 2623.4737548828125, + "kl_loss_4": 1681.3126037597656, + "kl_loss_9": 624.0909301757813, + "learning_rate": 0.0009082062785988049, + "loss": 1247.2415, + "step": 2040 + }, + { + "ce_loss_13": 3.3349857687950135, + "ce_loss_17": 3.2643967032432557, + "ce_loss_2": 4.436873483657837, + "ce_loss_4": 3.985110604763031, + "ce_loss_9": 3.53349769115448, + "epoch": 0.205, + "grad_norm": 960.0, + "kl_loss_13": 147.40721588134767, + "kl_loss_2": 2469.269909667969, + "kl_loss_4": 1584.0609619140625, + "kl_loss_9": 606.1272705078125, + "learning_rate": 0.0009072879760251679, + "loss": 1214.3277, + "step": 2050 + }, + { + "ce_loss_13": 3.286811077594757, + "ce_loss_17": 3.2115185260772705, + "ce_loss_2": 4.449003720283509, + "ce_loss_4": 3.9762887239456175, + "ce_loss_9": 3.4917128682136536, + "epoch": 0.206, + "grad_norm": 1272.0, + "kl_loss_13": 154.22257385253906, + "kl_loss_2": 2584.590954589844, + "kl_loss_4": 1650.4645263671875, + "kl_loss_9": 606.7064636230468, + "learning_rate": 0.0009063655720661341, + "loss": 1231.1944, + "step": 2060 + }, + { + "ce_loss_13": 3.3282309651374815, + "ce_loss_17": 3.2498881340026857, + "ce_loss_2": 4.434006929397583, + "ce_loss_4": 3.9711023330688477, + "ce_loss_9": 3.5193397760391236, + "epoch": 0.207, + "grad_norm": 860.0, + "kl_loss_13": 157.21303634643556, + "kl_loss_2": 2459.270617675781, + "kl_loss_4": 1560.081494140625, + "kl_loss_9": 589.4546905517578, + "learning_rate": 0.000905439076010301, + "loss": 1204.0477, + "step": 2070 + }, + { + "ce_loss_13": 3.285494565963745, + "ce_loss_17": 3.208006715774536, + "ce_loss_2": 4.432467269897461, + "ce_loss_4": 3.9654005408287047, + "ce_loss_9": 3.4814179062843325, + "epoch": 0.208, + "grad_norm": 820.0, + "kl_loss_13": 160.8441955566406, + "kl_loss_2": 2530.497900390625, + "kl_loss_4": 1613.6675842285156, + "kl_loss_9": 598.4929595947266, + "learning_rate": 0.0009045084971874737, + "loss": 1201.1896, + "step": 2080 + }, + { + "ce_loss_13": 3.2653284668922424, + "ce_loss_17": 3.18992565870285, + "ce_loss_2": 4.398504960536957, + "ce_loss_4": 3.935706281661987, + "ce_loss_9": 3.4602928161621094, + "epoch": 0.209, + "grad_norm": 964.0, + "kl_loss_13": 152.57452011108398, + "kl_loss_2": 2503.6657470703126, + "kl_loss_4": 1603.4722412109375, + "kl_loss_9": 593.3274749755859, + "learning_rate": 0.0009035738449685707, + "loss": 1230.3663, + "step": 2090 + }, + { + "ce_loss_13": 3.1988245964050295, + "ce_loss_17": 3.127147340774536, + "ce_loss_2": 4.399014830589294, + "ce_loss_4": 3.9075266480445863, + "ce_loss_9": 3.413822150230408, + "epoch": 0.21, + "grad_norm": 916.0, + "kl_loss_13": 148.84494552612304, + "kl_loss_2": 2621.2723876953123, + "kl_loss_4": 1667.533056640625, + "kl_loss_9": 614.9874298095704, + "learning_rate": 0.0009026351287655293, + "loss": 1225.7484, + "step": 2100 + }, + { + "ce_loss_13": 3.3957253456115724, + "ce_loss_17": 3.3281114220619203, + "ce_loss_2": 4.453698325157165, + "ce_loss_4": 4.012873351573944, + "ce_loss_9": 3.5795400142669678, + "epoch": 0.211, + "grad_norm": 1012.0, + "kl_loss_13": 140.5195255279541, + "kl_loss_2": 2368.7103637695313, + "kl_loss_4": 1508.3682067871093, + "kl_loss_9": 568.7417984008789, + "learning_rate": 0.0009016923580312113, + "loss": 1157.5861, + "step": 2110 + }, + { + "ce_loss_13": 3.2586395263671877, + "ce_loss_17": 3.186605167388916, + "ce_loss_2": 4.379410004615783, + "ce_loss_4": 3.9112540006637575, + "ce_loss_9": 3.448279118537903, + "epoch": 0.212, + "grad_norm": 1012.0, + "kl_loss_13": 154.1221652984619, + "kl_loss_2": 2477.111474609375, + "kl_loss_4": 1570.415576171875, + "kl_loss_9": 582.9285919189454, + "learning_rate": 0.0009007455422593077, + "loss": 1220.6526, + "step": 2120 + }, + { + "ce_loss_13": 3.2766650199890135, + "ce_loss_17": 3.196649408340454, + "ce_loss_2": 4.43287091255188, + "ce_loss_4": 3.954304039478302, + "ce_loss_9": 3.472559940814972, + "epoch": 0.213, + "grad_norm": 1184.0, + "kl_loss_13": 163.31368865966797, + "kl_loss_2": 2570.570166015625, + "kl_loss_4": 1636.387518310547, + "kl_loss_9": 606.7230697631836, + "learning_rate": 0.0008997946909842425, + "loss": 1233.015, + "step": 2130 + }, + { + "ce_loss_13": 3.3035802364349367, + "ce_loss_17": 3.2092464685440065, + "ce_loss_2": 4.486771178245545, + "ce_loss_4": 3.9918869614601133, + "ce_loss_9": 3.4921900033950806, + "epoch": 0.214, + "grad_norm": 1128.0, + "kl_loss_13": 185.57998046875, + "kl_loss_2": 2637.256787109375, + "kl_loss_4": 1685.0900451660157, + "kl_loss_9": 610.6810531616211, + "learning_rate": 0.0008988398137810777, + "loss": 1231.4818, + "step": 2140 + }, + { + "ce_loss_13": 3.3379700899124147, + "ce_loss_17": 3.2492310643196105, + "ce_loss_2": 4.428948855400085, + "ce_loss_4": 3.974040997028351, + "ce_loss_9": 3.512007546424866, + "epoch": 0.215, + "grad_norm": 1192.0, + "kl_loss_13": 190.50006713867188, + "kl_loss_2": 2450.3519653320313, + "kl_loss_4": 1565.778125, + "kl_loss_9": 579.5673553466797, + "learning_rate": 0.0008978809202654162, + "loss": 1191.3629, + "step": 2150 + }, + { + "ce_loss_13": 3.3076714396476747, + "ce_loss_17": 3.2240748047828673, + "ce_loss_2": 4.41434075832367, + "ce_loss_4": 3.9549734950065614, + "ce_loss_9": 3.492073678970337, + "epoch": 0.216, + "grad_norm": 956.0, + "kl_loss_13": 170.8991584777832, + "kl_loss_2": 2459.4666137695312, + "kl_loss_4": 1562.9693237304687, + "kl_loss_9": 582.1671112060546, + "learning_rate": 0.0008969180200933046, + "loss": 1210.172, + "step": 2160 + }, + { + "ce_loss_13": 3.268995833396912, + "ce_loss_17": 3.1834474444389342, + "ce_loss_2": 4.429706573486328, + "ce_loss_4": 3.954736924171448, + "ce_loss_9": 3.4683465003967284, + "epoch": 0.217, + "grad_norm": 1112.0, + "kl_loss_13": 163.76161499023436, + "kl_loss_2": 2545.57666015625, + "kl_loss_4": 1630.1313415527343, + "kl_loss_9": 603.1894729614257, + "learning_rate": 0.0008959511229611376, + "loss": 1235.0549, + "step": 2170 + }, + { + "ce_loss_13": 3.326635754108429, + "ce_loss_17": 3.256324803829193, + "ce_loss_2": 4.460869574546814, + "ce_loss_4": 3.9991163849830627, + "ce_loss_9": 3.525161361694336, + "epoch": 0.218, + "grad_norm": 1080.0, + "kl_loss_13": 148.4689163208008, + "kl_loss_2": 2504.493054199219, + "kl_loss_4": 1595.0758544921875, + "kl_loss_9": 596.0176544189453, + "learning_rate": 0.0008949802386055581, + "loss": 1208.4097, + "step": 2180 + }, + { + "ce_loss_13": 3.20139422416687, + "ce_loss_17": 3.1285878658294677, + "ce_loss_2": 4.337322962284088, + "ce_loss_4": 3.864535081386566, + "ce_loss_9": 3.3996766924858095, + "epoch": 0.219, + "grad_norm": 1416.0, + "kl_loss_13": 145.49535522460937, + "kl_loss_2": 2490.6876220703125, + "kl_loss_4": 1570.988262939453, + "kl_loss_9": 585.5628662109375, + "learning_rate": 0.0008940053768033609, + "loss": 1226.1381, + "step": 2190 + }, + { + "ce_loss_13": 3.280579400062561, + "ce_loss_17": 3.212918746471405, + "ce_loss_2": 4.398000907897949, + "ce_loss_4": 3.9295204520225524, + "ce_loss_9": 3.47022488117218, + "epoch": 0.22, + "grad_norm": 908.0, + "kl_loss_13": 140.25034561157227, + "kl_loss_2": 2488.719580078125, + "kl_loss_4": 1579.2732360839843, + "kl_loss_9": 585.6130462646485, + "learning_rate": 0.0008930265473713938, + "loss": 1196.576, + "step": 2200 + }, + { + "ce_loss_13": 3.2431564569473266, + "ce_loss_17": 3.1729213118553163, + "ce_loss_2": 4.3770240306854244, + "ce_loss_4": 3.90290253162384, + "ce_loss_9": 3.4383442282676695, + "epoch": 0.221, + "grad_norm": 984.0, + "kl_loss_13": 140.4324893951416, + "kl_loss_2": 2491.2975830078126, + "kl_loss_4": 1576.57021484375, + "kl_loss_9": 580.3924621582031, + "learning_rate": 0.0008920437601664579, + "loss": 1177.101, + "step": 2210 + }, + { + "ce_loss_13": 3.2332396388053892, + "ce_loss_17": 3.1653484225273134, + "ce_loss_2": 4.3728479743003845, + "ce_loss_4": 3.904981243610382, + "ce_loss_9": 3.436262035369873, + "epoch": 0.222, + "grad_norm": 896.0, + "kl_loss_13": 141.83710021972655, + "kl_loss_2": 2508.085656738281, + "kl_loss_4": 1598.729022216797, + "kl_loss_9": 592.4270904541015, + "learning_rate": 0.0008910570250852097, + "loss": 1186.8498, + "step": 2220 + }, + { + "ce_loss_13": 3.3349429249763487, + "ce_loss_17": 3.2675106525421143, + "ce_loss_2": 4.423606061935425, + "ce_loss_4": 3.9550350427627565, + "ce_loss_9": 3.5237053275108337, + "epoch": 0.223, + "grad_norm": 1440.0, + "kl_loss_13": 134.6342559814453, + "kl_loss_2": 2419.2567138671875, + "kl_loss_4": 1513.9413269042968, + "kl_loss_9": 566.5887939453125, + "learning_rate": 0.0008900663520640604, + "loss": 1162.1932, + "step": 2230 + }, + { + "ce_loss_13": 3.2831790685653686, + "ce_loss_17": 3.2146912574768067, + "ce_loss_2": 4.419791650772095, + "ce_loss_4": 3.940990316867828, + "ce_loss_9": 3.4771832704544066, + "epoch": 0.224, + "grad_norm": 976.0, + "kl_loss_13": 138.53964538574218, + "kl_loss_2": 2504.874865722656, + "kl_loss_4": 1576.1966918945313, + "kl_loss_9": 578.0481307983398, + "learning_rate": 0.0008890717510790764, + "loss": 1197.7674, + "step": 2240 + }, + { + "ce_loss_13": 3.2418623566627502, + "ce_loss_17": 3.1793636083602905, + "ce_loss_2": 4.392471671104431, + "ce_loss_4": 3.9172643303871153, + "ce_loss_9": 3.4362416744232176, + "epoch": 0.225, + "grad_norm": 932.0, + "kl_loss_13": 135.30275535583496, + "kl_loss_2": 2528.780310058594, + "kl_loss_4": 1594.2550170898437, + "kl_loss_9": 578.043684387207, + "learning_rate": 0.0008880732321458784, + "loss": 1212.4878, + "step": 2250 + }, + { + "ce_loss_13": 3.274475073814392, + "ce_loss_17": 3.2092744588851927, + "ce_loss_2": 4.398310565948487, + "ce_loss_4": 3.9279236435890197, + "ce_loss_9": 3.473265254497528, + "epoch": 0.226, + "grad_norm": 1200.0, + "kl_loss_13": 135.65610885620117, + "kl_loss_2": 2467.481652832031, + "kl_loss_4": 1556.7044067382812, + "kl_loss_9": 575.7236206054688, + "learning_rate": 0.0008870708053195413, + "loss": 1202.7795, + "step": 2260 + }, + { + "ce_loss_13": 3.2998771905899047, + "ce_loss_17": 3.234087347984314, + "ce_loss_2": 4.401855528354645, + "ce_loss_4": 3.9331700205802917, + "ce_loss_9": 3.4851217746734617, + "epoch": 0.227, + "grad_norm": 1200.0, + "kl_loss_13": 131.30461769104005, + "kl_loss_2": 2441.0129150390626, + "kl_loss_4": 1538.52021484375, + "kl_loss_9": 559.587986755371, + "learning_rate": 0.0008860644806944918, + "loss": 1175.3111, + "step": 2270 + }, + { + "ce_loss_13": 3.2397942543029785, + "ce_loss_17": 3.172461485862732, + "ce_loss_2": 4.388543558120728, + "ce_loss_4": 3.917333114147186, + "ce_loss_9": 3.439756453037262, + "epoch": 0.228, + "grad_norm": 1016.0, + "kl_loss_13": 137.34641036987304, + "kl_loss_2": 2513.0678100585938, + "kl_loss_4": 1591.2624389648438, + "kl_loss_9": 586.0183334350586, + "learning_rate": 0.0008850542684044079, + "loss": 1175.4074, + "step": 2280 + }, + { + "ce_loss_13": 3.20133718252182, + "ce_loss_17": 3.131630563735962, + "ce_loss_2": 4.396698522567749, + "ce_loss_4": 3.90509717464447, + "ce_loss_9": 3.4141934633255007, + "epoch": 0.229, + "grad_norm": 952.0, + "kl_loss_13": 140.61213226318358, + "kl_loss_2": 2615.365441894531, + "kl_loss_4": 1657.7905517578124, + "kl_loss_9": 603.1532775878907, + "learning_rate": 0.0008840401786221159, + "loss": 1213.8922, + "step": 2290 + }, + { + "ce_loss_13": 3.3449569225311278, + "ce_loss_17": 3.2826855182647705, + "ce_loss_2": 4.4417311429977415, + "ce_loss_4": 3.9899259328842165, + "ce_loss_9": 3.5327144861221313, + "epoch": 0.23, + "grad_norm": 864.0, + "kl_loss_13": 129.97677192687988, + "kl_loss_2": 2409.9514038085936, + "kl_loss_4": 1534.512384033203, + "kl_loss_9": 560.1477752685547, + "learning_rate": 0.000883022221559489, + "loss": 1158.9904, + "step": 2300 + }, + { + "ce_loss_13": 3.3053571224212646, + "ce_loss_17": 3.24102623462677, + "ce_loss_2": 4.431423902511597, + "ce_loss_4": 3.965811789035797, + "ce_loss_9": 3.4955753922462462, + "epoch": 0.231, + "grad_norm": 1120.0, + "kl_loss_13": 135.00216674804688, + "kl_loss_2": 2491.0085327148436, + "kl_loss_4": 1596.5063415527343, + "kl_loss_9": 576.8563293457031, + "learning_rate": 0.0008820004074673434, + "loss": 1227.5027, + "step": 2310 + }, + { + "ce_loss_13": 3.2144625067710875, + "ce_loss_17": 3.1500751256942747, + "ce_loss_2": 4.342784404754639, + "ce_loss_4": 3.8802725195884706, + "ce_loss_9": 3.4125901579856874, + "epoch": 0.232, + "grad_norm": 860.0, + "kl_loss_13": 131.8394027709961, + "kl_loss_2": 2496.114782714844, + "kl_loss_4": 1592.5800720214843, + "kl_loss_9": 576.9932022094727, + "learning_rate": 0.0008809747466353355, + "loss": 1179.7933, + "step": 2320 + }, + { + "ce_loss_13": 3.213177800178528, + "ce_loss_17": 3.1493353247642517, + "ce_loss_2": 4.356963562965393, + "ce_loss_4": 3.890561556816101, + "ce_loss_9": 3.4122796773910524, + "epoch": 0.233, + "grad_norm": 808.0, + "kl_loss_13": 131.68353385925292, + "kl_loss_2": 2507.774658203125, + "kl_loss_4": 1589.7752380371094, + "kl_loss_9": 567.858529663086, + "learning_rate": 0.0008799452493918585, + "loss": 1198.9062, + "step": 2330 + }, + { + "ce_loss_13": 3.2997725367546082, + "ce_loss_17": 3.2361501812934876, + "ce_loss_2": 4.406631648540497, + "ce_loss_4": 3.960523009300232, + "ce_loss_9": 3.49422527551651, + "epoch": 0.234, + "grad_norm": 1032.0, + "kl_loss_13": 131.5958095550537, + "kl_loss_2": 2451.2851684570314, + "kl_loss_4": 1578.8598388671876, + "kl_loss_9": 574.4567260742188, + "learning_rate": 0.0008789119261039385, + "loss": 1218.0458, + "step": 2340 + }, + { + "ce_loss_13": 3.2091623902320863, + "ce_loss_17": 3.141668677330017, + "ce_loss_2": 4.340531837940216, + "ce_loss_4": 3.8796759128570555, + "ce_loss_9": 3.408684492111206, + "epoch": 0.235, + "grad_norm": 972.0, + "kl_loss_13": 131.84058380126953, + "kl_loss_2": 2481.1201782226562, + "kl_loss_4": 1581.7587524414062, + "kl_loss_9": 576.1520477294922, + "learning_rate": 0.0008778747871771292, + "loss": 1176.685, + "step": 2350 + }, + { + "ce_loss_13": 3.263255071640015, + "ce_loss_17": 3.2017985820770263, + "ce_loss_2": 4.3496256351470945, + "ce_loss_4": 3.9020894527435304, + "ce_loss_9": 3.457408106327057, + "epoch": 0.236, + "grad_norm": 1088.0, + "kl_loss_13": 128.3655590057373, + "kl_loss_2": 2410.7779296875, + "kl_loss_4": 1538.0171142578124, + "kl_loss_9": 559.2697998046875, + "learning_rate": 0.0008768338430554083, + "loss": 1155.0783, + "step": 2360 + }, + { + "ce_loss_13": 3.268634593486786, + "ce_loss_17": 3.2049444556236266, + "ce_loss_2": 4.389615321159363, + "ce_loss_4": 3.930896818637848, + "ce_loss_9": 3.464911186695099, + "epoch": 0.237, + "grad_norm": 1144.0, + "kl_loss_13": 135.14081954956055, + "kl_loss_2": 2443.4658203125, + "kl_loss_4": 1548.9470458984374, + "kl_loss_9": 571.5383850097656, + "learning_rate": 0.0008757891042210713, + "loss": 1186.7279, + "step": 2370 + }, + { + "ce_loss_13": 3.2885490536689757, + "ce_loss_17": 3.2233977794647215, + "ce_loss_2": 4.395193243026734, + "ce_loss_4": 3.9391024827957155, + "ce_loss_9": 3.479727268218994, + "epoch": 0.238, + "grad_norm": 984.0, + "kl_loss_13": 133.54755859375, + "kl_loss_2": 2429.9510131835937, + "kl_loss_4": 1538.015069580078, + "kl_loss_9": 565.2484375, + "learning_rate": 0.0008747405811946271, + "loss": 1172.1912, + "step": 2380 + }, + { + "ce_loss_13": 3.190498685836792, + "ce_loss_17": 3.124911105632782, + "ce_loss_2": 4.366696166992187, + "ce_loss_4": 3.8726316332817077, + "ce_loss_9": 3.3945070028305055, + "epoch": 0.239, + "grad_norm": 916.0, + "kl_loss_13": 134.80825805664062, + "kl_loss_2": 2572.027197265625, + "kl_loss_4": 1606.4017944335938, + "kl_loss_9": 586.3279479980469, + "learning_rate": 0.0008736882845346905, + "loss": 1178.1937, + "step": 2390 + }, + { + "ce_loss_13": 3.280126166343689, + "ce_loss_17": 3.2125991106033327, + "ce_loss_2": 4.4094849824905396, + "ce_loss_4": 3.928516459465027, + "ce_loss_9": 3.4774782538414, + "epoch": 0.24, + "grad_norm": 1072.0, + "kl_loss_13": 137.51787872314452, + "kl_loss_2": 2463.948455810547, + "kl_loss_4": 1531.1963623046875, + "kl_loss_9": 574.06279296875, + "learning_rate": 0.0008726322248378774, + "loss": 1171.1249, + "step": 2400 + }, + { + "ce_loss_13": 3.2805550336837768, + "ce_loss_17": 3.2155802249908447, + "ce_loss_2": 4.433675289154053, + "ce_loss_4": 3.9535003542900085, + "ce_loss_9": 3.4735806703567507, + "epoch": 0.241, + "grad_norm": 904.0, + "kl_loss_13": 130.11072120666503, + "kl_loss_2": 2547.6666625976563, + "kl_loss_4": 1603.959765625, + "kl_loss_9": 572.6860931396484, + "learning_rate": 0.0008715724127386971, + "loss": 1217.5129, + "step": 2410 + }, + { + "ce_loss_13": 3.3479207158088684, + "ce_loss_17": 3.2853101015090944, + "ce_loss_2": 4.442001628875732, + "ce_loss_4": 3.976998782157898, + "ce_loss_9": 3.539716875553131, + "epoch": 0.242, + "grad_norm": 1032.0, + "kl_loss_13": 129.5887435913086, + "kl_loss_2": 2428.3344970703124, + "kl_loss_4": 1519.6899719238281, + "kl_loss_9": 562.5454528808593, + "learning_rate": 0.0008705088589094458, + "loss": 1175.8918, + "step": 2420 + }, + { + "ce_loss_13": 3.3562265396118165, + "ce_loss_17": 3.296085810661316, + "ce_loss_2": 4.469344305992126, + "ce_loss_4": 4.004327380657196, + "ce_loss_9": 3.554501843452454, + "epoch": 0.243, + "grad_norm": 940.0, + "kl_loss_13": 128.9242343902588, + "kl_loss_2": 2458.7217651367187, + "kl_loss_4": 1540.0459411621093, + "kl_loss_9": 581.9041610717774, + "learning_rate": 0.0008694415740600988, + "loss": 1187.2236, + "step": 2430 + }, + { + "ce_loss_13": 3.218381130695343, + "ce_loss_17": 3.154261577129364, + "ce_loss_2": 4.3708965420722965, + "ce_loss_4": 3.900407862663269, + "ce_loss_9": 3.421273422241211, + "epoch": 0.244, + "grad_norm": 1672.0, + "kl_loss_13": 132.84064598083495, + "kl_loss_2": 2530.9083740234373, + "kl_loss_4": 1613.1113159179688, + "kl_loss_9": 590.0507904052735, + "learning_rate": 0.0008683705689382025, + "loss": 1195.7924, + "step": 2440 + }, + { + "ce_loss_13": 3.3007038712501524, + "ce_loss_17": 3.2386343359947203, + "ce_loss_2": 4.3927103042602536, + "ce_loss_4": 3.928513526916504, + "ce_loss_9": 3.4920523405075072, + "epoch": 0.245, + "grad_norm": 1080.0, + "kl_loss_13": 130.68333930969237, + "kl_loss_2": 2419.5972900390625, + "kl_loss_4": 1511.9669555664063, + "kl_loss_9": 565.5081161499023, + "learning_rate": 0.0008672958543287666, + "loss": 1185.5471, + "step": 2450 + }, + { + "ce_loss_13": 3.308526337146759, + "ce_loss_17": 3.245667350292206, + "ce_loss_2": 4.384062385559082, + "ce_loss_4": 3.9374301552772524, + "ce_loss_9": 3.5022360563278196, + "epoch": 0.246, + "grad_norm": 1080.0, + "kl_loss_13": 130.61483840942384, + "kl_loss_2": 2387.4382934570312, + "kl_loss_4": 1513.6611328125, + "kl_loss_9": 565.4847808837891, + "learning_rate": 0.0008662174410541554, + "loss": 1154.4808, + "step": 2460 + }, + { + "ce_loss_13": 3.2732586145401, + "ce_loss_17": 3.2107366919517517, + "ce_loss_2": 4.360881757736206, + "ce_loss_4": 3.897175967693329, + "ce_loss_9": 3.461659276485443, + "epoch": 0.247, + "grad_norm": 1072.0, + "kl_loss_13": 129.1229190826416, + "kl_loss_2": 2402.814074707031, + "kl_loss_4": 1502.0176147460938, + "kl_loss_9": 559.5021774291993, + "learning_rate": 0.0008651353399739787, + "loss": 1182.4203, + "step": 2470 + }, + { + "ce_loss_13": 3.303418982028961, + "ce_loss_17": 3.2402849435806274, + "ce_loss_2": 4.404477429389954, + "ce_loss_4": 3.9484421014785767, + "ce_loss_9": 3.491190695762634, + "epoch": 0.248, + "grad_norm": 1000.0, + "kl_loss_13": 133.1383140563965, + "kl_loss_2": 2427.0049560546877, + "kl_loss_4": 1530.7884887695313, + "kl_loss_9": 565.21064453125, + "learning_rate": 0.0008640495619849821, + "loss": 1168.6572, + "step": 2480 + }, + { + "ce_loss_13": 3.29063161611557, + "ce_loss_17": 3.1960522294044496, + "ce_loss_2": 4.357745480537415, + "ce_loss_4": 3.886666476726532, + "ce_loss_9": 3.4485520005226133, + "epoch": 0.249, + "grad_norm": 1000.0, + "kl_loss_13": 186.1656692504883, + "kl_loss_2": 2435.9569580078123, + "kl_loss_4": 1512.9344177246094, + "kl_loss_9": 563.9576126098633, + "learning_rate": 0.0008629601180209381, + "loss": 1173.1512, + "step": 2490 + }, + { + "ce_loss_13": 3.2811018109321592, + "ce_loss_17": 3.1954628348350527, + "ce_loss_2": 4.351963686943054, + "ce_loss_4": 3.888296663761139, + "ce_loss_9": 3.4419282197952272, + "epoch": 0.25, + "grad_norm": 988.0, + "kl_loss_13": 184.60716857910157, + "kl_loss_2": 2395.45419921875, + "kl_loss_4": 1499.74111328125, + "kl_loss_9": 551.6833145141602, + "learning_rate": 0.000861867019052535, + "loss": 1182.3609, + "step": 2500 + }, + { + "ce_loss_13": 3.1939276576042177, + "ce_loss_17": 3.112457644939423, + "ce_loss_2": 4.327351713180542, + "ce_loss_4": 3.841133236885071, + "ce_loss_9": 3.373430919647217, + "epoch": 0.251, + "grad_norm": 1176.0, + "kl_loss_13": 165.7787857055664, + "kl_loss_2": 2511.038366699219, + "kl_loss_4": 1565.4980102539062, + "kl_loss_9": 567.4776931762696, + "learning_rate": 0.0008607702760872678, + "loss": 1202.1095, + "step": 2510 + }, + { + "ce_loss_13": 3.290536093711853, + "ce_loss_17": 3.2185993432998656, + "ce_loss_2": 4.359192597866058, + "ce_loss_4": 3.9113946914672852, + "ce_loss_9": 3.4659884929656983, + "epoch": 0.252, + "grad_norm": 1656.0, + "kl_loss_13": 149.08386306762696, + "kl_loss_2": 2361.8217895507814, + "kl_loss_4": 1496.411474609375, + "kl_loss_9": 545.7954162597656, + "learning_rate": 0.0008596699001693256, + "loss": 1173.9983, + "step": 2520 + }, + { + "ce_loss_13": 3.305788576602936, + "ce_loss_17": 3.2403656125068663, + "ce_loss_2": 4.376373386383056, + "ce_loss_4": 3.911648893356323, + "ce_loss_9": 3.476252090930939, + "epoch": 0.253, + "grad_norm": 1008.0, + "kl_loss_13": 145.78314056396485, + "kl_loss_2": 2393.4189331054686, + "kl_loss_4": 1483.4752868652345, + "kl_loss_9": 541.9092956542969, + "learning_rate": 0.0008585659023794818, + "loss": 1176.1121, + "step": 2530 + }, + { + "ce_loss_13": 3.267721617221832, + "ce_loss_17": 3.1962388873100283, + "ce_loss_2": 4.404138731956482, + "ce_loss_4": 3.9276219844818114, + "ce_loss_9": 3.4520254611968992, + "epoch": 0.254, + "grad_norm": 1024.0, + "kl_loss_13": 144.80935554504396, + "kl_loss_2": 2499.2675537109376, + "kl_loss_4": 1579.2716918945312, + "kl_loss_9": 561.6414367675782, + "learning_rate": 0.0008574582938349817, + "loss": 1181.5203, + "step": 2540 + }, + { + "ce_loss_13": 3.246894025802612, + "ce_loss_17": 3.1744077086448668, + "ce_loss_2": 4.385828351974487, + "ce_loss_4": 3.9168326258659363, + "ce_loss_9": 3.4482709646224974, + "epoch": 0.255, + "grad_norm": 880.0, + "kl_loss_13": 146.2490036010742, + "kl_loss_2": 2498.342529296875, + "kl_loss_4": 1589.2333557128907, + "kl_loss_9": 583.5234130859375, + "learning_rate": 0.0008563470856894315, + "loss": 1163.9626, + "step": 2550 + }, + { + "ce_loss_13": 3.244091844558716, + "ce_loss_17": 3.174080765247345, + "ce_loss_2": 4.363129913806915, + "ce_loss_4": 3.891741466522217, + "ce_loss_9": 3.43358553647995, + "epoch": 0.256, + "grad_norm": 1464.0, + "kl_loss_13": 135.09313850402833, + "kl_loss_2": 2452.0255615234373, + "kl_loss_4": 1544.8579895019532, + "kl_loss_9": 563.4564407348632, + "learning_rate": 0.0008552322891326845, + "loss": 1168.568, + "step": 2560 + }, + { + "ce_loss_13": 3.213199257850647, + "ce_loss_17": 3.1463199853897095, + "ce_loss_2": 4.341533541679382, + "ce_loss_4": 3.86978040933609, + "ce_loss_9": 3.40135577917099, + "epoch": 0.257, + "grad_norm": 1080.0, + "kl_loss_13": 133.4509418487549, + "kl_loss_2": 2477.9323486328126, + "kl_loss_4": 1555.6577575683593, + "kl_loss_9": 560.8576431274414, + "learning_rate": 0.0008541139153907296, + "loss": 1158.5441, + "step": 2570 + }, + { + "ce_loss_13": 3.173140287399292, + "ce_loss_17": 3.107233667373657, + "ce_loss_2": 4.288096070289612, + "ce_loss_4": 3.8135863900184632, + "ce_loss_9": 3.3572820901870726, + "epoch": 0.258, + "grad_norm": 944.0, + "kl_loss_13": 127.60382804870605, + "kl_loss_2": 2460.6232055664063, + "kl_loss_4": 1542.7626159667968, + "kl_loss_9": 549.3803268432617, + "learning_rate": 0.0008529919757255782, + "loss": 1177.1785, + "step": 2580 + }, + { + "ce_loss_13": 3.2075952768325804, + "ce_loss_17": 3.1483571887016297, + "ce_loss_2": 4.273063981533051, + "ce_loss_4": 3.8110056638717653, + "ce_loss_9": 3.38339558839798, + "epoch": 0.259, + "grad_norm": 1096.0, + "kl_loss_13": 126.33610877990722, + "kl_loss_2": 2358.6348388671877, + "kl_loss_4": 1469.3066711425781, + "kl_loss_9": 535.2224411010742, + "learning_rate": 0.0008518664814351503, + "loss": 1134.108, + "step": 2590 + }, + { + "ce_loss_13": 3.176634979248047, + "ce_loss_17": 3.1104158759117126, + "ce_loss_2": 4.3050748109817505, + "ce_loss_4": 3.831589663028717, + "ce_loss_9": 3.3672611474990846, + "epoch": 0.26, + "grad_norm": 1176.0, + "kl_loss_13": 132.34806365966796, + "kl_loss_2": 2491.8506469726562, + "kl_loss_4": 1569.1078125, + "kl_loss_9": 569.5062820434571, + "learning_rate": 0.0008507374438531607, + "loss": 1216.7088, + "step": 2600 + }, + { + "ce_loss_13": 3.1536267042160033, + "ce_loss_17": 3.0930786609649656, + "ce_loss_2": 4.266261148452759, + "ce_loss_4": 3.80001562833786, + "ce_loss_9": 3.344437062740326, + "epoch": 0.261, + "grad_norm": 1012.0, + "kl_loss_13": 126.88237686157227, + "kl_loss_2": 2426.9488159179687, + "kl_loss_4": 1531.4029235839844, + "kl_loss_9": 550.4254531860352, + "learning_rate": 0.0008496048743490053, + "loss": 1155.3569, + "step": 2610 + }, + { + "ce_loss_13": 3.300561821460724, + "ce_loss_17": 3.2384546637535094, + "ce_loss_2": 4.371083045005799, + "ce_loss_4": 3.9259945511817933, + "ce_loss_9": 3.486074697971344, + "epoch": 0.262, + "grad_norm": 936.0, + "kl_loss_13": 128.5718738555908, + "kl_loss_2": 2371.6972534179686, + "kl_loss_4": 1491.8759399414062, + "kl_loss_9": 549.4777709960938, + "learning_rate": 0.0008484687843276469, + "loss": 1148.3349, + "step": 2620 + }, + { + "ce_loss_13": 3.2328709840774534, + "ce_loss_17": 3.1683553814888, + "ce_loss_2": 4.331315362453461, + "ce_loss_4": 3.8682646989822387, + "ce_loss_9": 3.421520781517029, + "epoch": 0.263, + "grad_norm": 1152.0, + "kl_loss_13": 129.1306957244873, + "kl_loss_2": 2416.7307250976564, + "kl_loss_4": 1517.7003540039063, + "kl_loss_9": 553.895539855957, + "learning_rate": 0.0008473291852294987, + "loss": 1172.2359, + "step": 2630 + }, + { + "ce_loss_13": 3.2404208421707152, + "ce_loss_17": 3.175671112537384, + "ce_loss_2": 4.338552522659302, + "ce_loss_4": 3.8864893555641173, + "ce_loss_9": 3.4317582845687866, + "epoch": 0.264, + "grad_norm": 1112.0, + "kl_loss_13": 129.83010864257812, + "kl_loss_2": 2437.0772399902344, + "kl_loss_4": 1546.6776794433595, + "kl_loss_9": 556.6582641601562, + "learning_rate": 0.0008461860885303114, + "loss": 1157.79, + "step": 2640 + }, + { + "ce_loss_13": 3.2729005813598633, + "ce_loss_17": 3.211615431308746, + "ce_loss_2": 4.347352123260498, + "ce_loss_4": 3.8879849672317506, + "ce_loss_9": 3.4564772725105284, + "epoch": 0.265, + "grad_norm": 1136.0, + "kl_loss_13": 126.11852188110352, + "kl_loss_2": 2372.3445739746094, + "kl_loss_4": 1480.6177062988281, + "kl_loss_9": 541.128044128418, + "learning_rate": 0.000845039505741056, + "loss": 1152.9016, + "step": 2650 + }, + { + "ce_loss_13": 3.252891516685486, + "ce_loss_17": 3.186697447299957, + "ce_loss_2": 4.357656836509705, + "ce_loss_4": 3.899804937839508, + "ce_loss_9": 3.440484857559204, + "epoch": 0.266, + "grad_norm": 1440.0, + "kl_loss_13": 131.520027923584, + "kl_loss_2": 2464.13671875, + "kl_loss_4": 1561.834979248047, + "kl_loss_9": 568.615559387207, + "learning_rate": 0.0008438894484078086, + "loss": 1205.4416, + "step": 2660 + }, + { + "ce_loss_13": 3.26278018951416, + "ce_loss_17": 3.2027502775192263, + "ce_loss_2": 4.346376514434814, + "ce_loss_4": 3.8866035461425783, + "ce_loss_9": 3.4419350743293764, + "epoch": 0.267, + "grad_norm": 1128.0, + "kl_loss_13": 126.62418212890626, + "kl_loss_2": 2401.970983886719, + "kl_loss_4": 1511.085791015625, + "kl_loss_9": 548.1463500976563, + "learning_rate": 0.0008427359281116334, + "loss": 1155.4372, + "step": 2670 + }, + { + "ce_loss_13": 3.1619449257850647, + "ce_loss_17": 3.098335528373718, + "ce_loss_2": 4.294455564022064, + "ce_loss_4": 3.821459674835205, + "ce_loss_9": 3.355985927581787, + "epoch": 0.268, + "grad_norm": 1032.0, + "kl_loss_13": 127.92219352722168, + "kl_loss_2": 2485.9741088867186, + "kl_loss_4": 1556.1580688476563, + "kl_loss_9": 558.9973770141602, + "learning_rate": 0.0008415789564684673, + "loss": 1170.8918, + "step": 2680 + }, + { + "ce_loss_13": 3.4031387090682985, + "ce_loss_17": 3.3418530941009523, + "ce_loss_2": 4.4715519666671755, + "ce_loss_4": 4.017423462867737, + "ce_loss_9": 3.588889503479004, + "epoch": 0.269, + "grad_norm": 916.0, + "kl_loss_13": 130.35048828125, + "kl_loss_2": 2340.5950622558594, + "kl_loss_4": 1470.4480102539062, + "kl_loss_9": 550.8828277587891, + "learning_rate": 0.0008404185451290017, + "loss": 1128.8609, + "step": 2690 + }, + { + "ce_loss_13": 3.2721012234687805, + "ce_loss_17": 3.2122812986373903, + "ce_loss_2": 4.366690754890442, + "ce_loss_4": 3.900901746749878, + "ce_loss_9": 3.458920109272003, + "epoch": 0.27, + "grad_norm": 876.0, + "kl_loss_13": 126.24711227416992, + "kl_loss_2": 2402.3101196289062, + "kl_loss_4": 1500.8953063964843, + "kl_loss_9": 544.9368743896484, + "learning_rate": 0.0008392547057785661, + "loss": 1144.8202, + "step": 2700 + }, + { + "ce_loss_13": 3.20634663105011, + "ce_loss_17": 3.1404446959495544, + "ce_loss_2": 4.34354875087738, + "ce_loss_4": 3.8613749861717226, + "ce_loss_9": 3.397957968711853, + "epoch": 0.271, + "grad_norm": 1024.0, + "kl_loss_13": 138.9049976348877, + "kl_loss_2": 2541.2163696289062, + "kl_loss_4": 1584.1801452636719, + "kl_loss_9": 569.9162368774414, + "learning_rate": 0.0008380874501370098, + "loss": 1160.3137, + "step": 2710 + }, + { + "ce_loss_13": 3.1958078384399413, + "ce_loss_17": 3.1317479968070985, + "ce_loss_2": 4.334646201133728, + "ce_loss_4": 3.8549742460250855, + "ce_loss_9": 3.3933589339256285, + "epoch": 0.272, + "grad_norm": 1216.0, + "kl_loss_13": 139.6747169494629, + "kl_loss_2": 2506.9614440917967, + "kl_loss_4": 1564.9778137207031, + "kl_loss_9": 575.2636154174804, + "learning_rate": 0.0008369167899585841, + "loss": 1180.8441, + "step": 2720 + }, + { + "ce_loss_13": 3.3191107273101808, + "ce_loss_17": 3.2548878669738768, + "ce_loss_2": 4.367061805725098, + "ce_loss_4": 3.9256667375564573, + "ce_loss_9": 3.499273753166199, + "epoch": 0.273, + "grad_norm": 1192.0, + "kl_loss_13": 133.40234298706054, + "kl_loss_2": 2337.853271484375, + "kl_loss_4": 1467.8627563476562, + "kl_loss_9": 540.1153335571289, + "learning_rate": 0.0008357427370318238, + "loss": 1162.1727, + "step": 2730 + }, + { + "ce_loss_13": 3.2740294098854066, + "ce_loss_17": 3.2103097677230834, + "ce_loss_2": 4.379172682762146, + "ce_loss_4": 3.9060402512550354, + "ce_loss_9": 3.4591461062431335, + "epoch": 0.274, + "grad_norm": 1112.0, + "kl_loss_13": 133.08239250183107, + "kl_loss_2": 2446.6520874023436, + "kl_loss_4": 1520.9078430175782, + "kl_loss_9": 550.8165802001953, + "learning_rate": 0.0008345653031794292, + "loss": 1169.485, + "step": 2740 + }, + { + "ce_loss_13": 3.268154036998749, + "ce_loss_17": 3.206729364395142, + "ce_loss_2": 4.366567754745484, + "ce_loss_4": 3.9016868472099304, + "ce_loss_9": 3.4582218289375306, + "epoch": 0.275, + "grad_norm": 1040.0, + "kl_loss_13": 130.93044738769532, + "kl_loss_2": 2412.4147521972654, + "kl_loss_4": 1501.049560546875, + "kl_loss_9": 552.4580657958984, + "learning_rate": 0.0008333845002581458, + "loss": 1154.1961, + "step": 2750 + }, + { + "ce_loss_13": 3.198913943767548, + "ce_loss_17": 3.136397862434387, + "ce_loss_2": 4.321640872955323, + "ce_loss_4": 3.8584501266479494, + "ce_loss_9": 3.388032150268555, + "epoch": 0.276, + "grad_norm": 1064.0, + "kl_loss_13": 132.80465927124024, + "kl_loss_2": 2504.070166015625, + "kl_loss_4": 1592.6373046875, + "kl_loss_9": 568.819091796875, + "learning_rate": 0.0008322003401586462, + "loss": 1188.9615, + "step": 2760 + }, + { + "ce_loss_13": 3.230616331100464, + "ce_loss_17": 3.169395887851715, + "ce_loss_2": 4.30545289516449, + "ce_loss_4": 3.846277284622192, + "ce_loss_9": 3.4128074049949646, + "epoch": 0.277, + "grad_norm": 888.0, + "kl_loss_13": 124.23615074157715, + "kl_loss_2": 2369.2466674804687, + "kl_loss_4": 1485.9005249023437, + "kl_loss_9": 536.0483352661133, + "learning_rate": 0.0008310128348054094, + "loss": 1116.8891, + "step": 2770 + }, + { + "ce_loss_13": 3.2008282423019407, + "ce_loss_17": 3.1417774081230165, + "ce_loss_2": 4.298097097873688, + "ce_loss_4": 3.8336638689041136, + "ce_loss_9": 3.3848743557929994, + "epoch": 0.278, + "grad_norm": 1200.0, + "kl_loss_13": 126.43611068725586, + "kl_loss_2": 2416.030029296875, + "kl_loss_4": 1506.5792419433594, + "kl_loss_9": 548.2768005371094, + "learning_rate": 0.0008298219961566008, + "loss": 1146.8084, + "step": 2780 + }, + { + "ce_loss_13": 3.1657528281211853, + "ce_loss_17": 3.1060426354408266, + "ce_loss_2": 4.303262197971344, + "ce_loss_4": 3.8290231585502625, + "ce_loss_9": 3.358876872062683, + "epoch": 0.279, + "grad_norm": 1064.0, + "kl_loss_13": 128.27086067199707, + "kl_loss_2": 2513.7822265625, + "kl_loss_4": 1581.432928466797, + "kl_loss_9": 569.5649337768555, + "learning_rate": 0.0008286278362039527, + "loss": 1163.257, + "step": 2790 + }, + { + "ce_loss_13": 3.1953029155731203, + "ce_loss_17": 3.133466362953186, + "ce_loss_2": 4.343358755111694, + "ce_loss_4": 3.859322023391724, + "ce_loss_9": 3.3885730862617494, + "epoch": 0.28, + "grad_norm": 984.0, + "kl_loss_13": 127.40050735473633, + "kl_loss_2": 2516.710729980469, + "kl_loss_4": 1575.1679443359376, + "kl_loss_9": 557.6445678710937, + "learning_rate": 0.0008274303669726426, + "loss": 1154.4794, + "step": 2800 + }, + { + "ce_loss_13": 3.0999977946281434, + "ce_loss_17": 3.0372838020324706, + "ce_loss_2": 4.267980313301086, + "ce_loss_4": 3.7696927547454835, + "ce_loss_9": 3.2945470094680784, + "epoch": 0.281, + "grad_norm": 1128.0, + "kl_loss_13": 127.5879337310791, + "kl_loss_2": 2556.379846191406, + "kl_loss_4": 1576.2443237304688, + "kl_loss_9": 558.8732559204102, + "learning_rate": 0.0008262296005211721, + "loss": 1157.9153, + "step": 2810 + }, + { + "ce_loss_13": 3.2296096801757814, + "ce_loss_17": 3.166466403007507, + "ce_loss_2": 4.3552744626998905, + "ce_loss_4": 3.8800341367721556, + "ce_loss_9": 3.4214464902877806, + "epoch": 0.282, + "grad_norm": 984.0, + "kl_loss_13": 126.5991382598877, + "kl_loss_2": 2460.500811767578, + "kl_loss_4": 1541.0466186523438, + "kl_loss_9": 561.6442016601562, + "learning_rate": 0.0008250255489412463, + "loss": 1156.171, + "step": 2820 + }, + { + "ce_loss_13": 3.3276114225387574, + "ce_loss_17": 3.262240135669708, + "ce_loss_2": 4.421131682395935, + "ce_loss_4": 3.9639588236808776, + "ce_loss_9": 3.5064687371253966, + "epoch": 0.283, + "grad_norm": 1280.0, + "kl_loss_13": 127.70831680297852, + "kl_loss_2": 2416.2974609375, + "kl_loss_4": 1513.4972229003906, + "kl_loss_9": 546.3396697998047, + "learning_rate": 0.0008238182243576511, + "loss": 1155.1063, + "step": 2830 + }, + { + "ce_loss_13": 3.2904643654823302, + "ce_loss_17": 3.232466924190521, + "ce_loss_2": 4.3121960997581485, + "ce_loss_4": 3.878585159778595, + "ce_loss_9": 3.4648091554641725, + "epoch": 0.284, + "grad_norm": 1012.0, + "kl_loss_13": 124.42939529418945, + "kl_loss_2": 2280.30302734375, + "kl_loss_4": 1428.722509765625, + "kl_loss_9": 524.9333801269531, + "learning_rate": 0.0008226076389281315, + "loss": 1114.7313, + "step": 2840 + }, + { + "ce_loss_13": 3.330171585083008, + "ce_loss_17": 3.272111511230469, + "ce_loss_2": 4.392184138298035, + "ce_loss_4": 3.942944276332855, + "ce_loss_9": 3.5083374857902525, + "epoch": 0.285, + "grad_norm": 960.0, + "kl_loss_13": 124.86542510986328, + "kl_loss_2": 2382.9051025390627, + "kl_loss_4": 1487.2024658203125, + "kl_loss_9": 538.8542251586914, + "learning_rate": 0.0008213938048432696, + "loss": 1119.9718, + "step": 2850 + }, + { + "ce_loss_13": 3.2606813669204713, + "ce_loss_17": 3.197364699840546, + "ce_loss_2": 4.339873337745667, + "ce_loss_4": 3.8752093434333803, + "ce_loss_9": 3.4461276054382326, + "epoch": 0.286, + "grad_norm": 1224.0, + "kl_loss_13": 128.25371818542482, + "kl_loss_2": 2371.3980712890625, + "kl_loss_4": 1469.12548828125, + "kl_loss_9": 540.4666763305664, + "learning_rate": 0.0008201767343263612, + "loss": 1143.8422, + "step": 2860 + }, + { + "ce_loss_13": 3.2037661194801332, + "ce_loss_17": 3.14375239610672, + "ce_loss_2": 4.314858210086823, + "ce_loss_4": 3.8487162590026855, + "ce_loss_9": 3.390323507785797, + "epoch": 0.287, + "grad_norm": 1192.0, + "kl_loss_13": 126.11991844177246, + "kl_loss_2": 2443.959765625, + "kl_loss_4": 1534.53837890625, + "kl_loss_9": 551.5285293579102, + "learning_rate": 0.0008189564396332927, + "loss": 1120.3805, + "step": 2870 + }, + { + "ce_loss_13": 3.187635588645935, + "ce_loss_17": 3.1275291562080385, + "ce_loss_2": 4.304873895645142, + "ce_loss_4": 3.834199237823486, + "ce_loss_9": 3.376280093193054, + "epoch": 0.288, + "grad_norm": 1232.0, + "kl_loss_13": 123.50970878601075, + "kl_loss_2": 2436.457794189453, + "kl_loss_4": 1516.4190856933594, + "kl_loss_9": 545.2881454467773, + "learning_rate": 0.0008177329330524181, + "loss": 1153.2082, + "step": 2880 + }, + { + "ce_loss_13": 3.2426687598228456, + "ce_loss_17": 3.1831744432449343, + "ce_loss_2": 4.3146365761756895, + "ce_loss_4": 3.8643888235092163, + "ce_loss_9": 3.425844359397888, + "epoch": 0.289, + "grad_norm": 1472.0, + "kl_loss_13": 123.43602142333984, + "kl_loss_2": 2352.9763366699217, + "kl_loss_4": 1475.5585388183595, + "kl_loss_9": 533.2192260742188, + "learning_rate": 0.0008165062269044352, + "loss": 1126.9439, + "step": 2890 + }, + { + "ce_loss_13": 3.1997581720352173, + "ce_loss_17": 3.1352108001708983, + "ce_loss_2": 4.308706760406494, + "ce_loss_4": 3.828529250621796, + "ce_loss_9": 3.385022830963135, + "epoch": 0.29, + "grad_norm": 1000.0, + "kl_loss_13": 128.70101165771484, + "kl_loss_2": 2453.2666625976562, + "kl_loss_4": 1514.585675048828, + "kl_loss_9": 551.541455078125, + "learning_rate": 0.0008152763335422613, + "loss": 1160.5094, + "step": 2900 + }, + { + "ce_loss_13": 3.185084581375122, + "ce_loss_17": 3.121827554702759, + "ce_loss_2": 4.2843485355377195, + "ce_loss_4": 3.8238794922828676, + "ce_loss_9": 3.365398812294006, + "epoch": 0.291, + "grad_norm": 1152.0, + "kl_loss_13": 129.12664299011232, + "kl_loss_2": 2431.9979248046875, + "kl_loss_4": 1525.8602478027344, + "kl_loss_9": 549.915316772461, + "learning_rate": 0.0008140432653509088, + "loss": 1140.2348, + "step": 2910 + }, + { + "ce_loss_13": 3.2434255599975588, + "ce_loss_17": 3.1809029936790467, + "ce_loss_2": 4.31175925731659, + "ce_loss_4": 3.85903000831604, + "ce_loss_9": 3.4250482439994814, + "epoch": 0.292, + "grad_norm": 980.0, + "kl_loss_13": 127.83625221252441, + "kl_loss_2": 2388.2356201171874, + "kl_loss_4": 1493.0842651367188, + "kl_loss_9": 548.6456085205078, + "learning_rate": 0.0008128070347473608, + "loss": 1133.1981, + "step": 2920 + }, + { + "ce_loss_13": 3.249503719806671, + "ce_loss_17": 3.1868587136268616, + "ce_loss_2": 4.371308970451355, + "ce_loss_4": 3.895190346240997, + "ce_loss_9": 3.4365643858909607, + "epoch": 0.293, + "grad_norm": 1048.0, + "kl_loss_13": 128.10081634521484, + "kl_loss_2": 2475.247509765625, + "kl_loss_4": 1537.1791748046876, + "kl_loss_9": 556.0408630371094, + "learning_rate": 0.0008115676541804455, + "loss": 1155.5307, + "step": 2930 + }, + { + "ce_loss_13": 3.2453121662139894, + "ce_loss_17": 3.1846156001091, + "ce_loss_2": 4.327973961830139, + "ce_loss_4": 3.8633278012275696, + "ce_loss_9": 3.4255148649215696, + "epoch": 0.294, + "grad_norm": 924.0, + "kl_loss_13": 126.01223869323731, + "kl_loss_2": 2394.019909667969, + "kl_loss_4": 1483.6732482910156, + "kl_loss_9": 538.9107009887696, + "learning_rate": 0.0008103251361307119, + "loss": 1153.0051, + "step": 2940 + }, + { + "ce_loss_13": 3.279121255874634, + "ce_loss_17": 3.216415858268738, + "ce_loss_2": 4.364791202545166, + "ce_loss_4": 3.9064201712608337, + "ce_loss_9": 3.466567671298981, + "epoch": 0.295, + "grad_norm": 1168.0, + "kl_loss_13": 127.26075477600098, + "kl_loss_2": 2399.258447265625, + "kl_loss_4": 1509.8536437988282, + "kl_loss_9": 548.4006149291993, + "learning_rate": 0.0008090794931103026, + "loss": 1133.9059, + "step": 2950 + }, + { + "ce_loss_13": 3.260691452026367, + "ce_loss_17": 3.200836515426636, + "ce_loss_2": 4.327987408638, + "ce_loss_4": 3.879507374763489, + "ce_loss_9": 3.439251124858856, + "epoch": 0.296, + "grad_norm": 1272.0, + "kl_loss_13": 122.98451042175293, + "kl_loss_2": 2352.14873046875, + "kl_loss_4": 1473.5071899414063, + "kl_loss_9": 533.3884475708007, + "learning_rate": 0.0008078307376628291, + "loss": 1130.6416, + "step": 2960 + }, + { + "ce_loss_13": 3.3194911003112795, + "ce_loss_17": 3.263146662712097, + "ce_loss_2": 4.35304057598114, + "ce_loss_4": 3.9175407528877257, + "ce_loss_9": 3.4988057613372803, + "epoch": 0.297, + "grad_norm": 1072.0, + "kl_loss_13": 120.59720306396484, + "kl_loss_2": 2272.6192626953125, + "kl_loss_4": 1425.6671081542968, + "kl_loss_9": 520.9113632202149, + "learning_rate": 0.000806578882363245, + "loss": 1091.8103, + "step": 2970 + }, + { + "ce_loss_13": 3.2369943022727967, + "ce_loss_17": 3.178285872936249, + "ce_loss_2": 4.299906623363495, + "ce_loss_4": 3.846910905838013, + "ce_loss_9": 3.414962613582611, + "epoch": 0.298, + "grad_norm": 1240.0, + "kl_loss_13": 122.8391300201416, + "kl_loss_2": 2351.7813110351562, + "kl_loss_4": 1471.449267578125, + "kl_loss_9": 536.3119079589844, + "learning_rate": 0.0008053239398177191, + "loss": 1148.6521, + "step": 2980 + }, + { + "ce_loss_13": 3.2252198576927187, + "ce_loss_17": 3.1645880460739138, + "ce_loss_2": 4.316426849365234, + "ce_loss_4": 3.8593194007873537, + "ce_loss_9": 3.406945192813873, + "epoch": 0.299, + "grad_norm": 1160.0, + "kl_loss_13": 123.87558555603027, + "kl_loss_2": 2393.850305175781, + "kl_loss_4": 1496.8007019042968, + "kl_loss_9": 535.2160018920898, + "learning_rate": 0.0008040659226635089, + "loss": 1161.0637, + "step": 2990 + }, + { + "ce_loss_13": 3.3492501974105835, + "ce_loss_17": 3.2833282947540283, + "ce_loss_2": 4.415388989448547, + "ce_loss_4": 3.9594863414764405, + "ce_loss_9": 3.5338119864463806, + "epoch": 0.3, + "grad_norm": 948.0, + "kl_loss_13": 130.1931293487549, + "kl_loss_2": 2363.914898681641, + "kl_loss_4": 1477.0997985839845, + "kl_loss_9": 548.5758621215821, + "learning_rate": 0.0008028048435688333, + "loss": 1125.8326, + "step": 3000 + }, + { + "ce_loss_13": 3.221744477748871, + "ce_loss_17": 3.160210704803467, + "ce_loss_2": 4.324938941001892, + "ce_loss_4": 3.8589487433433534, + "ce_loss_9": 3.4083585500717164, + "epoch": 0.301, + "grad_norm": 1376.0, + "kl_loss_13": 125.73593139648438, + "kl_loss_2": 2438.4499633789064, + "kl_loss_4": 1523.3824584960937, + "kl_loss_9": 548.8932312011718, + "learning_rate": 0.0008015407152327448, + "loss": 1150.1362, + "step": 3010 + }, + { + "ce_loss_13": 3.264834225177765, + "ce_loss_17": 3.203164303302765, + "ce_loss_2": 4.352232456207275, + "ce_loss_4": 3.8845499038696287, + "ce_loss_9": 3.447026801109314, + "epoch": 0.302, + "grad_norm": 1168.0, + "kl_loss_13": 128.2925884246826, + "kl_loss_2": 2426.3364868164062, + "kl_loss_4": 1498.4809509277343, + "kl_loss_9": 545.5912322998047, + "learning_rate": 0.0008002735503850016, + "loss": 1149.155, + "step": 3020 + }, + { + "ce_loss_13": 3.158675718307495, + "ce_loss_17": 3.096822273731232, + "ce_loss_2": 4.283450317382813, + "ce_loss_4": 3.8108823537826537, + "ce_loss_9": 3.347420847415924, + "epoch": 0.303, + "grad_norm": 852.0, + "kl_loss_13": 126.87766876220704, + "kl_loss_2": 2458.9157958984374, + "kl_loss_4": 1539.0448791503907, + "kl_loss_9": 554.783595275879, + "learning_rate": 0.0007990033617859396, + "loss": 1159.5053, + "step": 3030 + }, + { + "ce_loss_13": 3.2143150687217714, + "ce_loss_17": 3.153176248073578, + "ce_loss_2": 4.286831665039062, + "ce_loss_4": 3.8296972036361696, + "ce_loss_9": 3.3967381954193114, + "epoch": 0.304, + "grad_norm": 1120.0, + "kl_loss_13": 125.46068496704102, + "kl_loss_2": 2369.1813842773436, + "kl_loss_4": 1474.4321594238281, + "kl_loss_9": 536.6104309082032, + "learning_rate": 0.000797730162226344, + "loss": 1102.3193, + "step": 3040 + }, + { + "ce_loss_13": 3.2354076504707336, + "ce_loss_17": 3.1755610346794128, + "ce_loss_2": 4.315566229820251, + "ce_loss_4": 3.85812109708786, + "ce_loss_9": 3.4227928161621093, + "epoch": 0.305, + "grad_norm": 952.0, + "kl_loss_13": 125.55020713806152, + "kl_loss_2": 2382.56806640625, + "kl_loss_4": 1494.0224731445312, + "kl_loss_9": 539.874057006836, + "learning_rate": 0.0007964539645273203, + "loss": 1120.4904, + "step": 3050 + }, + { + "ce_loss_13": 3.2497880816459657, + "ce_loss_17": 3.19111887216568, + "ce_loss_2": 4.30394834280014, + "ce_loss_4": 3.853395998477936, + "ce_loss_9": 3.421898901462555, + "epoch": 0.306, + "grad_norm": 1320.0, + "kl_loss_13": 122.69227867126465, + "kl_loss_2": 2327.3169128417967, + "kl_loss_4": 1459.70341796875, + "kl_loss_9": 523.6691238403321, + "learning_rate": 0.000795174781540165, + "loss": 1126.0399, + "step": 3060 + }, + { + "ce_loss_13": 3.3224302649497988, + "ce_loss_17": 3.2659563302993773, + "ce_loss_2": 4.362799215316772, + "ce_loss_4": 3.9159056544303894, + "ce_loss_9": 3.4947072863578796, + "epoch": 0.307, + "grad_norm": 1008.0, + "kl_loss_13": 122.68161735534667, + "kl_loss_2": 2293.161346435547, + "kl_loss_4": 1432.2747436523437, + "kl_loss_9": 519.4392929077148, + "learning_rate": 0.0007938926261462366, + "loss": 1119.0232, + "step": 3070 + }, + { + "ce_loss_13": 3.2754910826683044, + "ce_loss_17": 3.2146644115448, + "ce_loss_2": 4.319262671470642, + "ce_loss_4": 3.8593416690826414, + "ce_loss_9": 3.4458608865737914, + "epoch": 0.308, + "grad_norm": 1296.0, + "kl_loss_13": 124.00109100341797, + "kl_loss_2": 2343.5400390625, + "kl_loss_4": 1448.5188537597655, + "kl_loss_9": 525.8113388061523, + "learning_rate": 0.0007926075112568258, + "loss": 1133.05, + "step": 3080 + }, + { + "ce_loss_13": 3.268499195575714, + "ce_loss_17": 3.2076771378517153, + "ce_loss_2": 4.333807015419007, + "ce_loss_4": 3.8755951166152953, + "ce_loss_9": 3.446335184574127, + "epoch": 0.309, + "grad_norm": 880.0, + "kl_loss_13": 122.68290519714355, + "kl_loss_2": 2372.733020019531, + "kl_loss_4": 1472.968035888672, + "kl_loss_9": 536.2051025390625, + "learning_rate": 0.0007913194498130252, + "loss": 1107.3402, + "step": 3090 + }, + { + "ce_loss_13": 3.192689394950867, + "ce_loss_17": 3.133338475227356, + "ce_loss_2": 4.2941539764404295, + "ce_loss_4": 3.8334529042243957, + "ce_loss_9": 3.381569480895996, + "epoch": 0.31, + "grad_norm": 880.0, + "kl_loss_13": 123.5139316558838, + "kl_loss_2": 2398.068713378906, + "kl_loss_4": 1499.0524963378907, + "kl_loss_9": 539.4936416625976, + "learning_rate": 0.0007900284547855992, + "loss": 1141.9871, + "step": 3100 + }, + { + "ce_loss_13": 3.2067798495292665, + "ce_loss_17": 3.149487042427063, + "ce_loss_2": 4.25969934463501, + "ce_loss_4": 3.8115997910499573, + "ce_loss_9": 3.385125386714935, + "epoch": 0.311, + "grad_norm": 932.0, + "kl_loss_13": 121.11445426940918, + "kl_loss_2": 2340.554052734375, + "kl_loss_4": 1465.2375, + "kl_loss_9": 536.8883056640625, + "learning_rate": 0.0007887345391748532, + "loss": 1135.4595, + "step": 3110 + }, + { + "ce_loss_13": 3.327114629745483, + "ce_loss_17": 3.2699966192245484, + "ce_loss_2": 4.3509868621826175, + "ce_loss_4": 3.9068296194076537, + "ce_loss_9": 3.4944436311721803, + "epoch": 0.312, + "grad_norm": 1032.0, + "kl_loss_13": 120.66176643371583, + "kl_loss_2": 2278.895184326172, + "kl_loss_4": 1423.1607238769532, + "kl_loss_9": 517.2439193725586, + "learning_rate": 0.0007874377160105036, + "loss": 1087.3555, + "step": 3120 + }, + { + "ce_loss_13": 3.234862804412842, + "ce_loss_17": 3.1718748211860657, + "ce_loss_2": 4.329183721542359, + "ce_loss_4": 3.8809149861335754, + "ce_loss_9": 3.403255820274353, + "epoch": 0.313, + "grad_norm": 988.0, + "kl_loss_13": 126.96331253051758, + "kl_loss_2": 2425.5166076660157, + "kl_loss_4": 1566.3126342773437, + "kl_loss_9": 522.1515899658203, + "learning_rate": 0.0007861379983515449, + "loss": 1174.6674, + "step": 3130 + }, + { + "ce_loss_13": 3.3131019949913023, + "ce_loss_17": 3.2474843502044677, + "ce_loss_2": 4.3685447692871096, + "ce_loss_4": 3.9155742168426513, + "ce_loss_9": 3.4827993631362917, + "epoch": 0.314, + "grad_norm": 1008.0, + "kl_loss_13": 141.7888641357422, + "kl_loss_2": 2370.0421508789063, + "kl_loss_4": 1480.9339538574218, + "kl_loss_9": 527.7933471679687, + "learning_rate": 0.0007848353992861195, + "loss": 1123.1941, + "step": 3140 + }, + { + "ce_loss_13": 3.395362210273743, + "ce_loss_17": 3.319834494590759, + "ce_loss_2": 4.446502614021301, + "ce_loss_4": 4.01169718503952, + "ce_loss_9": 3.5656288266181946, + "epoch": 0.315, + "grad_norm": 996.0, + "kl_loss_13": 167.28359603881836, + "kl_loss_2": 2332.403839111328, + "kl_loss_4": 1480.8144287109376, + "kl_loss_9": 539.3936553955078, + "learning_rate": 0.0007835299319313853, + "loss": 1138.2926, + "step": 3150 + }, + { + "ce_loss_13": 3.282095503807068, + "ce_loss_17": 3.2127177715301514, + "ce_loss_2": 4.314677000045776, + "ce_loss_4": 3.864980411529541, + "ce_loss_9": 3.4414833307266237, + "epoch": 0.316, + "grad_norm": 1504.0, + "kl_loss_13": 148.65917434692383, + "kl_loss_2": 2325.644873046875, + "kl_loss_4": 1442.3484619140625, + "kl_loss_9": 520.1078231811523, + "learning_rate": 0.0007822216094333848, + "loss": 1147.7855, + "step": 3160 + }, + { + "ce_loss_13": 3.2855281829833984, + "ce_loss_17": 3.217927122116089, + "ce_loss_2": 4.365339589118958, + "ce_loss_4": 3.8983690977096557, + "ce_loss_9": 3.456553637981415, + "epoch": 0.317, + "grad_norm": 988.0, + "kl_loss_13": 144.84049034118652, + "kl_loss_2": 2392.973602294922, + "kl_loss_4": 1478.5102661132812, + "kl_loss_9": 532.4666564941406, + "learning_rate": 0.0007809104449669101, + "loss": 1121.3748, + "step": 3170 + }, + { + "ce_loss_13": 3.2335867047309876, + "ce_loss_17": 3.1693909764289856, + "ce_loss_2": 4.279091870784759, + "ce_loss_4": 3.8261072874069213, + "ce_loss_9": 3.4042014837265016, + "epoch": 0.318, + "grad_norm": 924.0, + "kl_loss_13": 131.08937492370606, + "kl_loss_2": 2316.0208984375, + "kl_loss_4": 1439.3594787597656, + "kl_loss_9": 523.8772628784179, + "learning_rate": 0.0007795964517353734, + "loss": 1103.5115, + "step": 3180 + }, + { + "ce_loss_13": 3.223126709461212, + "ce_loss_17": 3.1633258938789366, + "ce_loss_2": 4.308388495445252, + "ce_loss_4": 3.83981716632843, + "ce_loss_9": 3.4026937007904055, + "epoch": 0.319, + "grad_norm": 1016.0, + "kl_loss_13": 131.66280212402344, + "kl_loss_2": 2417.4763671875, + "kl_loss_4": 1493.2319091796876, + "kl_loss_9": 540.7565292358398, + "learning_rate": 0.000778279642970672, + "loss": 1110.884, + "step": 3190 + }, + { + "ce_loss_13": 3.2283523082733154, + "ce_loss_17": 3.168657290935516, + "ce_loss_2": 4.277379822731018, + "ce_loss_4": 3.8321051239967345, + "ce_loss_9": 3.404974806308746, + "epoch": 0.32, + "grad_norm": 1048.0, + "kl_loss_13": 125.85943069458008, + "kl_loss_2": 2334.446435546875, + "kl_loss_4": 1457.393292236328, + "kl_loss_9": 528.3299591064454, + "learning_rate": 0.0007769600319330552, + "loss": 1096.533, + "step": 3200 + }, + { + "ce_loss_13": 3.2530168294906616, + "ce_loss_17": 3.193608820438385, + "ce_loss_2": 4.370899343490601, + "ce_loss_4": 3.891812968254089, + "ce_loss_9": 3.440317177772522, + "epoch": 0.321, + "grad_norm": 1424.0, + "kl_loss_13": 124.41287879943847, + "kl_loss_2": 2444.187109375, + "kl_loss_4": 1511.061834716797, + "kl_loss_9": 535.4306365966797, + "learning_rate": 0.0007756376319109917, + "loss": 1133.1059, + "step": 3210 + }, + { + "ce_loss_13": 3.30334130525589, + "ce_loss_17": 3.245970273017883, + "ce_loss_2": 4.349112510681152, + "ce_loss_4": 3.902366375923157, + "ce_loss_9": 3.4771732091903687, + "epoch": 0.322, + "grad_norm": 1240.0, + "kl_loss_13": 123.08681030273438, + "kl_loss_2": 2321.476184082031, + "kl_loss_4": 1447.8593383789062, + "kl_loss_9": 527.7904403686523, + "learning_rate": 0.0007743124562210351, + "loss": 1086.287, + "step": 3220 + }, + { + "ce_loss_13": 3.3142515659332275, + "ce_loss_17": 3.2553324818611147, + "ce_loss_2": 4.3503148555755615, + "ce_loss_4": 3.9107661724090574, + "ce_loss_9": 3.487497019767761, + "epoch": 0.323, + "grad_norm": 1080.0, + "kl_loss_13": 122.52211647033691, + "kl_loss_2": 2315.1113159179686, + "kl_loss_4": 1458.0398010253907, + "kl_loss_9": 524.8260650634766, + "learning_rate": 0.0007729845182076895, + "loss": 1113.227, + "step": 3230 + }, + { + "ce_loss_13": 3.244875729084015, + "ce_loss_17": 3.1864152431488035, + "ce_loss_2": 4.278808498382569, + "ce_loss_4": 3.8322069525718687, + "ce_loss_9": 3.416013467311859, + "epoch": 0.324, + "grad_norm": 1136.0, + "kl_loss_13": 121.2698974609375, + "kl_loss_2": 2293.5723510742187, + "kl_loss_4": 1425.4594177246095, + "kl_loss_9": 521.6138046264648, + "learning_rate": 0.0007716538312432765, + "loss": 1123.0413, + "step": 3240 + }, + { + "ce_loss_13": 3.2065953850746154, + "ce_loss_17": 3.147258126735687, + "ce_loss_2": 4.304112792015076, + "ce_loss_4": 3.831463062763214, + "ce_loss_9": 3.3861777424812316, + "epoch": 0.325, + "grad_norm": 948.0, + "kl_loss_13": 124.9113094329834, + "kl_loss_2": 2404.3642150878904, + "kl_loss_4": 1492.1071228027345, + "kl_loss_9": 541.2806915283203, + "learning_rate": 0.0007703204087277988, + "loss": 1132.8419, + "step": 3250 + }, + { + "ce_loss_13": 3.3009021639823914, + "ce_loss_17": 3.2454873442649843, + "ce_loss_2": 4.328222846984863, + "ce_loss_4": 3.8818641304969788, + "ce_loss_9": 3.47253897190094, + "epoch": 0.326, + "grad_norm": 1064.0, + "kl_loss_13": 119.24455947875977, + "kl_loss_2": 2265.62802734375, + "kl_loss_4": 1401.0557189941405, + "kl_loss_9": 512.3205108642578, + "learning_rate": 0.0007689842640888063, + "loss": 1087.1602, + "step": 3260 + }, + { + "ce_loss_13": 3.295928394794464, + "ce_loss_17": 3.237764537334442, + "ce_loss_2": 4.336790418624878, + "ce_loss_4": 3.888652467727661, + "ce_loss_9": 3.4689743518829346, + "epoch": 0.327, + "grad_norm": 984.0, + "kl_loss_13": 122.70118522644043, + "kl_loss_2": 2287.208117675781, + "kl_loss_4": 1414.8956970214845, + "kl_loss_9": 523.5273834228516, + "learning_rate": 0.0007676454107812607, + "loss": 1099.1889, + "step": 3270 + }, + { + "ce_loss_13": 3.2376078844070433, + "ce_loss_17": 3.1816900610923766, + "ce_loss_2": 4.316426420211792, + "ce_loss_4": 3.847140574455261, + "ce_loss_9": 3.4150453925132753, + "epoch": 0.328, + "grad_norm": 1368.0, + "kl_loss_13": 123.1898422241211, + "kl_loss_2": 2387.416784667969, + "kl_loss_4": 1467.567724609375, + "kl_loss_9": 530.6652893066406, + "learning_rate": 0.0007663038622873999, + "loss": 1110.5736, + "step": 3280 + }, + { + "ce_loss_13": 3.281772184371948, + "ce_loss_17": 3.223153281211853, + "ce_loss_2": 4.337095534801483, + "ce_loss_4": 3.8741299629211428, + "ce_loss_9": 3.4543533086776734, + "epoch": 0.329, + "grad_norm": 1032.0, + "kl_loss_13": 121.96130714416503, + "kl_loss_2": 2350.160192871094, + "kl_loss_4": 1442.80205078125, + "kl_loss_9": 522.9670028686523, + "learning_rate": 0.0007649596321166025, + "loss": 1091.6693, + "step": 3290 + }, + { + "ce_loss_13": 3.184182012081146, + "ce_loss_17": 3.1267475962638853, + "ce_loss_2": 4.223506700992584, + "ce_loss_4": 3.7820595502853394, + "ce_loss_9": 3.3597326993942263, + "epoch": 0.33, + "grad_norm": 1048.0, + "kl_loss_13": 117.21873245239257, + "kl_loss_2": 2280.713970947266, + "kl_loss_4": 1418.3794250488281, + "kl_loss_9": 515.6194625854492, + "learning_rate": 0.0007636127338052513, + "loss": 1103.1809, + "step": 3300 + }, + { + "ce_loss_13": 3.2831894516944886, + "ce_loss_17": 3.2257657408714295, + "ce_loss_2": 4.367099642753601, + "ce_loss_4": 3.907731604576111, + "ce_loss_9": 3.464577794075012, + "epoch": 0.331, + "grad_norm": 984.0, + "kl_loss_13": 122.7444637298584, + "kl_loss_2": 2382.0689331054687, + "kl_loss_4": 1476.6230773925781, + "kl_loss_9": 530.2010208129883, + "learning_rate": 0.0007622631809165971, + "loss": 1106.733, + "step": 3310 + }, + { + "ce_loss_13": 3.2809274673461912, + "ce_loss_17": 3.2282126188278197, + "ce_loss_2": 4.2845458626747135, + "ce_loss_4": 3.853877055644989, + "ce_loss_9": 3.44593003988266, + "epoch": 0.332, + "grad_norm": 988.0, + "kl_loss_13": 114.16889762878418, + "kl_loss_2": 2206.9068969726563, + "kl_loss_4": 1372.0636596679688, + "kl_loss_9": 497.93410339355466, + "learning_rate": 0.000760910987040623, + "loss": 1075.1617, + "step": 3320 + }, + { + "ce_loss_13": 3.262575376033783, + "ce_loss_17": 3.2049320697784425, + "ce_loss_2": 4.351537084579467, + "ce_loss_4": 3.8797045946121216, + "ce_loss_9": 3.4463064789772035, + "epoch": 0.333, + "grad_norm": 992.0, + "kl_loss_13": 123.53824882507324, + "kl_loss_2": 2408.5938720703125, + "kl_loss_4": 1480.855694580078, + "kl_loss_9": 538.6513214111328, + "learning_rate": 0.000759556165793906, + "loss": 1106.4365, + "step": 3330 + }, + { + "ce_loss_13": 3.2815037846565245, + "ce_loss_17": 3.222372794151306, + "ce_loss_2": 4.340499067306519, + "ce_loss_4": 3.882282078266144, + "ce_loss_9": 3.456562674045563, + "epoch": 0.334, + "grad_norm": 896.0, + "kl_loss_13": 121.58780860900879, + "kl_loss_2": 2341.950634765625, + "kl_loss_4": 1441.2859130859374, + "kl_loss_9": 528.8720779418945, + "learning_rate": 0.000758198730819481, + "loss": 1119.0057, + "step": 3340 + }, + { + "ce_loss_13": 3.238608419895172, + "ce_loss_17": 3.180889129638672, + "ce_loss_2": 4.305194532871246, + "ce_loss_4": 3.8447780132293703, + "ce_loss_9": 3.4115110635757446, + "epoch": 0.335, + "grad_norm": 936.0, + "kl_loss_13": 119.46144599914551, + "kl_loss_2": 2366.570104980469, + "kl_loss_4": 1461.37890625, + "kl_loss_9": 518.0635528564453, + "learning_rate": 0.0007568386957867032, + "loss": 1114.3154, + "step": 3350 + }, + { + "ce_loss_13": 3.2940505504608155, + "ce_loss_17": 3.2357301712036133, + "ce_loss_2": 4.335721111297607, + "ce_loss_4": 3.8882152318954466, + "ce_loss_9": 3.4731027603149416, + "epoch": 0.336, + "grad_norm": 1096.0, + "kl_loss_13": 120.3353328704834, + "kl_loss_2": 2300.9728881835936, + "kl_loss_4": 1429.0256774902343, + "kl_loss_9": 527.7066268920898, + "learning_rate": 0.0007554760743911103, + "loss": 1113.5826, + "step": 3360 + }, + { + "ce_loss_13": 3.2137572407722472, + "ce_loss_17": 3.1595774412155153, + "ce_loss_2": 4.2538035869598385, + "ce_loss_4": 3.8056873083114624, + "ce_loss_9": 3.389769423007965, + "epoch": 0.337, + "grad_norm": 1224.0, + "kl_loss_13": 116.9436710357666, + "kl_loss_2": 2322.7385803222655, + "kl_loss_4": 1439.0205383300781, + "kl_loss_9": 510.78955078125, + "learning_rate": 0.0007541108803542846, + "loss": 1125.8256, + "step": 3370 + }, + { + "ce_loss_13": 3.251612687110901, + "ce_loss_17": 3.195606541633606, + "ce_loss_2": 4.306574952602387, + "ce_loss_4": 3.845045638084412, + "ce_loss_9": 3.4235295414924622, + "epoch": 0.338, + "grad_norm": 1064.0, + "kl_loss_13": 119.90342254638672, + "kl_loss_2": 2350.974481201172, + "kl_loss_4": 1440.0969604492188, + "kl_loss_9": 515.7942764282227, + "learning_rate": 0.0007527431274237149, + "loss": 1158.0889, + "step": 3380 + }, + { + "ce_loss_13": 3.229923164844513, + "ce_loss_17": 3.1746895551681518, + "ce_loss_2": 4.270837831497192, + "ce_loss_4": 3.816139316558838, + "ce_loss_9": 3.399337124824524, + "epoch": 0.339, + "grad_norm": 916.0, + "kl_loss_13": 118.58484725952148, + "kl_loss_2": 2314.47021484375, + "kl_loss_4": 1428.8500732421876, + "kl_loss_9": 515.3277740478516, + "learning_rate": 0.0007513728293726579, + "loss": 1103.5739, + "step": 3390 + }, + { + "ce_loss_13": 3.3369848012924193, + "ce_loss_17": 3.2785598993301392, + "ce_loss_2": 4.371959400177002, + "ce_loss_4": 3.922166907787323, + "ce_loss_9": 3.5100149869918824, + "epoch": 0.34, + "grad_norm": 968.0, + "kl_loss_13": 120.84526863098145, + "kl_loss_2": 2303.4162109375, + "kl_loss_4": 1425.5956481933595, + "kl_loss_9": 519.9938262939453, + "learning_rate": 0.00075, + "loss": 1091.0293, + "step": 3400 + }, + { + "ce_loss_13": 3.3281707644462584, + "ce_loss_17": 3.269998776912689, + "ce_loss_2": 4.3841801404953005, + "ce_loss_4": 3.928558886051178, + "ce_loss_9": 3.5081814527511597, + "epoch": 0.341, + "grad_norm": 1072.0, + "kl_loss_13": 123.31075057983398, + "kl_loss_2": 2354.8173828125, + "kl_loss_4": 1457.0600524902343, + "kl_loss_9": 534.3036361694336, + "learning_rate": 0.0007486246531301177, + "loss": 1100.7766, + "step": 3410 + }, + { + "ce_loss_13": 3.1443047404289244, + "ce_loss_17": 3.08646799325943, + "ce_loss_2": 4.208140635490418, + "ce_loss_4": 3.7499518632888793, + "ce_loss_9": 3.320631241798401, + "epoch": 0.342, + "grad_norm": 1040.0, + "kl_loss_13": 119.49196548461914, + "kl_loss_2": 2321.8042907714844, + "kl_loss_4": 1442.9562561035157, + "kl_loss_9": 513.6852355957031, + "learning_rate": 0.0007472468026127384, + "loss": 1085.7555, + "step": 3420 + }, + { + "ce_loss_13": 3.2794711589813232, + "ce_loss_17": 3.2171931505203246, + "ce_loss_2": 4.365655612945557, + "ce_loss_4": 3.898853766918182, + "ce_loss_9": 3.4626105666160583, + "epoch": 0.343, + "grad_norm": 1072.0, + "kl_loss_13": 127.92411308288574, + "kl_loss_2": 2425.393017578125, + "kl_loss_4": 1500.536883544922, + "kl_loss_9": 544.629751586914, + "learning_rate": 0.000745866462322802, + "loss": 1133.7564, + "step": 3430 + }, + { + "ce_loss_13": 3.2591280341148376, + "ce_loss_17": 3.2042083144187927, + "ce_loss_2": 4.29214516878128, + "ce_loss_4": 3.8416125178337097, + "ce_loss_9": 3.4306502461433412, + "epoch": 0.344, + "grad_norm": 1144.0, + "kl_loss_13": 119.0367645263672, + "kl_loss_2": 2273.144140625, + "kl_loss_4": 1396.9806396484375, + "kl_loss_9": 509.9261306762695, + "learning_rate": 0.0007444836461603195, + "loss": 1087.6735, + "step": 3440 + }, + { + "ce_loss_13": 3.321951377391815, + "ce_loss_17": 3.2649898409843443, + "ce_loss_2": 4.376961135864258, + "ce_loss_4": 3.9282360792160036, + "ce_loss_9": 3.5029739856719972, + "epoch": 0.345, + "grad_norm": 1168.0, + "kl_loss_13": 125.95065765380859, + "kl_loss_2": 2364.009930419922, + "kl_loss_4": 1474.0844116210938, + "kl_loss_9": 538.0685333251953, + "learning_rate": 0.0007430983680502344, + "loss": 1127.7188, + "step": 3450 + }, + { + "ce_loss_13": 3.1644829511642456, + "ce_loss_17": 3.109139585494995, + "ce_loss_2": 4.241729295253753, + "ce_loss_4": 3.7845134973526, + "ce_loss_9": 3.347025382518768, + "epoch": 0.346, + "grad_norm": 1368.0, + "kl_loss_13": 119.43791923522949, + "kl_loss_2": 2383.6324951171873, + "kl_loss_4": 1470.8812194824218, + "kl_loss_9": 536.7000152587891, + "learning_rate": 0.0007417106419422819, + "loss": 1119.0673, + "step": 3460 + }, + { + "ce_loss_13": 3.266258716583252, + "ce_loss_17": 3.2094192147254943, + "ce_loss_2": 4.316196548938751, + "ce_loss_4": 3.8623122572898865, + "ce_loss_9": 3.441911768913269, + "epoch": 0.347, + "grad_norm": 1056.0, + "kl_loss_13": 119.1989200592041, + "kl_loss_2": 2293.368475341797, + "kl_loss_4": 1418.2223266601563, + "kl_loss_9": 519.4162979125977, + "learning_rate": 0.0007403204818108486, + "loss": 1104.1301, + "step": 3470 + }, + { + "ce_loss_13": 3.2513155341148376, + "ce_loss_17": 3.1936648726463317, + "ce_loss_2": 4.303562068939209, + "ce_loss_4": 3.8448911786079405, + "ce_loss_9": 3.4258848428726196, + "epoch": 0.348, + "grad_norm": 960.0, + "kl_loss_13": 120.53968238830566, + "kl_loss_2": 2365.9833129882813, + "kl_loss_4": 1455.7239929199218, + "kl_loss_9": 530.2106063842773, + "learning_rate": 0.0007389279016548316, + "loss": 1080.6592, + "step": 3480 + }, + { + "ce_loss_13": 3.2518046855926515, + "ce_loss_17": 3.1916931867599487, + "ce_loss_2": 4.354296565055847, + "ce_loss_4": 3.874348223209381, + "ce_loss_9": 3.438166928291321, + "epoch": 0.349, + "grad_norm": 952.0, + "kl_loss_13": 123.02416687011718, + "kl_loss_2": 2429.16767578125, + "kl_loss_4": 1486.4276550292968, + "kl_loss_9": 538.3228988647461, + "learning_rate": 0.0007375329154974975, + "loss": 1127.644, + "step": 3490 + }, + { + "ce_loss_13": 3.2176830172538757, + "ce_loss_17": 3.161431574821472, + "ce_loss_2": 4.25177253484726, + "ce_loss_4": 3.795971405506134, + "ce_loss_9": 3.3863195180892944, + "epoch": 0.35, + "grad_norm": 1000.0, + "kl_loss_13": 121.72834739685058, + "kl_loss_2": 2295.3088439941407, + "kl_loss_4": 1416.5606079101562, + "kl_loss_9": 511.3838180541992, + "learning_rate": 0.0007361355373863414, + "loss": 1115.1531, + "step": 3500 + }, + { + "ce_loss_13": 3.2595064520835875, + "ce_loss_17": 3.2038070797920226, + "ce_loss_2": 4.295797657966614, + "ce_loss_4": 3.8433785438537598, + "ce_loss_9": 3.4298847317695618, + "epoch": 0.351, + "grad_norm": 960.0, + "kl_loss_13": 118.73634490966796, + "kl_loss_2": 2274.307568359375, + "kl_loss_4": 1391.078955078125, + "kl_loss_9": 504.7493423461914, + "learning_rate": 0.0007347357813929454, + "loss": 1108.4901, + "step": 3510 + }, + { + "ce_loss_13": 3.2150736331939695, + "ce_loss_17": 3.155968129634857, + "ce_loss_2": 4.241057026386261, + "ce_loss_4": 3.789379394054413, + "ce_loss_9": 3.377866244316101, + "epoch": 0.352, + "grad_norm": 1376.0, + "kl_loss_13": 118.2562816619873, + "kl_loss_2": 2264.3343872070313, + "kl_loss_4": 1390.2927185058593, + "kl_loss_9": 502.57317810058595, + "learning_rate": 0.0007333336616128369, + "loss": 1101.7781, + "step": 3520 + }, + { + "ce_loss_13": 3.186604917049408, + "ce_loss_17": 3.127040076255798, + "ce_loss_2": 4.26804370880127, + "ce_loss_4": 3.799088513851166, + "ce_loss_9": 3.366344165802002, + "epoch": 0.353, + "grad_norm": 1128.0, + "kl_loss_13": 122.21125411987305, + "kl_loss_2": 2367.472027587891, + "kl_loss_4": 1451.2210388183594, + "kl_loss_9": 527.5463684082031, + "learning_rate": 0.0007319291921653463, + "loss": 1112.7199, + "step": 3530 + }, + { + "ce_loss_13": 3.2716238379478453, + "ce_loss_17": 3.212361776828766, + "ce_loss_2": 4.336447095870971, + "ce_loss_4": 3.871511459350586, + "ce_loss_9": 3.4486935257911684, + "epoch": 0.354, + "grad_norm": 1064.0, + "kl_loss_13": 124.2909610748291, + "kl_loss_2": 2356.5185791015624, + "kl_loss_4": 1448.2679321289063, + "kl_loss_9": 529.4688507080078, + "learning_rate": 0.0007305223871934656, + "loss": 1093.6241, + "step": 3540 + }, + { + "ce_loss_13": 3.2383341431617736, + "ce_loss_17": 3.1798993587493896, + "ce_loss_2": 4.282925808429718, + "ce_loss_4": 3.828378677368164, + "ce_loss_9": 3.4098609566688536, + "epoch": 0.355, + "grad_norm": 1160.0, + "kl_loss_13": 120.9645767211914, + "kl_loss_2": 2301.982666015625, + "kl_loss_4": 1412.4159057617187, + "kl_loss_9": 512.0778991699219, + "learning_rate": 0.0007291132608637052, + "loss": 1096.1561, + "step": 3550 + }, + { + "ce_loss_13": 3.2011759638786317, + "ce_loss_17": 3.146040689945221, + "ce_loss_2": 4.323493075370789, + "ce_loss_4": 3.84235223531723, + "ce_loss_9": 3.37730211019516, + "epoch": 0.356, + "grad_norm": 984.0, + "kl_loss_13": 118.92344245910644, + "kl_loss_2": 2464.5512634277343, + "kl_loss_4": 1518.4718688964845, + "kl_loss_9": 515.1716766357422, + "learning_rate": 0.0007277018273659516, + "loss": 1142.6762, + "step": 3560 + }, + { + "ce_loss_13": 3.3223732113838196, + "ce_loss_17": 3.2644725322723387, + "ce_loss_2": 4.370389211177826, + "ce_loss_4": 3.9286834001541138, + "ce_loss_9": 3.5057213306427, + "epoch": 0.357, + "grad_norm": 1040.0, + "kl_loss_13": 124.14102096557617, + "kl_loss_2": 2336.9487915039062, + "kl_loss_4": 1465.2128784179688, + "kl_loss_9": 535.8851776123047, + "learning_rate": 0.0007262881009133242, + "loss": 1107.7307, + "step": 3570 + }, + { + "ce_loss_13": 3.246894121170044, + "ce_loss_17": 3.190457808971405, + "ce_loss_2": 4.286640787124634, + "ce_loss_4": 3.8347811460494996, + "ce_loss_9": 3.418200898170471, + "epoch": 0.358, + "grad_norm": 1032.0, + "kl_loss_13": 118.03323860168457, + "kl_loss_2": 2299.463232421875, + "kl_loss_4": 1414.3035888671875, + "kl_loss_9": 508.04096069335935, + "learning_rate": 0.0007248720957420329, + "loss": 1081.6021, + "step": 3580 + }, + { + "ce_loss_13": 3.242346262931824, + "ce_loss_17": 3.1887784600257874, + "ce_loss_2": 4.280523240566254, + "ce_loss_4": 3.8216065883636476, + "ce_loss_9": 3.410325264930725, + "epoch": 0.359, + "grad_norm": 1020.0, + "kl_loss_13": 118.04314804077148, + "kl_loss_2": 2282.009143066406, + "kl_loss_4": 1386.5883483886719, + "kl_loss_9": 506.6142013549805, + "learning_rate": 0.0007234538261112341, + "loss": 1107.6966, + "step": 3590 + }, + { + "ce_loss_13": 3.2862833976745605, + "ce_loss_17": 3.2279277563095095, + "ce_loss_2": 4.34386819601059, + "ce_loss_4": 3.880169320106506, + "ce_loss_9": 3.456702542304993, + "epoch": 0.36, + "grad_norm": 1192.0, + "kl_loss_13": 119.83821144104004, + "kl_loss_2": 2346.4058227539062, + "kl_loss_4": 1438.1818603515626, + "kl_loss_9": 520.6408508300781, + "learning_rate": 0.0007220333063028871, + "loss": 1093.3729, + "step": 3600 + }, + { + "ce_loss_13": 3.3232455968856813, + "ce_loss_17": 3.264686441421509, + "ce_loss_2": 4.418456983566284, + "ce_loss_4": 3.947876346111298, + "ce_loss_9": 3.4988825917243958, + "epoch": 0.361, + "grad_norm": 1024.0, + "kl_loss_13": 122.48561820983886, + "kl_loss_2": 2440.8555419921877, + "kl_loss_4": 1514.877392578125, + "kl_loss_9": 529.0708618164062, + "learning_rate": 0.0007206105506216106, + "loss": 1137.4742, + "step": 3610 + }, + { + "ce_loss_13": 3.1973274827003477, + "ce_loss_17": 3.142447865009308, + "ce_loss_2": 4.232101929187775, + "ce_loss_4": 3.7837777018547056, + "ce_loss_9": 3.3675485372543337, + "epoch": 0.362, + "grad_norm": 1040.0, + "kl_loss_13": 115.83602752685547, + "kl_loss_2": 2267.7968688964843, + "kl_loss_4": 1403.0463562011719, + "kl_loss_9": 501.7510787963867, + "learning_rate": 0.0007191855733945387, + "loss": 1067.9951, + "step": 3620 + }, + { + "ce_loss_13": 3.2900184869766234, + "ce_loss_17": 3.2331868052482604, + "ce_loss_2": 4.330365610122681, + "ce_loss_4": 3.875192940235138, + "ce_loss_9": 3.459042763710022, + "epoch": 0.363, + "grad_norm": 1064.0, + "kl_loss_13": 117.36633644104003, + "kl_loss_2": 2303.5316955566404, + "kl_loss_4": 1414.277587890625, + "kl_loss_9": 510.66974182128905, + "learning_rate": 0.0007177583889711762, + "loss": 1082.5898, + "step": 3630 + }, + { + "ce_loss_13": 3.2053954243659972, + "ce_loss_17": 3.1482120394706725, + "ce_loss_2": 4.264966559410095, + "ce_loss_4": 3.8012089490890504, + "ce_loss_9": 3.3806539297103884, + "epoch": 0.364, + "grad_norm": 1032.0, + "kl_loss_13": 118.35905075073242, + "kl_loss_2": 2340.270648193359, + "kl_loss_4": 1435.3213928222656, + "kl_loss_9": 519.4613372802735, + "learning_rate": 0.0007163290117232541, + "loss": 1100.2158, + "step": 3640 + }, + { + "ce_loss_13": 3.3174272060394285, + "ce_loss_17": 3.262504816055298, + "ce_loss_2": 4.317688310146332, + "ce_loss_4": 3.8793020248413086, + "ce_loss_9": 3.481636679172516, + "epoch": 0.365, + "grad_norm": 1096.0, + "kl_loss_13": 115.80275955200196, + "kl_loss_2": 2253.604669189453, + "kl_loss_4": 1377.8312255859375, + "kl_loss_9": 498.6133544921875, + "learning_rate": 0.0007148974560445859, + "loss": 1076.6742, + "step": 3650 + }, + { + "ce_loss_13": 3.2483646273612976, + "ce_loss_17": 3.1922641158103944, + "ce_loss_2": 4.255635070800781, + "ce_loss_4": 3.8187443375587464, + "ce_loss_9": 3.4140630960464478, + "epoch": 0.366, + "grad_norm": 1232.0, + "kl_loss_13": 116.36379852294922, + "kl_loss_2": 2238.6817321777344, + "kl_loss_4": 1393.1689453125, + "kl_loss_9": 505.644091796875, + "learning_rate": 0.0007134637363509209, + "loss": 1064.5636, + "step": 3660 + }, + { + "ce_loss_13": 3.3533724665641786, + "ce_loss_17": 3.299256956577301, + "ce_loss_2": 4.358953952789307, + "ce_loss_4": 3.918874216079712, + "ce_loss_9": 3.51888427734375, + "epoch": 0.367, + "grad_norm": 1040.0, + "kl_loss_13": 115.0433910369873, + "kl_loss_2": 2220.7624450683593, + "kl_loss_4": 1371.2409790039062, + "kl_loss_9": 499.35091094970704, + "learning_rate": 0.0007120278670798009, + "loss": 1078.0957, + "step": 3670 + }, + { + "ce_loss_13": 3.1595748543739317, + "ce_loss_17": 3.101496493816376, + "ce_loss_2": 4.2694832444190975, + "ce_loss_4": 3.795147728919983, + "ce_loss_9": 3.345221209526062, + "epoch": 0.368, + "grad_norm": 1352.0, + "kl_loss_13": 120.38405227661133, + "kl_loss_2": 2451.781280517578, + "kl_loss_4": 1509.0809997558595, + "kl_loss_9": 538.7034454345703, + "learning_rate": 0.0007105898626904133, + "loss": 1149.4721, + "step": 3680 + }, + { + "ce_loss_13": 3.256004345417023, + "ce_loss_17": 3.198394501209259, + "ce_loss_2": 4.312865948677063, + "ce_loss_4": 3.8444956302642823, + "ce_loss_9": 3.422794210910797, + "epoch": 0.369, + "grad_norm": 1128.0, + "kl_loss_13": 117.06986694335937, + "kl_loss_2": 2316.063824462891, + "kl_loss_4": 1412.9476623535156, + "kl_loss_9": 506.39573364257814, + "learning_rate": 0.0007091497376634463, + "loss": 1079.3659, + "step": 3690 + }, + { + "ce_loss_13": 3.203166663646698, + "ce_loss_17": 3.1482756972312926, + "ce_loss_2": 4.236350440979004, + "ce_loss_4": 3.786738908290863, + "ce_loss_9": 3.3716963291168214, + "epoch": 0.37, + "grad_norm": 1296.0, + "kl_loss_13": 119.29188232421875, + "kl_loss_2": 2294.298455810547, + "kl_loss_4": 1407.3860595703125, + "kl_loss_9": 505.8227905273437, + "learning_rate": 0.0007077075065009433, + "loss": 1099.6969, + "step": 3700 + }, + { + "ce_loss_13": 3.3031542897224426, + "ce_loss_17": 3.2442779779434203, + "ce_loss_2": 4.3587446212768555, + "ce_loss_4": 3.906403601169586, + "ce_loss_9": 3.479103207588196, + "epoch": 0.371, + "grad_norm": 1048.0, + "kl_loss_13": 123.7761360168457, + "kl_loss_2": 2332.160107421875, + "kl_loss_4": 1442.7122131347655, + "kl_loss_9": 521.059553527832, + "learning_rate": 0.0007062631837261557, + "loss": 1097.5053, + "step": 3710 + }, + { + "ce_loss_13": 3.1829145431518553, + "ce_loss_17": 3.1262371301651, + "ce_loss_2": 4.2195816278457645, + "ce_loss_4": 3.7679691195487974, + "ce_loss_9": 3.348852050304413, + "epoch": 0.372, + "grad_norm": 1216.0, + "kl_loss_13": 117.60966072082519, + "kl_loss_2": 2310.739880371094, + "kl_loss_4": 1421.925067138672, + "kl_loss_9": 511.9546661376953, + "learning_rate": 0.0007048167838833977, + "loss": 1114.1547, + "step": 3720 + }, + { + "ce_loss_13": 3.274474358558655, + "ce_loss_17": 3.214981162548065, + "ce_loss_2": 4.29020231962204, + "ce_loss_4": 3.8427693247795105, + "ce_loss_9": 3.4398711562156676, + "epoch": 0.373, + "grad_norm": 1048.0, + "kl_loss_13": 121.34806442260742, + "kl_loss_2": 2286.2930419921877, + "kl_loss_4": 1392.8391967773437, + "kl_loss_9": 508.8039886474609, + "learning_rate": 0.0007033683215379002, + "loss": 1078.0963, + "step": 3730 + }, + { + "ce_loss_13": 3.2551639437675477, + "ce_loss_17": 3.2005463004112245, + "ce_loss_2": 4.303338527679443, + "ce_loss_4": 3.840170180797577, + "ce_loss_9": 3.433464467525482, + "epoch": 0.374, + "grad_norm": 1200.0, + "kl_loss_13": 117.97501716613769, + "kl_loss_2": 2294.1265563964844, + "kl_loss_4": 1397.4256408691406, + "kl_loss_9": 515.2082077026367, + "learning_rate": 0.0007019178112756625, + "loss": 1095.4021, + "step": 3740 + }, + { + "ce_loss_13": 3.2217029452323915, + "ce_loss_17": 3.1650134205818174, + "ce_loss_2": 4.25470415353775, + "ce_loss_4": 3.8069305181503297, + "ce_loss_9": 3.3860450387001038, + "epoch": 0.375, + "grad_norm": 1032.0, + "kl_loss_13": 116.44257850646973, + "kl_loss_2": 2281.860736083984, + "kl_loss_4": 1401.6822204589844, + "kl_loss_9": 506.30464935302734, + "learning_rate": 0.0007004652677033068, + "loss": 1089.0237, + "step": 3750 + }, + { + "ce_loss_13": 3.2920990109443666, + "ce_loss_17": 3.241587746143341, + "ce_loss_2": 4.306069231033325, + "ce_loss_4": 3.8558060050010683, + "ce_loss_9": 3.4568776726722716, + "epoch": 0.376, + "grad_norm": 1088.0, + "kl_loss_13": 115.13447761535645, + "kl_loss_2": 2243.1771728515623, + "kl_loss_4": 1365.0895629882812, + "kl_loss_9": 493.81934509277346, + "learning_rate": 0.0006990107054479312, + "loss": 1072.3539, + "step": 3760 + }, + { + "ce_loss_13": 3.279723334312439, + "ce_loss_17": 3.220236301422119, + "ce_loss_2": 4.2950098514556885, + "ce_loss_4": 3.862891411781311, + "ce_loss_9": 3.4497161984443663, + "epoch": 0.377, + "grad_norm": 1020.0, + "kl_loss_13": 119.52506408691406, + "kl_loss_2": 2255.395788574219, + "kl_loss_4": 1405.876446533203, + "kl_loss_9": 510.06729888916016, + "learning_rate": 0.000697554139156961, + "loss": 1082.4869, + "step": 3770 + }, + { + "ce_loss_13": 3.267723226547241, + "ce_loss_17": 3.211534094810486, + "ce_loss_2": 4.31499674320221, + "ce_loss_4": 3.849847364425659, + "ce_loss_9": 3.4379292011260985, + "epoch": 0.378, + "grad_norm": 1152.0, + "kl_loss_13": 120.02754135131836, + "kl_loss_2": 2327.1294677734377, + "kl_loss_4": 1411.6473876953125, + "kl_loss_9": 517.193717956543, + "learning_rate": 0.0006960955834980027, + "loss": 1072.0203, + "step": 3780 + }, + { + "ce_loss_13": 3.2363647699356077, + "ce_loss_17": 3.181026244163513, + "ce_loss_2": 4.2651455163955685, + "ce_loss_4": 3.812887728214264, + "ce_loss_9": 3.4075511574745176, + "epoch": 0.379, + "grad_norm": 1504.0, + "kl_loss_13": 118.18362007141113, + "kl_loss_2": 2265.8506286621096, + "kl_loss_4": 1392.2117309570312, + "kl_loss_9": 509.61439819335936, + "learning_rate": 0.0006946350531586958, + "loss": 1080.6469, + "step": 3790 + }, + { + "ce_loss_13": 3.2648179173469543, + "ce_loss_17": 3.208346092700958, + "ce_loss_2": 4.296475219726562, + "ce_loss_4": 3.8459570169448853, + "ce_loss_9": 3.4356889367103576, + "epoch": 0.38, + "grad_norm": 1032.0, + "kl_loss_13": 116.07862281799316, + "kl_loss_2": 2294.108856201172, + "kl_loss_4": 1410.9663452148438, + "kl_loss_9": 512.0185394287109, + "learning_rate": 0.0006931725628465643, + "loss": 1104.2301, + "step": 3800 + }, + { + "ce_loss_13": 3.2749621868133545, + "ce_loss_17": 3.2189390063285828, + "ce_loss_2": 4.3122030854225155, + "ce_loss_4": 3.86287761926651, + "ce_loss_9": 3.4460429906845094, + "epoch": 0.381, + "grad_norm": 1064.0, + "kl_loss_13": 117.54122619628906, + "kl_loss_2": 2284.8057495117187, + "kl_loss_4": 1403.21865234375, + "kl_loss_9": 508.0739349365234, + "learning_rate": 0.0006917081272888696, + "loss": 1085.2492, + "step": 3810 + }, + { + "ce_loss_13": 3.1899675726890564, + "ce_loss_17": 3.133307635784149, + "ce_loss_2": 4.270281529426574, + "ce_loss_4": 3.8289338827133177, + "ce_loss_9": 3.3582560300827025, + "epoch": 0.382, + "grad_norm": 1168.0, + "kl_loss_13": 120.47982025146484, + "kl_loss_2": 2387.8220153808593, + "kl_loss_4": 1524.6073974609376, + "kl_loss_9": 515.2564239501953, + "learning_rate": 0.0006902417612324615, + "loss": 1101.9082, + "step": 3820 + }, + { + "ce_loss_13": 3.3163021087646483, + "ce_loss_17": 3.253460872173309, + "ce_loss_2": 4.3879059791564945, + "ce_loss_4": 3.9241119265556335, + "ce_loss_9": 3.4952972769737243, + "epoch": 0.383, + "grad_norm": 1240.0, + "kl_loss_13": 123.8676040649414, + "kl_loss_2": 2366.6651000976562, + "kl_loss_4": 1456.1027709960938, + "kl_loss_9": 529.3296096801757, + "learning_rate": 0.00068877347944363, + "loss": 1107.0277, + "step": 3830 + }, + { + "ce_loss_13": 3.3118074417114256, + "ce_loss_17": 3.2544734358787535, + "ce_loss_2": 4.324665665626526, + "ce_loss_4": 3.874320888519287, + "ce_loss_9": 3.475998568534851, + "epoch": 0.384, + "grad_norm": 1000.0, + "kl_loss_13": 121.64886360168457, + "kl_loss_2": 2258.939874267578, + "kl_loss_4": 1383.7299072265625, + "kl_loss_9": 504.94022369384766, + "learning_rate": 0.0006873032967079561, + "loss": 1090.5457, + "step": 3840 + }, + { + "ce_loss_13": 3.294573223590851, + "ce_loss_17": 3.243241775035858, + "ce_loss_2": 4.288003921508789, + "ce_loss_4": 3.85051885843277, + "ce_loss_9": 3.4562297821044923, + "epoch": 0.385, + "grad_norm": 1064.0, + "kl_loss_13": 117.66870727539063, + "kl_loss_2": 2226.2527282714846, + "kl_loss_4": 1369.209228515625, + "kl_loss_9": 496.37988128662107, + "learning_rate": 0.0006858312278301637, + "loss": 1058.9362, + "step": 3850 + }, + { + "ce_loss_13": 3.33297883272171, + "ce_loss_17": 3.2776344895362852, + "ce_loss_2": 4.321318292617798, + "ce_loss_4": 3.8812865734100344, + "ce_loss_9": 3.4917379021644592, + "epoch": 0.386, + "grad_norm": 1152.0, + "kl_loss_13": 119.04434738159179, + "kl_loss_2": 2224.3666931152343, + "kl_loss_4": 1368.8128601074218, + "kl_loss_9": 498.3236541748047, + "learning_rate": 0.0006843572876339704, + "loss": 1061.1072, + "step": 3860 + }, + { + "ce_loss_13": 3.247132360935211, + "ce_loss_17": 3.1932952880859373, + "ce_loss_2": 4.2274780631065365, + "ce_loss_4": 3.8059142231941223, + "ce_loss_9": 3.4110722303390504, + "epoch": 0.387, + "grad_norm": 1344.0, + "kl_loss_13": 113.06982345581055, + "kl_loss_2": 2178.4062561035157, + "kl_loss_4": 1340.2969116210938, + "kl_loss_9": 490.1000061035156, + "learning_rate": 0.0006828814909619373, + "loss": 1081.9537, + "step": 3870 + }, + { + "ce_loss_13": 3.370439875125885, + "ce_loss_17": 3.3122829675674437, + "ce_loss_2": 4.387281847000122, + "ce_loss_4": 3.9361196756362915, + "ce_loss_9": 3.538049840927124, + "epoch": 0.388, + "grad_norm": 888.0, + "kl_loss_13": 119.49054832458496, + "kl_loss_2": 2242.659228515625, + "kl_loss_4": 1372.8875793457032, + "kl_loss_9": 504.6261413574219, + "learning_rate": 0.0006814038526753205, + "loss": 1053.5878, + "step": 3880 + }, + { + "ce_loss_13": 3.2711361408233643, + "ce_loss_17": 3.2170068502426146, + "ce_loss_2": 4.288709592819214, + "ce_loss_4": 3.841650700569153, + "ce_loss_9": 3.4438189268112183, + "epoch": 0.389, + "grad_norm": 1096.0, + "kl_loss_13": 116.80678749084473, + "kl_loss_2": 2242.888543701172, + "kl_loss_4": 1373.6038513183594, + "kl_loss_9": 505.67105255126955, + "learning_rate": 0.0006799243876539213, + "loss": 1069.6137, + "step": 3890 + }, + { + "ce_loss_13": 3.1957311391830445, + "ce_loss_17": 3.140287661552429, + "ce_loss_2": 4.256408321857452, + "ce_loss_4": 3.78494508266449, + "ce_loss_9": 3.360948419570923, + "epoch": 0.39, + "grad_norm": 1216.0, + "kl_loss_13": 115.51587715148926, + "kl_loss_2": 2332.0825317382814, + "kl_loss_4": 1410.7244689941406, + "kl_loss_9": 504.37947540283204, + "learning_rate": 0.0006784431107959359, + "loss": 1093.4369, + "step": 3900 + }, + { + "ce_loss_13": 3.25591983795166, + "ce_loss_17": 3.1991868376731873, + "ce_loss_2": 4.323431754112244, + "ce_loss_4": 3.8537954926490783, + "ce_loss_9": 3.4328837156295777, + "epoch": 0.391, + "grad_norm": 1328.0, + "kl_loss_13": 118.43140678405761, + "kl_loss_2": 2364.4612426757812, + "kl_loss_4": 1438.1888061523437, + "kl_loss_9": 520.2050048828125, + "learning_rate": 0.0006769600370178059, + "loss": 1094.4535, + "step": 3910 + }, + { + "ce_loss_13": 3.2183175444602967, + "ce_loss_17": 3.1603541731834413, + "ce_loss_2": 4.256223905086517, + "ce_loss_4": 3.8103991031646727, + "ce_loss_9": 3.391024374961853, + "epoch": 0.392, + "grad_norm": 1048.0, + "kl_loss_13": 114.40931663513183, + "kl_loss_2": 2279.636865234375, + "kl_loss_4": 1410.5021606445312, + "kl_loss_9": 508.0836639404297, + "learning_rate": 0.0006754751812540679, + "loss": 1059.5244, + "step": 3920 + }, + { + "ce_loss_13": 3.270332455635071, + "ce_loss_17": 3.214082932472229, + "ce_loss_2": 4.3168336868286135, + "ce_loss_4": 3.8525742650032044, + "ce_loss_9": 3.441233456134796, + "epoch": 0.393, + "grad_norm": 1232.0, + "kl_loss_13": 118.80502738952637, + "kl_loss_2": 2327.346856689453, + "kl_loss_4": 1413.3339416503907, + "kl_loss_9": 513.6572372436524, + "learning_rate": 0.0006739885584572025, + "loss": 1094.8298, + "step": 3930 + }, + { + "ce_loss_13": 3.292375385761261, + "ce_loss_17": 3.2345564126968385, + "ce_loss_2": 4.351148295402527, + "ce_loss_4": 3.8803937673568725, + "ce_loss_9": 3.460458052158356, + "epoch": 0.394, + "grad_norm": 1192.0, + "kl_loss_13": 119.72496643066407, + "kl_loss_2": 2375.7529541015624, + "kl_loss_4": 1440.0219787597657, + "kl_loss_9": 516.5012908935547, + "learning_rate": 0.0006725001835974853, + "loss": 1086.5627, + "step": 3940 + }, + { + "ce_loss_13": 3.2847928404808044, + "ce_loss_17": 3.2268094420433044, + "ce_loss_2": 4.335441470146179, + "ce_loss_4": 3.871453082561493, + "ce_loss_9": 3.458202826976776, + "epoch": 0.395, + "grad_norm": 1216.0, + "kl_loss_13": 120.23314361572265, + "kl_loss_2": 2335.919299316406, + "kl_loss_4": 1420.2667175292968, + "kl_loss_9": 515.5993301391602, + "learning_rate": 0.0006710100716628344, + "loss": 1071.4093, + "step": 3950 + }, + { + "ce_loss_13": 3.2668710470199587, + "ce_loss_17": 3.2122414231300356, + "ce_loss_2": 4.300667762756348, + "ce_loss_4": 3.861303722858429, + "ce_loss_9": 3.4400934100151064, + "epoch": 0.396, + "grad_norm": 1088.0, + "kl_loss_13": 115.07725105285644, + "kl_loss_2": 2282.9452880859376, + "kl_loss_4": 1414.5107421875, + "kl_loss_9": 506.8136215209961, + "learning_rate": 0.0006695182376586602, + "loss": 1088.4627, + "step": 3960 + }, + { + "ce_loss_13": 3.299285662174225, + "ce_loss_17": 3.2437246084213256, + "ce_loss_2": 4.281932473182678, + "ce_loss_4": 3.8505242466926575, + "ce_loss_9": 3.4604560256004335, + "epoch": 0.397, + "grad_norm": 1004.0, + "kl_loss_13": 111.20931587219238, + "kl_loss_2": 2170.6823486328126, + "kl_loss_4": 1329.9519775390625, + "kl_loss_9": 482.34302673339846, + "learning_rate": 0.000668024696607715, + "loss": 1075.5578, + "step": 3970 + }, + { + "ce_loss_13": 3.257555305957794, + "ce_loss_17": 3.2037429809570312, + "ce_loss_2": 4.276872074604034, + "ce_loss_4": 3.8287230014801024, + "ce_loss_9": 3.420667016506195, + "epoch": 0.398, + "grad_norm": 940.0, + "kl_loss_13": 114.30792846679688, + "kl_loss_2": 2275.347900390625, + "kl_loss_4": 1395.9261352539063, + "kl_loss_9": 501.5082214355469, + "learning_rate": 0.0006665294635499404, + "loss": 1074.6923, + "step": 3980 + }, + { + "ce_loss_13": 3.265869653224945, + "ce_loss_17": 3.207436966896057, + "ce_loss_2": 4.337453627586365, + "ce_loss_4": 3.8671420335769655, + "ce_loss_9": 3.440145933628082, + "epoch": 0.399, + "grad_norm": 1232.0, + "kl_loss_13": 122.17486228942872, + "kl_loss_2": 2389.4196655273436, + "kl_loss_4": 1459.04248046875, + "kl_loss_9": 521.9432968139648, + "learning_rate": 0.0006650325535423167, + "loss": 1096.3436, + "step": 3990 + }, + { + "ce_loss_13": 3.2868151664733887, + "ce_loss_17": 3.233126497268677, + "ce_loss_2": 4.267296648025512, + "ce_loss_4": 3.846002721786499, + "ce_loss_9": 3.4503178358078004, + "epoch": 0.4, + "grad_norm": 984.0, + "kl_loss_13": 114.97598381042481, + "kl_loss_2": 2157.879669189453, + "kl_loss_4": 1333.1712707519532, + "kl_loss_9": 488.3334106445312, + "learning_rate": 0.0006635339816587109, + "loss": 1057.3236, + "step": 4000 + }, + { + "ce_loss_13": 3.2253784060478212, + "ce_loss_17": 3.1667786478996276, + "ce_loss_2": 4.279755222797394, + "ce_loss_4": 3.8059728860855104, + "ce_loss_9": 3.389483153820038, + "epoch": 0.401, + "grad_norm": 1024.0, + "kl_loss_13": 119.9583179473877, + "kl_loss_2": 2349.151300048828, + "kl_loss_4": 1421.7871398925781, + "kl_loss_9": 516.742317199707, + "learning_rate": 0.0006620337629897252, + "loss": 1083.5251, + "step": 4010 + }, + { + "ce_loss_13": 3.231654965877533, + "ce_loss_17": 3.1765261054039002, + "ce_loss_2": 4.2715056300163265, + "ce_loss_4": 3.8134943723678587, + "ce_loss_9": 3.4055351495742796, + "epoch": 0.402, + "grad_norm": 1040.0, + "kl_loss_13": 118.40730514526368, + "kl_loss_2": 2304.9548156738283, + "kl_loss_4": 1405.474853515625, + "kl_loss_9": 516.0563415527344, + "learning_rate": 0.0006605319126425454, + "loss": 1103.0711, + "step": 4020 + }, + { + "ce_loss_13": 3.144042181968689, + "ce_loss_17": 3.088515877723694, + "ce_loss_2": 4.206588113307953, + "ce_loss_4": 3.7421472311019897, + "ce_loss_9": 3.3153361558914183, + "epoch": 0.403, + "grad_norm": 1088.0, + "kl_loss_13": 116.6795555114746, + "kl_loss_2": 2366.950927734375, + "kl_loss_4": 1444.5437866210937, + "kl_loss_9": 514.3038314819336, + "learning_rate": 0.0006590284457407876, + "loss": 1098.408, + "step": 4030 + }, + { + "ce_loss_13": 3.2344555974006655, + "ce_loss_17": 3.1791993260383604, + "ce_loss_2": 4.271684312820435, + "ce_loss_4": 3.8213364243507386, + "ce_loss_9": 3.4080180764198302, + "epoch": 0.404, + "grad_norm": 1104.0, + "kl_loss_13": 116.74624099731446, + "kl_loss_2": 2284.5002380371093, + "kl_loss_4": 1394.9824829101562, + "kl_loss_9": 507.4809600830078, + "learning_rate": 0.0006575233774243465, + "loss": 1077.2402, + "step": 4040 + }, + { + "ce_loss_13": 3.230600190162659, + "ce_loss_17": 3.1754475712776182, + "ce_loss_2": 4.27406120300293, + "ce_loss_4": 3.8186389923095705, + "ce_loss_9": 3.3993021130561827, + "epoch": 0.405, + "grad_norm": 1096.0, + "kl_loss_13": 116.06656608581542, + "kl_loss_2": 2327.6747619628904, + "kl_loss_4": 1419.9807250976562, + "kl_loss_9": 511.68627166748047, + "learning_rate": 0.0006560167228492435, + "loss": 1087.238, + "step": 4050 + }, + { + "ce_loss_13": 3.2770464420318604, + "ce_loss_17": 3.224709188938141, + "ce_loss_2": 4.27225251197815, + "ce_loss_4": 3.8360234975814818, + "ce_loss_9": 3.44199081659317, + "epoch": 0.406, + "grad_norm": 984.0, + "kl_loss_13": 111.89021492004395, + "kl_loss_2": 2213.985980224609, + "kl_loss_4": 1357.6662109375, + "kl_loss_9": 495.77219848632814, + "learning_rate": 0.0006545084971874737, + "loss": 1072.8574, + "step": 4060 + }, + { + "ce_loss_13": 3.241572308540344, + "ce_loss_17": 3.182253873348236, + "ce_loss_2": 4.30549190044403, + "ce_loss_4": 3.8312055826187135, + "ce_loss_9": 3.4148097395896913, + "epoch": 0.407, + "grad_norm": 1232.0, + "kl_loss_13": 118.28398208618164, + "kl_loss_2": 2364.72275390625, + "kl_loss_4": 1427.7308227539063, + "kl_loss_9": 524.9188873291016, + "learning_rate": 0.0006529987156268526, + "loss": 1082.0934, + "step": 4070 + }, + { + "ce_loss_13": 3.154082989692688, + "ce_loss_17": 3.097145104408264, + "ce_loss_2": 4.214341485500336, + "ce_loss_4": 3.7496459007263185, + "ce_loss_9": 3.3264078974723814, + "epoch": 0.408, + "grad_norm": 896.0, + "kl_loss_13": 116.09186363220215, + "kl_loss_2": 2316.980822753906, + "kl_loss_4": 1419.9923217773437, + "kl_loss_9": 506.57225799560547, + "learning_rate": 0.0006514873933708637, + "loss": 1109.8646, + "step": 4080 + }, + { + "ce_loss_13": 3.269277000427246, + "ce_loss_17": 3.2127954006195067, + "ce_loss_2": 4.296722364425659, + "ce_loss_4": 3.8517523288726805, + "ce_loss_9": 3.4366657614707945, + "epoch": 0.409, + "grad_norm": 1120.0, + "kl_loss_13": 114.25712738037109, + "kl_loss_2": 2275.217352294922, + "kl_loss_4": 1396.4087524414062, + "kl_loss_9": 498.3484573364258, + "learning_rate": 0.0006499745456385053, + "loss": 1069.315, + "step": 4090 + }, + { + "ce_loss_13": 3.2353057265281677, + "ce_loss_17": 3.179414117336273, + "ce_loss_2": 4.272877788543701, + "ce_loss_4": 3.8157183051109316, + "ce_loss_9": 3.4051207184791563, + "epoch": 0.41, + "grad_norm": 1088.0, + "kl_loss_13": 115.13732872009277, + "kl_loss_2": 2273.2440795898438, + "kl_loss_4": 1391.163067626953, + "kl_loss_9": 503.09985198974607, + "learning_rate": 0.0006484601876641375, + "loss": 1084.3053, + "step": 4100 + }, + { + "ce_loss_13": 3.222550868988037, + "ce_loss_17": 3.1711755990982056, + "ce_loss_2": 4.215626072883606, + "ce_loss_4": 3.7862999200820924, + "ce_loss_9": 3.3787871956825257, + "epoch": 0.411, + "grad_norm": 996.0, + "kl_loss_13": 113.18559074401855, + "kl_loss_2": 2214.3840148925783, + "kl_loss_4": 1368.5880187988282, + "kl_loss_9": 489.7045639038086, + "learning_rate": 0.000646944334697328, + "loss": 1056.8592, + "step": 4110 + }, + { + "ce_loss_13": 3.332657074928284, + "ce_loss_17": 3.2808632493019103, + "ce_loss_2": 4.3141814231872555, + "ce_loss_4": 3.8839778661727906, + "ce_loss_9": 3.494704532623291, + "epoch": 0.412, + "grad_norm": 900.0, + "kl_loss_13": 113.22249984741211, + "kl_loss_2": 2172.733612060547, + "kl_loss_4": 1334.6379455566407, + "kl_loss_9": 489.5256011962891, + "learning_rate": 0.0006454270020026995, + "loss": 1037.7847, + "step": 4120 + }, + { + "ce_loss_13": 3.3000792622566224, + "ce_loss_17": 3.2495296359062196, + "ce_loss_2": 4.278862857818604, + "ce_loss_4": 3.852785313129425, + "ce_loss_9": 3.463261294364929, + "epoch": 0.413, + "grad_norm": 1000.0, + "kl_loss_13": 109.71647987365722, + "kl_loss_2": 2165.2211669921876, + "kl_loss_4": 1329.13447265625, + "kl_loss_9": 483.05108642578125, + "learning_rate": 0.0006439082048597755, + "loss": 1033.6895, + "step": 4130 + }, + { + "ce_loss_13": 3.2904861688613893, + "ce_loss_17": 3.2351158380508425, + "ce_loss_2": 4.311359119415283, + "ce_loss_4": 3.8588201880455015, + "ce_loss_9": 3.461822986602783, + "epoch": 0.414, + "grad_norm": 904.0, + "kl_loss_13": 114.71153526306152, + "kl_loss_2": 2272.288397216797, + "kl_loss_4": 1387.556622314453, + "kl_loss_9": 504.81707763671875, + "learning_rate": 0.0006423879585628261, + "loss": 1074.9437, + "step": 4140 + }, + { + "ce_loss_13": 3.2507503628730774, + "ce_loss_17": 3.1934019684791566, + "ce_loss_2": 4.302461767196656, + "ce_loss_4": 3.839321494102478, + "ce_loss_9": 3.424005317687988, + "epoch": 0.415, + "grad_norm": 1264.0, + "kl_loss_13": 116.64890022277832, + "kl_loss_2": 2321.1063842773438, + "kl_loss_4": 1419.0210021972657, + "kl_loss_9": 510.67321319580077, + "learning_rate": 0.0006408662784207149, + "loss": 1089.3537, + "step": 4150 + }, + { + "ce_loss_13": 3.218521749973297, + "ce_loss_17": 3.1660658597946165, + "ce_loss_2": 4.2414235949516295, + "ce_loss_4": 3.797908437252045, + "ce_loss_9": 3.383116602897644, + "epoch": 0.416, + "grad_norm": 1152.0, + "kl_loss_13": 112.22248916625976, + "kl_loss_2": 2269.583624267578, + "kl_loss_4": 1397.2362854003907, + "kl_loss_9": 499.2737350463867, + "learning_rate": 0.0006393431797567439, + "loss": 1073.0363, + "step": 4160 + }, + { + "ce_loss_13": 3.294830310344696, + "ce_loss_17": 3.2450266122817992, + "ce_loss_2": 4.265678524971008, + "ce_loss_4": 3.8339643836021424, + "ce_loss_9": 3.4542127370834352, + "epoch": 0.417, + "grad_norm": 1008.0, + "kl_loss_13": 112.24522476196289, + "kl_loss_2": 2184.2919067382813, + "kl_loss_4": 1335.4431579589843, + "kl_loss_9": 491.2783721923828, + "learning_rate": 0.0006378186779084996, + "loss": 1021.4752, + "step": 4170 + }, + { + "ce_loss_13": 3.131910705566406, + "ce_loss_17": 3.0777576208114623, + "ce_loss_2": 4.186444282531738, + "ce_loss_4": 3.732046830654144, + "ce_loss_9": 3.303534471988678, + "epoch": 0.418, + "grad_norm": 1224.0, + "kl_loss_13": 115.46459312438965, + "kl_loss_2": 2295.2138732910157, + "kl_loss_4": 1411.596142578125, + "kl_loss_9": 509.5405044555664, + "learning_rate": 0.0006362927882276989, + "loss": 1087.2432, + "step": 4180 + }, + { + "ce_loss_13": 3.3165557622909545, + "ce_loss_17": 3.2644819617271423, + "ce_loss_2": 4.310365915298462, + "ce_loss_4": 3.8637684106826784, + "ce_loss_9": 3.476090371608734, + "epoch": 0.419, + "grad_norm": 904.0, + "kl_loss_13": 111.56231918334962, + "kl_loss_2": 2203.7078552246094, + "kl_loss_4": 1333.4104858398437, + "kl_loss_9": 484.861669921875, + "learning_rate": 0.000634765526080034, + "loss": 1027.4697, + "step": 4190 + }, + { + "ce_loss_13": 3.3249414920806886, + "ce_loss_17": 3.2714288592338563, + "ce_loss_2": 4.316164779663086, + "ce_loss_4": 3.881926715373993, + "ce_loss_9": 3.492201507091522, + "epoch": 0.42, + "grad_norm": 932.0, + "kl_loss_13": 116.21826515197753, + "kl_loss_2": 2220.0864990234377, + "kl_loss_4": 1357.5955444335937, + "kl_loss_9": 498.2513122558594, + "learning_rate": 0.0006332369068450174, + "loss": 1046.0029, + "step": 4200 + }, + { + "ce_loss_13": 3.2582478284835816, + "ce_loss_17": 3.2052289605140687, + "ce_loss_2": 4.279117357730866, + "ce_loss_4": 3.833774411678314, + "ce_loss_9": 3.4272374272346497, + "epoch": 0.421, + "grad_norm": 964.0, + "kl_loss_13": 113.13218002319336, + "kl_loss_2": 2253.5891723632812, + "kl_loss_4": 1380.973614501953, + "kl_loss_9": 498.979557800293, + "learning_rate": 0.0006317069459158283, + "loss": 1054.4141, + "step": 4210 + }, + { + "ce_loss_13": 3.3661680936813356, + "ce_loss_17": 3.312276077270508, + "ce_loss_2": 4.336423325538635, + "ce_loss_4": 3.9053974509239198, + "ce_loss_9": 3.520712685585022, + "epoch": 0.422, + "grad_norm": 1000.0, + "kl_loss_13": 114.0796272277832, + "kl_loss_2": 2185.7806030273437, + "kl_loss_4": 1333.9905029296874, + "kl_loss_9": 486.2699005126953, + "learning_rate": 0.0006301756586991561, + "loss": 1043.7744, + "step": 4220 + }, + { + "ce_loss_13": 3.1512603402137755, + "ce_loss_17": 3.0970467567443847, + "ce_loss_2": 4.200273013114929, + "ce_loss_4": 3.742545855045319, + "ce_loss_9": 3.3217591643333435, + "epoch": 0.423, + "grad_norm": 1000.0, + "kl_loss_13": 114.35023880004883, + "kl_loss_2": 2335.140509033203, + "kl_loss_4": 1428.7702392578126, + "kl_loss_9": 507.99024658203126, + "learning_rate": 0.0006286430606150459, + "loss": 1081.4123, + "step": 4230 + }, + { + "ce_loss_13": 3.3513105750083922, + "ce_loss_17": 3.2976834893226625, + "ce_loss_2": 4.352766633033752, + "ce_loss_4": 3.9125161290168764, + "ce_loss_9": 3.5165541410446166, + "epoch": 0.424, + "grad_norm": 1064.0, + "kl_loss_13": 114.39515533447266, + "kl_loss_2": 2231.767742919922, + "kl_loss_4": 1367.0595275878907, + "kl_loss_9": 496.9046401977539, + "learning_rate": 0.0006271091670967436, + "loss": 1051.5317, + "step": 4240 + }, + { + "ce_loss_13": 3.260680139064789, + "ce_loss_17": 3.202347791194916, + "ce_loss_2": 4.313520789146423, + "ce_loss_4": 3.8584001898765563, + "ce_loss_9": 3.4388577222824095, + "epoch": 0.425, + "grad_norm": 976.0, + "kl_loss_13": 118.73695907592773, + "kl_loss_2": 2348.4839599609377, + "kl_loss_4": 1449.6987426757812, + "kl_loss_9": 527.9644729614258, + "learning_rate": 0.0006255739935905395, + "loss": 1080.7462, + "step": 4250 + }, + { + "ce_loss_13": 3.305410289764404, + "ce_loss_17": 3.2520323514938356, + "ce_loss_2": 4.29744565486908, + "ce_loss_4": 3.85259462594986, + "ce_loss_9": 3.466714012622833, + "epoch": 0.426, + "grad_norm": 920.0, + "kl_loss_13": 114.20407333374024, + "kl_loss_2": 2211.283172607422, + "kl_loss_4": 1337.8485900878907, + "kl_loss_9": 491.80822601318357, + "learning_rate": 0.0006240375555556145, + "loss": 1081.0334, + "step": 4260 + }, + { + "ce_loss_13": 3.298284411430359, + "ce_loss_17": 3.242466163635254, + "ce_loss_2": 4.358609008789062, + "ce_loss_4": 3.891164708137512, + "ce_loss_9": 3.477046036720276, + "epoch": 0.427, + "grad_norm": 880.0, + "kl_loss_13": 116.06230278015137, + "kl_loss_2": 2308.6298217773438, + "kl_loss_4": 1401.268798828125, + "kl_loss_9": 508.54779663085935, + "learning_rate": 0.000622499868463882, + "loss": 1074.5596, + "step": 4270 + }, + { + "ce_loss_13": 3.2727805733680726, + "ce_loss_17": 3.2200000047683717, + "ce_loss_2": 4.251509332656861, + "ce_loss_4": 3.809460389614105, + "ce_loss_9": 3.424841594696045, + "epoch": 0.428, + "grad_norm": 1512.0, + "kl_loss_13": 112.85384483337403, + "kl_loss_2": 2198.7832885742187, + "kl_loss_4": 1337.7629638671874, + "kl_loss_9": 484.31666412353513, + "learning_rate": 0.0006209609477998338, + "loss": 1049.8729, + "step": 4280 + }, + { + "ce_loss_13": 3.327023780345917, + "ce_loss_17": 3.272614669799805, + "ce_loss_2": 4.3319542646408085, + "ce_loss_4": 3.891036605834961, + "ce_loss_9": 3.4933693766593934, + "epoch": 0.429, + "grad_norm": 1200.0, + "kl_loss_13": 114.84309539794921, + "kl_loss_2": 2229.923229980469, + "kl_loss_4": 1374.1801208496095, + "kl_loss_9": 496.6692184448242, + "learning_rate": 0.0006194208090603844, + "loss": 1069.9532, + "step": 4290 + }, + { + "ce_loss_13": 3.2463571190834046, + "ce_loss_17": 3.195250380039215, + "ce_loss_2": 4.245231747627258, + "ce_loss_4": 3.8044413566589355, + "ce_loss_9": 3.411097288131714, + "epoch": 0.43, + "grad_norm": 1168.0, + "kl_loss_13": 110.6188575744629, + "kl_loss_2": 2207.0928588867187, + "kl_loss_4": 1345.121844482422, + "kl_loss_9": 482.6415710449219, + "learning_rate": 0.0006178794677547138, + "loss": 1032.0334, + "step": 4300 + }, + { + "ce_loss_13": 3.2737072348594665, + "ce_loss_17": 3.219567596912384, + "ce_loss_2": 4.294634616374969, + "ce_loss_4": 3.8541446805000303, + "ce_loss_9": 3.4414506554603577, + "epoch": 0.431, + "grad_norm": 1416.0, + "kl_loss_13": 115.14885368347169, + "kl_loss_2": 2281.1184265136717, + "kl_loss_4": 1406.0447387695312, + "kl_loss_9": 505.76427459716797, + "learning_rate": 0.0006163369394041111, + "loss": 1063.2492, + "step": 4310 + }, + { + "ce_loss_13": 3.2066330432891847, + "ce_loss_17": 3.154304051399231, + "ce_loss_2": 4.251827347278595, + "ce_loss_4": 3.789417731761932, + "ce_loss_9": 3.3820641040802, + "epoch": 0.432, + "grad_norm": 1528.0, + "kl_loss_13": 112.23676261901855, + "kl_loss_2": 2292.7070190429686, + "kl_loss_4": 1396.7613159179687, + "kl_loss_9": 499.70843811035155, + "learning_rate": 0.0006147932395418205, + "loss": 1092.2501, + "step": 4320 + }, + { + "ce_loss_13": 3.2508907079696656, + "ce_loss_17": 3.1966634154319764, + "ce_loss_2": 4.2479990243911745, + "ce_loss_4": 3.803886926174164, + "ce_loss_9": 3.4115867972373963, + "epoch": 0.433, + "grad_norm": 1192.0, + "kl_loss_13": 112.61089324951172, + "kl_loss_2": 2225.1054626464843, + "kl_loss_4": 1354.7594055175782, + "kl_loss_9": 495.6437133789062, + "learning_rate": 0.0006132483837128823, + "loss": 1044.7, + "step": 4330 + }, + { + "ce_loss_13": 3.2294129371643066, + "ce_loss_17": 3.175628292560577, + "ce_loss_2": 4.256321203708649, + "ce_loss_4": 3.7945444345474244, + "ce_loss_9": 3.395311963558197, + "epoch": 0.434, + "grad_norm": 980.0, + "kl_loss_13": 112.15752220153809, + "kl_loss_2": 2295.1910034179687, + "kl_loss_4": 1382.2321899414062, + "kl_loss_9": 495.2871948242188, + "learning_rate": 0.0006117023874739772, + "loss": 1066.0332, + "step": 4340 + }, + { + "ce_loss_13": 3.219388794898987, + "ce_loss_17": 3.1654563188552856, + "ce_loss_2": 4.25599182844162, + "ce_loss_4": 3.7939063549041747, + "ce_loss_9": 3.386433410644531, + "epoch": 0.435, + "grad_norm": 892.0, + "kl_loss_13": 112.37171630859375, + "kl_loss_2": 2288.638220214844, + "kl_loss_4": 1383.72587890625, + "kl_loss_9": 499.08514404296875, + "learning_rate": 0.0006101552663932703, + "loss": 1079.2202, + "step": 4350 + }, + { + "ce_loss_13": 3.251091992855072, + "ce_loss_17": 3.196838343143463, + "ce_loss_2": 4.258292531967163, + "ce_loss_4": 3.8165655851364138, + "ce_loss_9": 3.417253577709198, + "epoch": 0.436, + "grad_norm": 1120.0, + "kl_loss_13": 115.40013999938965, + "kl_loss_2": 2245.9327697753906, + "kl_loss_4": 1378.902362060547, + "kl_loss_9": 499.42090911865233, + "learning_rate": 0.0006086070360502539, + "loss": 1062.2396, + "step": 4360 + }, + { + "ce_loss_13": 3.2624617218971252, + "ce_loss_17": 3.2081745982170107, + "ce_loss_2": 4.266116559505463, + "ce_loss_4": 3.821986746788025, + "ce_loss_9": 3.4228087067604065, + "epoch": 0.437, + "grad_norm": 964.0, + "kl_loss_13": 114.69169464111329, + "kl_loss_2": 2262.5513244628905, + "kl_loss_4": 1375.5499267578125, + "kl_loss_9": 496.18865966796875, + "learning_rate": 0.0006070577120355903, + "loss": 1078.5221, + "step": 4370 + }, + { + "ce_loss_13": 3.259449505805969, + "ce_loss_17": 3.2051392197608948, + "ce_loss_2": 4.248458623886108, + "ce_loss_4": 3.8276082277297974, + "ce_loss_9": 3.429605758190155, + "epoch": 0.438, + "grad_norm": 1020.0, + "kl_loss_13": 113.97110328674316, + "kl_loss_2": 2179.389337158203, + "kl_loss_4": 1357.2092346191407, + "kl_loss_9": 496.8188720703125, + "learning_rate": 0.0006055073099509549, + "loss": 1054.0235, + "step": 4380 + }, + { + "ce_loss_13": 3.3189528346061707, + "ce_loss_17": 3.2636242508888245, + "ce_loss_2": 4.300916457176209, + "ce_loss_4": 3.862388384342194, + "ce_loss_9": 3.4761409997940063, + "epoch": 0.439, + "grad_norm": 980.0, + "kl_loss_13": 114.51318397521973, + "kl_loss_2": 2209.7271057128905, + "kl_loss_4": 1351.2647888183594, + "kl_loss_9": 496.4955139160156, + "learning_rate": 0.0006039558454088796, + "loss": 1063.8635, + "step": 4390 + }, + { + "ce_loss_13": 3.291050970554352, + "ce_loss_17": 3.234801483154297, + "ce_loss_2": 4.302252006530762, + "ce_loss_4": 3.8625045299530028, + "ce_loss_9": 3.4606791973114013, + "epoch": 0.44, + "grad_norm": 1008.0, + "kl_loss_13": 116.85295829772949, + "kl_loss_2": 2242.7557678222656, + "kl_loss_4": 1374.2103576660156, + "kl_loss_9": 498.915576171875, + "learning_rate": 0.0006024033340325954, + "loss": 1041.0988, + "step": 4400 + }, + { + "ce_loss_13": 3.351902258396149, + "ce_loss_17": 3.299048125743866, + "ce_loss_2": 4.318098855018616, + "ce_loss_4": 3.8939815878868105, + "ce_loss_9": 3.508348751068115, + "epoch": 0.441, + "grad_norm": 988.0, + "kl_loss_13": 114.71414108276367, + "kl_loss_2": 2142.451794433594, + "kl_loss_4": 1318.9441284179688, + "kl_loss_9": 477.57823791503904, + "learning_rate": 0.0006008497914558743, + "loss": 1032.8287, + "step": 4410 + }, + { + "ce_loss_13": 3.2981741189956666, + "ce_loss_17": 3.2423023343086244, + "ce_loss_2": 4.317156338691712, + "ce_loss_4": 3.8690537929534914, + "ce_loss_9": 3.4694420337677, + "epoch": 0.442, + "grad_norm": 1072.0, + "kl_loss_13": 122.18346138000489, + "kl_loss_2": 2273.7421020507813, + "kl_loss_4": 1391.2163696289062, + "kl_loss_9": 509.2335968017578, + "learning_rate": 0.0005992952333228728, + "loss": 1068.8811, + "step": 4420 + }, + { + "ce_loss_13": 3.2410339593887327, + "ce_loss_17": 3.185625505447388, + "ce_loss_2": 4.251394236087799, + "ce_loss_4": 3.8080117464065553, + "ce_loss_9": 3.4030250549316405, + "epoch": 0.443, + "grad_norm": 1096.0, + "kl_loss_13": 114.89693069458008, + "kl_loss_2": 2266.7773986816405, + "kl_loss_4": 1385.264501953125, + "kl_loss_9": 494.91763458251955, + "learning_rate": 0.0005977396752879741, + "loss": 1059.9248, + "step": 4430 + }, + { + "ce_loss_13": 3.1623576045036317, + "ce_loss_17": 3.108631801605225, + "ce_loss_2": 4.185509836673736, + "ce_loss_4": 3.738750946521759, + "ce_loss_9": 3.3378326296806335, + "epoch": 0.444, + "grad_norm": 1200.0, + "kl_loss_13": 114.97311859130859, + "kl_loss_2": 2271.976428222656, + "kl_loss_4": 1393.9525390625, + "kl_loss_9": 502.9504119873047, + "learning_rate": 0.0005961831330156305, + "loss": 1053.7545, + "step": 4440 + }, + { + "ce_loss_13": 3.30624281167984, + "ce_loss_17": 3.253134572505951, + "ce_loss_2": 4.33638311624527, + "ce_loss_4": 3.886413300037384, + "ce_loss_9": 3.475689005851746, + "epoch": 0.445, + "grad_norm": 988.0, + "kl_loss_13": 115.7089630126953, + "kl_loss_2": 2289.9570556640624, + "kl_loss_4": 1405.3046569824219, + "kl_loss_9": 498.2691284179688, + "learning_rate": 0.0005946256221802051, + "loss": 1084.5883, + "step": 4450 + }, + { + "ce_loss_13": 3.2804324507713316, + "ce_loss_17": 3.2293506026268006, + "ce_loss_2": 4.245705950260162, + "ce_loss_4": 3.815113198757172, + "ce_loss_9": 3.43248291015625, + "epoch": 0.446, + "grad_norm": 1168.0, + "kl_loss_13": 112.15541458129883, + "kl_loss_2": 2153.1458435058594, + "kl_loss_4": 1318.20908203125, + "kl_loss_9": 476.1432647705078, + "learning_rate": 0.0005930671584658151, + "loss": 1078.3115, + "step": 4460 + }, + { + "ce_loss_13": 3.288266897201538, + "ce_loss_17": 3.2371907114982603, + "ce_loss_2": 4.283300936222076, + "ce_loss_4": 3.8474048137664796, + "ce_loss_9": 3.450649082660675, + "epoch": 0.447, + "grad_norm": 1152.0, + "kl_loss_13": 113.39079551696777, + "kl_loss_2": 2228.3823059082033, + "kl_loss_4": 1364.92958984375, + "kl_loss_9": 493.24566345214845, + "learning_rate": 0.0005915077575661722, + "loss": 1065.0667, + "step": 4470 + }, + { + "ce_loss_13": 3.301390814781189, + "ce_loss_17": 3.2460404992103578, + "ce_loss_2": 4.315133488178253, + "ce_loss_4": 3.8610748529434202, + "ce_loss_9": 3.469024384021759, + "epoch": 0.448, + "grad_norm": 1048.0, + "kl_loss_13": 117.51225967407227, + "kl_loss_2": 2265.7343627929686, + "kl_loss_4": 1381.2666931152344, + "kl_loss_9": 504.8206512451172, + "learning_rate": 0.000589947435184427, + "loss": 1049.0127, + "step": 4480 + }, + { + "ce_loss_13": 3.3659312605857847, + "ce_loss_17": 3.3143599390983582, + "ce_loss_2": 4.310119581222534, + "ce_loss_4": 3.8942948460578917, + "ce_loss_9": 3.522876226902008, + "epoch": 0.449, + "grad_norm": 1008.0, + "kl_loss_13": 112.49709777832031, + "kl_loss_2": 2147.956866455078, + "kl_loss_4": 1331.3360290527344, + "kl_loss_9": 488.6355804443359, + "learning_rate": 0.0005883862070330078, + "loss": 1039.0609, + "step": 4490 + }, + { + "ce_loss_13": 3.298302376270294, + "ce_loss_17": 3.244524359703064, + "ce_loss_2": 4.301443207263946, + "ce_loss_4": 3.8609938740730287, + "ce_loss_9": 3.468320333957672, + "epoch": 0.45, + "grad_norm": 1008.0, + "kl_loss_13": 113.07883567810059, + "kl_loss_2": 2227.5302673339843, + "kl_loss_4": 1363.0703918457032, + "kl_loss_9": 497.58085479736326, + "learning_rate": 0.0005868240888334653, + "loss": 1042.5594, + "step": 4500 + }, + { + "ce_loss_13": 3.1868873238563538, + "ce_loss_17": 3.132264184951782, + "ce_loss_2": 4.22307413816452, + "ce_loss_4": 3.7738311529159545, + "ce_loss_9": 3.3573249697685243, + "epoch": 0.451, + "grad_norm": 1112.0, + "kl_loss_13": 115.24787178039551, + "kl_loss_2": 2290.6529052734377, + "kl_loss_4": 1411.1690856933594, + "kl_loss_9": 506.4848266601563, + "learning_rate": 0.0005852610963163119, + "loss": 1069.5668, + "step": 4510 + }, + { + "ce_loss_13": 3.207856297492981, + "ce_loss_17": 3.1569076657295225, + "ce_loss_2": 4.198264253139496, + "ce_loss_4": 3.7651442885398865, + "ce_loss_9": 3.3682425737380983, + "epoch": 0.452, + "grad_norm": 944.0, + "kl_loss_13": 111.95808868408203, + "kl_loss_2": 2206.5390747070314, + "kl_loss_4": 1352.5202270507812, + "kl_loss_9": 487.54320373535154, + "learning_rate": 0.0005836972452208654, + "loss": 1033.7531, + "step": 4520 + }, + { + "ce_loss_13": 3.215216076374054, + "ce_loss_17": 3.1628613650798796, + "ce_loss_2": 4.230946063995361, + "ce_loss_4": 3.7814112663269044, + "ce_loss_9": 3.3801348328590395, + "epoch": 0.453, + "grad_norm": 1012.0, + "kl_loss_13": 113.33771629333496, + "kl_loss_2": 2251.2802307128904, + "kl_loss_4": 1371.236083984375, + "kl_loss_9": 494.96768341064455, + "learning_rate": 0.0005821325512950885, + "loss": 1057.0058, + "step": 4530 + }, + { + "ce_loss_13": 3.2350626587867737, + "ce_loss_17": 3.1852386474609373, + "ce_loss_2": 4.234014999866486, + "ce_loss_4": 3.796155297756195, + "ce_loss_9": 3.400549292564392, + "epoch": 0.454, + "grad_norm": 960.0, + "kl_loss_13": 109.30186195373535, + "kl_loss_2": 2187.7409912109374, + "kl_loss_4": 1328.7613220214844, + "kl_loss_9": 476.477880859375, + "learning_rate": 0.0005805670302954321, + "loss": 1045.7387, + "step": 4540 + }, + { + "ce_loss_13": 3.2508257031440735, + "ce_loss_17": 3.1998579263687135, + "ce_loss_2": 4.239289700984955, + "ce_loss_4": 3.7977746725082397, + "ce_loss_9": 3.411236810684204, + "epoch": 0.455, + "grad_norm": 936.0, + "kl_loss_13": 108.79041748046875, + "kl_loss_2": 2202.4040466308593, + "kl_loss_4": 1340.7225463867187, + "kl_loss_9": 484.11951751708983, + "learning_rate": 0.000579000697986675, + "loss": 1032.8826, + "step": 4550 + }, + { + "ce_loss_13": 3.2050686836242677, + "ce_loss_17": 3.149174356460571, + "ce_loss_2": 4.245373201370239, + "ce_loss_4": 3.7939256906509398, + "ce_loss_9": 3.3826231002807616, + "epoch": 0.456, + "grad_norm": 964.0, + "kl_loss_13": 115.6324676513672, + "kl_loss_2": 2302.1110290527345, + "kl_loss_4": 1406.6975891113282, + "kl_loss_9": 510.40406341552733, + "learning_rate": 0.0005774335701417662, + "loss": 1059.7803, + "step": 4560 + }, + { + "ce_loss_13": 3.1977349400520323, + "ce_loss_17": 3.146476686000824, + "ce_loss_2": 4.242737972736359, + "ce_loss_4": 3.780718648433685, + "ce_loss_9": 3.363146352767944, + "epoch": 0.457, + "grad_norm": 916.0, + "kl_loss_13": 110.78387298583985, + "kl_loss_2": 2330.0321655273438, + "kl_loss_4": 1403.4038452148438, + "kl_loss_9": 498.5827133178711, + "learning_rate": 0.0005758656625416658, + "loss": 1067.3129, + "step": 4570 + }, + { + "ce_loss_13": 3.2525212168693542, + "ce_loss_17": 3.1976067066192626, + "ce_loss_2": 4.2534533619880674, + "ce_loss_4": 3.812270760536194, + "ce_loss_9": 3.4136794567108155, + "epoch": 0.458, + "grad_norm": 900.0, + "kl_loss_13": 113.4214656829834, + "kl_loss_2": 2216.9472412109376, + "kl_loss_4": 1360.3324768066407, + "kl_loss_9": 490.9329086303711, + "learning_rate": 0.0005742969909751859, + "loss": 1033.167, + "step": 4580 + }, + { + "ce_loss_13": 3.2646499037742616, + "ce_loss_17": 3.2118812084197996, + "ce_loss_2": 4.280008375644684, + "ce_loss_4": 3.815813684463501, + "ce_loss_9": 3.42454617023468, + "epoch": 0.459, + "grad_norm": 1144.0, + "kl_loss_13": 112.87073135375977, + "kl_loss_2": 2267.6935302734373, + "kl_loss_4": 1358.0788940429688, + "kl_loss_9": 493.472509765625, + "learning_rate": 0.0005727275712388318, + "loss": 1061.4747, + "step": 4590 + }, + { + "ce_loss_13": 3.2865273714065553, + "ce_loss_17": 3.237425136566162, + "ce_loss_2": 4.2610726118087765, + "ce_loss_4": 3.832384634017944, + "ce_loss_9": 3.4468985319137575, + "epoch": 0.46, + "grad_norm": 1416.0, + "kl_loss_13": 109.99393463134766, + "kl_loss_2": 2177.620721435547, + "kl_loss_4": 1327.9073791503906, + "kl_loss_9": 480.4010375976562, + "learning_rate": 0.0005711574191366427, + "loss": 1038.2758, + "step": 4600 + }, + { + "ce_loss_13": 3.2426883578300476, + "ce_loss_17": 3.1919708967208864, + "ce_loss_2": 4.227907609939575, + "ce_loss_4": 3.7876739740371703, + "ce_loss_9": 3.401491367816925, + "epoch": 0.461, + "grad_norm": 856.0, + "kl_loss_13": 110.47371635437011, + "kl_loss_2": 2218.3276916503905, + "kl_loss_4": 1340.7212158203124, + "kl_loss_9": 481.55714111328126, + "learning_rate": 0.0005695865504800327, + "loss": 1035.931, + "step": 4610 + }, + { + "ce_loss_13": 3.1780589699745176, + "ce_loss_17": 3.1205894589424132, + "ce_loss_2": 4.273273622989654, + "ce_loss_4": 3.8013526678085325, + "ce_loss_9": 3.3556954741477965, + "epoch": 0.462, + "grad_norm": 1216.0, + "kl_loss_13": 116.01806144714355, + "kl_loss_2": 2412.004357910156, + "kl_loss_4": 1483.5377563476563, + "kl_loss_9": 524.9045639038086, + "learning_rate": 0.0005680149810876322, + "loss": 1087.6727, + "step": 4620 + }, + { + "ce_loss_13": 3.234540843963623, + "ce_loss_17": 3.1828582525253295, + "ce_loss_2": 4.253381359577179, + "ce_loss_4": 3.791711854934692, + "ce_loss_9": 3.3954779863357545, + "epoch": 0.463, + "grad_norm": 1288.0, + "kl_loss_13": 110.05101470947265, + "kl_loss_2": 2254.9929077148436, + "kl_loss_4": 1357.5958251953125, + "kl_loss_9": 485.29139404296876, + "learning_rate": 0.0005664427267851271, + "loss": 1044.0494, + "step": 4630 + }, + { + "ce_loss_13": 3.1544872760772704, + "ce_loss_17": 3.1011232256889345, + "ce_loss_2": 4.1691277623176575, + "ce_loss_4": 3.7214199781417845, + "ce_loss_9": 3.313951563835144, + "epoch": 0.464, + "grad_norm": 968.0, + "kl_loss_13": 109.78980827331543, + "kl_loss_2": 2245.476446533203, + "kl_loss_4": 1354.0137329101562, + "kl_loss_9": 481.89545288085935, + "learning_rate": 0.0005648698034051009, + "loss": 1041.4527, + "step": 4640 + }, + { + "ce_loss_13": 3.2633049845695496, + "ce_loss_17": 3.2092364549636843, + "ce_loss_2": 4.306076383590698, + "ce_loss_4": 3.8452648401260374, + "ce_loss_9": 3.42974169254303, + "epoch": 0.465, + "grad_norm": 964.0, + "kl_loss_13": 111.22759056091309, + "kl_loss_2": 2293.011083984375, + "kl_loss_4": 1386.7623474121094, + "kl_loss_9": 486.6575759887695, + "learning_rate": 0.0005632962267868747, + "loss": 1042.301, + "step": 4650 + }, + { + "ce_loss_13": 3.203655731678009, + "ce_loss_17": 3.153251326084137, + "ce_loss_2": 4.199740731716156, + "ce_loss_4": 3.763210356235504, + "ce_loss_9": 3.3652966260910033, + "epoch": 0.466, + "grad_norm": 1288.0, + "kl_loss_13": 107.88800239562988, + "kl_loss_2": 2203.660821533203, + "kl_loss_4": 1341.0546997070312, + "kl_loss_9": 476.93409118652346, + "learning_rate": 0.0005617220127763474, + "loss": 1048.7657, + "step": 4660 + }, + { + "ce_loss_13": 3.283149230480194, + "ce_loss_17": 3.2315285325050356, + "ce_loss_2": 4.270407438278198, + "ce_loss_4": 3.838921403884888, + "ce_loss_9": 3.4391672015190125, + "epoch": 0.467, + "grad_norm": 944.0, + "kl_loss_13": 110.06097297668457, + "kl_loss_2": 2186.371514892578, + "kl_loss_4": 1343.689013671875, + "kl_loss_9": 481.6672653198242, + "learning_rate": 0.0005601471772258368, + "loss": 1047.4086, + "step": 4670 + }, + { + "ce_loss_13": 3.270585501194, + "ce_loss_17": 3.2172582030296324, + "ce_loss_2": 4.250234508514405, + "ce_loss_4": 3.814559781551361, + "ce_loss_9": 3.432374918460846, + "epoch": 0.468, + "grad_norm": 928.0, + "kl_loss_13": 110.6790729522705, + "kl_loss_2": 2173.3947143554688, + "kl_loss_4": 1313.260076904297, + "kl_loss_9": 477.43175048828124, + "learning_rate": 0.0005585717359939192, + "loss": 1044.8605, + "step": 4680 + }, + { + "ce_loss_13": 3.1767726302146913, + "ce_loss_17": 3.125308394432068, + "ce_loss_2": 4.167539250850678, + "ce_loss_4": 3.7297667503356933, + "ce_loss_9": 3.337337589263916, + "epoch": 0.469, + "grad_norm": 1248.0, + "kl_loss_13": 111.45461730957031, + "kl_loss_2": 2181.8961791992188, + "kl_loss_4": 1331.780029296875, + "kl_loss_9": 480.65440521240237, + "learning_rate": 0.0005569957049452703, + "loss": 1053.8356, + "step": 4690 + }, + { + "ce_loss_13": 3.236863577365875, + "ce_loss_17": 3.182290959358215, + "ce_loss_2": 4.252595901489258, + "ce_loss_4": 3.804863929748535, + "ce_loss_9": 3.4005256414413454, + "epoch": 0.47, + "grad_norm": 1224.0, + "kl_loss_13": 113.201411819458, + "kl_loss_2": 2253.833605957031, + "kl_loss_4": 1371.784326171875, + "kl_loss_9": 491.1901519775391, + "learning_rate": 0.0005554190999505056, + "loss": 1055.8584, + "step": 4700 + }, + { + "ce_loss_13": 3.3600334405899046, + "ce_loss_17": 3.3064367294311525, + "ce_loss_2": 4.3621403813362125, + "ce_loss_4": 3.9184364795684816, + "ce_loss_9": 3.525863695144653, + "epoch": 0.471, + "grad_norm": 1832.0, + "kl_loss_13": 114.85836601257324, + "kl_loss_2": 2245.316845703125, + "kl_loss_4": 1372.714208984375, + "kl_loss_9": 498.2320281982422, + "learning_rate": 0.0005538419368860196, + "loss": 1020.0092, + "step": 4710 + }, + { + "ce_loss_13": 3.279603111743927, + "ce_loss_17": 3.2279528260231016, + "ce_loss_2": 4.276900470256805, + "ce_loss_4": 3.8301114559173586, + "ce_loss_9": 3.436483359336853, + "epoch": 0.472, + "grad_norm": 952.0, + "kl_loss_13": 111.89427909851074, + "kl_loss_2": 2218.647540283203, + "kl_loss_4": 1337.0641845703126, + "kl_loss_9": 485.2105972290039, + "learning_rate": 0.0005522642316338268, + "loss": 1061.181, + "step": 4720 + }, + { + "ce_loss_13": 3.2930665612220764, + "ce_loss_17": 3.243084716796875, + "ce_loss_2": 4.277700531482696, + "ce_loss_4": 3.83689523935318, + "ce_loss_9": 3.4494389414787294, + "epoch": 0.473, + "grad_norm": 1544.0, + "kl_loss_13": 111.53352355957031, + "kl_loss_2": 2209.90634765625, + "kl_loss_4": 1340.559375, + "kl_loss_9": 489.29090423583983, + "learning_rate": 0.0005506860000814017, + "loss": 1063.3098, + "step": 4730 + }, + { + "ce_loss_13": 3.3136982679367066, + "ce_loss_17": 3.264804482460022, + "ce_loss_2": 4.2714087724685665, + "ce_loss_4": 3.850303387641907, + "ce_loss_9": 3.475944495201111, + "epoch": 0.474, + "grad_norm": 892.0, + "kl_loss_13": 108.09279823303223, + "kl_loss_2": 2142.868975830078, + "kl_loss_4": 1308.8129943847657, + "kl_loss_9": 474.7614212036133, + "learning_rate": 0.0005491072581215186, + "loss": 1034.6965, + "step": 4740 + }, + { + "ce_loss_13": 3.3083571076393126, + "ce_loss_17": 3.2537139296531676, + "ce_loss_2": 4.294279718399048, + "ce_loss_4": 3.853098380565643, + "ce_loss_9": 3.4682462096214293, + "epoch": 0.475, + "grad_norm": 1104.0, + "kl_loss_13": 114.12852745056152, + "kl_loss_2": 2222.1892333984374, + "kl_loss_4": 1351.5962280273438, + "kl_loss_9": 491.66186065673827, + "learning_rate": 0.0005475280216520913, + "loss": 1022.1295, + "step": 4750 + }, + { + "ce_loss_13": 3.2300382375717165, + "ce_loss_17": 3.179151678085327, + "ce_loss_2": 4.211649882793426, + "ce_loss_4": 3.776401126384735, + "ce_loss_9": 3.389701724052429, + "epoch": 0.476, + "grad_norm": 1112.0, + "kl_loss_13": 108.27315292358398, + "kl_loss_2": 2165.7446716308596, + "kl_loss_4": 1321.9624267578124, + "kl_loss_9": 473.8345779418945, + "learning_rate": 0.0005459483065760138, + "loss": 1050.3727, + "step": 4760 + }, + { + "ce_loss_13": 3.1679397225379944, + "ce_loss_17": 3.11603444814682, + "ce_loss_2": 4.229493188858032, + "ce_loss_4": 3.763473379611969, + "ce_loss_9": 3.3382099628448487, + "epoch": 0.477, + "grad_norm": 996.0, + "kl_loss_13": 110.51704330444336, + "kl_loss_2": 2333.7154846191406, + "kl_loss_4": 1422.4058227539062, + "kl_loss_9": 494.48961486816404, + "learning_rate": 0.0005443681288009991, + "loss": 1060.2763, + "step": 4770 + }, + { + "ce_loss_13": 3.228108215332031, + "ce_loss_17": 3.17510107755661, + "ce_loss_2": 4.232603669166565, + "ce_loss_4": 3.7905447483062744, + "ce_loss_9": 3.3814319372177124, + "epoch": 0.478, + "grad_norm": 1056.0, + "kl_loss_13": 109.66107788085938, + "kl_loss_2": 2245.638464355469, + "kl_loss_4": 1369.1700744628906, + "kl_loss_9": 481.25022583007814, + "learning_rate": 0.0005427875042394199, + "loss": 1054.5191, + "step": 4780 + }, + { + "ce_loss_13": 3.256838548183441, + "ce_loss_17": 3.2014452815055847, + "ce_loss_2": 4.246220719814301, + "ce_loss_4": 3.8117555141448975, + "ce_loss_9": 3.417004668712616, + "epoch": 0.479, + "grad_norm": 968.0, + "kl_loss_13": 111.60623626708984, + "kl_loss_2": 2183.6200927734376, + "kl_loss_4": 1354.45224609375, + "kl_loss_9": 484.35677032470704, + "learning_rate": 0.0005412064488081482, + "loss": 1052.2033, + "step": 4790 + }, + { + "ce_loss_13": 3.258776915073395, + "ce_loss_17": 3.206698942184448, + "ce_loss_2": 4.235857367515564, + "ce_loss_4": 3.8036677718162535, + "ce_loss_9": 3.4132760643959044, + "epoch": 0.48, + "grad_norm": 932.0, + "kl_loss_13": 107.48110961914062, + "kl_loss_2": 2173.7078491210937, + "kl_loss_4": 1325.2386047363282, + "kl_loss_9": 470.6855178833008, + "learning_rate": 0.0005396249784283942, + "loss": 1023.3161, + "step": 4800 + }, + { + "ce_loss_13": 3.276000273227692, + "ce_loss_17": 3.2228050947189333, + "ce_loss_2": 4.307373595237732, + "ce_loss_4": 3.8528510093688966, + "ce_loss_9": 3.439617323875427, + "epoch": 0.481, + "grad_norm": 1296.0, + "kl_loss_13": 112.76452827453613, + "kl_loss_2": 2280.9171447753906, + "kl_loss_4": 1391.2562377929687, + "kl_loss_9": 493.9219543457031, + "learning_rate": 0.0005380431090255476, + "loss": 1060.5157, + "step": 4810 + }, + { + "ce_loss_13": 3.2773956537246702, + "ce_loss_17": 3.227415692806244, + "ce_loss_2": 4.246558117866516, + "ce_loss_4": 3.8142582535743714, + "ce_loss_9": 3.4314499258995057, + "epoch": 0.482, + "grad_norm": 1184.0, + "kl_loss_13": 106.4849853515625, + "kl_loss_2": 2173.4614135742186, + "kl_loss_4": 1315.6811462402343, + "kl_loss_9": 467.9435760498047, + "learning_rate": 0.0005364608565290155, + "loss": 1021.682, + "step": 4820 + }, + { + "ce_loss_13": 3.280693018436432, + "ce_loss_17": 3.229283595085144, + "ce_loss_2": 4.281590461730957, + "ce_loss_4": 3.836788702011108, + "ce_loss_9": 3.4402923941612245, + "epoch": 0.483, + "grad_norm": 980.0, + "kl_loss_13": 110.87230529785157, + "kl_loss_2": 2223.226153564453, + "kl_loss_4": 1341.8607788085938, + "kl_loss_9": 483.7612579345703, + "learning_rate": 0.0005348782368720626, + "loss": 1040.8236, + "step": 4830 + }, + { + "ce_loss_13": 3.214833343029022, + "ce_loss_17": 3.1662728071212767, + "ce_loss_2": 4.207602524757386, + "ce_loss_4": 3.7587535858154295, + "ce_loss_9": 3.3715001344680786, + "epoch": 0.484, + "grad_norm": 936.0, + "kl_loss_13": 107.31052589416504, + "kl_loss_2": 2177.160748291016, + "kl_loss_4": 1309.6916381835938, + "kl_loss_9": 469.6512908935547, + "learning_rate": 0.000533295265991652, + "loss": 1035.5555, + "step": 4840 + }, + { + "ce_loss_13": 3.287393403053284, + "ce_loss_17": 3.2344298481941225, + "ce_loss_2": 4.261090314388275, + "ce_loss_4": 3.8333823919296264, + "ce_loss_9": 3.4509487748146057, + "epoch": 0.485, + "grad_norm": 1072.0, + "kl_loss_13": 108.91606903076172, + "kl_loss_2": 2159.747033691406, + "kl_loss_4": 1321.8405029296875, + "kl_loss_9": 480.07725219726564, + "learning_rate": 0.0005317119598282822, + "loss": 1022.9067, + "step": 4850 + }, + { + "ce_loss_13": 3.290719723701477, + "ce_loss_17": 3.238418161869049, + "ce_loss_2": 4.2707842826843265, + "ce_loss_4": 3.83918879032135, + "ce_loss_9": 3.44874883890152, + "epoch": 0.486, + "grad_norm": 1032.0, + "kl_loss_13": 109.74356002807617, + "kl_loss_2": 2177.9513732910154, + "kl_loss_4": 1336.8647766113281, + "kl_loss_9": 481.4572998046875, + "learning_rate": 0.0005301283343258293, + "loss": 1030.6471, + "step": 4860 + }, + { + "ce_loss_13": 3.346208024024963, + "ce_loss_17": 3.2939000010490416, + "ce_loss_2": 4.310344386100769, + "ce_loss_4": 3.889477014541626, + "ce_loss_9": 3.503834545612335, + "epoch": 0.487, + "grad_norm": 1288.0, + "kl_loss_13": 110.86674461364746, + "kl_loss_2": 2148.84541015625, + "kl_loss_4": 1318.5968444824218, + "kl_loss_9": 477.80435943603516, + "learning_rate": 0.000528544405431384, + "loss": 1017.3928, + "step": 4870 + }, + { + "ce_loss_13": 3.2290894865989683, + "ce_loss_17": 3.174159383773804, + "ce_loss_2": 4.233102631568909, + "ce_loss_4": 3.792231833934784, + "ce_loss_9": 3.394144558906555, + "epoch": 0.488, + "grad_norm": 1264.0, + "kl_loss_13": 112.18865776062012, + "kl_loss_2": 2239.877301025391, + "kl_loss_4": 1375.2481628417968, + "kl_loss_9": 495.68187103271487, + "learning_rate": 0.000526960189095093, + "loss": 1052.9785, + "step": 4880 + }, + { + "ce_loss_13": 3.213497483730316, + "ce_loss_17": 3.161575698852539, + "ce_loss_2": 4.205214536190033, + "ce_loss_4": 3.7677800059318542, + "ce_loss_9": 3.3718682527542114, + "epoch": 0.489, + "grad_norm": 1104.0, + "kl_loss_13": 107.91000404357911, + "kl_loss_2": 2186.121960449219, + "kl_loss_4": 1325.5592468261718, + "kl_loss_9": 475.09495239257814, + "learning_rate": 0.0005253757012699972, + "loss": 1028.5498, + "step": 4890 + }, + { + "ce_loss_13": 3.2871811270713804, + "ce_loss_17": 3.2368084311485292, + "ce_loss_2": 4.271547615528107, + "ce_loss_4": 3.83217339515686, + "ce_loss_9": 3.4459805607795717, + "epoch": 0.49, + "grad_norm": 972.0, + "kl_loss_13": 109.13588638305664, + "kl_loss_2": 2178.5744873046874, + "kl_loss_4": 1324.7280822753905, + "kl_loss_9": 478.6420501708984, + "learning_rate": 0.0005237909579118712, + "loss": 1042.8182, + "step": 4900 + }, + { + "ce_loss_13": 3.2541236519813537, + "ce_loss_17": 3.202117133140564, + "ce_loss_2": 4.274856173992157, + "ce_loss_4": 3.819451022148132, + "ce_loss_9": 3.4207903385162353, + "epoch": 0.491, + "grad_norm": 1168.0, + "kl_loss_13": 112.19763107299805, + "kl_loss_2": 2272.8551635742188, + "kl_loss_4": 1373.2723571777344, + "kl_loss_9": 494.2281951904297, + "learning_rate": 0.0005222059749790631, + "loss": 1055.1499, + "step": 4910 + }, + { + "ce_loss_13": 3.3191882729530335, + "ce_loss_17": 3.265893840789795, + "ce_loss_2": 4.259448051452637, + "ce_loss_4": 3.842312014102936, + "ce_loss_9": 3.474241018295288, + "epoch": 0.492, + "grad_norm": 856.0, + "kl_loss_13": 109.15383377075196, + "kl_loss_2": 2124.7238586425783, + "kl_loss_4": 1298.0829772949219, + "kl_loss_9": 472.2763168334961, + "learning_rate": 0.0005206207684323337, + "loss": 1004.4076, + "step": 4920 + }, + { + "ce_loss_13": 3.298674702644348, + "ce_loss_17": 3.2478388071060182, + "ce_loss_2": 4.280341839790344, + "ce_loss_4": 3.845925140380859, + "ce_loss_9": 3.455951476097107, + "epoch": 0.493, + "grad_norm": 1104.0, + "kl_loss_13": 111.8846549987793, + "kl_loss_2": 2195.7177307128904, + "kl_loss_4": 1338.984307861328, + "kl_loss_9": 483.6929077148437, + "learning_rate": 0.000519035354234695, + "loss": 1051.7047, + "step": 4930 + }, + { + "ce_loss_13": 3.2714035511016846, + "ce_loss_17": 3.215863215923309, + "ce_loss_2": 4.262099242210388, + "ce_loss_4": 3.8290010571479796, + "ce_loss_9": 3.4361720085144043, + "epoch": 0.494, + "grad_norm": 1216.0, + "kl_loss_13": 112.78949012756348, + "kl_loss_2": 2180.284765625, + "kl_loss_4": 1332.1336059570312, + "kl_loss_9": 481.2816329956055, + "learning_rate": 0.0005174497483512506, + "loss": 1017.8218, + "step": 4940 + }, + { + "ce_loss_13": 3.3174590826034547, + "ce_loss_17": 3.270202100276947, + "ce_loss_2": 4.286599111557007, + "ce_loss_4": 3.8568575024604796, + "ce_loss_9": 3.4731018781661986, + "epoch": 0.495, + "grad_norm": 1168.0, + "kl_loss_13": 108.25534095764161, + "kl_loss_2": 2174.4831970214846, + "kl_loss_4": 1324.2088500976563, + "kl_loss_9": 474.80159759521484, + "learning_rate": 0.0005158639667490339, + "loss": 1044.1715, + "step": 4950 + }, + { + "ce_loss_13": 3.226637125015259, + "ce_loss_17": 3.1736985564231874, + "ce_loss_2": 4.221232938766479, + "ce_loss_4": 3.782878804206848, + "ce_loss_9": 3.38752304315567, + "epoch": 0.496, + "grad_norm": 1232.0, + "kl_loss_13": 108.65655517578125, + "kl_loss_2": 2204.9200927734373, + "kl_loss_4": 1341.1760009765626, + "kl_loss_9": 483.80982818603513, + "learning_rate": 0.0005142780253968481, + "loss": 1031.0737, + "step": 4960 + }, + { + "ce_loss_13": 3.174634885787964, + "ce_loss_17": 3.1229832410812377, + "ce_loss_2": 4.151046252250671, + "ce_loss_4": 3.7142460227012633, + "ce_loss_9": 3.3272785902023316, + "epoch": 0.497, + "grad_norm": 1004.0, + "kl_loss_13": 105.21072120666504, + "kl_loss_2": 2165.0536926269533, + "kl_loss_4": 1307.7681701660156, + "kl_loss_9": 464.3578475952148, + "learning_rate": 0.0005126919402651053, + "loss": 1000.4867, + "step": 4970 + }, + { + "ce_loss_13": 3.240077257156372, + "ce_loss_17": 3.185662806034088, + "ce_loss_2": 4.251433145999909, + "ce_loss_4": 3.8115333199501036, + "ce_loss_9": 3.402933120727539, + "epoch": 0.498, + "grad_norm": 1272.0, + "kl_loss_13": 111.33021049499511, + "kl_loss_2": 2216.136126708984, + "kl_loss_4": 1357.0237365722655, + "kl_loss_9": 484.2033325195313, + "learning_rate": 0.0005111057273256647, + "loss": 1042.8983, + "step": 4980 + }, + { + "ce_loss_13": 3.343287134170532, + "ce_loss_17": 3.29540411233902, + "ce_loss_2": 4.255156755447388, + "ce_loss_4": 3.8461005330085754, + "ce_loss_9": 3.4873583793640135, + "epoch": 0.499, + "grad_norm": 1216.0, + "kl_loss_13": 105.00677680969238, + "kl_loss_2": 2057.784045410156, + "kl_loss_4": 1257.2170288085938, + "kl_loss_9": 455.3284118652344, + "learning_rate": 0.0005095194025516733, + "loss": 988.157, + "step": 4990 + }, + { + "ce_loss_13": 3.270824205875397, + "ce_loss_17": 3.2217456102371216, + "ce_loss_2": 4.238569843769073, + "ce_loss_4": 3.8028435945510863, + "ce_loss_9": 3.4210456013679504, + "epoch": 0.5, + "grad_norm": 992.0, + "kl_loss_13": 105.67387733459472, + "kl_loss_2": 2144.187335205078, + "kl_loss_4": 1296.3322814941407, + "kl_loss_9": 464.3164535522461, + "learning_rate": 0.000507932981917404, + "loss": 1045.0859, + "step": 5000 + }, + { + "ce_loss_13": 3.222704768180847, + "ce_loss_17": 3.171173429489136, + "ce_loss_2": 4.254918313026428, + "ce_loss_4": 3.8063995003700257, + "ce_loss_9": 3.3885273575782775, + "epoch": 0.501, + "grad_norm": 1224.0, + "kl_loss_13": 113.85473251342773, + "kl_loss_2": 2289.4469482421873, + "kl_loss_4": 1406.094580078125, + "kl_loss_9": 495.9951766967773, + "learning_rate": 0.0005063464813980949, + "loss": 1067.6688, + "step": 5010 + }, + { + "ce_loss_13": 3.208712577819824, + "ce_loss_17": 3.1586388826370237, + "ce_loss_2": 4.1936874151229855, + "ce_loss_4": 3.7563283324241636, + "ce_loss_9": 3.364860200881958, + "epoch": 0.502, + "grad_norm": 872.0, + "kl_loss_13": 108.19618873596191, + "kl_loss_2": 2210.4015869140626, + "kl_loss_4": 1352.5886840820312, + "kl_loss_9": 480.4805206298828, + "learning_rate": 0.0005047599169697884, + "loss": 1028.723, + "step": 5020 + }, + { + "ce_loss_13": 3.1455006837844848, + "ce_loss_17": 3.0943350315093996, + "ce_loss_2": 4.153426742553711, + "ce_loss_4": 3.703087866306305, + "ce_loss_9": 3.3071312069892884, + "epoch": 0.503, + "grad_norm": 1088.0, + "kl_loss_13": 109.03148651123047, + "kl_loss_2": 2212.201318359375, + "kl_loss_4": 1324.3646301269532, + "kl_loss_9": 475.39954528808596, + "learning_rate": 0.000503173304609171, + "loss": 1009.9896, + "step": 5030 + }, + { + "ce_loss_13": 3.2657658576965334, + "ce_loss_17": 3.21257461309433, + "ce_loss_2": 4.242002463340759, + "ce_loss_4": 3.816751551628113, + "ce_loss_9": 3.424900007247925, + "epoch": 0.504, + "grad_norm": 1040.0, + "kl_loss_13": 111.41729011535645, + "kl_loss_2": 2165.790808105469, + "kl_loss_4": 1330.8880737304687, + "kl_loss_9": 475.275813293457, + "learning_rate": 0.0005015866602934111, + "loss": 1013.2565, + "step": 5040 + }, + { + "ce_loss_13": 3.23921936750412, + "ce_loss_17": 3.1856309294700624, + "ce_loss_2": 4.261603832244873, + "ce_loss_4": 3.815677559375763, + "ce_loss_9": 3.407161271572113, + "epoch": 0.505, + "grad_norm": 1008.0, + "kl_loss_13": 114.81494789123535, + "kl_loss_2": 2262.851110839844, + "kl_loss_4": 1389.6484619140624, + "kl_loss_9": 502.0806716918945, + "learning_rate": 0.0005, + "loss": 1049.0623, + "step": 5050 + }, + { + "ce_loss_13": 3.230252742767334, + "ce_loss_17": 3.178666591644287, + "ce_loss_2": 4.21544029712677, + "ce_loss_4": 3.7766329526901243, + "ce_loss_9": 3.3884277820587156, + "epoch": 0.506, + "grad_norm": 1176.0, + "kl_loss_13": 111.81233978271484, + "kl_loss_2": 2198.957598876953, + "kl_loss_4": 1338.3859069824218, + "kl_loss_9": 480.9139739990234, + "learning_rate": 0.0004984133397065889, + "loss": 1018.4827, + "step": 5060 + }, + { + "ce_loss_13": 3.235624635219574, + "ce_loss_17": 3.1809684872627257, + "ce_loss_2": 4.242417931556702, + "ce_loss_4": 3.8105843305587768, + "ce_loss_9": 3.4018515825271605, + "epoch": 0.507, + "grad_norm": 1112.0, + "kl_loss_13": 110.32775650024413, + "kl_loss_2": 2212.3127258300783, + "kl_loss_4": 1354.6189331054688, + "kl_loss_9": 483.2311676025391, + "learning_rate": 0.0004968266953908291, + "loss": 1019.2341, + "step": 5070 + }, + { + "ce_loss_13": 3.276156723499298, + "ce_loss_17": 3.2256416201591493, + "ce_loss_2": 4.281172335147858, + "ce_loss_4": 3.832336127758026, + "ce_loss_9": 3.4308794260025026, + "epoch": 0.508, + "grad_norm": 1080.0, + "kl_loss_13": 106.78249778747559, + "kl_loss_2": 2219.6784118652345, + "kl_loss_4": 1340.677099609375, + "kl_loss_9": 473.00534973144534, + "learning_rate": 0.0004952400830302117, + "loss": 1029.2022, + "step": 5080 + }, + { + "ce_loss_13": 3.1995387196540834, + "ce_loss_17": 3.1476324558258058, + "ce_loss_2": 4.22556174993515, + "ce_loss_4": 3.77113823890686, + "ce_loss_9": 3.3625010371208193, + "epoch": 0.509, + "grad_norm": 1136.0, + "kl_loss_13": 109.94557342529296, + "kl_loss_2": 2253.514764404297, + "kl_loss_4": 1367.4345336914062, + "kl_loss_9": 487.09239959716797, + "learning_rate": 0.0004936535186019053, + "loss": 1032.7042, + "step": 5090 + }, + { + "ce_loss_13": 3.2978178739547728, + "ce_loss_17": 3.250880241394043, + "ce_loss_2": 4.254099154472351, + "ce_loss_4": 3.8262134075164793, + "ce_loss_9": 3.4515664935112, + "epoch": 0.51, + "grad_norm": 956.0, + "kl_loss_13": 105.78404998779297, + "kl_loss_2": 2117.003942871094, + "kl_loss_4": 1281.8258117675782, + "kl_loss_9": 461.229052734375, + "learning_rate": 0.000492067018082596, + "loss": 1007.5633, + "step": 5100 + }, + { + "ce_loss_13": 3.2365919232368467, + "ce_loss_17": 3.1842783212661745, + "ce_loss_2": 4.280776762962342, + "ce_loss_4": 3.8157904982566833, + "ce_loss_9": 3.3977736473083495, + "epoch": 0.511, + "grad_norm": 1004.0, + "kl_loss_13": 111.49824409484863, + "kl_loss_2": 2284.035919189453, + "kl_loss_4": 1389.0912048339844, + "kl_loss_9": 492.51202697753905, + "learning_rate": 0.0004904805974483267, + "loss": 1071.0456, + "step": 5110 + }, + { + "ce_loss_13": 3.3491473317146303, + "ce_loss_17": 3.294369626045227, + "ce_loss_2": 4.358141589164734, + "ce_loss_4": 3.923525071144104, + "ce_loss_9": 3.5212703108787538, + "epoch": 0.512, + "grad_norm": 992.0, + "kl_loss_13": 115.72470321655274, + "kl_loss_2": 2263.9080322265627, + "kl_loss_4": 1403.2633850097657, + "kl_loss_9": 511.30066986083983, + "learning_rate": 0.0004888942726743353, + "loss": 1082.3979, + "step": 5120 + }, + { + "ce_loss_13": 3.225136196613312, + "ce_loss_17": 3.1711061358451844, + "ce_loss_2": 4.225401782989502, + "ce_loss_4": 3.7794183254241944, + "ce_loss_9": 3.3844812989234923, + "epoch": 0.513, + "grad_norm": 1040.0, + "kl_loss_13": 110.7390235900879, + "kl_loss_2": 2239.033251953125, + "kl_loss_4": 1358.2411804199219, + "kl_loss_9": 486.7144241333008, + "learning_rate": 0.0004873080597348947, + "loss": 1050.067, + "step": 5130 + }, + { + "ce_loss_13": 3.110567259788513, + "ce_loss_17": 3.058481764793396, + "ce_loss_2": 4.1664651989936825, + "ce_loss_4": 3.7035851001739504, + "ce_loss_9": 3.276204264163971, + "epoch": 0.514, + "grad_norm": 1080.0, + "kl_loss_13": 107.96892433166504, + "kl_loss_2": 2336.91787109375, + "kl_loss_4": 1417.5145324707032, + "kl_loss_9": 492.3893783569336, + "learning_rate": 0.0004857219746031519, + "loss": 1056.3621, + "step": 5140 + }, + { + "ce_loss_13": 3.285878098011017, + "ce_loss_17": 3.234000086784363, + "ce_loss_2": 4.25599856376648, + "ce_loss_4": 3.825782763957977, + "ce_loss_9": 3.4363691091537474, + "epoch": 0.515, + "grad_norm": 876.0, + "kl_loss_13": 108.96701393127441, + "kl_loss_2": 2172.479949951172, + "kl_loss_4": 1325.5762634277344, + "kl_loss_9": 474.7083343505859, + "learning_rate": 0.0004841360332509663, + "loss": 1035.6582, + "step": 5150 + }, + { + "ce_loss_13": 3.2344977498054504, + "ce_loss_17": 3.18514369726181, + "ce_loss_2": 4.21552494764328, + "ce_loss_4": 3.774991989135742, + "ce_loss_9": 3.3879132509231566, + "epoch": 0.516, + "grad_norm": 952.0, + "kl_loss_13": 105.33232688903809, + "kl_loss_2": 2162.049597167969, + "kl_loss_4": 1304.0826171875, + "kl_loss_9": 466.10711364746095, + "learning_rate": 0.0004825502516487497, + "loss": 989.9464, + "step": 5160 + }, + { + "ce_loss_13": 3.1986342787742617, + "ce_loss_17": 3.15169472694397, + "ce_loss_2": 4.208314824104309, + "ce_loss_4": 3.763399827480316, + "ce_loss_9": 3.358096885681152, + "epoch": 0.517, + "grad_norm": 1488.0, + "kl_loss_13": 107.77177772521972, + "kl_loss_2": 2252.6386474609376, + "kl_loss_4": 1367.6669311523438, + "kl_loss_9": 484.79732208251954, + "learning_rate": 0.00048096464576530507, + "loss": 1049.5946, + "step": 5170 + }, + { + "ce_loss_13": 3.301984977722168, + "ce_loss_17": 3.2530831694602966, + "ce_loss_2": 4.245057606697083, + "ce_loss_4": 3.8227354407310488, + "ce_loss_9": 3.454120099544525, + "epoch": 0.518, + "grad_norm": 1400.0, + "kl_loss_13": 107.68561592102051, + "kl_loss_2": 2115.5692626953123, + "kl_loss_4": 1287.362078857422, + "kl_loss_9": 466.56765899658205, + "learning_rate": 0.00047937923156766646, + "loss": 1001.7566, + "step": 5180 + }, + { + "ce_loss_13": 3.3438785672187805, + "ce_loss_17": 3.2939836263656614, + "ce_loss_2": 4.281368744373322, + "ce_loss_4": 3.855742335319519, + "ce_loss_9": 3.4901130199432373, + "epoch": 0.519, + "grad_norm": 964.0, + "kl_loss_13": 105.55763473510743, + "kl_loss_2": 2111.768273925781, + "kl_loss_4": 1275.2513366699218, + "kl_loss_9": 463.88387603759764, + "learning_rate": 0.00047779402502093696, + "loss": 1006.8702, + "step": 5190 + }, + { + "ce_loss_13": 3.309345471858978, + "ce_loss_17": 3.2597888231277468, + "ce_loss_2": 4.279218089580536, + "ce_loss_4": 3.8487032413482667, + "ce_loss_9": 3.466625654697418, + "epoch": 0.52, + "grad_norm": 1392.0, + "kl_loss_13": 107.53615341186523, + "kl_loss_2": 2151.816961669922, + "kl_loss_4": 1306.5434936523438, + "kl_loss_9": 472.96825714111327, + "learning_rate": 0.0004762090420881289, + "loss": 1021.7607, + "step": 5200 + }, + { + "ce_loss_13": 3.2263648748397826, + "ce_loss_17": 3.178256297111511, + "ce_loss_2": 4.193283033370972, + "ce_loss_4": 3.7647646069526672, + "ce_loss_9": 3.379043686389923, + "epoch": 0.521, + "grad_norm": 1000.0, + "kl_loss_13": 107.35119972229003, + "kl_loss_2": 2144.997277832031, + "kl_loss_4": 1308.7781066894531, + "kl_loss_9": 466.6482498168945, + "learning_rate": 0.00047462429873000296, + "loss": 998.5607, + "step": 5210 + }, + { + "ce_loss_13": 3.3140349864959715, + "ce_loss_17": 3.2648982405662537, + "ce_loss_2": 4.272081685066223, + "ce_loss_4": 3.83947434425354, + "ce_loss_9": 3.4667728424072264, + "epoch": 0.522, + "grad_norm": 1032.0, + "kl_loss_13": 108.13305015563965, + "kl_loss_2": 2152.734210205078, + "kl_loss_4": 1297.7600891113282, + "kl_loss_9": 465.7859146118164, + "learning_rate": 0.0004730398109049071, + "loss": 1007.0312, + "step": 5220 + }, + { + "ce_loss_13": 3.2415935397148132, + "ce_loss_17": 3.189415395259857, + "ce_loss_2": 4.258810150623321, + "ce_loss_4": 3.807445538043976, + "ce_loss_9": 3.4064001560211183, + "epoch": 0.523, + "grad_norm": 1072.0, + "kl_loss_13": 111.09775276184082, + "kl_loss_2": 2255.535504150391, + "kl_loss_4": 1375.0738159179687, + "kl_loss_9": 494.41556549072266, + "learning_rate": 0.000471455594568616, + "loss": 1037.1305, + "step": 5230 + }, + { + "ce_loss_13": 3.311601257324219, + "ce_loss_17": 3.262132716178894, + "ce_loss_2": 4.251528978347778, + "ce_loss_4": 3.8337440371513365, + "ce_loss_9": 3.4593356370925905, + "epoch": 0.524, + "grad_norm": 1136.0, + "kl_loss_13": 108.04674186706544, + "kl_loss_2": 2109.9256896972656, + "kl_loss_4": 1283.6365844726563, + "kl_loss_9": 463.1393981933594, + "learning_rate": 0.00046987166567417086, + "loss": 1016.2275, + "step": 5240 + }, + { + "ce_loss_13": 3.2324166893959045, + "ce_loss_17": 3.1840704679489136, + "ce_loss_2": 4.214827346801758, + "ce_loss_4": 3.779071307182312, + "ce_loss_9": 3.387071359157562, + "epoch": 0.525, + "grad_norm": 920.0, + "kl_loss_13": 106.59242362976075, + "kl_loss_2": 2177.0141723632814, + "kl_loss_4": 1318.8663513183594, + "kl_loss_9": 471.74820098876955, + "learning_rate": 0.00046828804017171776, + "loss": 991.893, + "step": 5250 + }, + { + "ce_loss_13": 3.275792896747589, + "ce_loss_17": 3.2220492482185366, + "ce_loss_2": 4.290846729278565, + "ce_loss_4": 3.83939551115036, + "ce_loss_9": 3.4407880663871766, + "epoch": 0.526, + "grad_norm": 1136.0, + "kl_loss_13": 109.19308052062988, + "kl_loss_2": 2221.9618408203123, + "kl_loss_4": 1340.5270935058593, + "kl_loss_9": 476.9682327270508, + "learning_rate": 0.00046670473400834805, + "loss": 1040.4307, + "step": 5260 + }, + { + "ce_loss_13": 3.214914882183075, + "ce_loss_17": 3.1649786710739134, + "ce_loss_2": 4.185833942890167, + "ce_loss_4": 3.739937722682953, + "ce_loss_9": 3.3634310603141784, + "epoch": 0.527, + "grad_norm": 880.0, + "kl_loss_13": 104.75299758911133, + "kl_loss_2": 2147.398400878906, + "kl_loss_4": 1297.228009033203, + "kl_loss_9": 461.8563003540039, + "learning_rate": 0.00046512176312793734, + "loss": 1040.058, + "step": 5270 + }, + { + "ce_loss_13": 3.206827437877655, + "ce_loss_17": 3.1552355170249937, + "ce_loss_2": 4.187953531742096, + "ce_loss_4": 3.7473872661590577, + "ce_loss_9": 3.3625742316246034, + "epoch": 0.528, + "grad_norm": 1096.0, + "kl_loss_13": 106.52910118103027, + "kl_loss_2": 2187.5312744140624, + "kl_loss_4": 1322.4962463378906, + "kl_loss_9": 469.03101501464846, + "learning_rate": 0.00046353914347098467, + "loss": 1031.2357, + "step": 5280 + }, + { + "ce_loss_13": 3.305253803730011, + "ce_loss_17": 3.252510154247284, + "ce_loss_2": 4.285699701309204, + "ce_loss_4": 3.842504251003265, + "ce_loss_9": 3.4528360605239867, + "epoch": 0.529, + "grad_norm": 1056.0, + "kl_loss_13": 106.41786842346191, + "kl_loss_2": 2176.531140136719, + "kl_loss_4": 1306.1036376953125, + "kl_loss_9": 460.55931396484374, + "learning_rate": 0.0004619568909744524, + "loss": 1028.1643, + "step": 5290 + }, + { + "ce_loss_13": 3.307948422431946, + "ce_loss_17": 3.2569443583488464, + "ce_loss_2": 4.26685893535614, + "ce_loss_4": 3.834060752391815, + "ce_loss_9": 3.4600693821907043, + "epoch": 0.53, + "grad_norm": 1304.0, + "kl_loss_13": 107.5014633178711, + "kl_loss_2": 2140.457586669922, + "kl_loss_4": 1306.8389831542968, + "kl_loss_9": 467.36107482910154, + "learning_rate": 0.00046037502157160573, + "loss": 1022.1671, + "step": 5300 + }, + { + "ce_loss_13": 3.1830863833427427, + "ce_loss_17": 3.131933069229126, + "ce_loss_2": 4.17400164604187, + "ce_loss_4": 3.7360609650611876, + "ce_loss_9": 3.343784141540527, + "epoch": 0.531, + "grad_norm": 916.0, + "kl_loss_13": 108.30407600402832, + "kl_loss_2": 2198.6561645507813, + "kl_loss_4": 1332.3593627929688, + "kl_loss_9": 476.6579849243164, + "learning_rate": 0.00045879355119185207, + "loss": 1033.7932, + "step": 5310 + }, + { + "ce_loss_13": 3.2634960412979126, + "ce_loss_17": 3.2128461956977845, + "ce_loss_2": 4.2584593296051025, + "ce_loss_4": 3.8088276505470278, + "ce_loss_9": 3.423557090759277, + "epoch": 0.532, + "grad_norm": 1016.0, + "kl_loss_13": 109.0525390625, + "kl_loss_2": 2215.382958984375, + "kl_loss_4": 1344.674462890625, + "kl_loss_9": 483.93562469482424, + "learning_rate": 0.0004572124957605803, + "loss": 1045.333, + "step": 5320 + }, + { + "ce_loss_13": 3.2787564635276794, + "ce_loss_17": 3.2269508719444273, + "ce_loss_2": 4.256575560569763, + "ce_loss_4": 3.8199068903923035, + "ce_loss_9": 3.434715986251831, + "epoch": 0.533, + "grad_norm": 928.0, + "kl_loss_13": 108.03712043762206, + "kl_loss_2": 2182.551055908203, + "kl_loss_4": 1316.497491455078, + "kl_loss_9": 472.1242965698242, + "learning_rate": 0.00045563187119900103, + "loss": 1008.6424, + "step": 5330 + }, + { + "ce_loss_13": 3.1260906934738157, + "ce_loss_17": 3.0763007402420044, + "ce_loss_2": 4.137292850017547, + "ce_loss_4": 3.688651609420776, + "ce_loss_9": 3.285020625591278, + "epoch": 0.534, + "grad_norm": 1128.0, + "kl_loss_13": 107.27054481506347, + "kl_loss_2": 2240.3926513671877, + "kl_loss_4": 1355.5620910644532, + "kl_loss_9": 478.71144714355466, + "learning_rate": 0.00045405169342398633, + "loss": 1037.7574, + "step": 5340 + }, + { + "ce_loss_13": 3.2156192779541017, + "ce_loss_17": 3.1625937700271605, + "ce_loss_2": 4.220162975788116, + "ce_loss_4": 3.7749172687530517, + "ce_loss_9": 3.373515546321869, + "epoch": 0.535, + "grad_norm": 1032.0, + "kl_loss_13": 109.34287300109864, + "kl_loss_2": 2223.4241149902346, + "kl_loss_4": 1345.4398864746095, + "kl_loss_9": 476.4727493286133, + "learning_rate": 0.0004524719783479088, + "loss": 1016.2339, + "step": 5350 + }, + { + "ce_loss_13": 3.169036400318146, + "ce_loss_17": 3.1177022576332094, + "ce_loss_2": 4.195098793506622, + "ce_loss_4": 3.743642437458038, + "ce_loss_9": 3.333178925514221, + "epoch": 0.536, + "grad_norm": 1012.0, + "kl_loss_13": 109.25458374023438, + "kl_loss_2": 2278.1389343261717, + "kl_loss_4": 1380.7042236328125, + "kl_loss_9": 485.84498291015626, + "learning_rate": 0.00045089274187848144, + "loss": 1027.5537, + "step": 5360 + }, + { + "ce_loss_13": 3.2890742897987364, + "ce_loss_17": 3.2406167149543763, + "ce_loss_2": 4.258069348335266, + "ce_loss_4": 3.814786446094513, + "ce_loss_9": 3.439096474647522, + "epoch": 0.537, + "grad_norm": 1360.0, + "kl_loss_13": 105.89108619689941, + "kl_loss_2": 2161.1524658203125, + "kl_loss_4": 1292.0111572265625, + "kl_loss_9": 466.79013977050784, + "learning_rate": 0.00044931399991859835, + "loss": 1007.4661, + "step": 5370 + }, + { + "ce_loss_13": 3.149510049819946, + "ce_loss_17": 3.097951829433441, + "ce_loss_2": 4.137627530097961, + "ce_loss_4": 3.689609134197235, + "ce_loss_9": 3.305711901187897, + "epoch": 0.538, + "grad_norm": 992.0, + "kl_loss_13": 105.90481109619141, + "kl_loss_2": 2211.254040527344, + "kl_loss_4": 1328.7205688476563, + "kl_loss_9": 469.3480453491211, + "learning_rate": 0.00044773576836617336, + "loss": 1013.1835, + "step": 5380 + }, + { + "ce_loss_13": 3.236725854873657, + "ce_loss_17": 3.185615932941437, + "ce_loss_2": 4.233655059337616, + "ce_loss_4": 3.7948179721832274, + "ce_loss_9": 3.4014445185661315, + "epoch": 0.539, + "grad_norm": 1096.0, + "kl_loss_13": 108.59077491760254, + "kl_loss_2": 2215.704730224609, + "kl_loss_4": 1349.9497009277343, + "kl_loss_9": 482.33070068359376, + "learning_rate": 0.00044615806311398056, + "loss": 1052.5212, + "step": 5390 + }, + { + "ce_loss_13": 3.313949739933014, + "ce_loss_17": 3.267702412605286, + "ce_loss_2": 4.232453298568726, + "ce_loss_4": 3.8164884209632874, + "ce_loss_9": 3.459679901599884, + "epoch": 0.54, + "grad_norm": 976.0, + "kl_loss_13": 105.0506088256836, + "kl_loss_2": 2074.752337646484, + "kl_loss_4": 1256.4076843261719, + "kl_loss_9": 456.01829833984374, + "learning_rate": 0.00044458090004949454, + "loss": 1011.3523, + "step": 5400 + }, + { + "ce_loss_13": 3.174528408050537, + "ce_loss_17": 3.120724880695343, + "ce_loss_2": 4.2255029559135435, + "ce_loss_4": 3.760796058177948, + "ce_loss_9": 3.343866789340973, + "epoch": 0.541, + "grad_norm": 856.0, + "kl_loss_13": 112.00466651916504, + "kl_loss_2": 2339.7271423339844, + "kl_loss_4": 1414.7299194335938, + "kl_loss_9": 500.8909164428711, + "learning_rate": 0.0004430042950547297, + "loss": 1042.9648, + "step": 5410 + }, + { + "ce_loss_13": 3.2675156354904176, + "ce_loss_17": 3.2129441499710083, + "ce_loss_2": 4.268678307533264, + "ce_loss_4": 3.8227189064025877, + "ce_loss_9": 3.4316918730735777, + "epoch": 0.542, + "grad_norm": 1216.0, + "kl_loss_13": 111.6023162841797, + "kl_loss_2": 2220.746160888672, + "kl_loss_4": 1339.9001770019531, + "kl_loss_9": 484.6208038330078, + "learning_rate": 0.0004414282640060809, + "loss": 1028.1423, + "step": 5420 + }, + { + "ce_loss_13": 3.354524922370911, + "ce_loss_17": 3.3029243111610413, + "ce_loss_2": 4.306393313407898, + "ce_loss_4": 3.896896743774414, + "ce_loss_9": 3.532094955444336, + "epoch": 0.543, + "grad_norm": 1200.0, + "kl_loss_13": 108.4278579711914, + "kl_loss_2": 2109.394207763672, + "kl_loss_4": 1302.832470703125, + "kl_loss_9": 511.4583312988281, + "learning_rate": 0.0004398528227741633, + "loss": 1046.2162, + "step": 5430 + }, + { + "ce_loss_13": 3.2286988139152526, + "ce_loss_17": 3.177917945384979, + "ce_loss_2": 4.212862086296082, + "ce_loss_4": 3.784455132484436, + "ce_loss_9": 3.39864102602005, + "epoch": 0.544, + "grad_norm": 1056.0, + "kl_loss_13": 108.80259017944336, + "kl_loss_2": 2164.3222534179686, + "kl_loss_4": 1331.6523803710938, + "kl_loss_9": 498.5959197998047, + "learning_rate": 0.00043827798722365264, + "loss": 1041.6467, + "step": 5440 + }, + { + "ce_loss_13": 3.3431167721748354, + "ce_loss_17": 3.2934610247612, + "ce_loss_2": 4.288112020492553, + "ce_loss_4": 3.8623299598693848, + "ce_loss_9": 3.4948390245437624, + "epoch": 0.545, + "grad_norm": 1088.0, + "kl_loss_13": 108.57424964904786, + "kl_loss_2": 2127.683697509766, + "kl_loss_4": 1286.5928771972656, + "kl_loss_9": 474.2645523071289, + "learning_rate": 0.00043670377321312535, + "loss": 1004.8467, + "step": 5450 + }, + { + "ce_loss_13": 3.347521185874939, + "ce_loss_17": 3.2985552310943604, + "ce_loss_2": 4.28309155702591, + "ce_loss_4": 3.867686402797699, + "ce_loss_9": 3.4948613047599792, + "epoch": 0.546, + "grad_norm": 920.0, + "kl_loss_13": 107.39347343444824, + "kl_loss_2": 2107.3383605957033, + "kl_loss_4": 1289.5321411132813, + "kl_loss_9": 469.7123199462891, + "learning_rate": 0.0004351301965948991, + "loss": 1017.9658, + "step": 5460 + }, + { + "ce_loss_13": 3.258587098121643, + "ce_loss_17": 3.2080526828765867, + "ce_loss_2": 4.202140700817108, + "ce_loss_4": 3.781372916698456, + "ce_loss_9": 3.4050683259963987, + "epoch": 0.547, + "grad_norm": 1088.0, + "kl_loss_13": 105.97619361877442, + "kl_loss_2": 2098.9670166015626, + "kl_loss_4": 1278.442724609375, + "kl_loss_9": 458.47902221679686, + "learning_rate": 0.000433557273214873, + "loss": 1007.1094, + "step": 5470 + }, + { + "ce_loss_13": 3.2443623661994936, + "ce_loss_17": 3.195618045330048, + "ce_loss_2": 4.204770863056183, + "ce_loss_4": 3.7743229389190676, + "ce_loss_9": 3.3987857580184935, + "epoch": 0.548, + "grad_norm": 952.0, + "kl_loss_13": 107.47964782714844, + "kl_loss_2": 2137.7500183105467, + "kl_loss_4": 1290.947314453125, + "kl_loss_9": 466.47139434814454, + "learning_rate": 0.000431985018912368, + "loss": 994.8767, + "step": 5480 + }, + { + "ce_loss_13": 3.2204198360443117, + "ce_loss_17": 3.1687081933021544, + "ce_loss_2": 4.232752060890197, + "ce_loss_4": 3.7934465646743774, + "ce_loss_9": 3.3796839833259584, + "epoch": 0.549, + "grad_norm": 1064.0, + "kl_loss_13": 108.58690719604492, + "kl_loss_2": 2230.613244628906, + "kl_loss_4": 1368.739990234375, + "kl_loss_9": 481.31530609130857, + "learning_rate": 0.0004304134495199674, + "loss": 1012.5309, + "step": 5490 + }, + { + "ce_loss_13": 3.249473285675049, + "ce_loss_17": 3.197075438499451, + "ce_loss_2": 4.228789019584656, + "ce_loss_4": 3.7995336055755615, + "ce_loss_9": 3.410299515724182, + "epoch": 0.55, + "grad_norm": 1008.0, + "kl_loss_13": 108.9456859588623, + "kl_loss_2": 2213.9209594726562, + "kl_loss_4": 1358.5975769042968, + "kl_loss_9": 489.80407562255857, + "learning_rate": 0.0004288425808633575, + "loss": 1024.2502, + "step": 5500 + }, + { + "ce_loss_13": 3.226742923259735, + "ce_loss_17": 3.1780293107032778, + "ce_loss_2": 4.20872814655304, + "ce_loss_4": 3.7648880004882814, + "ce_loss_9": 3.375022864341736, + "epoch": 0.551, + "grad_norm": 1080.0, + "kl_loss_13": 105.94135360717773, + "kl_loss_2": 2181.6394165039064, + "kl_loss_4": 1320.9524353027343, + "kl_loss_9": 468.8830764770508, + "learning_rate": 0.0004272724287611684, + "loss": 1022.648, + "step": 5510 + }, + { + "ce_loss_13": 3.200169563293457, + "ce_loss_17": 3.147791862487793, + "ce_loss_2": 4.206039774417877, + "ce_loss_4": 3.747302758693695, + "ce_loss_9": 3.35833854675293, + "epoch": 0.552, + "grad_norm": 1024.0, + "kl_loss_13": 108.29860572814941, + "kl_loss_2": 2230.6957458496095, + "kl_loss_4": 1334.3414184570313, + "kl_loss_9": 477.27283172607423, + "learning_rate": 0.00042570300902481425, + "loss": 1027.3209, + "step": 5520 + }, + { + "ce_loss_13": 3.2276618480682373, + "ce_loss_17": 3.180474281311035, + "ce_loss_2": 4.1941629409790036, + "ce_loss_4": 3.7642850279808044, + "ce_loss_9": 3.3778549075126647, + "epoch": 0.553, + "grad_norm": 1024.0, + "kl_loss_13": 104.75520858764648, + "kl_loss_2": 2171.4489807128907, + "kl_loss_4": 1310.355047607422, + "kl_loss_9": 467.9978424072266, + "learning_rate": 0.00042413433745833423, + "loss": 1014.252, + "step": 5530 + }, + { + "ce_loss_13": 3.226637804508209, + "ce_loss_17": 3.178347134590149, + "ce_loss_2": 4.220327818393708, + "ce_loss_4": 3.778496539592743, + "ce_loss_9": 3.38158597946167, + "epoch": 0.554, + "grad_norm": 1064.0, + "kl_loss_13": 106.28964271545411, + "kl_loss_2": 2193.3007385253904, + "kl_loss_4": 1312.8393188476562, + "kl_loss_9": 468.48512878417966, + "learning_rate": 0.0004225664298582339, + "loss": 991.7201, + "step": 5540 + }, + { + "ce_loss_13": 3.311958611011505, + "ce_loss_17": 3.2612168908119203, + "ce_loss_2": 4.261777949333191, + "ce_loss_4": 3.8381883263587953, + "ce_loss_9": 3.458965373039246, + "epoch": 0.555, + "grad_norm": 940.0, + "kl_loss_13": 105.46303863525391, + "kl_loss_2": 2126.867779541016, + "kl_loss_4": 1290.0499877929688, + "kl_loss_9": 464.66784820556643, + "learning_rate": 0.000420999302013325, + "loss": 996.4063, + "step": 5550 + }, + { + "ce_loss_13": 3.213218593597412, + "ce_loss_17": 3.1599210858345033, + "ce_loss_2": 4.246359586715698, + "ce_loss_4": 3.7806575655937196, + "ce_loss_9": 3.3735639691352843, + "epoch": 0.556, + "grad_norm": 948.0, + "kl_loss_13": 110.27377281188964, + "kl_loss_2": 2278.909295654297, + "kl_loss_4": 1350.4434387207032, + "kl_loss_9": 483.4982437133789, + "learning_rate": 0.000419432969704568, + "loss": 1023.4113, + "step": 5560 + }, + { + "ce_loss_13": 3.2502139806747437, + "ce_loss_17": 3.200896847248077, + "ce_loss_2": 4.216920232772827, + "ce_loss_4": 3.781894659996033, + "ce_loss_9": 3.402904415130615, + "epoch": 0.557, + "grad_norm": 988.0, + "kl_loss_13": 106.63406486511231, + "kl_loss_2": 2131.5817016601563, + "kl_loss_4": 1289.1296752929688, + "kl_loss_9": 466.41649932861327, + "learning_rate": 0.00041786744870491154, + "loss": 1029.9557, + "step": 5570 + }, + { + "ce_loss_13": 3.1923250198364257, + "ce_loss_17": 3.1415870547294618, + "ce_loss_2": 4.173611140251159, + "ce_loss_4": 3.738932120800018, + "ce_loss_9": 3.3525986433029176, + "epoch": 0.558, + "grad_norm": 1056.0, + "kl_loss_13": 109.15409851074219, + "kl_loss_2": 2184.7519958496096, + "kl_loss_4": 1331.7179138183594, + "kl_loss_9": 480.9710983276367, + "learning_rate": 0.0004163027547791347, + "loss": 1019.7738, + "step": 5580 + }, + { + "ce_loss_13": 3.175397825241089, + "ce_loss_17": 3.1253713369369507, + "ce_loss_2": 4.207211792469025, + "ce_loss_4": 3.7362369775772093, + "ce_loss_9": 3.333598756790161, + "epoch": 0.559, + "grad_norm": 984.0, + "kl_loss_13": 108.09348678588867, + "kl_loss_2": 2271.22587890625, + "kl_loss_4": 1356.5689270019532, + "kl_loss_9": 479.36707458496096, + "learning_rate": 0.0004147389036836881, + "loss": 1030.4446, + "step": 5590 + }, + { + "ce_loss_13": 3.2243221521377565, + "ce_loss_17": 3.174144923686981, + "ce_loss_2": 4.220474207401276, + "ce_loss_4": 3.783113384246826, + "ce_loss_9": 3.3823321223258973, + "epoch": 0.56, + "grad_norm": 1336.0, + "kl_loss_13": 108.44959182739258, + "kl_loss_2": 2206.8712463378906, + "kl_loss_4": 1349.97109375, + "kl_loss_9": 480.26465301513673, + "learning_rate": 0.00041317591116653486, + "loss": 1045.5668, + "step": 5600 + }, + { + "ce_loss_13": 3.250491189956665, + "ce_loss_17": 3.1992578983306883, + "ce_loss_2": 4.251442277431488, + "ce_loss_4": 3.803864026069641, + "ce_loss_9": 3.410554790496826, + "epoch": 0.561, + "grad_norm": 1248.0, + "kl_loss_13": 110.18686599731446, + "kl_loss_2": 2221.180987548828, + "kl_loss_4": 1334.6320068359375, + "kl_loss_9": 481.29314575195315, + "learning_rate": 0.0004116137929669921, + "loss": 1018.3037, + "step": 5610 + }, + { + "ce_loss_13": 3.247248661518097, + "ce_loss_17": 3.196286952495575, + "ce_loss_2": 4.2242255210876465, + "ce_loss_4": 3.7908988118171694, + "ce_loss_9": 3.404494285583496, + "epoch": 0.562, + "grad_norm": 1128.0, + "kl_loss_13": 105.28205261230468, + "kl_loss_2": 2175.927453613281, + "kl_loss_4": 1323.4735595703125, + "kl_loss_9": 472.6299102783203, + "learning_rate": 0.00041005256481557305, + "loss": 1005.9586, + "step": 5620 + }, + { + "ce_loss_13": 3.340807831287384, + "ce_loss_17": 3.2939091444015505, + "ce_loss_2": 4.265756905078888, + "ce_loss_4": 3.8510836720466615, + "ce_loss_9": 3.485648810863495, + "epoch": 0.563, + "grad_norm": 1048.0, + "kl_loss_13": 102.68735733032227, + "kl_loss_2": 2069.946008300781, + "kl_loss_4": 1252.8001892089844, + "kl_loss_9": 453.3646759033203, + "learning_rate": 0.00040849224243382767, + "loss": 987.3195, + "step": 5630 + }, + { + "ce_loss_13": 3.204978549480438, + "ce_loss_17": 3.1556813597679136, + "ce_loss_2": 4.187444388866425, + "ce_loss_4": 3.7457319378852842, + "ce_loss_9": 3.3552908301353455, + "epoch": 0.564, + "grad_norm": 1048.0, + "kl_loss_13": 107.18028144836425, + "kl_loss_2": 2189.4162841796874, + "kl_loss_4": 1327.5060729980469, + "kl_loss_9": 474.95810546875, + "learning_rate": 0.000406932841534185, + "loss": 1003.8793, + "step": 5640 + }, + { + "ce_loss_13": 3.1665628433227537, + "ce_loss_17": 3.114697265625, + "ce_loss_2": 4.16257119178772, + "ce_loss_4": 3.7231425046920776, + "ce_loss_9": 3.3221094965934754, + "epoch": 0.565, + "grad_norm": 1200.0, + "kl_loss_13": 107.15737495422363, + "kl_loss_2": 2211.5795349121095, + "kl_loss_4": 1340.2088684082032, + "kl_loss_9": 473.61424713134767, + "learning_rate": 0.0004053743778197951, + "loss": 1046.642, + "step": 5650 + }, + { + "ce_loss_13": 3.2677687764167787, + "ce_loss_17": 3.217938768863678, + "ce_loss_2": 4.240297913551331, + "ce_loss_4": 3.808706891536713, + "ce_loss_9": 3.424710714817047, + "epoch": 0.566, + "grad_norm": 848.0, + "kl_loss_13": 108.83717041015625, + "kl_loss_2": 2152.098468017578, + "kl_loss_4": 1314.6810974121095, + "kl_loss_9": 477.20800323486327, + "learning_rate": 0.0004038168669843697, + "loss": 1031.1551, + "step": 5660 + }, + { + "ce_loss_13": 3.226331329345703, + "ce_loss_17": 3.1779110193252564, + "ce_loss_2": 4.176452052593231, + "ce_loss_4": 3.7551100611686707, + "ce_loss_9": 3.381741237640381, + "epoch": 0.567, + "grad_norm": 996.0, + "kl_loss_13": 105.135693359375, + "kl_loss_2": 2110.937890625, + "kl_loss_4": 1279.2918518066406, + "kl_loss_9": 463.6072463989258, + "learning_rate": 0.000402260324712026, + "loss": 1019.4232, + "step": 5670 + }, + { + "ce_loss_13": 3.267211210727692, + "ce_loss_17": 3.2202460289001467, + "ce_loss_2": 4.269510877132416, + "ce_loss_4": 3.8227261900901794, + "ce_loss_9": 3.425837850570679, + "epoch": 0.568, + "grad_norm": 1200.0, + "kl_loss_13": 104.88587112426758, + "kl_loss_2": 2207.832159423828, + "kl_loss_4": 1332.200506591797, + "kl_loss_9": 472.66163482666013, + "learning_rate": 0.00040070476667712743, + "loss": 1009.2877, + "step": 5680 + }, + { + "ce_loss_13": 3.2995219230651855, + "ce_loss_17": 3.248284697532654, + "ce_loss_2": 4.269084072113037, + "ce_loss_4": 3.8354527711868287, + "ce_loss_9": 3.449599826335907, + "epoch": 0.569, + "grad_norm": 1040.0, + "kl_loss_13": 107.38444290161132, + "kl_loss_2": 2152.9364685058595, + "kl_loss_4": 1305.1982849121093, + "kl_loss_9": 465.0652328491211, + "learning_rate": 0.0003991502085441259, + "loss": 1016.1959, + "step": 5690 + }, + { + "ce_loss_13": 3.338280963897705, + "ce_loss_17": 3.29004043340683, + "ce_loss_2": 4.264880752563476, + "ce_loss_4": 3.8434857964515685, + "ce_loss_9": 3.4819197177886965, + "epoch": 0.57, + "grad_norm": 1128.0, + "kl_loss_13": 104.25477027893066, + "kl_loss_2": 2071.682019042969, + "kl_loss_4": 1246.6133117675781, + "kl_loss_9": 451.98497619628904, + "learning_rate": 0.0003975966659674047, + "loss": 999.24, + "step": 5700 + }, + { + "ce_loss_13": 3.304381275177002, + "ce_loss_17": 3.2547492742538453, + "ce_loss_2": 4.2747913002967834, + "ce_loss_4": 3.832672679424286, + "ce_loss_9": 3.456703174114227, + "epoch": 0.571, + "grad_norm": 1040.0, + "kl_loss_13": 106.02146835327149, + "kl_loss_2": 2157.36142578125, + "kl_loss_4": 1288.7392883300781, + "kl_loss_9": 464.78004302978513, + "learning_rate": 0.0003960441545911204, + "loss": 998.801, + "step": 5710 + }, + { + "ce_loss_13": 3.2945245265960694, + "ce_loss_17": 3.244948410987854, + "ce_loss_2": 4.2505226016044615, + "ce_loss_4": 3.822256624698639, + "ce_loss_9": 3.444940137863159, + "epoch": 0.572, + "grad_norm": 1224.0, + "kl_loss_13": 106.71220932006835, + "kl_loss_2": 2147.4679260253906, + "kl_loss_4": 1306.8393249511719, + "kl_loss_9": 469.7888442993164, + "learning_rate": 0.0003944926900490452, + "loss": 1007.3723, + "step": 5720 + }, + { + "ce_loss_13": 3.2140813946723936, + "ce_loss_17": 3.1621242642402647, + "ce_loss_2": 4.2208818912506105, + "ce_loss_4": 3.7676998734474183, + "ce_loss_9": 3.3754167795181274, + "epoch": 0.573, + "grad_norm": 1144.0, + "kl_loss_13": 107.32836990356445, + "kl_loss_2": 2213.2598205566405, + "kl_loss_4": 1333.5973754882812, + "kl_loss_9": 477.7337875366211, + "learning_rate": 0.0003929422879644099, + "loss": 1009.975, + "step": 5730 + }, + { + "ce_loss_13": 3.2143573999404906, + "ce_loss_17": 3.1666715502738954, + "ce_loss_2": 4.180016028881073, + "ce_loss_4": 3.737887442111969, + "ce_loss_9": 3.3648409247398376, + "epoch": 0.574, + "grad_norm": 1224.0, + "kl_loss_13": 103.48085556030273, + "kl_loss_2": 2142.003973388672, + "kl_loss_4": 1292.037646484375, + "kl_loss_9": 461.6825668334961, + "learning_rate": 0.0003913929639497462, + "loss": 981.0994, + "step": 5740 + }, + { + "ce_loss_13": 3.1730966091156008, + "ce_loss_17": 3.123303234577179, + "ce_loss_2": 4.17813618183136, + "ce_loss_4": 3.727213191986084, + "ce_loss_9": 3.3294308304786684, + "epoch": 0.575, + "grad_norm": 1128.0, + "kl_loss_13": 105.16019668579102, + "kl_loss_2": 2217.6057250976564, + "kl_loss_4": 1333.008563232422, + "kl_loss_9": 467.17459869384766, + "learning_rate": 0.00038984473360672965, + "loss": 1002.6323, + "step": 5750 + }, + { + "ce_loss_13": 3.1798732042312623, + "ce_loss_17": 3.130116605758667, + "ce_loss_2": 4.180464386940002, + "ce_loss_4": 3.7306759119033814, + "ce_loss_9": 3.33222382068634, + "epoch": 0.576, + "grad_norm": 1032.0, + "kl_loss_13": 104.2150936126709, + "kl_loss_2": 2204.584893798828, + "kl_loss_4": 1320.6196411132812, + "kl_loss_9": 466.8303695678711, + "learning_rate": 0.0003882976125260229, + "loss": 999.3053, + "step": 5760 + }, + { + "ce_loss_13": 3.249653995037079, + "ce_loss_17": 3.198180115222931, + "ce_loss_2": 4.225491976737976, + "ce_loss_4": 3.784458315372467, + "ce_loss_9": 3.398803198337555, + "epoch": 0.577, + "grad_norm": 1064.0, + "kl_loss_13": 105.69968490600586, + "kl_loss_2": 2163.3163513183595, + "kl_loss_4": 1298.5627563476562, + "kl_loss_9": 462.13909912109375, + "learning_rate": 0.00038675161628711776, + "loss": 1012.3885, + "step": 5770 + }, + { + "ce_loss_13": 3.281804645061493, + "ce_loss_17": 3.2332274556159972, + "ce_loss_2": 4.2343097448349, + "ce_loss_4": 3.8072708368301393, + "ce_loss_9": 3.4310219168663023, + "epoch": 0.578, + "grad_norm": 1088.0, + "kl_loss_13": 105.93903160095215, + "kl_loss_2": 2116.0813720703127, + "kl_loss_4": 1283.2293762207032, + "kl_loss_9": 466.1056701660156, + "learning_rate": 0.0003852067604581794, + "loss": 1026.1375, + "step": 5780 + }, + { + "ce_loss_13": 3.231726658344269, + "ce_loss_17": 3.183363842964172, + "ce_loss_2": 4.217213237285614, + "ce_loss_4": 3.7710780262947083, + "ce_loss_9": 3.384087860584259, + "epoch": 0.579, + "grad_norm": 1304.0, + "kl_loss_13": 105.16792488098145, + "kl_loss_2": 2199.165490722656, + "kl_loss_4": 1327.1482482910155, + "kl_loss_9": 473.02309265136716, + "learning_rate": 0.0003836630605958888, + "loss": 1009.5799, + "step": 5790 + }, + { + "ce_loss_13": 3.285310745239258, + "ce_loss_17": 3.2366268157958986, + "ce_loss_2": 4.253984498977661, + "ce_loss_4": 3.8166111588478087, + "ce_loss_9": 3.437506675720215, + "epoch": 0.58, + "grad_norm": 1216.0, + "kl_loss_13": 106.23129653930664, + "kl_loss_2": 2174.561871337891, + "kl_loss_4": 1310.150457763672, + "kl_loss_9": 469.5129028320313, + "learning_rate": 0.0003821205322452863, + "loss": 1048.4742, + "step": 5800 + }, + { + "ce_loss_13": 3.2684043049812317, + "ce_loss_17": 3.2208677172660827, + "ce_loss_2": 4.2259564876556395, + "ce_loss_4": 3.7933168411254883, + "ce_loss_9": 3.4158771276474, + "epoch": 0.581, + "grad_norm": 836.0, + "kl_loss_13": 104.45205192565918, + "kl_loss_2": 2152.362384033203, + "kl_loss_4": 1302.3035400390625, + "kl_loss_9": 461.311328125, + "learning_rate": 0.0003805791909396155, + "loss": 1009.0611, + "step": 5810 + }, + { + "ce_loss_13": 3.2196359395980836, + "ce_loss_17": 3.1725378155708315, + "ce_loss_2": 4.202900910377503, + "ce_loss_4": 3.7574003100395204, + "ce_loss_9": 3.3732016682624817, + "epoch": 0.582, + "grad_norm": 1040.0, + "kl_loss_13": 104.37162628173829, + "kl_loss_2": 2172.766259765625, + "kl_loss_4": 1303.2477233886718, + "kl_loss_9": 461.41520233154296, + "learning_rate": 0.0003790390522001662, + "loss": 1018.1101, + "step": 5820 + }, + { + "ce_loss_13": 3.163041388988495, + "ce_loss_17": 3.1155371189117433, + "ce_loss_2": 4.147301125526428, + "ce_loss_4": 3.6993380546569825, + "ce_loss_9": 3.3110084414482115, + "epoch": 0.583, + "grad_norm": 1072.0, + "kl_loss_13": 102.91978492736817, + "kl_loss_2": 2206.9365173339843, + "kl_loss_4": 1321.4876586914063, + "kl_loss_9": 461.0155319213867, + "learning_rate": 0.0003775001315361183, + "loss": 1000.7617, + "step": 5830 + }, + { + "ce_loss_13": 3.2620732307434084, + "ce_loss_17": 3.2116411328315735, + "ce_loss_2": 4.253238999843598, + "ce_loss_4": 3.8036014914512633, + "ce_loss_9": 3.4165867805480956, + "epoch": 0.584, + "grad_norm": 980.0, + "kl_loss_13": 106.43870391845704, + "kl_loss_2": 2193.1389343261717, + "kl_loss_4": 1310.7002563476562, + "kl_loss_9": 465.5862838745117, + "learning_rate": 0.0003759624444443858, + "loss": 1015.6199, + "step": 5840 + }, + { + "ce_loss_13": 3.2950802087783813, + "ce_loss_17": 3.247869575023651, + "ce_loss_2": 4.25583291053772, + "ce_loss_4": 3.814383125305176, + "ce_loss_9": 3.44400874376297, + "epoch": 0.585, + "grad_norm": 1088.0, + "kl_loss_13": 104.62295837402344, + "kl_loss_2": 2148.91650390625, + "kl_loss_4": 1292.0863830566407, + "kl_loss_9": 458.55144653320315, + "learning_rate": 0.00037442600640946044, + "loss": 989.8988, + "step": 5850 + }, + { + "ce_loss_13": 3.259762477874756, + "ce_loss_17": 3.209624469280243, + "ce_loss_2": 4.210914778709411, + "ce_loss_4": 3.7830132484436034, + "ce_loss_9": 3.4094659805297853, + "epoch": 0.586, + "grad_norm": 884.0, + "kl_loss_13": 104.18981819152832, + "kl_loss_2": 2139.6120849609374, + "kl_loss_4": 1295.595281982422, + "kl_loss_9": 464.49849395751954, + "learning_rate": 0.00037289083290325663, + "loss": 982.6902, + "step": 5860 + }, + { + "ce_loss_13": 3.241625726222992, + "ce_loss_17": 3.1911665201187134, + "ce_loss_2": 4.195501685142517, + "ce_loss_4": 3.766813588142395, + "ce_loss_9": 3.38678138256073, + "epoch": 0.587, + "grad_norm": 1224.0, + "kl_loss_13": 104.56239128112793, + "kl_loss_2": 2111.3352966308594, + "kl_loss_4": 1270.1345153808593, + "kl_loss_9": 453.6889419555664, + "learning_rate": 0.0003713569393849543, + "loss": 987.5633, + "step": 5870 + }, + { + "ce_loss_13": 3.2889536738395693, + "ce_loss_17": 3.2411887526512144, + "ce_loss_2": 4.245223808288574, + "ce_loss_4": 3.820126986503601, + "ce_loss_9": 3.439180147647858, + "epoch": 0.588, + "grad_norm": 1064.0, + "kl_loss_13": 105.63932228088379, + "kl_loss_2": 2143.903204345703, + "kl_loss_4": 1300.479736328125, + "kl_loss_9": 466.0010681152344, + "learning_rate": 0.00036982434130084397, + "loss": 1007.8867, + "step": 5880 + }, + { + "ce_loss_13": 3.204426145553589, + "ce_loss_17": 3.15376101732254, + "ce_loss_2": 4.168929398059845, + "ce_loss_4": 3.7413434267044066, + "ce_loss_9": 3.359885025024414, + "epoch": 0.589, + "grad_norm": 1176.0, + "kl_loss_13": 107.33417320251465, + "kl_loss_2": 2148.308868408203, + "kl_loss_4": 1305.3046813964843, + "kl_loss_9": 471.7650177001953, + "learning_rate": 0.00036829305408417166, + "loss": 1014.3895, + "step": 5890 + }, + { + "ce_loss_13": 3.188737714290619, + "ce_loss_17": 3.1381046533584596, + "ce_loss_2": 4.196311950683594, + "ce_loss_4": 3.7427588939666747, + "ce_loss_9": 3.3490108847618103, + "epoch": 0.59, + "grad_norm": 920.0, + "kl_loss_13": 107.5712989807129, + "kl_loss_2": 2213.5058166503904, + "kl_loss_4": 1333.904315185547, + "kl_loss_9": 474.17527923583987, + "learning_rate": 0.0003667630931549826, + "loss": 1015.3334, + "step": 5900 + }, + { + "ce_loss_13": 3.163939726352692, + "ce_loss_17": 3.112599182128906, + "ce_loss_2": 4.1923185110092165, + "ce_loss_4": 3.7266955614089965, + "ce_loss_9": 3.3202701926231386, + "epoch": 0.591, + "grad_norm": 1224.0, + "kl_loss_13": 105.56490974426269, + "kl_loss_2": 2278.30625, + "kl_loss_4": 1359.6305053710937, + "kl_loss_9": 476.89821472167966, + "learning_rate": 0.00036523447391996613, + "loss": 1034.77, + "step": 5910 + }, + { + "ce_loss_13": 3.246525001525879, + "ce_loss_17": 3.1980230808258057, + "ce_loss_2": 4.2038208723068236, + "ce_loss_4": 3.772247862815857, + "ce_loss_9": 3.396599757671356, + "epoch": 0.592, + "grad_norm": 1024.0, + "kl_loss_13": 103.29920616149903, + "kl_loss_2": 2129.308709716797, + "kl_loss_4": 1283.3960571289062, + "kl_loss_9": 460.65132598876954, + "learning_rate": 0.00036370721177230114, + "loss": 990.7611, + "step": 5920 + }, + { + "ce_loss_13": 3.2498660683631897, + "ce_loss_17": 3.1988643765449525, + "ce_loss_2": 4.2426536798477175, + "ce_loss_4": 3.7951523661613464, + "ce_loss_9": 3.4062994480133058, + "epoch": 0.593, + "grad_norm": 1200.0, + "kl_loss_13": 107.33820114135742, + "kl_loss_2": 2186.1585998535156, + "kl_loss_4": 1317.0520080566407, + "kl_loss_9": 471.22791900634763, + "learning_rate": 0.00036218132209150044, + "loss": 1010.8336, + "step": 5930 + }, + { + "ce_loss_13": 3.199877750873566, + "ce_loss_17": 3.147655165195465, + "ce_loss_2": 4.225960111618042, + "ce_loss_4": 3.7693408370018004, + "ce_loss_9": 3.3650948286056517, + "epoch": 0.594, + "grad_norm": 988.0, + "kl_loss_13": 109.9642177581787, + "kl_loss_2": 2272.393426513672, + "kl_loss_4": 1375.5148315429688, + "kl_loss_9": 485.6137893676758, + "learning_rate": 0.0003606568202432562, + "loss": 1030.8201, + "step": 5940 + }, + { + "ce_loss_13": 3.2716619610786437, + "ce_loss_17": 3.2230890512466432, + "ce_loss_2": 4.276122367382049, + "ce_loss_4": 3.8125640869140627, + "ce_loss_9": 3.4222139358520507, + "epoch": 0.595, + "grad_norm": 1664.0, + "kl_loss_13": 106.80064086914062, + "kl_loss_2": 2246.651641845703, + "kl_loss_4": 1337.2862243652344, + "kl_loss_9": 472.8148498535156, + "learning_rate": 0.0003591337215792851, + "loss": 1007.8102, + "step": 5950 + }, + { + "ce_loss_13": 3.312983274459839, + "ce_loss_17": 3.2664036750793457, + "ce_loss_2": 4.2408958911895756, + "ce_loss_4": 3.822533071041107, + "ce_loss_9": 3.45985826253891, + "epoch": 0.596, + "grad_norm": 1144.0, + "kl_loss_13": 103.2039421081543, + "kl_loss_2": 2106.0645263671877, + "kl_loss_4": 1275.7274047851563, + "kl_loss_9": 459.8647399902344, + "learning_rate": 0.00035761204143717383, + "loss": 1003.2287, + "step": 5960 + }, + { + "ce_loss_13": 3.262428116798401, + "ce_loss_17": 3.2125823974609373, + "ce_loss_2": 4.233766210079193, + "ce_loss_4": 3.7987877130508423, + "ce_loss_9": 3.4145849466323854, + "epoch": 0.597, + "grad_norm": 1192.0, + "kl_loss_13": 105.41806068420411, + "kl_loss_2": 2175.652551269531, + "kl_loss_4": 1315.5428344726563, + "kl_loss_9": 467.65365142822264, + "learning_rate": 0.0003560917951402245, + "loss": 1032.5457, + "step": 5970 + }, + { + "ce_loss_13": 3.245500075817108, + "ce_loss_17": 3.1974679589271546, + "ce_loss_2": 4.2172334551811215, + "ce_loss_4": 3.7752400279045104, + "ce_loss_9": 3.3959237217903135, + "epoch": 0.598, + "grad_norm": 1576.0, + "kl_loss_13": 103.92341575622558, + "kl_loss_2": 2149.972381591797, + "kl_loss_4": 1289.4344360351563, + "kl_loss_9": 459.9585586547852, + "learning_rate": 0.00035457299799730046, + "loss": 999.1177, + "step": 5980 + }, + { + "ce_loss_13": 3.3038394451141357, + "ce_loss_17": 3.255398166179657, + "ce_loss_2": 4.2515393733978275, + "ce_loss_4": 3.8280488014221192, + "ce_loss_9": 3.4515564799308778, + "epoch": 0.599, + "grad_norm": 1012.0, + "kl_loss_13": 104.42323455810546, + "kl_loss_2": 2122.2318725585938, + "kl_loss_4": 1280.4359741210938, + "kl_loss_9": 460.97376861572263, + "learning_rate": 0.0003530556653026721, + "loss": 1007.8557, + "step": 5990 + }, + { + "ce_loss_13": 3.2271589994430543, + "ce_loss_17": 3.1783417582511904, + "ce_loss_2": 4.21160455942154, + "ce_loss_4": 3.760587203502655, + "ce_loss_9": 3.376507580280304, + "epoch": 0.6, + "grad_norm": 2960.0, + "kl_loss_13": 103.70793304443359, + "kl_loss_2": 2180.3646911621095, + "kl_loss_4": 1298.9149047851563, + "kl_loss_9": 453.79684295654295, + "learning_rate": 0.00035153981233586274, + "loss": 1012.7484, + "step": 6000 + }, + { + "ce_loss_13": 3.196209728717804, + "ce_loss_17": 3.1477805376052856, + "ce_loss_2": 4.182122385501861, + "ce_loss_4": 3.7424838185310363, + "ce_loss_9": 3.353682827949524, + "epoch": 0.601, + "grad_norm": 976.0, + "kl_loss_13": 102.67936897277832, + "kl_loss_2": 2174.152783203125, + "kl_loss_4": 1309.452081298828, + "kl_loss_9": 462.7125442504883, + "learning_rate": 0.00035002545436149473, + "loss": 1039.3401, + "step": 6010 + }, + { + "ce_loss_13": 3.210281789302826, + "ce_loss_17": 3.160186004638672, + "ce_loss_2": 4.201205313205719, + "ce_loss_4": 3.756035101413727, + "ce_loss_9": 3.361941027641296, + "epoch": 0.602, + "grad_norm": 1120.0, + "kl_loss_13": 107.11726303100586, + "kl_loss_2": 2211.111956787109, + "kl_loss_4": 1337.2409912109374, + "kl_loss_9": 473.2167694091797, + "learning_rate": 0.0003485126066291364, + "loss": 1003.5432, + "step": 6020 + }, + { + "ce_loss_13": 3.2568456649780275, + "ce_loss_17": 3.206075870990753, + "ce_loss_2": 4.235669422149658, + "ce_loss_4": 3.798553490638733, + "ce_loss_9": 3.403362047672272, + "epoch": 0.603, + "grad_norm": 1328.0, + "kl_loss_13": 103.70798225402832, + "kl_loss_2": 2173.763946533203, + "kl_loss_4": 1308.6146911621095, + "kl_loss_9": 458.2526031494141, + "learning_rate": 0.0003470012843731476, + "loss": 1012.1658, + "step": 6030 + }, + { + "ce_loss_13": 3.195317542552948, + "ce_loss_17": 3.146851134300232, + "ce_loss_2": 4.182744765281678, + "ce_loss_4": 3.7498085737228393, + "ce_loss_9": 3.352411425113678, + "epoch": 0.604, + "grad_norm": 1144.0, + "kl_loss_13": 104.77651710510254, + "kl_loss_2": 2180.915155029297, + "kl_loss_4": 1331.8111083984375, + "kl_loss_9": 466.05229034423826, + "learning_rate": 0.00034549150281252633, + "loss": 1033.7506, + "step": 6040 + }, + { + "ce_loss_13": 3.17526798248291, + "ce_loss_17": 3.1275426745414734, + "ce_loss_2": 4.138425624370575, + "ce_loss_4": 3.7111088275909423, + "ce_loss_9": 3.3338120102882387, + "epoch": 0.605, + "grad_norm": 1112.0, + "kl_loss_13": 104.60922775268554, + "kl_loss_2": 2114.93232421875, + "kl_loss_4": 1280.9937622070313, + "kl_loss_9": 458.72052612304685, + "learning_rate": 0.0003439832771507565, + "loss": 990.048, + "step": 6050 + }, + { + "ce_loss_13": 3.1868816018104553, + "ce_loss_17": 3.13805034160614, + "ce_loss_2": 4.172012794017792, + "ce_loss_4": 3.7354788899421694, + "ce_loss_9": 3.3404654026031495, + "epoch": 0.606, + "grad_norm": 944.0, + "kl_loss_13": 104.38705711364746, + "kl_loss_2": 2190.2447143554687, + "kl_loss_4": 1327.617758178711, + "kl_loss_9": 465.23214263916014, + "learning_rate": 0.0003424766225756537, + "loss": 1005.7731, + "step": 6060 + }, + { + "ce_loss_13": 3.2416218638420107, + "ce_loss_17": 3.1923166275024415, + "ce_loss_2": 4.216587698459625, + "ce_loss_4": 3.78301146030426, + "ce_loss_9": 3.3988449692726137, + "epoch": 0.607, + "grad_norm": 976.0, + "kl_loss_13": 104.24121742248535, + "kl_loss_2": 2156.653253173828, + "kl_loss_4": 1299.6509338378905, + "kl_loss_9": 466.19153289794923, + "learning_rate": 0.00034097155425921255, + "loss": 989.775, + "step": 6070 + }, + { + "ce_loss_13": 3.140945649147034, + "ce_loss_17": 3.0902453541755674, + "ce_loss_2": 4.134526884555816, + "ce_loss_4": 3.6841949820518494, + "ce_loss_9": 3.2898115873336793, + "epoch": 0.608, + "grad_norm": 1000.0, + "kl_loss_13": 105.24062995910644, + "kl_loss_2": 2215.407196044922, + "kl_loss_4": 1321.4732482910156, + "kl_loss_9": 465.3280715942383, + "learning_rate": 0.0003394680873574546, + "loss": 1008.4233, + "step": 6080 + }, + { + "ce_loss_13": 3.243442642688751, + "ce_loss_17": 3.193169891834259, + "ce_loss_2": 4.238304376602173, + "ce_loss_4": 3.794393229484558, + "ce_loss_9": 3.3994379162788393, + "epoch": 0.609, + "grad_norm": 1264.0, + "kl_loss_13": 106.26813545227051, + "kl_loss_2": 2206.4529235839846, + "kl_loss_4": 1336.7542907714844, + "kl_loss_9": 469.1918319702148, + "learning_rate": 0.0003379662370102747, + "loss": 1008.0615, + "step": 6090 + }, + { + "ce_loss_13": 3.251959192752838, + "ce_loss_17": 3.204720401763916, + "ce_loss_2": 4.204728186130524, + "ce_loss_4": 3.7797337055206297, + "ce_loss_9": 3.4027763843536376, + "epoch": 0.61, + "grad_norm": 1120.0, + "kl_loss_13": 103.9813003540039, + "kl_loss_2": 2146.872106933594, + "kl_loss_4": 1305.0810302734376, + "kl_loss_9": 464.12479400634766, + "learning_rate": 0.0003364660183412892, + "loss": 1010.1797, + "step": 6100 + }, + { + "ce_loss_13": 3.236882722377777, + "ce_loss_17": 3.1889939188957213, + "ce_loss_2": 4.19646600484848, + "ce_loss_4": 3.764793562889099, + "ce_loss_9": 3.386471688747406, + "epoch": 0.611, + "grad_norm": 992.0, + "kl_loss_13": 105.31930580139161, + "kl_loss_2": 2162.4851989746094, + "kl_loss_4": 1303.3688415527345, + "kl_loss_9": 467.3864410400391, + "learning_rate": 0.0003349674464576834, + "loss": 1019.5473, + "step": 6110 + }, + { + "ce_loss_13": 3.1896036744117735, + "ce_loss_17": 3.1396544218063354, + "ce_loss_2": 4.176100075244904, + "ce_loss_4": 3.7310116052627564, + "ce_loss_9": 3.3424534797668457, + "epoch": 0.612, + "grad_norm": 1032.0, + "kl_loss_13": 105.2658634185791, + "kl_loss_2": 2196.6162109375, + "kl_loss_4": 1314.2333984375, + "kl_loss_9": 465.91847229003906, + "learning_rate": 0.00033347053645005966, + "loss": 991.5459, + "step": 6120 + }, + { + "ce_loss_13": 3.2873972058296204, + "ce_loss_17": 3.2408745646476746, + "ce_loss_2": 4.230761992931366, + "ce_loss_4": 3.810734987258911, + "ce_loss_9": 3.434909200668335, + "epoch": 0.613, + "grad_norm": 1056.0, + "kl_loss_13": 102.8981834411621, + "kl_loss_2": 2092.7922607421874, + "kl_loss_4": 1274.9353942871094, + "kl_loss_9": 453.7485107421875, + "learning_rate": 0.00033197530339228485, + "loss": 1000.4702, + "step": 6130 + }, + { + "ce_loss_13": 3.2483258724212645, + "ce_loss_17": 3.197148072719574, + "ce_loss_2": 4.218422901630402, + "ce_loss_4": 3.785948061943054, + "ce_loss_9": 3.4022462129592896, + "epoch": 0.614, + "grad_norm": 1496.0, + "kl_loss_13": 106.79622459411621, + "kl_loss_2": 2153.293865966797, + "kl_loss_4": 1309.2161193847655, + "kl_loss_9": 471.3808197021484, + "learning_rate": 0.00033048176234133967, + "loss": 1000.4318, + "step": 6140 + }, + { + "ce_loss_13": 3.240337109565735, + "ce_loss_17": 3.1915099143981935, + "ce_loss_2": 4.199856150150299, + "ce_loss_4": 3.774345266819, + "ce_loss_9": 3.39338276386261, + "epoch": 0.615, + "grad_norm": 1168.0, + "kl_loss_13": 106.1420654296875, + "kl_loss_2": 2143.172637939453, + "kl_loss_4": 1306.88564453125, + "kl_loss_9": 470.33663177490234, + "learning_rate": 0.0003289899283371657, + "loss": 1013.5041, + "step": 6150 + }, + { + "ce_loss_13": 3.251935601234436, + "ce_loss_17": 3.2037736535072328, + "ce_loss_2": 4.234608602523804, + "ce_loss_4": 3.7864712119102477, + "ce_loss_9": 3.401926612854004, + "epoch": 0.616, + "grad_norm": 1136.0, + "kl_loss_13": 103.44955558776856, + "kl_loss_2": 2165.1412048339844, + "kl_loss_4": 1291.0008056640625, + "kl_loss_9": 457.18701171875, + "learning_rate": 0.0003274998164025148, + "loss": 1015.5461, + "step": 6160 + }, + { + "ce_loss_13": 3.287897837162018, + "ce_loss_17": 3.2381337761878966, + "ce_loss_2": 4.243561959266662, + "ce_loss_4": 3.817963790893555, + "ce_loss_9": 3.4391323208808897, + "epoch": 0.617, + "grad_norm": 1008.0, + "kl_loss_13": 105.92054252624511, + "kl_loss_2": 2127.7518920898438, + "kl_loss_4": 1291.0977844238282, + "kl_loss_9": 465.2182113647461, + "learning_rate": 0.0003260114415427975, + "loss": 1020.3175, + "step": 6170 + }, + { + "ce_loss_13": 3.210032606124878, + "ce_loss_17": 3.163334584236145, + "ce_loss_2": 4.208543956279755, + "ce_loss_4": 3.7583757996559144, + "ce_loss_9": 3.364535939693451, + "epoch": 0.618, + "grad_norm": 1232.0, + "kl_loss_13": 104.31474609375, + "kl_loss_2": 2209.049249267578, + "kl_loss_4": 1316.9962646484375, + "kl_loss_9": 460.4062561035156, + "learning_rate": 0.0003245248187459323, + "loss": 1031.5932, + "step": 6180 + }, + { + "ce_loss_13": 3.200501525402069, + "ce_loss_17": 3.156936466693878, + "ce_loss_2": 4.139424109458924, + "ce_loss_4": 3.7125108361244203, + "ce_loss_9": 3.3476163029670714, + "epoch": 0.619, + "grad_norm": 1112.0, + "kl_loss_13": 99.683984375, + "kl_loss_2": 2099.4702697753905, + "kl_loss_4": 1254.0724609375, + "kl_loss_9": 445.6229675292969, + "learning_rate": 0.00032303996298219416, + "loss": 980.149, + "step": 6190 + }, + { + "ce_loss_13": 3.281447637081146, + "ce_loss_17": 3.2324851989746093, + "ce_loss_2": 4.22553573846817, + "ce_loss_4": 3.8030932188034057, + "ce_loss_9": 3.42909117937088, + "epoch": 0.62, + "grad_norm": 904.0, + "kl_loss_13": 102.7640983581543, + "kl_loss_2": 2084.6480407714844, + "kl_loss_4": 1257.8009033203125, + "kl_loss_9": 448.228271484375, + "learning_rate": 0.00032155688920406414, + "loss": 979.9359, + "step": 6200 + }, + { + "ce_loss_13": 3.1930267095565794, + "ce_loss_17": 3.1435487985610964, + "ce_loss_2": 4.1974161028862, + "ce_loss_4": 3.7443273782730104, + "ce_loss_9": 3.342049038410187, + "epoch": 0.621, + "grad_norm": 1080.0, + "kl_loss_13": 106.1325756072998, + "kl_loss_2": 2211.7973876953124, + "kl_loss_4": 1327.9992065429688, + "kl_loss_9": 462.88376770019534, + "learning_rate": 0.0003200756123460788, + "loss": 1032.9165, + "step": 6210 + }, + { + "ce_loss_13": 3.234103286266327, + "ce_loss_17": 3.181319165229797, + "ce_loss_2": 4.2249065160751345, + "ce_loss_4": 3.7768866658210754, + "ce_loss_9": 3.3874099969863893, + "epoch": 0.622, + "grad_norm": 1072.0, + "kl_loss_13": 107.58830261230469, + "kl_loss_2": 2216.287457275391, + "kl_loss_4": 1329.381524658203, + "kl_loss_9": 473.97355499267576, + "learning_rate": 0.00031859614732467957, + "loss": 1029.472, + "step": 6220 + }, + { + "ce_loss_13": 3.275028681755066, + "ce_loss_17": 3.2258262038230896, + "ce_loss_2": 4.225001573562622, + "ce_loss_4": 3.793428909778595, + "ce_loss_9": 3.4213786482810975, + "epoch": 0.623, + "grad_norm": 884.0, + "kl_loss_13": 103.21855010986329, + "kl_loss_2": 2106.8102172851563, + "kl_loss_4": 1269.1572021484376, + "kl_loss_9": 450.4051086425781, + "learning_rate": 0.00031711850903806275, + "loss": 985.1707, + "step": 6230 + }, + { + "ce_loss_13": 3.1819409370422362, + "ce_loss_17": 3.1322739839553835, + "ce_loss_2": 4.175078105926514, + "ce_loss_4": 3.733454203605652, + "ce_loss_9": 3.3365514516830443, + "epoch": 0.624, + "grad_norm": 1136.0, + "kl_loss_13": 107.71724395751953, + "kl_loss_2": 2206.1650939941405, + "kl_loss_4": 1332.8798706054688, + "kl_loss_9": 474.23154602050784, + "learning_rate": 0.0003156427123660297, + "loss": 1007.2943, + "step": 6240 + }, + { + "ce_loss_13": 3.2704357385635374, + "ce_loss_17": 3.2225410461425783, + "ce_loss_2": 4.209833788871765, + "ce_loss_4": 3.790596091747284, + "ce_loss_9": 3.421807146072388, + "epoch": 0.625, + "grad_norm": 996.0, + "kl_loss_13": 104.33633689880371, + "kl_loss_2": 2099.1803771972654, + "kl_loss_4": 1278.2591857910156, + "kl_loss_9": 458.1471405029297, + "learning_rate": 0.0003141687721698363, + "loss": 999.332, + "step": 6250 + }, + { + "ce_loss_13": 3.2414236307144164, + "ce_loss_17": 3.1960933685302733, + "ce_loss_2": 4.165024363994599, + "ce_loss_4": 3.747212052345276, + "ce_loss_9": 3.385522425174713, + "epoch": 0.626, + "grad_norm": 888.0, + "kl_loss_13": 99.80004425048828, + "kl_loss_2": 2052.9569763183595, + "kl_loss_4": 1230.8111267089844, + "kl_loss_9": 440.13487548828124, + "learning_rate": 0.00031269670329204396, + "loss": 984.5798, + "step": 6260 + }, + { + "ce_loss_13": 3.280908465385437, + "ce_loss_17": 3.234300422668457, + "ce_loss_2": 4.206663715839386, + "ce_loss_4": 3.7964126467704773, + "ce_loss_9": 3.42737158536911, + "epoch": 0.627, + "grad_norm": 1072.0, + "kl_loss_13": 104.54029731750488, + "kl_loss_2": 2086.112091064453, + "kl_loss_4": 1266.8228820800782, + "kl_loss_9": 458.22731323242186, + "learning_rate": 0.00031122652055637015, + "loss": 997.6929, + "step": 6270 + }, + { + "ce_loss_13": 3.2415281653404238, + "ce_loss_17": 3.194524824619293, + "ce_loss_2": 4.21508618593216, + "ce_loss_4": 3.7768426299095155, + "ce_loss_9": 3.3972652673721315, + "epoch": 0.628, + "grad_norm": 884.0, + "kl_loss_13": 104.50228538513184, + "kl_loss_2": 2178.8778686523438, + "kl_loss_4": 1313.34853515625, + "kl_loss_9": 464.4199188232422, + "learning_rate": 0.0003097582387675385, + "loss": 995.9834, + "step": 6280 + }, + { + "ce_loss_13": 3.2811944127082824, + "ce_loss_17": 3.234877955913544, + "ce_loss_2": 4.241111028194427, + "ce_loss_4": 3.809443771839142, + "ce_loss_9": 3.4293747186660766, + "epoch": 0.629, + "grad_norm": 972.0, + "kl_loss_13": 104.42979125976562, + "kl_loss_2": 2162.802593994141, + "kl_loss_4": 1301.1986999511719, + "kl_loss_9": 461.3689819335938, + "learning_rate": 0.00030829187271113034, + "loss": 996.6279, + "step": 6290 + }, + { + "ce_loss_13": 3.273211932182312, + "ce_loss_17": 3.2255611658096313, + "ce_loss_2": 4.215889430046081, + "ce_loss_4": 3.788710916042328, + "ce_loss_9": 3.416086220741272, + "epoch": 0.63, + "grad_norm": 1312.0, + "kl_loss_13": 102.39365730285644, + "kl_loss_2": 2102.2348388671876, + "kl_loss_4": 1264.0921630859375, + "kl_loss_9": 449.0651657104492, + "learning_rate": 0.00030682743715343565, + "loss": 1004.2732, + "step": 6300 + }, + { + "ce_loss_13": 3.2214179158210756, + "ce_loss_17": 3.1716656565666197, + "ce_loss_2": 4.200562572479248, + "ce_loss_4": 3.766305387020111, + "ce_loss_9": 3.3748429894447325, + "epoch": 0.631, + "grad_norm": 1008.0, + "kl_loss_13": 107.51469841003419, + "kl_loss_2": 2158.5488220214843, + "kl_loss_4": 1310.2103698730468, + "kl_loss_9": 468.02489013671874, + "learning_rate": 0.0003053649468413043, + "loss": 1021.0883, + "step": 6310 + }, + { + "ce_loss_13": 3.3317938446998596, + "ce_loss_17": 3.2813440680503847, + "ce_loss_2": 4.274353742599487, + "ce_loss_4": 3.858440887928009, + "ce_loss_9": 3.4813751101493837, + "epoch": 0.632, + "grad_norm": 1152.0, + "kl_loss_13": 105.76119689941406, + "kl_loss_2": 2120.442108154297, + "kl_loss_4": 1294.5304077148437, + "kl_loss_9": 463.05701751708983, + "learning_rate": 0.00030390441650199725, + "loss": 992.1391, + "step": 6320 + }, + { + "ce_loss_13": 3.23501238822937, + "ce_loss_17": 3.189093732833862, + "ce_loss_2": 4.195493984222412, + "ce_loss_4": 3.7625299096107483, + "ce_loss_9": 3.3860408782958986, + "epoch": 0.633, + "grad_norm": 1216.0, + "kl_loss_13": 103.58341941833496, + "kl_loss_2": 2130.9781127929687, + "kl_loss_4": 1287.248583984375, + "kl_loss_9": 459.33980255126954, + "learning_rate": 0.00030244586084303903, + "loss": 989.9657, + "step": 6330 + }, + { + "ce_loss_13": 3.2015799045562745, + "ce_loss_17": 3.1522209286689757, + "ce_loss_2": 4.191288423538208, + "ce_loss_4": 3.7551525950431826, + "ce_loss_9": 3.3566280722618105, + "epoch": 0.634, + "grad_norm": 900.0, + "kl_loss_13": 106.068558883667, + "kl_loss_2": 2197.0657348632812, + "kl_loss_4": 1333.6278137207032, + "kl_loss_9": 470.49120025634767, + "learning_rate": 0.00030098929455206903, + "loss": 997.0803, + "step": 6340 + }, + { + "ce_loss_13": 3.209775185585022, + "ce_loss_17": 3.1625745058059693, + "ce_loss_2": 4.177516996860504, + "ce_loss_4": 3.7348382115364074, + "ce_loss_9": 3.358668029308319, + "epoch": 0.635, + "grad_norm": 1096.0, + "kl_loss_13": 102.4077766418457, + "kl_loss_2": 2168.529180908203, + "kl_loss_4": 1296.4439208984375, + "kl_loss_9": 457.2828735351562, + "learning_rate": 0.00029953473229669324, + "loss": 1023.8111, + "step": 6350 + }, + { + "ce_loss_13": 3.2389843583106996, + "ce_loss_17": 3.192874884605408, + "ce_loss_2": 4.208077490329742, + "ce_loss_4": 3.7804375171661375, + "ce_loss_9": 3.395845854282379, + "epoch": 0.636, + "grad_norm": 940.0, + "kl_loss_13": 102.62096939086913, + "kl_loss_2": 2146.139056396484, + "kl_loss_4": 1294.9204406738281, + "kl_loss_9": 464.46875762939453, + "learning_rate": 0.00029808218872433767, + "loss": 989.9184, + "step": 6360 + }, + { + "ce_loss_13": 3.29668550491333, + "ce_loss_17": 3.2472703099250793, + "ce_loss_2": 4.247447431087494, + "ce_loss_4": 3.820680856704712, + "ce_loss_9": 3.4459635615348816, + "epoch": 0.637, + "grad_norm": 988.0, + "kl_loss_13": 103.13903846740723, + "kl_loss_2": 2120.7714294433595, + "kl_loss_4": 1282.2728271484375, + "kl_loss_9": 454.44849700927733, + "learning_rate": 0.0002966316784621, + "loss": 982.4893, + "step": 6370 + }, + { + "ce_loss_13": 3.213066005706787, + "ce_loss_17": 3.162489891052246, + "ce_loss_2": 4.202362954616547, + "ce_loss_4": 3.7600911974906923, + "ce_loss_9": 3.3690034627914427, + "epoch": 0.638, + "grad_norm": 1080.0, + "kl_loss_13": 105.10025787353516, + "kl_loss_2": 2189.3020263671874, + "kl_loss_4": 1318.2996520996094, + "kl_loss_9": 472.2932327270508, + "learning_rate": 0.0002951832161166024, + "loss": 997.1781, + "step": 6380 + }, + { + "ce_loss_13": 3.2860203266143797, + "ce_loss_17": 3.2354984164237974, + "ce_loss_2": 4.246008443832397, + "ce_loss_4": 3.8232905983924867, + "ce_loss_9": 3.4389090299606324, + "epoch": 0.639, + "grad_norm": 988.0, + "kl_loss_13": 105.26334724426269, + "kl_loss_2": 2128.4377197265626, + "kl_loss_4": 1295.216290283203, + "kl_loss_9": 462.69873199462893, + "learning_rate": 0.0002937368162738445, + "loss": 979.7954, + "step": 6390 + }, + { + "ce_loss_13": 3.232245588302612, + "ce_loss_17": 3.1863335490226747, + "ce_loss_2": 4.182946264743805, + "ce_loss_4": 3.7543675661087037, + "ce_loss_9": 3.3733017563819887, + "epoch": 0.64, + "grad_norm": 1192.0, + "kl_loss_13": 99.80812568664551, + "kl_loss_2": 2130.761614990234, + "kl_loss_4": 1283.79404296875, + "kl_loss_9": 445.4130569458008, + "learning_rate": 0.0002922924934990568, + "loss": 1004.1253, + "step": 6400 + }, + { + "ce_loss_13": 3.170327401161194, + "ce_loss_17": 3.1216153860092164, + "ce_loss_2": 4.174137079715729, + "ce_loss_4": 3.7283746719360353, + "ce_loss_9": 3.3237895011901855, + "epoch": 0.641, + "grad_norm": 904.0, + "kl_loss_13": 103.18271827697754, + "kl_loss_2": 2220.125421142578, + "kl_loss_4": 1338.4816772460938, + "kl_loss_9": 464.634814453125, + "learning_rate": 0.0002908502623365536, + "loss": 1009.507, + "step": 6410 + }, + { + "ce_loss_13": 3.1035951495170595, + "ce_loss_17": 3.055507016181946, + "ce_loss_2": 4.113869214057923, + "ce_loss_4": 3.657516372203827, + "ce_loss_9": 3.2565125226974487, + "epoch": 0.642, + "grad_norm": 1424.0, + "kl_loss_13": 102.2946002960205, + "kl_loss_2": 2238.432958984375, + "kl_loss_4": 1344.4247802734376, + "kl_loss_9": 464.67442932128904, + "learning_rate": 0.0002894101373095867, + "loss": 1016.3265, + "step": 6420 + }, + { + "ce_loss_13": 3.3120118260383604, + "ce_loss_17": 3.2641794323921203, + "ce_loss_2": 4.254489946365356, + "ce_loss_4": 3.8334326267242433, + "ce_loss_9": 3.4581778764724733, + "epoch": 0.643, + "grad_norm": 1096.0, + "kl_loss_13": 104.83038482666015, + "kl_loss_2": 2120.8044860839846, + "kl_loss_4": 1284.6194396972655, + "kl_loss_9": 457.8020477294922, + "learning_rate": 0.00028797213292019926, + "loss": 993.7502, + "step": 6430 + }, + { + "ce_loss_13": 3.2902072072029114, + "ce_loss_17": 3.2400644421577454, + "ce_loss_2": 4.245911180973053, + "ce_loss_4": 3.814227652549744, + "ce_loss_9": 3.438106060028076, + "epoch": 0.644, + "grad_norm": 840.0, + "kl_loss_13": 104.72585029602051, + "kl_loss_2": 2128.230145263672, + "kl_loss_4": 1286.1579162597657, + "kl_loss_9": 463.4535629272461, + "learning_rate": 0.0002865362636490791, + "loss": 1014.3291, + "step": 6440 + }, + { + "ce_loss_13": 3.306964325904846, + "ce_loss_17": 3.260729491710663, + "ce_loss_2": 4.253649592399597, + "ce_loss_4": 3.8314553260803224, + "ce_loss_9": 3.454626405239105, + "epoch": 0.645, + "grad_norm": 968.0, + "kl_loss_13": 103.07368850708008, + "kl_loss_2": 2117.1407836914063, + "kl_loss_4": 1290.5475646972657, + "kl_loss_9": 454.7432098388672, + "learning_rate": 0.0002851025439554142, + "loss": 988.6116, + "step": 6450 + }, + { + "ce_loss_13": 3.2901487827301024, + "ce_loss_17": 3.241109323501587, + "ce_loss_2": 4.22101411819458, + "ce_loss_4": 3.8168194770812987, + "ce_loss_9": 3.4408981561660767, + "epoch": 0.646, + "grad_norm": 952.0, + "kl_loss_13": 103.53351631164551, + "kl_loss_2": 2070.153704833984, + "kl_loss_4": 1278.0226745605469, + "kl_loss_9": 457.4317153930664, + "learning_rate": 0.00028367098827674573, + "loss": 983.8079, + "step": 6460 + }, + { + "ce_loss_13": 3.2246446132659914, + "ce_loss_17": 3.1765026092529296, + "ce_loss_2": 4.182961690425873, + "ce_loss_4": 3.7429243206977842, + "ce_loss_9": 3.3690723180770874, + "epoch": 0.647, + "grad_norm": 1048.0, + "kl_loss_13": 100.93603286743163, + "kl_loss_2": 2129.2001525878904, + "kl_loss_4": 1271.9819885253905, + "kl_loss_9": 451.08668518066406, + "learning_rate": 0.00028224161102882397, + "loss": 994.8107, + "step": 6470 + }, + { + "ce_loss_13": 3.1979884624481203, + "ce_loss_17": 3.154080641269684, + "ce_loss_2": 4.133810496330261, + "ce_loss_4": 3.7249322056770326, + "ce_loss_9": 3.3427038550376893, + "epoch": 0.648, + "grad_norm": 1176.0, + "kl_loss_13": 100.88006210327148, + "kl_loss_2": 2092.920129394531, + "kl_loss_4": 1275.9655517578126, + "kl_loss_9": 452.5431106567383, + "learning_rate": 0.00028081442660546124, + "loss": 991.6566, + "step": 6480 + }, + { + "ce_loss_13": 3.2644654393196104, + "ce_loss_17": 3.2164513230323792, + "ce_loss_2": 4.204858922958374, + "ce_loss_4": 3.777467978000641, + "ce_loss_9": 3.4090405821800234, + "epoch": 0.649, + "grad_norm": 1072.0, + "kl_loss_13": 104.46651840209961, + "kl_loss_2": 2107.4126525878905, + "kl_loss_4": 1260.0858520507813, + "kl_loss_9": 454.9138488769531, + "learning_rate": 0.0002793894493783892, + "loss": 989.8436, + "step": 6490 + }, + { + "ce_loss_13": 3.2795921087265016, + "ce_loss_17": 3.2319431066513062, + "ce_loss_2": 4.233194398880005, + "ce_loss_4": 3.79156676530838, + "ce_loss_9": 3.4226823449134827, + "epoch": 0.65, + "grad_norm": 980.0, + "kl_loss_13": 101.57818145751953, + "kl_loss_2": 2114.493798828125, + "kl_loss_4": 1257.4849426269532, + "kl_loss_9": 448.1490447998047, + "learning_rate": 0.0002779666936971129, + "loss": 979.8436, + "step": 6500 + }, + { + "ce_loss_13": 3.2866982102394102, + "ce_loss_17": 3.2400903701782227, + "ce_loss_2": 4.253109622001648, + "ce_loss_4": 3.820470058917999, + "ce_loss_9": 3.4409501194953918, + "epoch": 0.651, + "grad_norm": 1184.0, + "kl_loss_13": 103.64112167358398, + "kl_loss_2": 2151.6502685546875, + "kl_loss_4": 1301.0191528320313, + "kl_loss_9": 462.5331665039063, + "learning_rate": 0.00027654617388876614, + "loss": 1007.7316, + "step": 6510 + }, + { + "ce_loss_13": 3.3100210428237915, + "ce_loss_17": 3.260994827747345, + "ce_loss_2": 4.257573354244232, + "ce_loss_4": 3.828768563270569, + "ce_loss_9": 3.454323935508728, + "epoch": 0.652, + "grad_norm": 1012.0, + "kl_loss_13": 104.20470161437989, + "kl_loss_2": 2127.4112182617187, + "kl_loss_4": 1279.7777465820313, + "kl_loss_9": 456.177424621582, + "learning_rate": 0.0002751279042579672, + "loss": 993.2917, + "step": 6520 + }, + { + "ce_loss_13": 3.2570806622505186, + "ce_loss_17": 3.2090718030929564, + "ce_loss_2": 4.199404299259186, + "ce_loss_4": 3.7739688396453857, + "ce_loss_9": 3.4020878672599792, + "epoch": 0.653, + "grad_norm": 1136.0, + "kl_loss_13": 100.88242416381836, + "kl_loss_2": 2104.5212585449217, + "kl_loss_4": 1266.602606201172, + "kl_loss_9": 450.30115509033203, + "learning_rate": 0.00027371189908667604, + "loss": 999.6396, + "step": 6530 + }, + { + "ce_loss_13": 3.306930994987488, + "ce_loss_17": 3.2566690683364867, + "ce_loss_2": 4.294221377372741, + "ce_loss_4": 3.8480350017547607, + "ce_loss_9": 3.4587231397628786, + "epoch": 0.654, + "grad_norm": 904.0, + "kl_loss_13": 107.04639892578125, + "kl_loss_2": 2182.273889160156, + "kl_loss_4": 1305.1680114746093, + "kl_loss_9": 463.10042877197264, + "learning_rate": 0.00027229817263404863, + "loss": 1021.0101, + "step": 6540 + }, + { + "ce_loss_13": 3.2906779766082765, + "ce_loss_17": 3.2438485503196715, + "ce_loss_2": 4.19850103855133, + "ce_loss_4": 3.787437987327576, + "ce_loss_9": 3.4315773367881777, + "epoch": 0.655, + "grad_norm": 964.0, + "kl_loss_13": 101.8517406463623, + "kl_loss_2": 2047.837579345703, + "kl_loss_4": 1241.574639892578, + "kl_loss_9": 447.4479278564453, + "learning_rate": 0.0002708867391362948, + "loss": 979.7929, + "step": 6550 + }, + { + "ce_loss_13": 3.2700411915779113, + "ce_loss_17": 3.2241607189178465, + "ce_loss_2": 4.18927526473999, + "ce_loss_4": 3.767679417133331, + "ce_loss_9": 3.4116820216178896, + "epoch": 0.656, + "grad_norm": 1040.0, + "kl_loss_13": 100.81562309265136, + "kl_loss_2": 2053.5175903320314, + "kl_loss_4": 1219.6805786132813, + "kl_loss_9": 435.45800628662107, + "learning_rate": 0.0002694776128065345, + "loss": 980.6092, + "step": 6560 + }, + { + "ce_loss_13": 3.2063382029533387, + "ce_loss_17": 3.159927189350128, + "ce_loss_2": 4.1649257898330685, + "ce_loss_4": 3.736240255832672, + "ce_loss_9": 3.356412422657013, + "epoch": 0.657, + "grad_norm": 900.0, + "kl_loss_13": 103.89412498474121, + "kl_loss_2": 2148.0609741210938, + "kl_loss_4": 1304.7820617675782, + "kl_loss_9": 464.2591094970703, + "learning_rate": 0.00026807080783465374, + "loss": 985.6221, + "step": 6570 + }, + { + "ce_loss_13": 3.3128485321998595, + "ce_loss_17": 3.264893925189972, + "ce_loss_2": 4.268110942840576, + "ce_loss_4": 3.846385049819946, + "ce_loss_9": 3.461671328544617, + "epoch": 0.658, + "grad_norm": 876.0, + "kl_loss_13": 104.39120063781738, + "kl_loss_2": 2137.850030517578, + "kl_loss_4": 1302.5514770507812, + "kl_loss_9": 462.6618316650391, + "learning_rate": 0.00026666633838716316, + "loss": 1005.1484, + "step": 6580 + }, + { + "ce_loss_13": 3.211996853351593, + "ce_loss_17": 3.16183180809021, + "ce_loss_2": 4.184601545333862, + "ce_loss_4": 3.74734867811203, + "ce_loss_9": 3.366036319732666, + "epoch": 0.659, + "grad_norm": 1020.0, + "kl_loss_13": 106.22913360595703, + "kl_loss_2": 2161.1759338378906, + "kl_loss_4": 1315.378515625, + "kl_loss_9": 465.3143035888672, + "learning_rate": 0.00026526421860705474, + "loss": 1014.7031, + "step": 6590 + }, + { + "ce_loss_13": 3.235409379005432, + "ce_loss_17": 3.1856571078300475, + "ce_loss_2": 4.201184010505676, + "ce_loss_4": 3.767891824245453, + "ce_loss_9": 3.3857994198799135, + "epoch": 0.66, + "grad_norm": 1048.0, + "kl_loss_13": 105.19897346496582, + "kl_loss_2": 2143.557482910156, + "kl_loss_4": 1285.1950744628907, + "kl_loss_9": 460.3154861450195, + "learning_rate": 0.0002638644626136587, + "loss": 991.368, + "step": 6600 + }, + { + "ce_loss_13": 3.2461226105690004, + "ce_loss_17": 3.2000025629997255, + "ce_loss_2": 4.201997339725494, + "ce_loss_4": 3.773619270324707, + "ce_loss_9": 3.3955109477043153, + "epoch": 0.661, + "grad_norm": 1272.0, + "kl_loss_13": 102.3372974395752, + "kl_loss_2": 2124.0232177734374, + "kl_loss_4": 1288.3472900390625, + "kl_loss_9": 458.7044876098633, + "learning_rate": 0.00026246708450250255, + "loss": 995.341, + "step": 6610 + }, + { + "ce_loss_13": 3.237837779521942, + "ce_loss_17": 3.191122889518738, + "ce_loss_2": 4.178948020935058, + "ce_loss_4": 3.758194422721863, + "ce_loss_9": 3.3838201999664306, + "epoch": 0.662, + "grad_norm": 1296.0, + "kl_loss_13": 101.85936317443847, + "kl_loss_2": 2097.706512451172, + "kl_loss_4": 1268.7511413574218, + "kl_loss_9": 450.9641387939453, + "learning_rate": 0.00026107209834516854, + "loss": 986.7556, + "step": 6620 + }, + { + "ce_loss_13": 3.1941364645957946, + "ce_loss_17": 3.145762324333191, + "ce_loss_2": 4.193040311336517, + "ce_loss_4": 3.744525933265686, + "ce_loss_9": 3.3457990765571592, + "epoch": 0.663, + "grad_norm": 1192.0, + "kl_loss_13": 104.02223472595215, + "kl_loss_2": 2222.0311584472656, + "kl_loss_4": 1333.690606689453, + "kl_loss_9": 464.6977844238281, + "learning_rate": 0.0002596795181891514, + "loss": 1022.1763, + "step": 6630 + }, + { + "ce_loss_13": 3.200673055648804, + "ce_loss_17": 3.151281237602234, + "ce_loss_2": 4.17911251783371, + "ce_loss_4": 3.737889790534973, + "ce_loss_9": 3.353026473522186, + "epoch": 0.664, + "grad_norm": 1064.0, + "kl_loss_13": 106.82000122070312, + "kl_loss_2": 2175.008709716797, + "kl_loss_4": 1313.3203918457032, + "kl_loss_9": 471.40569458007815, + "learning_rate": 0.000258289358057718, + "loss": 1042.0936, + "step": 6640 + }, + { + "ce_loss_13": 3.2718376517295837, + "ce_loss_17": 3.2207493782043457, + "ce_loss_2": 4.245374727249145, + "ce_loss_4": 3.8042579174041746, + "ce_loss_9": 3.4281726717948913, + "epoch": 0.665, + "grad_norm": 932.0, + "kl_loss_13": 106.7474594116211, + "kl_loss_2": 2165.606689453125, + "kl_loss_4": 1296.6082336425782, + "kl_loss_9": 468.31117553710936, + "learning_rate": 0.0002569016319497657, + "loss": 1009.9519, + "step": 6650 + }, + { + "ce_loss_13": 3.2552980065345762, + "ce_loss_17": 3.2047754287719727, + "ce_loss_2": 4.2274405598640445, + "ce_loss_4": 3.7844682335853577, + "ce_loss_9": 3.411592173576355, + "epoch": 0.666, + "grad_norm": 932.0, + "kl_loss_13": 106.34464378356934, + "kl_loss_2": 2185.2001770019533, + "kl_loss_4": 1305.817300415039, + "kl_loss_9": 471.4287475585937, + "learning_rate": 0.00025551635383968066, + "loss": 1024.333, + "step": 6660 + }, + { + "ce_loss_13": 3.1725062131881714, + "ce_loss_17": 3.1228858113288878, + "ce_loss_2": 4.1487244963645935, + "ce_loss_4": 3.7083927750587464, + "ce_loss_9": 3.321577000617981, + "epoch": 0.667, + "grad_norm": 988.0, + "kl_loss_13": 105.71883163452148, + "kl_loss_2": 2189.703155517578, + "kl_loss_4": 1316.3324340820313, + "kl_loss_9": 467.3186279296875, + "learning_rate": 0.00025413353767719804, + "loss": 1015.5817, + "step": 6670 + }, + { + "ce_loss_13": 3.2266804456710814, + "ce_loss_17": 3.1812199473381044, + "ce_loss_2": 4.190030992031097, + "ce_loss_4": 3.7505768537521362, + "ce_loss_9": 3.374610793590546, + "epoch": 0.668, + "grad_norm": 1480.0, + "kl_loss_13": 101.84782752990722, + "kl_loss_2": 2157.1388427734373, + "kl_loss_4": 1299.3576293945312, + "kl_loss_9": 455.4222579956055, + "learning_rate": 0.0002527531973872617, + "loss": 1003.8312, + "step": 6680 + }, + { + "ce_loss_13": 3.2432339787483215, + "ce_loss_17": 3.1954752445220946, + "ce_loss_2": 4.192810535430908, + "ce_loss_4": 3.775682270526886, + "ce_loss_9": 3.391780209541321, + "epoch": 0.669, + "grad_norm": 1072.0, + "kl_loss_13": 102.44257354736328, + "kl_loss_2": 2125.15107421875, + "kl_loss_4": 1301.0641723632812, + "kl_loss_9": 459.88512115478517, + "learning_rate": 0.0002513753468698826, + "loss": 995.1326, + "step": 6690 + }, + { + "ce_loss_13": 3.2071149706840516, + "ce_loss_17": 3.1601174592971804, + "ce_loss_2": 4.185392951965332, + "ce_loss_4": 3.748111701011658, + "ce_loss_9": 3.359476351737976, + "epoch": 0.67, + "grad_norm": 960.0, + "kl_loss_13": 104.4536247253418, + "kl_loss_2": 2183.4039611816406, + "kl_loss_4": 1318.0630004882812, + "kl_loss_9": 470.28516540527346, + "learning_rate": 0.0002500000000000001, + "loss": 1011.6139, + "step": 6700 + }, + { + "ce_loss_13": 3.3195304751396177, + "ce_loss_17": 3.276082456111908, + "ce_loss_2": 4.226193523406982, + "ce_loss_4": 3.820376527309418, + "ce_loss_9": 3.464117670059204, + "epoch": 0.671, + "grad_norm": 852.0, + "kl_loss_13": 100.98722305297852, + "kl_loss_2": 2040.2514709472657, + "kl_loss_4": 1238.9735900878907, + "kl_loss_9": 447.40814208984375, + "learning_rate": 0.0002486271706273421, + "loss": 1005.8499, + "step": 6710 + }, + { + "ce_loss_13": 3.260366415977478, + "ce_loss_17": 3.2149436235427857, + "ce_loss_2": 4.176659274101257, + "ce_loss_4": 3.7612442135810853, + "ce_loss_9": 3.3989365220069887, + "epoch": 0.672, + "grad_norm": 1072.0, + "kl_loss_13": 99.98232765197754, + "kl_loss_2": 2042.1004516601563, + "kl_loss_4": 1234.8556060791016, + "kl_loss_9": 438.04880676269534, + "learning_rate": 0.0002472568725762853, + "loss": 984.6168, + "step": 6720 + }, + { + "ce_loss_13": 3.2528300404548647, + "ce_loss_17": 3.2054027199745176, + "ce_loss_2": 4.1674515962600704, + "ce_loss_4": 3.754186975955963, + "ce_loss_9": 3.3945667028427122, + "epoch": 0.673, + "grad_norm": 1096.0, + "kl_loss_13": 99.29492797851563, + "kl_loss_2": 2067.0465576171873, + "kl_loss_4": 1241.0871154785157, + "kl_loss_9": 436.12317810058596, + "learning_rate": 0.00024588911964571554, + "loss": 974.8271, + "step": 6730 + }, + { + "ce_loss_13": 3.2680162668228148, + "ce_loss_17": 3.215383195877075, + "ce_loss_2": 4.243164575099945, + "ce_loss_4": 3.8106635451316833, + "ce_loss_9": 3.425454580783844, + "epoch": 0.674, + "grad_norm": 1224.0, + "kl_loss_13": 109.05999984741212, + "kl_loss_2": 2166.109094238281, + "kl_loss_4": 1322.577215576172, + "kl_loss_9": 473.5297653198242, + "learning_rate": 0.00024452392560888974, + "loss": 998.7105, + "step": 6740 + }, + { + "ce_loss_13": 3.1634591460227965, + "ce_loss_17": 3.114391732215881, + "ce_loss_2": 4.115790605545044, + "ce_loss_4": 3.6860440731048585, + "ce_loss_9": 3.3140316009521484, + "epoch": 0.675, + "grad_norm": 1136.0, + "kl_loss_13": 101.23067245483398, + "kl_loss_2": 2138.4682189941404, + "kl_loss_4": 1289.8099182128906, + "kl_loss_9": 455.61485748291017, + "learning_rate": 0.00024316130421329695, + "loss": 985.8256, + "step": 6750 + }, + { + "ce_loss_13": 3.2400077104568483, + "ce_loss_17": 3.1933814525604247, + "ce_loss_2": 4.1784663438797, + "ce_loss_4": 3.7577054858207704, + "ce_loss_9": 3.3869038105010985, + "epoch": 0.676, + "grad_norm": 832.0, + "kl_loss_13": 101.22082328796387, + "kl_loss_2": 2096.6354370117188, + "kl_loss_4": 1270.7260131835938, + "kl_loss_9": 450.51924896240234, + "learning_rate": 0.00024180126918051909, + "loss": 990.6471, + "step": 6760 + }, + { + "ce_loss_13": 3.281948673725128, + "ce_loss_17": 3.2340904235839845, + "ce_loss_2": 4.221386134624481, + "ce_loss_4": 3.795566427707672, + "ce_loss_9": 3.4324694752693174, + "epoch": 0.677, + "grad_norm": 964.0, + "kl_loss_13": 103.02061500549317, + "kl_loss_2": 2092.3447204589843, + "kl_loss_4": 1259.3734558105468, + "kl_loss_9": 453.64100952148436, + "learning_rate": 0.00024044383420609406, + "loss": 977.0837, + "step": 6770 + }, + { + "ce_loss_13": 3.2922922015190124, + "ce_loss_17": 3.2476919531822204, + "ce_loss_2": 4.204925155639648, + "ce_loss_4": 3.789861559867859, + "ce_loss_9": 3.433992123603821, + "epoch": 0.678, + "grad_norm": 1004.0, + "kl_loss_13": 100.87908744812012, + "kl_loss_2": 2064.591204833984, + "kl_loss_4": 1248.3886657714843, + "kl_loss_9": 447.58892517089845, + "learning_rate": 0.00023908901295937712, + "loss": 994.1336, + "step": 6780 + }, + { + "ce_loss_13": 3.2830626487731935, + "ce_loss_17": 3.2329793572425842, + "ce_loss_2": 4.224538588523865, + "ce_loss_4": 3.7981178402900695, + "ce_loss_9": 3.4267912983894346, + "epoch": 0.679, + "grad_norm": 1176.0, + "kl_loss_13": 102.39090576171876, + "kl_loss_2": 2090.019598388672, + "kl_loss_4": 1256.8080810546876, + "kl_loss_9": 445.41021423339845, + "learning_rate": 0.00023773681908340283, + "loss": 999.7303, + "step": 6790 + }, + { + "ce_loss_13": 3.2627842664718627, + "ce_loss_17": 3.210619592666626, + "ce_loss_2": 4.24127299785614, + "ce_loss_4": 3.7977648973464966, + "ce_loss_9": 3.419055187702179, + "epoch": 0.68, + "grad_norm": 1200.0, + "kl_loss_13": 107.6691349029541, + "kl_loss_2": 2189.826251220703, + "kl_loss_4": 1323.4154174804687, + "kl_loss_9": 478.61189422607424, + "learning_rate": 0.00023638726619474876, + "loss": 1028.3978, + "step": 6800 + }, + { + "ce_loss_13": 3.2457313418388365, + "ce_loss_17": 3.196158730983734, + "ce_loss_2": 4.246055316925049, + "ce_loss_4": 3.8066432952880858, + "ce_loss_9": 3.404669535160065, + "epoch": 0.681, + "grad_norm": 1048.0, + "kl_loss_13": 105.44283752441406, + "kl_loss_2": 2193.1047180175783, + "kl_loss_4": 1339.202215576172, + "kl_loss_9": 470.3429260253906, + "learning_rate": 0.0002350403678833976, + "loss": 1012.7346, + "step": 6810 + }, + { + "ce_loss_13": 3.1812268257141114, + "ce_loss_17": 3.1330739974975588, + "ce_loss_2": 4.152749073505402, + "ce_loss_4": 3.7164103388786316, + "ce_loss_9": 3.3301640272140505, + "epoch": 0.682, + "grad_norm": 1016.0, + "kl_loss_13": 101.76590156555176, + "kl_loss_2": 2158.500848388672, + "kl_loss_4": 1308.8683349609375, + "kl_loss_9": 455.3803970336914, + "learning_rate": 0.00023369613771260007, + "loss": 994.8266, + "step": 6820 + }, + { + "ce_loss_13": 3.2936524748802185, + "ce_loss_17": 3.2442342877388, + "ce_loss_2": 4.257334411144257, + "ce_loss_4": 3.823900580406189, + "ce_loss_9": 3.445124554634094, + "epoch": 0.683, + "grad_norm": 1304.0, + "kl_loss_13": 104.39794845581055, + "kl_loss_2": 2154.306201171875, + "kl_loss_4": 1295.963397216797, + "kl_loss_9": 461.85015258789065, + "learning_rate": 0.00023235458921873925, + "loss": 1010.3116, + "step": 6830 + }, + { + "ce_loss_13": 3.2474116683006287, + "ce_loss_17": 3.197927749156952, + "ce_loss_2": 4.2535091757774355, + "ce_loss_4": 3.8051849842071532, + "ce_loss_9": 3.4092461466789246, + "epoch": 0.684, + "grad_norm": 1312.0, + "kl_loss_13": 107.95975723266602, + "kl_loss_2": 2240.894091796875, + "kl_loss_4": 1354.4358825683594, + "kl_loss_9": 480.27411499023435, + "learning_rate": 0.0002310157359111938, + "loss": 1036.5108, + "step": 6840 + }, + { + "ce_loss_13": 3.1398362040519716, + "ce_loss_17": 3.088029706478119, + "ce_loss_2": 4.197815501689911, + "ce_loss_4": 3.7188588857650755, + "ce_loss_9": 3.3065144419670105, + "epoch": 0.685, + "grad_norm": 1408.0, + "kl_loss_13": 104.71201629638672, + "kl_loss_2": 2311.3247253417967, + "kl_loss_4": 1374.8434204101563, + "kl_loss_9": 477.91454010009767, + "learning_rate": 0.0002296795912722014, + "loss": 1038.1281, + "step": 6850 + }, + { + "ce_loss_13": 3.2793092727661133, + "ce_loss_17": 3.2312268137931826, + "ce_loss_2": 4.20980396270752, + "ce_loss_4": 3.784900200366974, + "ce_loss_9": 3.427792024612427, + "epoch": 0.686, + "grad_norm": 976.0, + "kl_loss_13": 102.50910835266113, + "kl_loss_2": 2085.1129516601563, + "kl_loss_4": 1257.3842529296876, + "kl_loss_9": 450.30498046875, + "learning_rate": 0.0002283461687567236, + "loss": 969.9803, + "step": 6860 + }, + { + "ce_loss_13": 3.3375165104866027, + "ce_loss_17": 3.2890327334403993, + "ce_loss_2": 4.24235463142395, + "ce_loss_4": 3.840671956539154, + "ce_loss_9": 3.4789104104042052, + "epoch": 0.687, + "grad_norm": 856.0, + "kl_loss_13": 102.10050659179687, + "kl_loss_2": 2033.2359436035156, + "kl_loss_4": 1244.7200073242188, + "kl_loss_9": 442.48499755859376, + "learning_rate": 0.00022701548179231045, + "loss": 988.8264, + "step": 6870 + }, + { + "ce_loss_13": 3.289671754837036, + "ce_loss_17": 3.240048110485077, + "ce_loss_2": 4.244234001636505, + "ce_loss_4": 3.814144825935364, + "ce_loss_9": 3.438059628009796, + "epoch": 0.688, + "grad_norm": 1032.0, + "kl_loss_13": 104.05793571472168, + "kl_loss_2": 2136.1650634765624, + "kl_loss_4": 1287.3386291503907, + "kl_loss_9": 458.53736572265626, + "learning_rate": 0.00022568754377896516, + "loss": 981.4267, + "step": 6880 + }, + { + "ce_loss_13": 3.2787294387817383, + "ce_loss_17": 3.2297904253005982, + "ce_loss_2": 4.2119832873344425, + "ce_loss_4": 3.793773424625397, + "ce_loss_9": 3.4266915202140806, + "epoch": 0.689, + "grad_norm": 1224.0, + "kl_loss_13": 103.35405807495117, + "kl_loss_2": 2106.5705017089845, + "kl_loss_4": 1271.8478515625, + "kl_loss_9": 458.0431076049805, + "learning_rate": 0.00022436236808900844, + "loss": 985.9883, + "step": 6890 + }, + { + "ce_loss_13": 3.179608941078186, + "ce_loss_17": 3.129877281188965, + "ce_loss_2": 4.145314931869507, + "ce_loss_4": 3.7118398785591125, + "ce_loss_9": 3.33070969581604, + "epoch": 0.69, + "grad_norm": 1112.0, + "kl_loss_13": 103.02140884399414, + "kl_loss_2": 2162.8433837890625, + "kl_loss_4": 1303.9549865722656, + "kl_loss_9": 457.87942962646486, + "learning_rate": 0.00022303996806694487, + "loss": 994.5302, + "step": 6900 + }, + { + "ce_loss_13": 3.249576508998871, + "ce_loss_17": 3.202440345287323, + "ce_loss_2": 4.2110624551773075, + "ce_loss_4": 3.776653838157654, + "ce_loss_9": 3.400467002391815, + "epoch": 0.691, + "grad_norm": 1688.0, + "kl_loss_13": 101.58654479980468, + "kl_loss_2": 2147.4699096679688, + "kl_loss_4": 1293.2305297851562, + "kl_loss_9": 455.65966796875, + "learning_rate": 0.00022172035702932823, + "loss": 990.926, + "step": 6910 + }, + { + "ce_loss_13": 3.2973915934562683, + "ce_loss_17": 3.2501471638679504, + "ce_loss_2": 4.218091928958893, + "ce_loss_4": 3.810104763507843, + "ce_loss_9": 3.444382297992706, + "epoch": 0.692, + "grad_norm": 1280.0, + "kl_loss_13": 102.84040832519531, + "kl_loss_2": 2051.6656982421873, + "kl_loss_4": 1254.1827270507813, + "kl_loss_9": 448.6915740966797, + "learning_rate": 0.00022040354826462666, + "loss": 975.1661, + "step": 6920 + }, + { + "ce_loss_13": 3.2260066270828247, + "ce_loss_17": 3.180059814453125, + "ce_loss_2": 4.186605894565583, + "ce_loss_4": 3.7482640147209167, + "ce_loss_9": 3.3698163747787477, + "epoch": 0.693, + "grad_norm": 1280.0, + "kl_loss_13": 100.90071716308594, + "kl_loss_2": 2130.3034545898436, + "kl_loss_4": 1272.9785888671875, + "kl_loss_9": 446.72293548583986, + "learning_rate": 0.0002190895550330899, + "loss": 998.8228, + "step": 6930 + }, + { + "ce_loss_13": 3.163276433944702, + "ce_loss_17": 3.11296660900116, + "ce_loss_2": 4.153519976139068, + "ce_loss_4": 3.7158083319664, + "ce_loss_9": 3.3186029314994814, + "epoch": 0.694, + "grad_norm": 1536.0, + "kl_loss_13": 105.05643081665039, + "kl_loss_2": 2189.6942504882813, + "kl_loss_4": 1328.123114013672, + "kl_loss_9": 468.58738861083987, + "learning_rate": 0.00021777839056661552, + "loss": 995.8514, + "step": 6940 + }, + { + "ce_loss_13": 3.24445219039917, + "ce_loss_17": 3.1964301824569703, + "ce_loss_2": 4.189391398429871, + "ce_loss_4": 3.762847971916199, + "ce_loss_9": 3.3889621257781983, + "epoch": 0.695, + "grad_norm": 1104.0, + "kl_loss_13": 102.36992073059082, + "kl_loss_2": 2106.068566894531, + "kl_loss_4": 1270.103271484375, + "kl_loss_9": 447.5800811767578, + "learning_rate": 0.0002164700680686147, + "loss": 971.9291, + "step": 6950 + }, + { + "ce_loss_13": 3.289232635498047, + "ce_loss_17": 3.2411341667175293, + "ce_loss_2": 4.218494701385498, + "ce_loss_4": 3.7959846377372743, + "ce_loss_9": 3.4362203001976015, + "epoch": 0.696, + "grad_norm": 1064.0, + "kl_loss_13": 103.23946914672851, + "kl_loss_2": 2072.047473144531, + "kl_loss_4": 1243.9751831054687, + "kl_loss_9": 450.554914855957, + "learning_rate": 0.0002151646007138806, + "loss": 976.4219, + "step": 6960 + }, + { + "ce_loss_13": 3.1716899514198302, + "ce_loss_17": 3.1234277844429017, + "ce_loss_2": 4.1481289744377134, + "ce_loss_4": 3.7061108231544493, + "ce_loss_9": 3.3231279134750364, + "epoch": 0.697, + "grad_norm": 1016.0, + "kl_loss_13": 104.04178924560547, + "kl_loss_2": 2185.198107910156, + "kl_loss_4": 1312.1087951660156, + "kl_loss_9": 462.21910247802737, + "learning_rate": 0.00021386200164845526, + "loss": 999.273, + "step": 6970 + }, + { + "ce_loss_13": 3.345793628692627, + "ce_loss_17": 3.2997822880744936, + "ce_loss_2": 4.243441462516785, + "ce_loss_4": 3.842452323436737, + "ce_loss_9": 3.487791085243225, + "epoch": 0.698, + "grad_norm": 924.0, + "kl_loss_13": 101.88482551574707, + "kl_loss_2": 2035.6734741210937, + "kl_loss_4": 1236.5458068847656, + "kl_loss_9": 445.43784790039064, + "learning_rate": 0.0002125622839894964, + "loss": 964.5474, + "step": 6980 + }, + { + "ce_loss_13": 3.28831205368042, + "ce_loss_17": 3.2422478437423705, + "ce_loss_2": 4.221863973140716, + "ce_loss_4": 3.7983975052833556, + "ce_loss_9": 3.431994414329529, + "epoch": 0.699, + "grad_norm": 1072.0, + "kl_loss_13": 101.97261772155761, + "kl_loss_2": 2062.595147705078, + "kl_loss_4": 1239.8757873535155, + "kl_loss_9": 442.8819320678711, + "learning_rate": 0.00021126546082514663, + "loss": 971.5902, + "step": 6990 + }, + { + "ce_loss_13": 3.3109893918037416, + "ce_loss_17": 3.265090274810791, + "ce_loss_2": 4.219663059711456, + "ce_loss_4": 3.810477948188782, + "ce_loss_9": 3.4486504673957823, + "epoch": 0.7, + "grad_norm": 916.0, + "kl_loss_13": 102.23057060241699, + "kl_loss_2": 2056.7913146972655, + "kl_loss_4": 1248.7921997070312, + "kl_loss_9": 446.58590240478514, + "learning_rate": 0.00020997154521440098, + "loss": 967.2799, + "step": 7000 + }, + { + "ce_loss_13": 3.25771621465683, + "ce_loss_17": 3.2107723474502565, + "ce_loss_2": 4.193326783180237, + "ce_loss_4": 3.763080906867981, + "ce_loss_9": 3.3964253544807432, + "epoch": 0.701, + "grad_norm": 900.0, + "kl_loss_13": 101.07011260986329, + "kl_loss_2": 2101.0982971191406, + "kl_loss_4": 1262.2381164550782, + "kl_loss_9": 447.86836700439454, + "learning_rate": 0.0002086805501869749, + "loss": 972.7094, + "step": 7010 + }, + { + "ce_loss_13": 3.2292993783950807, + "ce_loss_17": 3.1790473818778993, + "ce_loss_2": 4.215039098262787, + "ce_loss_4": 3.777962052822113, + "ce_loss_9": 3.3856145977973937, + "epoch": 0.702, + "grad_norm": 1240.0, + "kl_loss_13": 104.63607139587403, + "kl_loss_2": 2196.8508117675783, + "kl_loss_4": 1321.345721435547, + "kl_loss_9": 470.0806350708008, + "learning_rate": 0.0002073924887431744, + "loss": 1003.4582, + "step": 7020 + }, + { + "ce_loss_13": 3.235413110256195, + "ce_loss_17": 3.189332604408264, + "ce_loss_2": 4.192441380023956, + "ce_loss_4": 3.763059604167938, + "ce_loss_9": 3.386086845397949, + "epoch": 0.703, + "grad_norm": 1512.0, + "kl_loss_13": 102.21333808898926, + "kl_loss_2": 2147.6332580566404, + "kl_loss_4": 1297.4617736816406, + "kl_loss_9": 455.28276977539065, + "learning_rate": 0.00020610737385376348, + "loss": 1019.6457, + "step": 7030 + }, + { + "ce_loss_13": 3.2884790897369385, + "ce_loss_17": 3.2432208180427553, + "ce_loss_2": 4.195916604995728, + "ce_loss_4": 3.7990179657936096, + "ce_loss_9": 3.4342324495315553, + "epoch": 0.704, + "grad_norm": 1200.0, + "kl_loss_13": 101.49304580688477, + "kl_loss_2": 2032.1216613769532, + "kl_loss_4": 1240.2603881835937, + "kl_loss_9": 443.6861206054688, + "learning_rate": 0.00020482521845983521, + "loss": 987.3602, + "step": 7040 + }, + { + "ce_loss_13": 3.2843542814254763, + "ce_loss_17": 3.235620582103729, + "ce_loss_2": 4.2434363007545475, + "ce_loss_4": 3.807620084285736, + "ce_loss_9": 3.4326597929000853, + "epoch": 0.705, + "grad_norm": 1416.0, + "kl_loss_13": 105.09378509521484, + "kl_loss_2": 2131.4164306640623, + "kl_loss_4": 1272.9880981445312, + "kl_loss_9": 456.50634765625, + "learning_rate": 0.00020354603547267987, + "loss": 1002.6995, + "step": 7050 + }, + { + "ce_loss_13": 3.269883382320404, + "ce_loss_17": 3.2183038711547853, + "ce_loss_2": 4.243802499771118, + "ce_loss_4": 3.8104315876960753, + "ce_loss_9": 3.422291946411133, + "epoch": 0.706, + "grad_norm": 940.0, + "kl_loss_13": 104.86704368591309, + "kl_loss_2": 2146.307440185547, + "kl_loss_4": 1294.3465270996094, + "kl_loss_9": 463.04643096923826, + "learning_rate": 0.00020226983777365604, + "loss": 1023.8993, + "step": 7060 + }, + { + "ce_loss_13": 3.1801671266555784, + "ce_loss_17": 3.133017432689667, + "ce_loss_2": 4.178548872470856, + "ce_loss_4": 3.7255703806877136, + "ce_loss_9": 3.3312161564826965, + "epoch": 0.707, + "grad_norm": 1012.0, + "kl_loss_13": 99.52737426757812, + "kl_loss_2": 2187.274932861328, + "kl_loss_4": 1303.1950317382812, + "kl_loss_9": 445.7186340332031, + "learning_rate": 0.00020099663821406056, + "loss": 994.9156, + "step": 7070 + }, + { + "ce_loss_13": 3.27728613615036, + "ce_loss_17": 3.2302020311355593, + "ce_loss_2": 4.20411890745163, + "ce_loss_4": 3.7864834785461428, + "ce_loss_9": 3.4216678977012633, + "epoch": 0.708, + "grad_norm": 1264.0, + "kl_loss_13": 100.65608100891113, + "kl_loss_2": 2055.3018432617187, + "kl_loss_4": 1237.0408508300782, + "kl_loss_9": 440.6377151489258, + "learning_rate": 0.00019972644961499853, + "loss": 989.3201, + "step": 7080 + }, + { + "ce_loss_13": 3.2457040548324585, + "ce_loss_17": 3.1968128442764283, + "ce_loss_2": 4.221124994754791, + "ce_loss_4": 3.778362786769867, + "ce_loss_9": 3.3978989005088804, + "epoch": 0.709, + "grad_norm": 932.0, + "kl_loss_13": 105.33441162109375, + "kl_loss_2": 2174.3044189453126, + "kl_loss_4": 1306.544854736328, + "kl_loss_9": 464.2111541748047, + "learning_rate": 0.00019845928476725522, + "loss": 1002.0223, + "step": 7090 + }, + { + "ce_loss_13": 3.320943057537079, + "ce_loss_17": 3.2715343475341796, + "ce_loss_2": 4.261563336849212, + "ce_loss_4": 3.8357497334480284, + "ce_loss_9": 3.4658365130424498, + "epoch": 0.71, + "grad_norm": 1312.0, + "kl_loss_13": 103.61341514587403, + "kl_loss_2": 2093.532598876953, + "kl_loss_4": 1263.7469604492187, + "kl_loss_9": 453.82713470458987, + "learning_rate": 0.00019719515643116677, + "loss": 1015.4979, + "step": 7100 + }, + { + "ce_loss_13": 3.2649486303329467, + "ce_loss_17": 3.216744267940521, + "ce_loss_2": 4.210716032981873, + "ce_loss_4": 3.779931879043579, + "ce_loss_9": 3.4105352401733398, + "epoch": 0.711, + "grad_norm": 1424.0, + "kl_loss_13": 101.52503089904785, + "kl_loss_2": 2106.6075073242187, + "kl_loss_4": 1257.0684509277344, + "kl_loss_9": 448.6671569824219, + "learning_rate": 0.0001959340773364911, + "loss": 992.6879, + "step": 7110 + }, + { + "ce_loss_13": 3.278683769702911, + "ce_loss_17": 3.229863095283508, + "ce_loss_2": 4.2382872104644775, + "ce_loss_4": 3.7976195931434633, + "ce_loss_9": 3.4241430044174193, + "epoch": 0.712, + "grad_norm": 1020.0, + "kl_loss_13": 102.51830368041992, + "kl_loss_2": 2138.4018127441404, + "kl_loss_4": 1281.7517028808593, + "kl_loss_9": 456.4356094360352, + "learning_rate": 0.0001946760601822809, + "loss": 976.7356, + "step": 7120 + }, + { + "ce_loss_13": 3.330082154273987, + "ce_loss_17": 3.2839014172554015, + "ce_loss_2": 4.261330091953278, + "ce_loss_4": 3.8422109842300416, + "ce_loss_9": 3.475909078121185, + "epoch": 0.713, + "grad_norm": 944.0, + "kl_loss_13": 102.10880088806152, + "kl_loss_2": 2080.713134765625, + "kl_loss_4": 1253.0130493164063, + "kl_loss_9": 449.0329879760742, + "learning_rate": 0.00019342111763675512, + "loss": 963.1166, + "step": 7130 + }, + { + "ce_loss_13": 3.3258482694625853, + "ce_loss_17": 3.280312216281891, + "ce_loss_2": 4.234133088588715, + "ce_loss_4": 3.8280014991760254, + "ce_loss_9": 3.4662880301475525, + "epoch": 0.714, + "grad_norm": 980.0, + "kl_loss_13": 103.50119819641114, + "kl_loss_2": 2046.9655517578126, + "kl_loss_4": 1244.9883239746093, + "kl_loss_9": 448.0644012451172, + "learning_rate": 0.00019216926233717085, + "loss": 963.7633, + "step": 7140 + }, + { + "ce_loss_13": 3.2199657201766967, + "ce_loss_17": 3.1739187836647034, + "ce_loss_2": 4.227598297595978, + "ce_loss_4": 3.770006847381592, + "ce_loss_9": 3.368194031715393, + "epoch": 0.715, + "grad_norm": 1008.0, + "kl_loss_13": 100.99191169738769, + "kl_loss_2": 2216.086053466797, + "kl_loss_4": 1315.6239807128907, + "kl_loss_9": 444.15630493164065, + "learning_rate": 0.00019092050688969737, + "loss": 1013.7867, + "step": 7150 + }, + { + "ce_loss_13": 3.289920115470886, + "ce_loss_17": 3.2437126517295836, + "ce_loss_2": 4.211596512794495, + "ce_loss_4": 3.7951135277748107, + "ce_loss_9": 3.4325337171554566, + "epoch": 0.716, + "grad_norm": 1304.0, + "kl_loss_13": 101.67895431518555, + "kl_loss_2": 2102.002764892578, + "kl_loss_4": 1273.8563293457032, + "kl_loss_9": 446.5319396972656, + "learning_rate": 0.00018967486386928817, + "loss": 978.4325, + "step": 7160 + }, + { + "ce_loss_13": 3.1637759447097777, + "ce_loss_17": 3.1165361285209654, + "ce_loss_2": 4.142092621326446, + "ce_loss_4": 3.704055166244507, + "ce_loss_9": 3.3138237476348875, + "epoch": 0.717, + "grad_norm": 880.0, + "kl_loss_13": 103.00496063232421, + "kl_loss_2": 2184.2013305664063, + "kl_loss_4": 1316.35, + "kl_loss_9": 463.64766998291014, + "learning_rate": 0.00018843234581955443, + "loss": 1032.0961, + "step": 7170 + }, + { + "ce_loss_13": 3.1771233916282653, + "ce_loss_17": 3.126755440235138, + "ce_loss_2": 4.140774643421173, + "ce_loss_4": 3.714804983139038, + "ce_loss_9": 3.330841541290283, + "epoch": 0.718, + "grad_norm": 1136.0, + "kl_loss_13": 103.78391532897949, + "kl_loss_2": 2150.5621032714844, + "kl_loss_4": 1307.5394226074218, + "kl_loss_9": 461.04087524414064, + "learning_rate": 0.00018719296525263924, + "loss": 1003.4609, + "step": 7180 + }, + { + "ce_loss_13": 3.274753415584564, + "ce_loss_17": 3.2296223521232603, + "ce_loss_2": 4.190529310703278, + "ce_loss_4": 3.7737638115882874, + "ce_loss_9": 3.4123709201812744, + "epoch": 0.719, + "grad_norm": 972.0, + "kl_loss_13": 102.13137435913086, + "kl_loss_2": 2051.735986328125, + "kl_loss_4": 1233.3234313964845, + "kl_loss_9": 442.374739074707, + "learning_rate": 0.0001859567346490913, + "loss": 965.5097, + "step": 7190 + }, + { + "ce_loss_13": 3.253644645214081, + "ce_loss_17": 3.204506719112396, + "ce_loss_2": 4.221274256706238, + "ce_loss_4": 3.7849581718444822, + "ce_loss_9": 3.4022349238395693, + "epoch": 0.72, + "grad_norm": 1024.0, + "kl_loss_13": 103.78521919250488, + "kl_loss_2": 2160.165985107422, + "kl_loss_4": 1290.6940490722657, + "kl_loss_9": 459.7044219970703, + "learning_rate": 0.0001847236664577389, + "loss": 983.8235, + "step": 7200 + }, + { + "ce_loss_13": 3.2759679913520814, + "ce_loss_17": 3.2273041367530824, + "ce_loss_2": 4.188448047637939, + "ce_loss_4": 3.7774073123931884, + "ce_loss_9": 3.420163369178772, + "epoch": 0.721, + "grad_norm": 888.0, + "kl_loss_13": 100.77950859069824, + "kl_loss_2": 2035.1171813964843, + "kl_loss_4": 1226.6566772460938, + "kl_loss_9": 438.77677764892576, + "learning_rate": 0.00018349377309556487, + "loss": 955.6046, + "step": 7210 + }, + { + "ce_loss_13": 3.2242144465446474, + "ce_loss_17": 3.175799787044525, + "ce_loss_2": 4.223109292984009, + "ce_loss_4": 3.7641687035560607, + "ce_loss_9": 3.374901843070984, + "epoch": 0.722, + "grad_norm": 1056.0, + "kl_loss_13": 103.63941764831543, + "kl_loss_2": 2238.734014892578, + "kl_loss_4": 1330.0077026367187, + "kl_loss_9": 464.37327728271487, + "learning_rate": 0.00018226706694758193, + "loss": 1013.5134, + "step": 7220 + }, + { + "ce_loss_13": 3.2951212406158445, + "ce_loss_17": 3.2491416931152344, + "ce_loss_2": 4.238140666484833, + "ce_loss_4": 3.8119378805160524, + "ce_loss_9": 3.4382128715515137, + "epoch": 0.723, + "grad_norm": 1152.0, + "kl_loss_13": 102.50158958435058, + "kl_loss_2": 2116.6992248535157, + "kl_loss_4": 1284.8026794433595, + "kl_loss_9": 454.30648040771484, + "learning_rate": 0.0001810435603667075, + "loss": 1013.9983, + "step": 7230 + }, + { + "ce_loss_13": 3.1493306159973145, + "ce_loss_17": 3.101602244377136, + "ce_loss_2": 4.114741802215576, + "ce_loss_4": 3.6743094325065613, + "ce_loss_9": 3.2947826266288756, + "epoch": 0.724, + "grad_norm": 1064.0, + "kl_loss_13": 99.61135520935059, + "kl_loss_2": 2136.101428222656, + "kl_loss_4": 1280.5566650390624, + "kl_loss_9": 447.623030090332, + "learning_rate": 0.0001798232656736389, + "loss": 1008.8657, + "step": 7240 + }, + { + "ce_loss_13": 3.3198463678359986, + "ce_loss_17": 3.2713189244270326, + "ce_loss_2": 4.226815581321716, + "ce_loss_4": 3.8106476068496704, + "ce_loss_9": 3.4620877146720885, + "epoch": 0.725, + "grad_norm": 944.0, + "kl_loss_13": 101.66799049377441, + "kl_loss_2": 2034.7107482910155, + "kl_loss_4": 1223.4187133789062, + "kl_loss_9": 439.388330078125, + "learning_rate": 0.0001786061951567303, + "loss": 970.7255, + "step": 7250 + }, + { + "ce_loss_13": 3.2318737745285033, + "ce_loss_17": 3.18269305229187, + "ce_loss_2": 4.186606872081756, + "ce_loss_4": 3.7551080346107484, + "ce_loss_9": 3.384189856052399, + "epoch": 0.726, + "grad_norm": 1320.0, + "kl_loss_13": 103.28277626037598, + "kl_loss_2": 2110.865338134766, + "kl_loss_4": 1271.9143615722655, + "kl_loss_9": 452.49157562255857, + "learning_rate": 0.00017739236107186857, + "loss": 996.2609, + "step": 7260 + }, + { + "ce_loss_13": 3.3279162645339966, + "ce_loss_17": 3.281880331039429, + "ce_loss_2": 4.219913053512573, + "ce_loss_4": 3.8133899450302122, + "ce_loss_9": 3.463189160823822, + "epoch": 0.727, + "grad_norm": 972.0, + "kl_loss_13": 100.67747802734375, + "kl_loss_2": 2025.626904296875, + "kl_loss_4": 1222.1552734375, + "kl_loss_9": 437.16917114257814, + "learning_rate": 0.00017618177564234904, + "loss": 964.968, + "step": 7270 + }, + { + "ce_loss_13": 3.2964897394180297, + "ce_loss_17": 3.251171123981476, + "ce_loss_2": 4.198864662647248, + "ce_loss_4": 3.7930629730224608, + "ce_loss_9": 3.4340954542160036, + "epoch": 0.728, + "grad_norm": 956.0, + "kl_loss_13": 99.37322654724122, + "kl_loss_2": 2012.8637329101562, + "kl_loss_4": 1215.3342407226562, + "kl_loss_9": 430.8265319824219, + "learning_rate": 0.00017497445105875377, + "loss": 959.9689, + "step": 7280 + }, + { + "ce_loss_13": 3.2095884799957277, + "ce_loss_17": 3.160313975811005, + "ce_loss_2": 4.182150101661682, + "ce_loss_4": 3.7413028120994567, + "ce_loss_9": 3.360980784893036, + "epoch": 0.729, + "grad_norm": 952.0, + "kl_loss_13": 102.95348663330078, + "kl_loss_2": 2171.3057067871096, + "kl_loss_4": 1302.643048095703, + "kl_loss_9": 462.14252014160155, + "learning_rate": 0.000173770399478828, + "loss": 998.101, + "step": 7290 + }, + { + "ce_loss_13": 3.134935402870178, + "ce_loss_17": 3.0897037267684935, + "ce_loss_2": 4.0793322086334225, + "ce_loss_4": 3.6437400221824645, + "ce_loss_9": 3.2767797231674196, + "epoch": 0.73, + "grad_norm": 1168.0, + "kl_loss_13": 100.00651206970215, + "kl_loss_2": 2116.372998046875, + "kl_loss_4": 1261.404229736328, + "kl_loss_9": 444.44177398681643, + "learning_rate": 0.0001725696330273575, + "loss": 1007.8494, + "step": 7300 + }, + { + "ce_loss_13": 3.313021719455719, + "ce_loss_17": 3.266770362854004, + "ce_loss_2": 4.2282095670700075, + "ce_loss_4": 3.818688249588013, + "ce_loss_9": 3.4571586728096007, + "epoch": 0.731, + "grad_norm": 868.0, + "kl_loss_13": 100.3053020477295, + "kl_loss_2": 2033.5826416015625, + "kl_loss_4": 1230.6662658691407, + "kl_loss_9": 440.2505874633789, + "learning_rate": 0.00017137216379604724, + "loss": 957.9258, + "step": 7310 + }, + { + "ce_loss_13": 3.198021948337555, + "ce_loss_17": 3.1496949195861816, + "ce_loss_2": 4.156942975521088, + "ce_loss_4": 3.7194634914398192, + "ce_loss_9": 3.342144012451172, + "epoch": 0.732, + "grad_norm": 1004.0, + "kl_loss_13": 101.31270408630371, + "kl_loss_2": 2115.138153076172, + "kl_loss_4": 1260.4626708984374, + "kl_loss_9": 441.09951934814455, + "learning_rate": 0.00017017800384339925, + "loss": 986.9494, + "step": 7320 + }, + { + "ce_loss_13": 3.1509732246398925, + "ce_loss_17": 3.1032688856124877, + "ce_loss_2": 4.1420656681060795, + "ce_loss_4": 3.6995010137557984, + "ce_loss_9": 3.3060576558113097, + "epoch": 0.733, + "grad_norm": 1120.0, + "kl_loss_13": 102.62201347351075, + "kl_loss_2": 2193.893243408203, + "kl_loss_4": 1313.6682373046874, + "kl_loss_9": 460.35191802978517, + "learning_rate": 0.00016898716519459073, + "loss": 982.2604, + "step": 7330 + }, + { + "ce_loss_13": 3.2725262880325316, + "ce_loss_17": 3.2226198077201844, + "ce_loss_2": 4.260886538028717, + "ce_loss_4": 3.812357759475708, + "ce_loss_9": 3.4250252842903137, + "epoch": 0.734, + "grad_norm": 1000.0, + "kl_loss_13": 105.24427146911621, + "kl_loss_2": 2175.9248474121096, + "kl_loss_4": 1301.1353271484375, + "kl_loss_9": 464.1909408569336, + "learning_rate": 0.00016779965984135375, + "loss": 994.7641, + "step": 7340 + }, + { + "ce_loss_13": 3.1878478288650514, + "ce_loss_17": 3.140075206756592, + "ce_loss_2": 4.145504760742187, + "ce_loss_4": 3.7042070388793946, + "ce_loss_9": 3.332831788063049, + "epoch": 0.735, + "grad_norm": 976.0, + "kl_loss_13": 99.37413673400879, + "kl_loss_2": 2122.98564453125, + "kl_loss_4": 1254.6998474121094, + "kl_loss_9": 441.175715637207, + "learning_rate": 0.00016661549974185424, + "loss": 981.7744, + "step": 7350 + }, + { + "ce_loss_13": 3.2209058523178102, + "ce_loss_17": 3.173720991611481, + "ce_loss_2": 4.1701184749603275, + "ce_loss_4": 3.744451093673706, + "ce_loss_9": 3.369333827495575, + "epoch": 0.736, + "grad_norm": 1016.0, + "kl_loss_13": 103.34624824523925, + "kl_loss_2": 2116.6716369628907, + "kl_loss_4": 1276.6074157714843, + "kl_loss_9": 455.69161529541014, + "learning_rate": 0.00016543469682057105, + "loss": 975.9254, + "step": 7360 + }, + { + "ce_loss_13": 3.247960650920868, + "ce_loss_17": 3.198669099807739, + "ce_loss_2": 4.192428553104401, + "ce_loss_4": 3.7728428721427916, + "ce_loss_9": 3.39778048992157, + "epoch": 0.737, + "grad_norm": 1000.0, + "kl_loss_13": 103.9306884765625, + "kl_loss_2": 2108.7956665039064, + "kl_loss_4": 1282.23974609375, + "kl_loss_9": 456.1327133178711, + "learning_rate": 0.00016425726296817632, + "loss": 982.9344, + "step": 7370 + }, + { + "ce_loss_13": 3.2609863758087156, + "ce_loss_17": 3.2138314604759217, + "ce_loss_2": 4.191257441043854, + "ce_loss_4": 3.7736965894699095, + "ce_loss_9": 3.406478762626648, + "epoch": 0.738, + "grad_norm": 984.0, + "kl_loss_13": 100.7370590209961, + "kl_loss_2": 2062.5892944335938, + "kl_loss_4": 1242.0026062011718, + "kl_loss_9": 440.29749298095703, + "learning_rate": 0.00016308321004141607, + "loss": 971.1008, + "step": 7380 + }, + { + "ce_loss_13": 3.2182164669036863, + "ce_loss_17": 3.1670512914657594, + "ce_loss_2": 4.17956907749176, + "ce_loss_4": 3.751479411125183, + "ce_loss_9": 3.3711397647857666, + "epoch": 0.739, + "grad_norm": 1096.0, + "kl_loss_13": 104.85634346008301, + "kl_loss_2": 2129.237933349609, + "kl_loss_4": 1285.3275573730468, + "kl_loss_9": 456.14708404541017, + "learning_rate": 0.00016191254986299043, + "loss": 978.3435, + "step": 7390 + }, + { + "ce_loss_13": 3.2611135482788085, + "ce_loss_17": 3.2160486221313476, + "ce_loss_2": 4.179602634906769, + "ce_loss_4": 3.7670073986053465, + "ce_loss_9": 3.3967936873435973, + "epoch": 0.74, + "grad_norm": 1472.0, + "kl_loss_13": 99.44545249938965, + "kl_loss_2": 2073.338488769531, + "kl_loss_4": 1256.3492004394532, + "kl_loss_9": 438.98588409423826, + "learning_rate": 0.00016074529422143398, + "loss": 988.5254, + "step": 7400 + }, + { + "ce_loss_13": 3.2178966283798216, + "ce_loss_17": 3.1717904090881346, + "ce_loss_2": 4.18922336101532, + "ce_loss_4": 3.746052587032318, + "ce_loss_9": 3.367326760292053, + "epoch": 0.741, + "grad_norm": 1584.0, + "kl_loss_13": 101.81290588378906, + "kl_loss_2": 2145.7101928710936, + "kl_loss_4": 1284.2736755371093, + "kl_loss_9": 451.10104217529295, + "learning_rate": 0.0001595814548709983, + "loss": 1001.89, + "step": 7410 + }, + { + "ce_loss_13": 3.282708501815796, + "ce_loss_17": 3.231583368778229, + "ce_loss_2": 4.23915159702301, + "ce_loss_4": 3.810225987434387, + "ce_loss_9": 3.433437633514404, + "epoch": 0.742, + "grad_norm": 944.0, + "kl_loss_13": 105.40448913574218, + "kl_loss_2": 2156.3843505859377, + "kl_loss_4": 1297.7621337890625, + "kl_loss_9": 465.99937438964844, + "learning_rate": 0.00015842104353153285, + "loss": 1000.2484, + "step": 7420 + }, + { + "ce_loss_13": 3.294725751876831, + "ce_loss_17": 3.2486682176589965, + "ce_loss_2": 4.240672910213471, + "ce_loss_4": 3.8200744152069093, + "ce_loss_9": 3.4452807664871217, + "epoch": 0.743, + "grad_norm": 952.0, + "kl_loss_13": 103.84085502624512, + "kl_loss_2": 2111.9013122558595, + "kl_loss_4": 1279.530047607422, + "kl_loss_9": 453.24732971191406, + "learning_rate": 0.0001572640718883667, + "loss": 1006.1846, + "step": 7430 + }, + { + "ce_loss_13": 3.2367947340011596, + "ce_loss_17": 3.19027783870697, + "ce_loss_2": 4.160536992549896, + "ce_loss_4": 3.7421008944511414, + "ce_loss_9": 3.3772555708885195, + "epoch": 0.744, + "grad_norm": 944.0, + "kl_loss_13": 100.11829223632813, + "kl_loss_2": 2067.847723388672, + "kl_loss_4": 1241.6715454101563, + "kl_loss_9": 438.22315368652346, + "learning_rate": 0.0001561105515921915, + "loss": 990.9836, + "step": 7440 + }, + { + "ce_loss_13": 3.0914377212524413, + "ce_loss_17": 3.043926501274109, + "ce_loss_2": 4.08496071100235, + "ce_loss_4": 3.6405054688453675, + "ce_loss_9": 3.242345702648163, + "epoch": 0.745, + "grad_norm": 1112.0, + "kl_loss_13": 99.09310989379883, + "kl_loss_2": 2214.273553466797, + "kl_loss_4": 1320.3433898925782, + "kl_loss_9": 456.71732025146486, + "learning_rate": 0.0001549604942589441, + "loss": 991.5547, + "step": 7450 + }, + { + "ce_loss_13": 3.269583225250244, + "ce_loss_17": 3.2249382853507997, + "ce_loss_2": 4.167814135551453, + "ce_loss_4": 3.7584699749946595, + "ce_loss_9": 3.4090994715690615, + "epoch": 0.746, + "grad_norm": 1176.0, + "kl_loss_13": 98.48213043212891, + "kl_loss_2": 2000.6360900878906, + "kl_loss_4": 1205.6654296875, + "kl_loss_9": 429.5097290039063, + "learning_rate": 0.00015381391146968864, + "loss": 957.9197, + "step": 7460 + }, + { + "ce_loss_13": 3.245890426635742, + "ce_loss_17": 3.2003955006599427, + "ce_loss_2": 4.194105887413025, + "ce_loss_4": 3.771884572505951, + "ce_loss_9": 3.3900243043899536, + "epoch": 0.747, + "grad_norm": 1080.0, + "kl_loss_13": 98.49898338317871, + "kl_loss_2": 2089.197833251953, + "kl_loss_4": 1261.216778564453, + "kl_loss_9": 439.7403793334961, + "learning_rate": 0.00015267081477050133, + "loss": 981.1238, + "step": 7470 + }, + { + "ce_loss_13": 3.3379173636436463, + "ce_loss_17": 3.2907899141311647, + "ce_loss_2": 4.248699605464935, + "ce_loss_4": 3.8449817657470704, + "ce_loss_9": 3.482374167442322, + "epoch": 0.748, + "grad_norm": 964.0, + "kl_loss_13": 103.6711540222168, + "kl_loss_2": 2052.2494506835938, + "kl_loss_4": 1243.8148193359375, + "kl_loss_9": 450.21435089111327, + "learning_rate": 0.00015153121567235335, + "loss": 960.5336, + "step": 7480 + }, + { + "ce_loss_13": 3.2361804366111757, + "ce_loss_17": 3.190000581741333, + "ce_loss_2": 4.190605473518372, + "ce_loss_4": 3.75474009513855, + "ce_loss_9": 3.381406915187836, + "epoch": 0.749, + "grad_norm": 1360.0, + "kl_loss_13": 101.31139945983887, + "kl_loss_2": 2140.1505920410154, + "kl_loss_4": 1278.5599365234375, + "kl_loss_9": 447.76602630615236, + "learning_rate": 0.00015039512565099468, + "loss": 964.2838, + "step": 7490 + }, + { + "ce_loss_13": 3.3013733625411987, + "ce_loss_17": 3.2542645215988157, + "ce_loss_2": 4.225599646568298, + "ce_loss_4": 3.8054020285606383, + "ce_loss_9": 3.4421015858650206, + "epoch": 0.75, + "grad_norm": 856.0, + "kl_loss_13": 101.51166076660157, + "kl_loss_2": 2077.1557006835938, + "kl_loss_4": 1248.4674377441406, + "kl_loss_9": 443.1510940551758, + "learning_rate": 0.00014926255614683932, + "loss": 1011.2238, + "step": 7500 + }, + { + "ce_loss_13": 3.2377806067466737, + "ce_loss_17": 3.1915971159935, + "ce_loss_2": 4.170037150382996, + "ce_loss_4": 3.739166593551636, + "ce_loss_9": 3.380395936965942, + "epoch": 0.751, + "grad_norm": 1000.0, + "kl_loss_13": 101.47249946594238, + "kl_loss_2": 2094.5810424804686, + "kl_loss_4": 1244.3457336425781, + "kl_loss_9": 444.2610153198242, + "learning_rate": 0.0001481335185648498, + "loss": 979.9605, + "step": 7510 + }, + { + "ce_loss_13": 3.253994345664978, + "ce_loss_17": 3.2062817573547364, + "ce_loss_2": 4.191251564025879, + "ce_loss_4": 3.770757627487183, + "ce_loss_9": 3.401432716846466, + "epoch": 0.752, + "grad_norm": 1056.0, + "kl_loss_13": 101.32072486877442, + "kl_loss_2": 2097.450604248047, + "kl_loss_4": 1261.2585571289062, + "kl_loss_9": 452.81489715576174, + "learning_rate": 0.0001470080242744218, + "loss": 972.5561, + "step": 7520 + }, + { + "ce_loss_13": 3.2490274548530578, + "ce_loss_17": 3.2040793180465696, + "ce_loss_2": 4.201032567024231, + "ce_loss_4": 3.764175629615784, + "ce_loss_9": 3.3911293387413024, + "epoch": 0.753, + "grad_norm": 1200.0, + "kl_loss_13": 99.14194602966309, + "kl_loss_2": 2119.8879943847655, + "kl_loss_4": 1269.5832458496093, + "kl_loss_9": 442.64300842285155, + "learning_rate": 0.0001458860846092705, + "loss": 987.3602, + "step": 7530 + }, + { + "ce_loss_13": 3.287778925895691, + "ce_loss_17": 3.239814305305481, + "ce_loss_2": 4.192265236377716, + "ce_loss_4": 3.7909107208251953, + "ce_loss_9": 3.4307783365249636, + "epoch": 0.754, + "grad_norm": 1096.0, + "kl_loss_13": 101.26725044250489, + "kl_loss_2": 2032.7541137695312, + "kl_loss_4": 1239.180029296875, + "kl_loss_9": 441.51709899902346, + "learning_rate": 0.00014476771086731566, + "loss": 952.5381, + "step": 7540 + }, + { + "ce_loss_13": 3.3816287398338316, + "ce_loss_17": 3.3320206880569456, + "ce_loss_2": 4.303283953666687, + "ce_loss_4": 3.88857958316803, + "ce_loss_9": 3.52352888584137, + "epoch": 0.755, + "grad_norm": 996.0, + "kl_loss_13": 105.0622169494629, + "kl_loss_2": 2064.4279907226564, + "kl_loss_4": 1243.8705322265625, + "kl_loss_9": 447.0940628051758, + "learning_rate": 0.00014365291431056872, + "loss": 996.5289, + "step": 7550 + }, + { + "ce_loss_13": 3.2195640444755553, + "ce_loss_17": 3.170068109035492, + "ce_loss_2": 4.1696008205413815, + "ce_loss_4": 3.749052035808563, + "ce_loss_9": 3.3705715894699098, + "epoch": 0.756, + "grad_norm": 1256.0, + "kl_loss_13": 104.77289543151855, + "kl_loss_2": 2146.103369140625, + "kl_loss_4": 1290.6354553222657, + "kl_loss_9": 462.94439697265625, + "learning_rate": 0.00014254170616501827, + "loss": 989.2295, + "step": 7560 + }, + { + "ce_loss_13": 3.159365344047546, + "ce_loss_17": 3.1080535411834718, + "ce_loss_2": 4.161156284809112, + "ce_loss_4": 3.7167094945907593, + "ce_loss_9": 3.3157166600227357, + "epoch": 0.757, + "grad_norm": 1288.0, + "kl_loss_13": 104.5283088684082, + "kl_loss_2": 2216.742510986328, + "kl_loss_4": 1340.4740661621095, + "kl_loss_9": 471.3513717651367, + "learning_rate": 0.0001414340976205183, + "loss": 1028.4697, + "step": 7570 + }, + { + "ce_loss_13": 3.16928768157959, + "ce_loss_17": 3.120891749858856, + "ce_loss_2": 4.1503631472587585, + "ce_loss_4": 3.7084590673446653, + "ce_loss_9": 3.3225682973861694, + "epoch": 0.758, + "grad_norm": 1168.0, + "kl_loss_13": 101.72402153015136, + "kl_loss_2": 2164.8482055664062, + "kl_loss_4": 1295.7515747070313, + "kl_loss_9": 453.8439437866211, + "learning_rate": 0.00014033009983067452, + "loss": 990.4742, + "step": 7580 + }, + { + "ce_loss_13": 3.3302480936050416, + "ce_loss_17": 3.2823233127593996, + "ce_loss_2": 4.230685698986053, + "ce_loss_4": 3.8180121898651125, + "ce_loss_9": 3.470197117328644, + "epoch": 0.759, + "grad_norm": 1440.0, + "kl_loss_13": 99.38315925598144, + "kl_loss_2": 2024.4679565429688, + "kl_loss_4": 1211.806396484375, + "kl_loss_9": 432.2859634399414, + "learning_rate": 0.00013922972391273224, + "loss": 962.8225, + "step": 7590 + }, + { + "ce_loss_13": 3.3332573533058167, + "ce_loss_17": 3.285056746006012, + "ce_loss_2": 4.279339623451233, + "ce_loss_4": 3.838828182220459, + "ce_loss_9": 3.4777159094810486, + "epoch": 0.76, + "grad_norm": 1048.0, + "kl_loss_13": 100.32025680541992, + "kl_loss_2": 2095.843603515625, + "kl_loss_4": 1240.1984252929688, + "kl_loss_9": 438.59041290283204, + "learning_rate": 0.0001381329809474649, + "loss": 978.8602, + "step": 7600 + }, + { + "ce_loss_13": 3.236497700214386, + "ce_loss_17": 3.185157799720764, + "ce_loss_2": 4.217888844013214, + "ce_loss_4": 3.7790570855140686, + "ce_loss_9": 3.3904631614685057, + "epoch": 0.761, + "grad_norm": 1104.0, + "kl_loss_13": 104.97152099609374, + "kl_loss_2": 2193.6860168457033, + "kl_loss_4": 1309.6801177978516, + "kl_loss_9": 466.25842437744143, + "learning_rate": 0.0001370398819790621, + "loss": 1008.5524, + "step": 7610 + }, + { + "ce_loss_13": 3.3669647932052613, + "ce_loss_17": 3.320329523086548, + "ce_loss_2": 4.287932395935059, + "ce_loss_4": 3.8653638124465943, + "ce_loss_9": 3.5080355525016786, + "epoch": 0.762, + "grad_norm": 1264.0, + "kl_loss_13": 101.7803783416748, + "kl_loss_2": 2048.445977783203, + "kl_loss_4": 1231.3599426269532, + "kl_loss_9": 442.16017456054686, + "learning_rate": 0.00013595043801501794, + "loss": 960.0238, + "step": 7620 + }, + { + "ce_loss_13": 3.1705962777137757, + "ce_loss_17": 3.122034823894501, + "ce_loss_2": 4.193672704696655, + "ce_loss_4": 3.7323740124702454, + "ce_loss_9": 3.325533401966095, + "epoch": 0.763, + "grad_norm": 1456.0, + "kl_loss_13": 103.72077293395996, + "kl_loss_2": 2254.3762451171874, + "kl_loss_4": 1344.2235290527344, + "kl_loss_9": 466.3963684082031, + "learning_rate": 0.00013486466002602133, + "loss": 1012.6096, + "step": 7630 + }, + { + "ce_loss_13": 3.283672797679901, + "ce_loss_17": 3.237477791309357, + "ce_loss_2": 4.1970849394798275, + "ce_loss_4": 3.7829890608787538, + "ce_loss_9": 3.4260783910751345, + "epoch": 0.764, + "grad_norm": 1104.0, + "kl_loss_13": 100.7426700592041, + "kl_loss_2": 2059.8158935546876, + "kl_loss_4": 1240.9336364746093, + "kl_loss_9": 443.3312591552734, + "learning_rate": 0.00013378255894584462, + "loss": 994.4663, + "step": 7640 + }, + { + "ce_loss_13": 3.2206910967826845, + "ce_loss_17": 3.1713557839393616, + "ce_loss_2": 4.189263188838959, + "ce_loss_4": 3.7519478797912598, + "ce_loss_9": 3.370849001407623, + "epoch": 0.765, + "grad_norm": 936.0, + "kl_loss_13": 103.36294860839844, + "kl_loss_2": 2148.8883666992188, + "kl_loss_4": 1287.0466247558593, + "kl_loss_9": 456.6633544921875, + "learning_rate": 0.0001327041456712334, + "loss": 998.6118, + "step": 7650 + }, + { + "ce_loss_13": 3.255833184719086, + "ce_loss_17": 3.204713594913483, + "ce_loss_2": 4.197321557998658, + "ce_loss_4": 3.7753271102905273, + "ce_loss_9": 3.4034424424171448, + "epoch": 0.766, + "grad_norm": 956.0, + "kl_loss_13": 102.53512382507324, + "kl_loss_2": 2109.8352233886717, + "kl_loss_4": 1277.1186889648438, + "kl_loss_9": 454.2614044189453, + "learning_rate": 0.00013162943106179747, + "loss": 994.7248, + "step": 7660 + }, + { + "ce_loss_13": 3.2431095600128175, + "ce_loss_17": 3.1949783682823183, + "ce_loss_2": 4.1670672535896305, + "ce_loss_4": 3.7470529317855834, + "ce_loss_9": 3.380710184574127, + "epoch": 0.767, + "grad_norm": 1012.0, + "kl_loss_13": 101.14639739990234, + "kl_loss_2": 2076.0106689453123, + "kl_loss_4": 1261.167706298828, + "kl_loss_9": 444.8407943725586, + "learning_rate": 0.00013055842593990132, + "loss": 974.6033, + "step": 7670 + }, + { + "ce_loss_13": 3.184472131729126, + "ce_loss_17": 3.138358461856842, + "ce_loss_2": 4.121903717517853, + "ce_loss_4": 3.6949310183525084, + "ce_loss_9": 3.3316203951835632, + "epoch": 0.768, + "grad_norm": 980.0, + "kl_loss_13": 99.72313537597657, + "kl_loss_2": 2076.2475158691404, + "kl_loss_4": 1245.8335876464844, + "kl_loss_9": 443.2247619628906, + "learning_rate": 0.00012949114109055414, + "loss": 992.0391, + "step": 7680 + }, + { + "ce_loss_13": 3.228347909450531, + "ce_loss_17": 3.1773066878318788, + "ce_loss_2": 4.177132475376129, + "ce_loss_4": 3.7568712711334227, + "ce_loss_9": 3.3794357776641846, + "epoch": 0.769, + "grad_norm": 1032.0, + "kl_loss_13": 101.52271690368653, + "kl_loss_2": 2119.754083251953, + "kl_loss_4": 1283.0621337890625, + "kl_loss_9": 453.93002166748045, + "learning_rate": 0.00012842758726130281, + "loss": 997.065, + "step": 7690 + }, + { + "ce_loss_13": 3.265441524982452, + "ce_loss_17": 3.216144061088562, + "ce_loss_2": 4.244739127159119, + "ce_loss_4": 3.803415894508362, + "ce_loss_9": 3.4191662311553954, + "epoch": 0.77, + "grad_norm": 908.0, + "kl_loss_13": 103.63887405395508, + "kl_loss_2": 2154.948620605469, + "kl_loss_4": 1287.4507873535156, + "kl_loss_9": 459.24183502197263, + "learning_rate": 0.00012736777516212267, + "loss": 984.2212, + "step": 7700 + }, + { + "ce_loss_13": 3.266558563709259, + "ce_loss_17": 3.217825412750244, + "ce_loss_2": 4.210503566265106, + "ce_loss_4": 3.7817785859107973, + "ce_loss_9": 3.413377857208252, + "epoch": 0.771, + "grad_norm": 912.0, + "kl_loss_13": 103.07483367919922, + "kl_loss_2": 2116.633093261719, + "kl_loss_4": 1273.910546875, + "kl_loss_9": 458.2764465332031, + "learning_rate": 0.00012631171546530968, + "loss": 972.7604, + "step": 7710 + }, + { + "ce_loss_13": 3.2736518502235414, + "ce_loss_17": 3.2228816151618958, + "ce_loss_2": 4.2132275223732, + "ce_loss_4": 3.7989086270332337, + "ce_loss_9": 3.4256322145462037, + "epoch": 0.772, + "grad_norm": 956.0, + "kl_loss_13": 104.09769287109376, + "kl_loss_2": 2095.392926025391, + "kl_loss_4": 1275.789239501953, + "kl_loss_9": 456.71444244384764, + "learning_rate": 0.00012525941880537307, + "loss": 999.7916, + "step": 7720 + }, + { + "ce_loss_13": 3.3036232590675354, + "ce_loss_17": 3.2566508531570433, + "ce_loss_2": 4.2416020154953005, + "ce_loss_4": 3.82111679315567, + "ce_loss_9": 3.447795498371124, + "epoch": 0.773, + "grad_norm": 964.0, + "kl_loss_13": 101.40843238830567, + "kl_loss_2": 2078.2499877929686, + "kl_loss_4": 1251.6986694335938, + "kl_loss_9": 440.6107009887695, + "learning_rate": 0.00012421089577892869, + "loss": 978.3182, + "step": 7730 + }, + { + "ce_loss_13": 3.2610741019248963, + "ce_loss_17": 3.2119283437728883, + "ce_loss_2": 4.22058265209198, + "ce_loss_4": 3.78484765291214, + "ce_loss_9": 3.4089120388031007, + "epoch": 0.774, + "grad_norm": 1152.0, + "kl_loss_13": 103.04828567504883, + "kl_loss_2": 2149.300653076172, + "kl_loss_4": 1291.3053649902345, + "kl_loss_9": 459.15892333984374, + "learning_rate": 0.0001231661569445919, + "loss": 995.4051, + "step": 7740 + }, + { + "ce_loss_13": 3.1244083762168886, + "ce_loss_17": 3.076567268371582, + "ce_loss_2": 4.086958360671997, + "ce_loss_4": 3.6492671728134156, + "ce_loss_9": 3.2687422156333925, + "epoch": 0.775, + "grad_norm": 1000.0, + "kl_loss_13": 101.32084350585937, + "kl_loss_2": 2136.0405151367186, + "kl_loss_4": 1275.9732666015625, + "kl_loss_9": 452.86229248046874, + "learning_rate": 0.00012212521282287093, + "loss": 1008.7845, + "step": 7750 + }, + { + "ce_loss_13": 3.2651302099227903, + "ce_loss_17": 3.21563835144043, + "ce_loss_2": 4.199793183803559, + "ce_loss_4": 3.783316135406494, + "ce_loss_9": 3.4151036858558657, + "epoch": 0.776, + "grad_norm": 1512.0, + "kl_loss_13": 103.44102554321289, + "kl_loss_2": 2081.4614318847657, + "kl_loss_4": 1261.7095764160156, + "kl_loss_9": 452.03993835449216, + "learning_rate": 0.00012108807389606158, + "loss": 999.6672, + "step": 7760 + }, + { + "ce_loss_13": 3.260496425628662, + "ce_loss_17": 3.2144153714179993, + "ce_loss_2": 4.200718879699707, + "ce_loss_4": 3.7767830014228823, + "ce_loss_9": 3.405388343334198, + "epoch": 0.777, + "grad_norm": 1280.0, + "kl_loss_13": 99.42475051879883, + "kl_loss_2": 2089.568493652344, + "kl_loss_4": 1259.119384765625, + "kl_loss_9": 441.4059371948242, + "learning_rate": 0.00012005475060814159, + "loss": 976.4806, + "step": 7770 + }, + { + "ce_loss_13": 3.199585807323456, + "ce_loss_17": 3.1530317902565, + "ce_loss_2": 4.172993063926697, + "ce_loss_4": 3.734945833683014, + "ce_loss_9": 3.3503398180007933, + "epoch": 0.778, + "grad_norm": 1096.0, + "kl_loss_13": 102.36463890075683, + "kl_loss_2": 2172.3350341796877, + "kl_loss_4": 1302.6199096679688, + "kl_loss_9": 455.3308990478516, + "learning_rate": 0.00011902525336466464, + "loss": 996.6336, + "step": 7780 + }, + { + "ce_loss_13": 3.1970359563827513, + "ce_loss_17": 3.1460973501205443, + "ce_loss_2": 4.183026456832886, + "ce_loss_4": 3.739704465866089, + "ce_loss_9": 3.3491469383239747, + "epoch": 0.779, + "grad_norm": 1312.0, + "kl_loss_13": 104.08134574890137, + "kl_loss_2": 2197.7756103515626, + "kl_loss_4": 1320.008123779297, + "kl_loss_9": 464.2852355957031, + "learning_rate": 0.00011799959253265668, + "loss": 996.9018, + "step": 7790 + }, + { + "ce_loss_13": 3.2524810671806335, + "ce_loss_17": 3.202907085418701, + "ce_loss_2": 4.2066610455513, + "ce_loss_4": 3.774940252304077, + "ce_loss_9": 3.39757376909256, + "epoch": 0.78, + "grad_norm": 972.0, + "kl_loss_13": 103.14059524536133, + "kl_loss_2": 2140.4659057617187, + "kl_loss_4": 1285.6123168945312, + "kl_loss_9": 457.75340881347654, + "learning_rate": 0.00011697777844051105, + "loss": 992.0883, + "step": 7800 + }, + { + "ce_loss_13": 3.2353628873825073, + "ce_loss_17": 3.1870660066604612, + "ce_loss_2": 4.229414403438568, + "ce_loss_4": 3.7797078251838685, + "ce_loss_9": 3.385956120491028, + "epoch": 0.781, + "grad_norm": 1112.0, + "kl_loss_13": 103.35344543457032, + "kl_loss_2": 2203.8872985839844, + "kl_loss_4": 1320.7568481445312, + "kl_loss_9": 456.38891754150393, + "learning_rate": 0.00011595982137788402, + "loss": 1005.192, + "step": 7810 + }, + { + "ce_loss_13": 3.2123388171195986, + "ce_loss_17": 3.1662863969802855, + "ce_loss_2": 4.140447068214416, + "ce_loss_4": 3.7170390367507933, + "ce_loss_9": 3.35581716299057, + "epoch": 0.782, + "grad_norm": 1024.0, + "kl_loss_13": 100.07517471313477, + "kl_loss_2": 2059.298260498047, + "kl_loss_4": 1239.4554382324218, + "kl_loss_9": 442.8959136962891, + "learning_rate": 0.00011494573159559212, + "loss": 977.8385, + "step": 7820 + }, + { + "ce_loss_13": 3.19861501455307, + "ce_loss_17": 3.151161313056946, + "ce_loss_2": 4.14994592666626, + "ce_loss_4": 3.7293417692184447, + "ce_loss_9": 3.347096335887909, + "epoch": 0.783, + "grad_norm": 928.0, + "kl_loss_13": 102.2232894897461, + "kl_loss_2": 2122.2062927246093, + "kl_loss_4": 1289.788104248047, + "kl_loss_9": 449.8838516235352, + "learning_rate": 0.00011393551930550828, + "loss": 1007.6727, + "step": 7830 + }, + { + "ce_loss_13": 3.3309919476509093, + "ce_loss_17": 3.285180854797363, + "ce_loss_2": 4.243929171562195, + "ce_loss_4": 3.83711017370224, + "ce_loss_9": 3.479169476032257, + "epoch": 0.784, + "grad_norm": 1024.0, + "kl_loss_13": 103.51986732482911, + "kl_loss_2": 2056.4778686523437, + "kl_loss_4": 1253.28623046875, + "kl_loss_9": 452.3572174072266, + "learning_rate": 0.00011292919468045875, + "loss": 973.4789, + "step": 7840 + }, + { + "ce_loss_13": 3.2882464647293093, + "ce_loss_17": 3.2412719130516052, + "ce_loss_2": 4.232723808288574, + "ce_loss_4": 3.8036736130714415, + "ce_loss_9": 3.4296405792236326, + "epoch": 0.785, + "grad_norm": 1152.0, + "kl_loss_13": 102.11189575195313, + "kl_loss_2": 2098.169873046875, + "kl_loss_4": 1266.2834411621093, + "kl_loss_9": 448.7361831665039, + "learning_rate": 0.00011192676785412154, + "loss": 976.3126, + "step": 7850 + }, + { + "ce_loss_13": 3.2317895889282227, + "ce_loss_17": 3.177842390537262, + "ce_loss_2": 4.208509302139282, + "ce_loss_4": 3.7691295862197878, + "ce_loss_9": 3.381120133399963, + "epoch": 0.786, + "grad_norm": 984.0, + "kl_loss_13": 103.42075996398925, + "kl_loss_2": 2159.717492675781, + "kl_loss_4": 1296.4408447265625, + "kl_loss_9": 454.42005920410156, + "learning_rate": 0.00011092824892092374, + "loss": 995.3005, + "step": 7860 + }, + { + "ce_loss_13": 3.165933883190155, + "ce_loss_17": 3.116780197620392, + "ce_loss_2": 4.153436815738678, + "ce_loss_4": 3.706295597553253, + "ce_loss_9": 3.3170262455940245, + "epoch": 0.787, + "grad_norm": 908.0, + "kl_loss_13": 101.23736534118652, + "kl_loss_2": 2189.7651184082033, + "kl_loss_4": 1309.5819763183595, + "kl_loss_9": 456.9874465942383, + "learning_rate": 0.0001099336479359398, + "loss": 987.6969, + "step": 7870 + }, + { + "ce_loss_13": 3.285086214542389, + "ce_loss_17": 3.238777422904968, + "ce_loss_2": 4.206076502799988, + "ce_loss_4": 3.7897098541259764, + "ce_loss_9": 3.4294825553894044, + "epoch": 0.788, + "grad_norm": 868.0, + "kl_loss_13": 101.20943107604981, + "kl_loss_2": 2067.07255859375, + "kl_loss_4": 1249.7016174316407, + "kl_loss_9": 450.14626159667966, + "learning_rate": 0.00010894297491479043, + "loss": 980.3164, + "step": 7880 + }, + { + "ce_loss_13": 3.271121025085449, + "ce_loss_17": 3.2255475759506225, + "ce_loss_2": 4.218229627609253, + "ce_loss_4": 3.7828092336654664, + "ce_loss_9": 3.414362847805023, + "epoch": 0.789, + "grad_norm": 1016.0, + "kl_loss_13": 101.39401206970214, + "kl_loss_2": 2098.4370361328124, + "kl_loss_4": 1256.4552856445312, + "kl_loss_9": 445.91014099121094, + "learning_rate": 0.00010795623983354214, + "loss": 973.8822, + "step": 7890 + }, + { + "ce_loss_13": 3.1709980964660645, + "ce_loss_17": 3.123633337020874, + "ce_loss_2": 4.134852206707, + "ce_loss_4": 3.7005024552345276, + "ce_loss_9": 3.325609254837036, + "epoch": 0.79, + "grad_norm": 940.0, + "kl_loss_13": 104.57198944091797, + "kl_loss_2": 2163.441650390625, + "kl_loss_4": 1303.5415954589844, + "kl_loss_9": 465.8653900146484, + "learning_rate": 0.00010697345262860636, + "loss": 992.7499, + "step": 7900 + }, + { + "ce_loss_13": 3.3076335191726685, + "ce_loss_17": 3.260413885116577, + "ce_loss_2": 4.229410541057587, + "ce_loss_4": 3.803511071205139, + "ce_loss_9": 3.448054051399231, + "epoch": 0.791, + "grad_norm": 1312.0, + "kl_loss_13": 101.77360725402832, + "kl_loss_2": 2073.8045166015627, + "kl_loss_4": 1237.9384033203125, + "kl_loss_9": 446.22881774902345, + "learning_rate": 0.00010599462319663906, + "loss": 967.693, + "step": 7910 + }, + { + "ce_loss_13": 3.2776466846466064, + "ce_loss_17": 3.2310521602630615, + "ce_loss_2": 4.183774328231811, + "ce_loss_4": 3.7728922963142395, + "ce_loss_9": 3.413072574138641, + "epoch": 0.792, + "grad_norm": 872.0, + "kl_loss_13": 100.58659286499024, + "kl_loss_2": 2033.0189331054687, + "kl_loss_4": 1224.5311645507813, + "kl_loss_9": 437.2007431030273, + "learning_rate": 0.00010501976139444191, + "loss": 956.9797, + "step": 7920 + }, + { + "ce_loss_13": 3.30449059009552, + "ce_loss_17": 3.2558288097381594, + "ce_loss_2": 4.230979406833649, + "ce_loss_4": 3.8139089584350585, + "ce_loss_9": 3.4463000535964965, + "epoch": 0.793, + "grad_norm": 1304.0, + "kl_loss_13": 101.18901672363282, + "kl_loss_2": 2078.9392944335937, + "kl_loss_4": 1254.8287658691406, + "kl_loss_9": 442.39510955810545, + "learning_rate": 0.0001040488770388625, + "loss": 987.5436, + "step": 7930 + }, + { + "ce_loss_13": 3.2569679737091066, + "ce_loss_17": 3.210511040687561, + "ce_loss_2": 4.200649857521057, + "ce_loss_4": 3.7724182486534117, + "ce_loss_9": 3.4011685490608214, + "epoch": 0.794, + "grad_norm": 1072.0, + "kl_loss_13": 102.23939933776856, + "kl_loss_2": 2127.973455810547, + "kl_loss_4": 1279.2343688964843, + "kl_loss_9": 453.9692443847656, + "learning_rate": 0.00010308197990669538, + "loss": 982.5158, + "step": 7940 + }, + { + "ce_loss_13": 3.364276683330536, + "ce_loss_17": 3.3144590258598328, + "ce_loss_2": 4.298801422119141, + "ce_loss_4": 3.8767467021942137, + "ce_loss_9": 3.508592891693115, + "epoch": 0.795, + "grad_norm": 1048.0, + "kl_loss_13": 104.11300544738769, + "kl_loss_2": 2095.551922607422, + "kl_loss_4": 1261.4102355957032, + "kl_loss_9": 451.5866165161133, + "learning_rate": 0.0001021190797345839, + "loss": 973.609, + "step": 7950 + }, + { + "ce_loss_13": 3.0919105768203736, + "ce_loss_17": 3.044080901145935, + "ce_loss_2": 4.09787026643753, + "ce_loss_4": 3.6502553701400755, + "ce_loss_9": 3.2520545363426208, + "epoch": 0.796, + "grad_norm": 1328.0, + "kl_loss_13": 106.24812927246094, + "kl_loss_2": 2227.2838134765625, + "kl_loss_4": 1339.3446228027344, + "kl_loss_9": 477.76061096191404, + "learning_rate": 0.00010116018621892236, + "loss": 1006.2994, + "step": 7960 + }, + { + "ce_loss_13": 3.299259805679321, + "ce_loss_17": 3.2519605994224547, + "ce_loss_2": 4.256993734836579, + "ce_loss_4": 3.8238647103309633, + "ce_loss_9": 3.4542850136756895, + "epoch": 0.797, + "grad_norm": 1160.0, + "kl_loss_13": 106.80334930419922, + "kl_loss_2": 2143.125836181641, + "kl_loss_4": 1301.5462768554687, + "kl_loss_9": 469.33297271728514, + "learning_rate": 0.00010020530901575753, + "loss": 976.3502, + "step": 7970 + }, + { + "ce_loss_13": 3.3236821055412293, + "ce_loss_17": 3.276770460605621, + "ce_loss_2": 4.253068566322327, + "ce_loss_4": 3.836443018913269, + "ce_loss_9": 3.46806663274765, + "epoch": 0.798, + "grad_norm": 836.0, + "kl_loss_13": 103.52790412902831, + "kl_loss_2": 2093.7552734375, + "kl_loss_4": 1270.1452270507812, + "kl_loss_9": 453.0150802612305, + "learning_rate": 9.925445774069231e-05, + "loss": 964.5483, + "step": 7980 + }, + { + "ce_loss_13": 3.2782413244247435, + "ce_loss_17": 3.228104555606842, + "ce_loss_2": 4.22121022939682, + "ce_loss_4": 3.7972465634346007, + "ce_loss_9": 3.424219560623169, + "epoch": 0.799, + "grad_norm": 1200.0, + "kl_loss_13": 101.96302680969238, + "kl_loss_2": 2078.939141845703, + "kl_loss_4": 1249.4914245605469, + "kl_loss_9": 445.1977874755859, + "learning_rate": 9.830764196878872e-05, + "loss": 958.9357, + "step": 7990 + }, + { + "ce_loss_13": 3.2221267223358154, + "ce_loss_17": 3.1738629221916197, + "ce_loss_2": 4.178364479541779, + "ce_loss_4": 3.747695779800415, + "ce_loss_9": 3.3700037002563477, + "epoch": 0.8, + "grad_norm": 1168.0, + "kl_loss_13": 101.14788856506348, + "kl_loss_2": 2155.2220703125, + "kl_loss_4": 1296.5777282714844, + "kl_loss_9": 451.07776641845703, + "learning_rate": 9.736487123447069e-05, + "loss": 988.9884, + "step": 8000 + }, + { + "ce_loss_13": 3.1704826235771177, + "ce_loss_17": 3.123217225074768, + "ce_loss_2": 4.1758524656295775, + "ce_loss_4": 3.714837110042572, + "ce_loss_9": 3.3182682514190676, + "epoch": 0.801, + "grad_norm": 928.0, + "kl_loss_13": 102.08486976623536, + "kl_loss_2": 2241.2819580078126, + "kl_loss_4": 1330.4576782226563, + "kl_loss_9": 459.6312774658203, + "learning_rate": 9.642615503142926e-05, + "loss": 1012.2793, + "step": 8010 + }, + { + "ce_loss_13": 3.237340545654297, + "ce_loss_17": 3.190611445903778, + "ce_loss_2": 4.199431025981903, + "ce_loss_4": 3.7531250715255737, + "ce_loss_9": 3.3862797021865845, + "epoch": 0.802, + "grad_norm": 988.0, + "kl_loss_13": 100.68223152160644, + "kl_loss_2": 2142.8732666015626, + "kl_loss_4": 1268.0432250976562, + "kl_loss_9": 447.0963668823242, + "learning_rate": 9.549150281252633e-05, + "loss": 976.3664, + "step": 8020 + }, + { + "ce_loss_13": 3.264307129383087, + "ce_loss_17": 3.2160903811454773, + "ce_loss_2": 4.213959002494812, + "ce_loss_4": 3.776724672317505, + "ce_loss_9": 3.4120821714401246, + "epoch": 0.803, + "grad_norm": 888.0, + "kl_loss_13": 102.52159156799317, + "kl_loss_2": 2122.230120849609, + "kl_loss_4": 1262.4470458984374, + "kl_loss_9": 449.2831726074219, + "learning_rate": 9.4560923989699e-05, + "loss": 994.8857, + "step": 8030 + }, + { + "ce_loss_13": 3.256777024269104, + "ce_loss_17": 3.2098089933395384, + "ce_loss_2": 4.197511219978333, + "ce_loss_4": 3.7704452157020567, + "ce_loss_9": 3.4072154641151426, + "epoch": 0.804, + "grad_norm": 1144.0, + "kl_loss_13": 102.05184783935547, + "kl_loss_2": 2088.0073974609377, + "kl_loss_4": 1250.928515625, + "kl_loss_9": 449.11826171875, + "learning_rate": 9.363442793386607e-05, + "loss": 994.9217, + "step": 8040 + }, + { + "ce_loss_13": 3.229873847961426, + "ce_loss_17": 3.179894042015076, + "ce_loss_2": 4.21276445388794, + "ce_loss_4": 3.7755220413208006, + "ce_loss_9": 3.383824384212494, + "epoch": 0.805, + "grad_norm": 1384.0, + "kl_loss_13": 104.52713813781739, + "kl_loss_2": 2156.66845703125, + "kl_loss_4": 1308.6089965820313, + "kl_loss_9": 463.5359146118164, + "learning_rate": 9.271202397483213e-05, + "loss": 978.8461, + "step": 8050 + }, + { + "ce_loss_13": 3.262869918346405, + "ce_loss_17": 3.217521011829376, + "ce_loss_2": 4.191404068470002, + "ce_loss_4": 3.760555636882782, + "ce_loss_9": 3.403642737865448, + "epoch": 0.806, + "grad_norm": 1096.0, + "kl_loss_13": 99.6948631286621, + "kl_loss_2": 2083.3760375976562, + "kl_loss_4": 1242.0854431152343, + "kl_loss_9": 443.4048080444336, + "learning_rate": 9.179372140119524e-05, + "loss": 988.4973, + "step": 8060 + }, + { + "ce_loss_13": 3.2076014161109923, + "ce_loss_17": 3.1601006031036376, + "ce_loss_2": 4.141274046897888, + "ce_loss_4": 3.7182689189910887, + "ce_loss_9": 3.349162769317627, + "epoch": 0.807, + "grad_norm": 1000.0, + "kl_loss_13": 100.31297187805175, + "kl_loss_2": 2098.6954345703125, + "kl_loss_4": 1251.3703186035157, + "kl_loss_9": 444.0520751953125, + "learning_rate": 9.087952946025175e-05, + "loss": 990.7295, + "step": 8070 + }, + { + "ce_loss_13": 3.3127268314361573, + "ce_loss_17": 3.2658475399017335, + "ce_loss_2": 4.210246276855469, + "ce_loss_4": 3.800182580947876, + "ce_loss_9": 3.45171856880188, + "epoch": 0.808, + "grad_norm": 996.0, + "kl_loss_13": 99.853125, + "kl_loss_2": 2023.9295043945312, + "kl_loss_4": 1211.0155456542968, + "kl_loss_9": 432.6921127319336, + "learning_rate": 8.996945735790446e-05, + "loss": 975.716, + "step": 8080 + }, + { + "ce_loss_13": 3.2095915198326113, + "ce_loss_17": 3.1645363569259644, + "ce_loss_2": 4.1415794610977175, + "ce_loss_4": 3.7311418056488037, + "ce_loss_9": 3.356985855102539, + "epoch": 0.809, + "grad_norm": 1192.0, + "kl_loss_13": 101.8376564025879, + "kl_loss_2": 2099.4591796875, + "kl_loss_4": 1276.8267517089844, + "kl_loss_9": 451.8041000366211, + "learning_rate": 8.906351425856951e-05, + "loss": 992.0324, + "step": 8090 + }, + { + "ce_loss_13": 3.1945524334907534, + "ce_loss_17": 3.147158706188202, + "ce_loss_2": 4.154912292957306, + "ce_loss_4": 3.7292218446731566, + "ce_loss_9": 3.342159354686737, + "epoch": 0.81, + "grad_norm": 1240.0, + "kl_loss_13": 102.25700950622559, + "kl_loss_2": 2160.9501708984376, + "kl_loss_4": 1305.7983276367188, + "kl_loss_9": 455.7219268798828, + "learning_rate": 8.816170928508365e-05, + "loss": 1001.9814, + "step": 8100 + }, + { + "ce_loss_13": 3.1611883997917176, + "ce_loss_17": 3.1136874318122865, + "ce_loss_2": 4.156942415237427, + "ce_loss_4": 3.7047573208808897, + "ce_loss_9": 3.3096653699874876, + "epoch": 0.811, + "grad_norm": 944.0, + "kl_loss_13": 102.17304496765136, + "kl_loss_2": 2208.650274658203, + "kl_loss_4": 1317.0377014160156, + "kl_loss_9": 456.44727478027346, + "learning_rate": 8.7264051518613e-05, + "loss": 1001.4947, + "step": 8110 + }, + { + "ce_loss_13": 3.2558268189430235, + "ce_loss_17": 3.2112725734710694, + "ce_loss_2": 4.172137558460236, + "ce_loss_4": 3.755516004562378, + "ce_loss_9": 3.393725323677063, + "epoch": 0.812, + "grad_norm": 1012.0, + "kl_loss_13": 99.35440063476562, + "kl_loss_2": 2055.748431396484, + "kl_loss_4": 1239.678271484375, + "kl_loss_9": 439.4017578125, + "learning_rate": 8.637054999856148e-05, + "loss": 973.4494, + "step": 8120 + }, + { + "ce_loss_13": 3.2383846640586853, + "ce_loss_17": 3.18854444026947, + "ce_loss_2": 4.193518245220185, + "ce_loss_4": 3.7635922908782957, + "ce_loss_9": 3.388272774219513, + "epoch": 0.813, + "grad_norm": 916.0, + "kl_loss_13": 103.3952850341797, + "kl_loss_2": 2128.638818359375, + "kl_loss_4": 1284.4596435546875, + "kl_loss_9": 452.1752243041992, + "learning_rate": 8.548121372247918e-05, + "loss": 1003.1381, + "step": 8130 + }, + { + "ce_loss_13": 3.3107922434806825, + "ce_loss_17": 3.2646395564079285, + "ce_loss_2": 4.228652036190033, + "ce_loss_4": 3.8030380249023437, + "ce_loss_9": 3.4463945746421816, + "epoch": 0.814, + "grad_norm": 1392.0, + "kl_loss_13": 100.93543128967285, + "kl_loss_2": 2084.018475341797, + "kl_loss_4": 1243.06044921875, + "kl_loss_9": 441.40815887451174, + "learning_rate": 8.459605164597267e-05, + "loss": 973.2101, + "step": 8140 + }, + { + "ce_loss_13": 3.194267153739929, + "ce_loss_17": 3.147884750366211, + "ce_loss_2": 4.142776048183441, + "ce_loss_4": 3.71869353055954, + "ce_loss_9": 3.336175727844238, + "epoch": 0.815, + "grad_norm": 972.0, + "kl_loss_13": 100.9654052734375, + "kl_loss_2": 2112.378308105469, + "kl_loss_4": 1278.4134704589844, + "kl_loss_9": 446.9718948364258, + "learning_rate": 8.371507268261436e-05, + "loss": 988.1336, + "step": 8150 + }, + { + "ce_loss_13": 3.269476282596588, + "ce_loss_17": 3.220861256122589, + "ce_loss_2": 4.221264338493347, + "ce_loss_4": 3.7841190695762634, + "ce_loss_9": 3.419230115413666, + "epoch": 0.816, + "grad_norm": 948.0, + "kl_loss_13": 101.87098503112793, + "kl_loss_2": 2110.8914001464846, + "kl_loss_4": 1259.2027770996094, + "kl_loss_9": 450.92470092773436, + "learning_rate": 8.283828570385238e-05, + "loss": 963.8521, + "step": 8160 + }, + { + "ce_loss_13": 3.26354079246521, + "ce_loss_17": 3.2155607461929323, + "ce_loss_2": 4.209452056884766, + "ce_loss_4": 3.7787928223609923, + "ce_loss_9": 3.4093036532402037, + "epoch": 0.817, + "grad_norm": 1016.0, + "kl_loss_13": 102.47528419494628, + "kl_loss_2": 2086.3271362304686, + "kl_loss_4": 1251.88671875, + "kl_loss_9": 447.65564422607423, + "learning_rate": 8.196569953892202e-05, + "loss": 979.234, + "step": 8170 + }, + { + "ce_loss_13": 3.1943814277648928, + "ce_loss_17": 3.1454592823982237, + "ce_loss_2": 4.142535817623139, + "ce_loss_4": 3.721554160118103, + "ce_loss_9": 3.344161856174469, + "epoch": 0.818, + "grad_norm": 904.0, + "kl_loss_13": 102.15505790710449, + "kl_loss_2": 2102.592907714844, + "kl_loss_4": 1277.155987548828, + "kl_loss_9": 454.5060607910156, + "learning_rate": 8.109732297475635e-05, + "loss": 974.7617, + "step": 8180 + }, + { + "ce_loss_13": 3.162568008899689, + "ce_loss_17": 3.111032009124756, + "ce_loss_2": 4.179723310470581, + "ce_loss_4": 3.7297472953796387, + "ce_loss_9": 3.3250279784202577, + "epoch": 0.819, + "grad_norm": 1280.0, + "kl_loss_13": 105.54817428588868, + "kl_loss_2": 2219.075958251953, + "kl_loss_4": 1347.22265625, + "kl_loss_9": 478.760791015625, + "learning_rate": 8.023316475589754e-05, + "loss": 1017.3179, + "step": 8190 + }, + { + "ce_loss_13": 3.129737615585327, + "ce_loss_17": 3.0786852717399595, + "ce_loss_2": 4.173572134971619, + "ce_loss_4": 3.700889503955841, + "ce_loss_9": 3.291579246520996, + "epoch": 0.82, + "grad_norm": 1416.0, + "kl_loss_13": 107.01251220703125, + "kl_loss_2": 2289.6451904296873, + "kl_loss_4": 1367.956201171875, + "kl_loss_9": 478.4766906738281, + "learning_rate": 7.937323358440934e-05, + "loss": 1032.0778, + "step": 8200 + }, + { + "ce_loss_13": 3.246927225589752, + "ce_loss_17": 3.202515149116516, + "ce_loss_2": 4.159306156635284, + "ce_loss_4": 3.747850489616394, + "ce_loss_9": 3.3908654928207396, + "epoch": 0.821, + "grad_norm": 908.0, + "kl_loss_13": 100.4887752532959, + "kl_loss_2": 2051.482458496094, + "kl_loss_4": 1246.8281860351562, + "kl_loss_9": 441.79815063476565, + "learning_rate": 7.851753811978923e-05, + "loss": 975.0311, + "step": 8210 + }, + { + "ce_loss_13": 3.267043209075928, + "ce_loss_17": 3.2202131032943724, + "ce_loss_2": 4.218369567394257, + "ce_loss_4": 3.7910293459892275, + "ce_loss_9": 3.412215805053711, + "epoch": 0.822, + "grad_norm": 1064.0, + "kl_loss_13": 102.62970008850098, + "kl_loss_2": 2127.5798400878907, + "kl_loss_4": 1287.9249694824218, + "kl_loss_9": 447.6311492919922, + "learning_rate": 7.766608697888095e-05, + "loss": 980.2822, + "step": 8220 + }, + { + "ce_loss_13": 3.2748966097831724, + "ce_loss_17": 3.228378343582153, + "ce_loss_2": 4.23167929649353, + "ce_loss_4": 3.795110809803009, + "ce_loss_9": 3.4215556740760804, + "epoch": 0.823, + "grad_norm": 956.0, + "kl_loss_13": 103.45522193908691, + "kl_loss_2": 2135.0842224121093, + "kl_loss_4": 1277.23076171875, + "kl_loss_9": 452.83446197509767, + "learning_rate": 7.681888873578785e-05, + "loss": 995.732, + "step": 8230 + }, + { + "ce_loss_13": 3.2074190258979796, + "ce_loss_17": 3.1561907529830933, + "ce_loss_2": 4.1820849776268005, + "ce_loss_4": 3.738399529457092, + "ce_loss_9": 3.3608216762542726, + "epoch": 0.824, + "grad_norm": 992.0, + "kl_loss_13": 104.77609939575196, + "kl_loss_2": 2163.8196655273437, + "kl_loss_4": 1287.7794677734375, + "kl_loss_9": 460.43443603515624, + "learning_rate": 7.597595192178702e-05, + "loss": 986.9766, + "step": 8240 + }, + { + "ce_loss_13": 3.203732705116272, + "ce_loss_17": 3.1554117798805237, + "ce_loss_2": 4.193833374977112, + "ce_loss_4": 3.743336892127991, + "ce_loss_9": 3.3571459889411925, + "epoch": 0.825, + "grad_norm": 1048.0, + "kl_loss_13": 104.56061019897462, + "kl_loss_2": 2213.5382385253906, + "kl_loss_4": 1323.4664733886718, + "kl_loss_9": 463.72909088134764, + "learning_rate": 7.513728502524286e-05, + "loss": 1011.0385, + "step": 8250 + }, + { + "ce_loss_13": 3.2062904238700867, + "ce_loss_17": 3.1627049922943113, + "ce_loss_2": 4.146057713031769, + "ce_loss_4": 3.7188182711601256, + "ce_loss_9": 3.3453433990478514, + "epoch": 0.826, + "grad_norm": 1032.0, + "kl_loss_13": 97.80841865539551, + "kl_loss_2": 2092.534802246094, + "kl_loss_4": 1247.9206176757812, + "kl_loss_9": 436.4948669433594, + "learning_rate": 7.430289649152156e-05, + "loss": 988.1217, + "step": 8260 + }, + { + "ce_loss_13": 3.113608407974243, + "ce_loss_17": 3.0663789033889772, + "ce_loss_2": 4.106975650787353, + "ce_loss_4": 3.664884555339813, + "ce_loss_9": 3.2727004051208497, + "epoch": 0.827, + "grad_norm": 1064.0, + "kl_loss_13": 102.66676864624023, + "kl_loss_2": 2228.8986633300783, + "kl_loss_4": 1342.4717102050781, + "kl_loss_9": 466.7908660888672, + "learning_rate": 7.347279472290646e-05, + "loss": 999.7191, + "step": 8270 + }, + { + "ce_loss_13": 3.2472809553146362, + "ce_loss_17": 3.2006909966468813, + "ce_loss_2": 4.212896692752838, + "ce_loss_4": 3.7773235201835633, + "ce_loss_9": 3.3952941298484802, + "epoch": 0.828, + "grad_norm": 1096.0, + "kl_loss_13": 101.43166809082031, + "kl_loss_2": 2145.3037536621096, + "kl_loss_4": 1285.9737915039063, + "kl_loss_9": 452.28861541748046, + "learning_rate": 7.264698807851328e-05, + "loss": 999.7668, + "step": 8280 + }, + { + "ce_loss_13": 3.2246126413345335, + "ce_loss_17": 3.1786208152770996, + "ce_loss_2": 4.147958254814148, + "ce_loss_4": 3.7270422101020815, + "ce_loss_9": 3.364509701728821, + "epoch": 0.829, + "grad_norm": 1004.0, + "kl_loss_13": 98.81834487915039, + "kl_loss_2": 2067.9016967773437, + "kl_loss_4": 1238.2277221679688, + "kl_loss_9": 437.84869995117185, + "learning_rate": 7.182548487420554e-05, + "loss": 977.3588, + "step": 8290 + }, + { + "ce_loss_13": 3.2685453057289124, + "ce_loss_17": 3.222314155101776, + "ce_loss_2": 4.208584856987, + "ce_loss_4": 3.780838119983673, + "ce_loss_9": 3.4178367972373964, + "epoch": 0.83, + "grad_norm": 852.0, + "kl_loss_13": 102.65170021057129, + "kl_loss_2": 2112.604736328125, + "kl_loss_4": 1263.031787109375, + "kl_loss_9": 451.32301940917966, + "learning_rate": 7.100829338251146e-05, + "loss": 979.1859, + "step": 8300 + }, + { + "ce_loss_13": 3.2056555271148683, + "ce_loss_17": 3.155162882804871, + "ce_loss_2": 4.182552087306976, + "ce_loss_4": 3.7506999731063844, + "ce_loss_9": 3.3552916407585145, + "epoch": 0.831, + "grad_norm": 1264.0, + "kl_loss_13": 104.76410217285157, + "kl_loss_2": 2169.3369567871096, + "kl_loss_4": 1312.7925537109375, + "kl_loss_9": 460.9786315917969, + "learning_rate": 7.019542183254046e-05, + "loss": 985.36, + "step": 8310 + }, + { + "ce_loss_13": 3.243871343135834, + "ce_loss_17": 3.1938986778259277, + "ce_loss_2": 4.1779109835624695, + "ce_loss_4": 3.754119002819061, + "ce_loss_9": 3.391617405414581, + "epoch": 0.832, + "grad_norm": 1096.0, + "kl_loss_13": 105.35637702941895, + "kl_loss_2": 2109.878955078125, + "kl_loss_4": 1270.0175659179688, + "kl_loss_9": 456.3101806640625, + "learning_rate": 6.938687840989971e-05, + "loss": 983.1326, + "step": 8320 + }, + { + "ce_loss_13": 3.1818163990974426, + "ce_loss_17": 3.1345704793930054, + "ce_loss_2": 4.140335583686829, + "ce_loss_4": 3.7112187266349794, + "ce_loss_9": 3.3336414337158202, + "epoch": 0.833, + "grad_norm": 1192.0, + "kl_loss_13": 103.37887229919434, + "kl_loss_2": 2119.5376098632814, + "kl_loss_4": 1276.9036376953125, + "kl_loss_9": 456.54979553222654, + "learning_rate": 6.858267125661271e-05, + "loss": 999.7768, + "step": 8330 + }, + { + "ce_loss_13": 3.2426349759101867, + "ce_loss_17": 3.193797743320465, + "ce_loss_2": 4.191949367523193, + "ce_loss_4": 3.7708911538124084, + "ce_loss_9": 3.3941858291625975, + "epoch": 0.834, + "grad_norm": 1376.0, + "kl_loss_13": 101.32920036315917, + "kl_loss_2": 2103.1502197265627, + "kl_loss_4": 1277.3342529296874, + "kl_loss_9": 449.2188217163086, + "learning_rate": 6.778280847103668e-05, + "loss": 1005.9226, + "step": 8340 + }, + { + "ce_loss_13": 3.24954993724823, + "ce_loss_17": 3.200397181510925, + "ce_loss_2": 4.186002504825592, + "ce_loss_4": 3.7711974024772643, + "ce_loss_9": 3.3984900951385497, + "epoch": 0.835, + "grad_norm": 956.0, + "kl_loss_13": 103.5113624572754, + "kl_loss_2": 2117.148028564453, + "kl_loss_4": 1289.7764892578125, + "kl_loss_9": 458.2400787353516, + "learning_rate": 6.698729810778065e-05, + "loss": 984.4787, + "step": 8350 + }, + { + "ce_loss_13": 3.1665493488311767, + "ce_loss_17": 3.117012989521027, + "ce_loss_2": 4.134637403488159, + "ce_loss_4": 3.699856436252594, + "ce_loss_9": 3.315209114551544, + "epoch": 0.836, + "grad_norm": 1152.0, + "kl_loss_13": 100.10919914245605, + "kl_loss_2": 2136.9033813476562, + "kl_loss_4": 1281.1259643554688, + "kl_loss_9": 451.45128631591797, + "learning_rate": 6.619614817762538e-05, + "loss": 995.9742, + "step": 8360 + }, + { + "ce_loss_13": 3.1353330612182617, + "ce_loss_17": 3.0841333150863646, + "ce_loss_2": 4.147654914855957, + "ce_loss_4": 3.692219114303589, + "ce_loss_9": 3.292687475681305, + "epoch": 0.837, + "grad_norm": 1248.0, + "kl_loss_13": 102.13751449584962, + "kl_loss_2": 2238.194476318359, + "kl_loss_4": 1341.3754089355468, + "kl_loss_9": 465.93928680419924, + "learning_rate": 6.540936664744196e-05, + "loss": 1012.2673, + "step": 8370 + }, + { + "ce_loss_13": 3.2731244802474975, + "ce_loss_17": 3.224391186237335, + "ce_loss_2": 4.23663512468338, + "ce_loss_4": 3.805148386955261, + "ce_loss_9": 3.421452081203461, + "epoch": 0.838, + "grad_norm": 964.0, + "kl_loss_13": 103.07422714233398, + "kl_loss_2": 2132.597546386719, + "kl_loss_4": 1284.0694641113282, + "kl_loss_9": 451.873030090332, + "learning_rate": 6.462696144011149e-05, + "loss": 982.847, + "step": 8380 + }, + { + "ce_loss_13": 3.225843274593353, + "ce_loss_17": 3.177779734134674, + "ce_loss_2": 4.162012577056885, + "ce_loss_4": 3.7491840720176697, + "ce_loss_9": 3.37238187789917, + "epoch": 0.839, + "grad_norm": 1056.0, + "kl_loss_13": 103.78873863220215, + "kl_loss_2": 2105.923205566406, + "kl_loss_4": 1290.6177490234375, + "kl_loss_9": 457.21540985107424, + "learning_rate": 6.384894043444567e-05, + "loss": 976.453, + "step": 8390 + }, + { + "ce_loss_13": 3.2544339060783387, + "ce_loss_17": 3.2068969011306763, + "ce_loss_2": 4.217712879180908, + "ce_loss_4": 3.779253029823303, + "ce_loss_9": 3.3995004177093504, + "epoch": 0.84, + "grad_norm": 1112.0, + "kl_loss_13": 102.47664489746094, + "kl_loss_2": 2130.4115600585938, + "kl_loss_4": 1284.221990966797, + "kl_loss_9": 453.3491180419922, + "learning_rate": 6.307531146510753e-05, + "loss": 983.9436, + "step": 8400 + }, + { + "ce_loss_13": 3.2285305142402647, + "ce_loss_17": 3.179581344127655, + "ce_loss_2": 4.155021214485169, + "ce_loss_4": 3.742681312561035, + "ce_loss_9": 3.3759974360466005, + "epoch": 0.841, + "grad_norm": 928.0, + "kl_loss_13": 102.02907791137696, + "kl_loss_2": 2060.980676269531, + "kl_loss_4": 1249.4063110351562, + "kl_loss_9": 446.5684753417969, + "learning_rate": 6.230608232253226e-05, + "loss": 962.4022, + "step": 8410 + }, + { + "ce_loss_13": 3.19055415391922, + "ce_loss_17": 3.1425411820411684, + "ce_loss_2": 4.185473334789276, + "ce_loss_4": 3.7423317432403564, + "ce_loss_9": 3.3456467151641847, + "epoch": 0.842, + "grad_norm": 1112.0, + "kl_loss_13": 103.11506080627441, + "kl_loss_2": 2199.642687988281, + "kl_loss_4": 1323.6161071777344, + "kl_loss_9": 466.18689117431643, + "learning_rate": 6.154126075284855e-05, + "loss": 989.1609, + "step": 8420 + }, + { + "ce_loss_13": 3.2846620202064516, + "ce_loss_17": 3.2368993878364565, + "ce_loss_2": 4.200677371025085, + "ce_loss_4": 3.793342816829681, + "ce_loss_9": 3.4248011231422426, + "epoch": 0.843, + "grad_norm": 992.0, + "kl_loss_13": 99.16912879943848, + "kl_loss_2": 2041.9186462402345, + "kl_loss_4": 1235.149951171875, + "kl_loss_9": 438.9051208496094, + "learning_rate": 6.078085445780129e-05, + "loss": 954.5255, + "step": 8430 + }, + { + "ce_loss_13": 3.2866493344306944, + "ce_loss_17": 3.24077650308609, + "ce_loss_2": 4.245067894458771, + "ce_loss_4": 3.8085723161697387, + "ce_loss_9": 3.4317312121391295, + "epoch": 0.844, + "grad_norm": 1520.0, + "kl_loss_13": 103.00012092590332, + "kl_loss_2": 2148.400012207031, + "kl_loss_4": 1279.3104125976563, + "kl_loss_9": 451.4021499633789, + "learning_rate": 6.002487109467347e-05, + "loss": 973.5395, + "step": 8440 + }, + { + "ce_loss_13": 3.2867504596710204, + "ce_loss_17": 3.2391888380050657, + "ce_loss_2": 4.2166420221328735, + "ce_loss_4": 3.8021249175071716, + "ce_loss_9": 3.4364368438720705, + "epoch": 0.845, + "grad_norm": 952.0, + "kl_loss_13": 104.41244621276856, + "kl_loss_2": 2097.564044189453, + "kl_loss_4": 1273.9579956054688, + "kl_loss_9": 461.5078826904297, + "learning_rate": 5.927331827620902e-05, + "loss": 975.9972, + "step": 8450 + }, + { + "ce_loss_13": 3.2792110085487365, + "ce_loss_17": 3.229340398311615, + "ce_loss_2": 4.177789902687072, + "ce_loss_4": 3.773702526092529, + "ce_loss_9": 3.4206575036048887, + "epoch": 0.846, + "grad_norm": 1072.0, + "kl_loss_13": 100.93832015991211, + "kl_loss_2": 2024.2252990722657, + "kl_loss_4": 1227.3581420898438, + "kl_loss_9": 442.4761108398437, + "learning_rate": 5.852620357053651e-05, + "loss": 970.7976, + "step": 8460 + }, + { + "ce_loss_13": 3.313369107246399, + "ce_loss_17": 3.2671704053878785, + "ce_loss_2": 4.231478774547577, + "ce_loss_4": 3.820213425159454, + "ce_loss_9": 3.456589865684509, + "epoch": 0.847, + "grad_norm": 1088.0, + "kl_loss_13": 99.90718040466308, + "kl_loss_2": 2059.377099609375, + "kl_loss_4": 1239.5423706054687, + "kl_loss_9": 442.8251693725586, + "learning_rate": 5.778353450109286e-05, + "loss": 971.7859, + "step": 8470 + }, + { + "ce_loss_13": 3.3482731223106383, + "ce_loss_17": 3.2992701530456543, + "ce_loss_2": 4.297941255569458, + "ce_loss_4": 3.868474280834198, + "ce_loss_9": 3.496585726737976, + "epoch": 0.848, + "grad_norm": 1048.0, + "kl_loss_13": 103.3643856048584, + "kl_loss_2": 2110.0072326660156, + "kl_loss_4": 1264.4103515625, + "kl_loss_9": 454.29853515625, + "learning_rate": 5.7045318546547206e-05, + "loss": 979.1016, + "step": 8480 + }, + { + "ce_loss_13": 3.2434808254241942, + "ce_loss_17": 3.1958300948143004, + "ce_loss_2": 4.206908857822418, + "ce_loss_4": 3.7733861327171327, + "ce_loss_9": 3.3931934595108033, + "epoch": 0.849, + "grad_norm": 932.0, + "kl_loss_13": 101.7448802947998, + "kl_loss_2": 2140.2457275390625, + "kl_loss_4": 1291.944073486328, + "kl_loss_9": 449.5452056884766, + "learning_rate": 5.631156314072605e-05, + "loss": 979.9654, + "step": 8490 + }, + { + "ce_loss_13": 3.2683048844337463, + "ce_loss_17": 3.2216777205467224, + "ce_loss_2": 4.183839499950409, + "ce_loss_4": 3.76673846244812, + "ce_loss_9": 3.4118457555770876, + "epoch": 0.85, + "grad_norm": 1012.0, + "kl_loss_13": 100.51690673828125, + "kl_loss_2": 2051.498669433594, + "kl_loss_4": 1235.3427124023438, + "kl_loss_9": 437.6060394287109, + "learning_rate": 5.5582275672538315e-05, + "loss": 965.0143, + "step": 8500 + }, + { + "ce_loss_13": 3.1779184103012086, + "ce_loss_17": 3.1292684674263, + "ce_loss_2": 4.1969388961792, + "ce_loss_4": 3.7422746419906616, + "ce_loss_9": 3.3321661353111267, + "epoch": 0.851, + "grad_norm": 1112.0, + "kl_loss_13": 104.94289093017578, + "kl_loss_2": 2237.122448730469, + "kl_loss_4": 1343.809259033203, + "kl_loss_9": 465.33240966796876, + "learning_rate": 5.4857463485900484e-05, + "loss": 1010.6073, + "step": 8510 + }, + { + "ce_loss_13": 3.239231359958649, + "ce_loss_17": 3.1926878809928896, + "ce_loss_2": 4.171153104305267, + "ce_loss_4": 3.754164123535156, + "ce_loss_9": 3.385264503955841, + "epoch": 0.852, + "grad_norm": 1040.0, + "kl_loss_13": 100.37816047668457, + "kl_loss_2": 2091.125354003906, + "kl_loss_4": 1262.6582275390624, + "kl_loss_9": 448.4439956665039, + "learning_rate": 5.413713387966329e-05, + "loss": 977.2996, + "step": 8520 + }, + { + "ce_loss_13": 3.2544985890388487, + "ce_loss_17": 3.2055798292160036, + "ce_loss_2": 4.213623034954071, + "ce_loss_4": 3.778373920917511, + "ce_loss_9": 3.4003939032554626, + "epoch": 0.853, + "grad_norm": 1104.0, + "kl_loss_13": 102.68244323730468, + "kl_loss_2": 2136.201379394531, + "kl_loss_4": 1282.6224853515625, + "kl_loss_9": 451.0315628051758, + "learning_rate": 5.34212941075381e-05, + "loss": 989.5346, + "step": 8530 + }, + { + "ce_loss_13": 3.2683287382125856, + "ce_loss_17": 3.223601984977722, + "ce_loss_2": 4.185144472122192, + "ce_loss_4": 3.7635194420814515, + "ce_loss_9": 3.4065766453742983, + "epoch": 0.854, + "grad_norm": 940.0, + "kl_loss_13": 100.08591079711914, + "kl_loss_2": 2060.322967529297, + "kl_loss_4": 1228.7866821289062, + "kl_loss_9": 431.9791000366211, + "learning_rate": 5.270995137802315e-05, + "loss": 966.8654, + "step": 8540 + }, + { + "ce_loss_13": 3.2042253613471985, + "ce_loss_17": 3.1586084365844727, + "ce_loss_2": 4.148164641857147, + "ce_loss_4": 3.7191488981246947, + "ce_loss_9": 3.34843966960907, + "epoch": 0.855, + "grad_norm": 980.0, + "kl_loss_13": 99.83077392578124, + "kl_loss_2": 2122.355340576172, + "kl_loss_4": 1270.1538818359375, + "kl_loss_9": 447.014338684082, + "learning_rate": 5.2003112854332125e-05, + "loss": 989.5109, + "step": 8550 + }, + { + "ce_loss_13": 3.2019141554832458, + "ce_loss_17": 3.1565523505210877, + "ce_loss_2": 4.132523250579834, + "ce_loss_4": 3.7114076137542726, + "ce_loss_9": 3.343767786026001, + "epoch": 0.856, + "grad_norm": 1020.0, + "kl_loss_13": 99.79981956481933, + "kl_loss_2": 2101.3156677246093, + "kl_loss_4": 1270.280987548828, + "kl_loss_9": 442.9563323974609, + "learning_rate": 5.130078565432089e-05, + "loss": 962.4086, + "step": 8560 + }, + { + "ce_loss_13": 3.268127417564392, + "ce_loss_17": 3.2234100341796874, + "ce_loss_2": 4.177941250801086, + "ce_loss_4": 3.7700358748435976, + "ce_loss_9": 3.4080262303352358, + "epoch": 0.857, + "grad_norm": 984.0, + "kl_loss_13": 98.53223495483398, + "kl_loss_2": 2064.2197692871096, + "kl_loss_4": 1248.5998779296874, + "kl_loss_9": 437.4849822998047, + "learning_rate": 5.060297685041659e-05, + "loss": 955.3677, + "step": 8570 + }, + { + "ce_loss_13": 3.200807571411133, + "ce_loss_17": 3.1519843697547913, + "ce_loss_2": 4.162305295467377, + "ce_loss_4": 3.728400158882141, + "ce_loss_9": 3.348930525779724, + "epoch": 0.858, + "grad_norm": 1112.0, + "kl_loss_13": 103.72225723266601, + "kl_loss_2": 2151.137109375, + "kl_loss_4": 1292.6536865234375, + "kl_loss_9": 455.32340545654296, + "learning_rate": 4.99096934695461e-05, + "loss": 1003.1525, + "step": 8580 + }, + { + "ce_loss_13": 3.256711792945862, + "ce_loss_17": 3.209336316585541, + "ce_loss_2": 4.203397381305694, + "ce_loss_4": 3.7791913390159606, + "ce_loss_9": 3.40111049413681, + "epoch": 0.859, + "grad_norm": 1012.0, + "kl_loss_13": 100.52773323059083, + "kl_loss_2": 2103.793536376953, + "kl_loss_4": 1264.7725708007813, + "kl_loss_9": 447.57261962890624, + "learning_rate": 4.922094249306558e-05, + "loss": 967.135, + "step": 8590 + }, + { + "ce_loss_13": 3.2862537622451784, + "ce_loss_17": 3.2389361023902894, + "ce_loss_2": 4.2373674511909485, + "ce_loss_4": 3.8063544273376464, + "ce_loss_9": 3.4311780095100404, + "epoch": 0.86, + "grad_norm": 1344.0, + "kl_loss_13": 103.32524604797364, + "kl_loss_2": 2123.043212890625, + "kl_loss_4": 1269.6117736816407, + "kl_loss_9": 452.6595977783203, + "learning_rate": 4.853673085668947e-05, + "loss": 964.0638, + "step": 8600 + }, + { + "ce_loss_13": 3.307164764404297, + "ce_loss_17": 3.259320616722107, + "ce_loss_2": 4.248125827312469, + "ce_loss_4": 3.8175243616104124, + "ce_loss_9": 3.452924072742462, + "epoch": 0.861, + "grad_norm": 1104.0, + "kl_loss_13": 101.06617546081543, + "kl_loss_2": 2109.3964294433595, + "kl_loss_4": 1258.0895751953126, + "kl_loss_9": 443.64978942871096, + "learning_rate": 4.78570654504214e-05, + "loss": 982.8816, + "step": 8610 + }, + { + "ce_loss_13": 3.2527210235595705, + "ce_loss_17": 3.2034615874290466, + "ce_loss_2": 4.200418400764465, + "ce_loss_4": 3.777254056930542, + "ce_loss_9": 3.3991082191467283, + "epoch": 0.862, + "grad_norm": 1144.0, + "kl_loss_13": 100.54085197448731, + "kl_loss_2": 2125.900250244141, + "kl_loss_4": 1284.1296875, + "kl_loss_9": 450.4908782958984, + "learning_rate": 4.7181953118484556e-05, + "loss": 985.0406, + "step": 8620 + }, + { + "ce_loss_13": 3.2747682452201845, + "ce_loss_17": 3.229109525680542, + "ce_loss_2": 4.2062456727027895, + "ce_loss_4": 3.7882620334625243, + "ce_loss_9": 3.4201944351196287, + "epoch": 0.863, + "grad_norm": 1056.0, + "kl_loss_13": 100.08700332641601, + "kl_loss_2": 2045.2134948730468, + "kl_loss_4": 1243.1802307128905, + "kl_loss_9": 439.36628875732424, + "learning_rate": 4.651140065925269e-05, + "loss": 988.7731, + "step": 8630 + }, + { + "ce_loss_13": 3.214710056781769, + "ce_loss_17": 3.1660316228866576, + "ce_loss_2": 4.149816310405731, + "ce_loss_4": 3.7266596436500548, + "ce_loss_9": 3.358921194076538, + "epoch": 0.864, + "grad_norm": 1112.0, + "kl_loss_13": 101.1618537902832, + "kl_loss_2": 2112.379748535156, + "kl_loss_4": 1267.4575988769532, + "kl_loss_9": 448.4175231933594, + "learning_rate": 4.58454148251814e-05, + "loss": 988.7574, + "step": 8640 + }, + { + "ce_loss_13": 3.227523624897003, + "ce_loss_17": 3.1797904133796693, + "ce_loss_2": 4.200765824317932, + "ce_loss_4": 3.7670172810554505, + "ce_loss_9": 3.373416578769684, + "epoch": 0.865, + "grad_norm": 952.0, + "kl_loss_13": 101.38066825866699, + "kl_loss_2": 2151.5326965332033, + "kl_loss_4": 1297.120867919922, + "kl_loss_9": 448.66112060546874, + "learning_rate": 4.518400232274078e-05, + "loss": 983.0388, + "step": 8650 + }, + { + "ce_loss_13": 3.2499554991722106, + "ce_loss_17": 3.200919044017792, + "ce_loss_2": 4.193244934082031, + "ce_loss_4": 3.7635411620140076, + "ce_loss_9": 3.3950576543807984, + "epoch": 0.866, + "grad_norm": 1192.0, + "kl_loss_13": 101.92468910217285, + "kl_loss_2": 2091.9731689453124, + "kl_loss_4": 1255.2327575683594, + "kl_loss_9": 445.1644973754883, + "learning_rate": 4.452716981234745e-05, + "loss": 952.505, + "step": 8660 + }, + { + "ce_loss_13": 3.23288733959198, + "ce_loss_17": 3.185954523086548, + "ce_loss_2": 4.1606947660446165, + "ce_loss_4": 3.733210062980652, + "ce_loss_9": 3.3727546095848084, + "epoch": 0.867, + "grad_norm": 920.0, + "kl_loss_13": 98.55878868103028, + "kl_loss_2": 2086.489959716797, + "kl_loss_4": 1254.2104919433593, + "kl_loss_9": 443.2176208496094, + "learning_rate": 4.3874923908297335e-05, + "loss": 956.8958, + "step": 8670 + }, + { + "ce_loss_13": 3.276777219772339, + "ce_loss_17": 3.228739786148071, + "ce_loss_2": 4.231963181495667, + "ce_loss_4": 3.795181357860565, + "ce_loss_9": 3.422040593624115, + "epoch": 0.868, + "grad_norm": 856.0, + "kl_loss_13": 103.66386680603027, + "kl_loss_2": 2135.990856933594, + "kl_loss_4": 1280.0503356933593, + "kl_loss_9": 452.3242919921875, + "learning_rate": 4.322727117869951e-05, + "loss": 981.0543, + "step": 8680 + }, + { + "ce_loss_13": 3.2885834217071532, + "ce_loss_17": 3.2391077399253847, + "ce_loss_2": 4.238253498077393, + "ce_loss_4": 3.800091791152954, + "ce_loss_9": 3.430259144306183, + "epoch": 0.869, + "grad_norm": 1248.0, + "kl_loss_13": 102.45274505615234, + "kl_loss_2": 2136.7294494628904, + "kl_loss_4": 1274.0455200195313, + "kl_loss_9": 450.96560821533205, + "learning_rate": 4.2584218145409916e-05, + "loss": 976.5607, + "step": 8690 + }, + { + "ce_loss_13": 3.3277555465698243, + "ce_loss_17": 3.282154953479767, + "ce_loss_2": 4.228168916702271, + "ce_loss_4": 3.817222785949707, + "ce_loss_9": 3.465859019756317, + "epoch": 0.87, + "grad_norm": 1216.0, + "kl_loss_13": 99.73889389038087, + "kl_loss_2": 2035.5429443359376, + "kl_loss_4": 1225.6856811523437, + "kl_loss_9": 436.75442962646486, + "learning_rate": 4.194577128396521e-05, + "loss": 952.7639, + "step": 8700 + }, + { + "ce_loss_13": 3.2104090809822083, + "ce_loss_17": 3.1636773347854614, + "ce_loss_2": 4.1500626564025875, + "ce_loss_4": 3.719528067111969, + "ce_loss_9": 3.352318322658539, + "epoch": 0.871, + "grad_norm": 964.0, + "kl_loss_13": 99.25862579345703, + "kl_loss_2": 2109.141259765625, + "kl_loss_4": 1257.7720520019532, + "kl_loss_9": 437.88904876708983, + "learning_rate": 4.1311937023518264e-05, + "loss": 987.9172, + "step": 8710 + }, + { + "ce_loss_13": 3.232139790058136, + "ce_loss_17": 3.1878427386283876, + "ce_loss_2": 4.208202242851257, + "ce_loss_4": 3.74212726354599, + "ce_loss_9": 3.3677379846572877, + "epoch": 0.872, + "grad_norm": 948.0, + "kl_loss_13": 97.73515968322754, + "kl_loss_2": 2177.1430725097657, + "kl_loss_4": 1257.6459411621095, + "kl_loss_9": 431.63305206298827, + "learning_rate": 4.0682721746773344e-05, + "loss": 976.3083, + "step": 8720 + }, + { + "ce_loss_13": 3.0979220867156982, + "ce_loss_17": 3.0500224947929384, + "ce_loss_2": 4.091311717033387, + "ce_loss_4": 3.6451772093772887, + "ce_loss_9": 3.248476564884186, + "epoch": 0.873, + "grad_norm": 1160.0, + "kl_loss_13": 100.0238193511963, + "kl_loss_2": 2175.1422424316406, + "kl_loss_4": 1303.4404541015624, + "kl_loss_9": 451.34252014160154, + "learning_rate": 4.0058131789920904e-05, + "loss": 973.1121, + "step": 8730 + }, + { + "ce_loss_13": 3.2482882857322695, + "ce_loss_17": 3.202499210834503, + "ce_loss_2": 4.176600193977356, + "ce_loss_4": 3.761054527759552, + "ce_loss_9": 3.391435515880585, + "epoch": 0.874, + "grad_norm": 1008.0, + "kl_loss_13": 99.88175621032715, + "kl_loss_2": 2095.143420410156, + "kl_loss_4": 1267.2463134765626, + "kl_loss_9": 442.6323501586914, + "learning_rate": 3.9438173442575e-05, + "loss": 1003.0488, + "step": 8740 + }, + { + "ce_loss_13": 3.2781424641609194, + "ce_loss_17": 3.231118679046631, + "ce_loss_2": 4.198352777957917, + "ce_loss_4": 3.7846031308174135, + "ce_loss_9": 3.4244203686714174, + "epoch": 0.875, + "grad_norm": 1240.0, + "kl_loss_13": 100.2481674194336, + "kl_loss_2": 2051.8716064453124, + "kl_loss_4": 1242.6941345214843, + "kl_loss_9": 441.20912017822263, + "learning_rate": 3.882285294770937e-05, + "loss": 967.37, + "step": 8750 + }, + { + "ce_loss_13": 3.2420179724693297, + "ce_loss_17": 3.195972001552582, + "ce_loss_2": 4.15534234046936, + "ce_loss_4": 3.7425065636634827, + "ce_loss_9": 3.3827720284461975, + "epoch": 0.876, + "grad_norm": 984.0, + "kl_loss_13": 100.61825942993164, + "kl_loss_2": 2070.0154235839846, + "kl_loss_4": 1249.9717712402344, + "kl_loss_9": 445.3143737792969, + "learning_rate": 3.821217650159453e-05, + "loss": 981.2265, + "step": 8760 + }, + { + "ce_loss_13": 3.116419792175293, + "ce_loss_17": 3.066071403026581, + "ce_loss_2": 4.10679612159729, + "ce_loss_4": 3.67021906375885, + "ce_loss_9": 3.2722548842430115, + "epoch": 0.877, + "grad_norm": 1184.0, + "kl_loss_13": 101.89100036621093, + "kl_loss_2": 2178.528942871094, + "kl_loss_4": 1320.5689880371094, + "kl_loss_9": 458.6724548339844, + "learning_rate": 3.760615025373543e-05, + "loss": 995.7758, + "step": 8770 + }, + { + "ce_loss_13": 3.292972540855408, + "ce_loss_17": 3.2431691765785216, + "ce_loss_2": 4.255655467510223, + "ce_loss_4": 3.819663631916046, + "ce_loss_9": 3.440823757648468, + "epoch": 0.878, + "grad_norm": 968.0, + "kl_loss_13": 105.39409523010254, + "kl_loss_2": 2152.3751708984373, + "kl_loss_4": 1283.293798828125, + "kl_loss_9": 455.04277954101565, + "learning_rate": 3.700478030680987e-05, + "loss": 1000.5841, + "step": 8780 + }, + { + "ce_loss_13": 3.284103202819824, + "ce_loss_17": 3.238202524185181, + "ce_loss_2": 4.224376666545868, + "ce_loss_4": 3.8014718770980833, + "ce_loss_9": 3.4295626521110534, + "epoch": 0.879, + "grad_norm": 1040.0, + "kl_loss_13": 99.58377304077149, + "kl_loss_2": 2097.2333557128904, + "kl_loss_4": 1255.6708190917968, + "kl_loss_9": 444.42860717773436, + "learning_rate": 3.6408072716606344e-05, + "loss": 971.6826, + "step": 8790 + }, + { + "ce_loss_13": 3.2096526980400086, + "ce_loss_17": 3.162070095539093, + "ce_loss_2": 4.180876076221466, + "ce_loss_4": 3.743834745883942, + "ce_loss_9": 3.360402011871338, + "epoch": 0.88, + "grad_norm": 896.0, + "kl_loss_13": 102.0367359161377, + "kl_loss_2": 2163.858605957031, + "kl_loss_4": 1304.3258178710937, + "kl_loss_9": 455.8943267822266, + "learning_rate": 3.5816033491963716e-05, + "loss": 1010.3639, + "step": 8800 + }, + { + "ce_loss_13": 3.0727545380592347, + "ce_loss_17": 3.0262926697731016, + "ce_loss_2": 4.081104552745819, + "ce_loss_4": 3.6180320024490356, + "ce_loss_9": 3.222576153278351, + "epoch": 0.881, + "grad_norm": 1832.0, + "kl_loss_13": 99.94013404846191, + "kl_loss_2": 2215.870013427734, + "kl_loss_4": 1307.6419799804687, + "kl_loss_9": 447.7231246948242, + "learning_rate": 3.522866859471047e-05, + "loss": 992.8438, + "step": 8810 + }, + { + "ce_loss_13": 3.303645062446594, + "ce_loss_17": 3.2592533230781555, + "ce_loss_2": 4.196143448352814, + "ce_loss_4": 3.7870477437973022, + "ce_loss_9": 3.4373404741287232, + "epoch": 0.882, + "grad_norm": 1328.0, + "kl_loss_13": 97.36594696044922, + "kl_loss_2": 2003.4808166503906, + "kl_loss_4": 1201.0601501464844, + "kl_loss_9": 428.9521987915039, + "learning_rate": 3.46459839396045e-05, + "loss": 955.9329, + "step": 8820 + }, + { + "ce_loss_13": 3.2250730752944947, + "ce_loss_17": 3.175858938694, + "ce_loss_2": 4.18024787902832, + "ce_loss_4": 3.755022394657135, + "ce_loss_9": 3.3714071393013, + "epoch": 0.883, + "grad_norm": 848.0, + "kl_loss_13": 102.0469310760498, + "kl_loss_2": 2107.246759033203, + "kl_loss_4": 1281.4510375976563, + "kl_loss_9": 450.33849029541017, + "learning_rate": 3.406798539427386e-05, + "loss": 1004.4393, + "step": 8830 + }, + { + "ce_loss_13": 3.2856566429138185, + "ce_loss_17": 3.2401284098625185, + "ce_loss_2": 4.225487470626831, + "ce_loss_4": 3.7960302948951723, + "ce_loss_9": 3.424995946884155, + "epoch": 0.884, + "grad_norm": 1056.0, + "kl_loss_13": 101.03460884094238, + "kl_loss_2": 2124.606134033203, + "kl_loss_4": 1273.6157531738281, + "kl_loss_9": 447.2622955322266, + "learning_rate": 3.349467877915746e-05, + "loss": 981.8498, + "step": 8840 + }, + { + "ce_loss_13": 3.2454614520072935, + "ce_loss_17": 3.1977991461753845, + "ce_loss_2": 4.207997584342957, + "ce_loss_4": 3.7777808904647827, + "ce_loss_9": 3.3933525919914245, + "epoch": 0.885, + "grad_norm": 1024.0, + "kl_loss_13": 101.58301658630371, + "kl_loss_2": 2167.0832763671874, + "kl_loss_4": 1303.2544982910156, + "kl_loss_9": 455.3290328979492, + "learning_rate": 3.292606986744667e-05, + "loss": 1018.7739, + "step": 8850 + }, + { + "ce_loss_13": 3.2007771253585817, + "ce_loss_17": 3.1567315340042112, + "ce_loss_2": 4.155295968055725, + "ce_loss_4": 3.721710276603699, + "ce_loss_9": 3.3420563459396364, + "epoch": 0.886, + "grad_norm": 936.0, + "kl_loss_13": 99.32695350646972, + "kl_loss_2": 2132.0983337402345, + "kl_loss_4": 1280.4623168945313, + "kl_loss_9": 445.7021026611328, + "learning_rate": 3.23621643850267e-05, + "loss": 984.1855, + "step": 8860 + }, + { + "ce_loss_13": 3.273322248458862, + "ce_loss_17": 3.2260786294937134, + "ce_loss_2": 4.210953342914581, + "ce_loss_4": 3.787678599357605, + "ce_loss_9": 3.4195808291435243, + "epoch": 0.887, + "grad_norm": 1224.0, + "kl_loss_13": 103.04633750915528, + "kl_loss_2": 2106.0673217773438, + "kl_loss_4": 1267.906756591797, + "kl_loss_9": 452.71812591552737, + "learning_rate": 3.180296801041971e-05, + "loss": 966.0686, + "step": 8870 + }, + { + "ce_loss_13": 3.296452188491821, + "ce_loss_17": 3.250930941104889, + "ce_loss_2": 4.246974968910218, + "ce_loss_4": 3.802563285827637, + "ce_loss_9": 3.4385242104530334, + "epoch": 0.888, + "grad_norm": 908.0, + "kl_loss_13": 100.42105331420899, + "kl_loss_2": 2120.5658447265623, + "kl_loss_4": 1252.4359802246095, + "kl_loss_9": 439.27488708496094, + "learning_rate": 3.124848637472688e-05, + "loss": 960.566, + "step": 8880 + }, + { + "ce_loss_13": 3.1211246728897093, + "ce_loss_17": 3.0743814945220946, + "ce_loss_2": 4.085460984706879, + "ce_loss_4": 3.6526371717453, + "ce_loss_9": 3.2656445264816285, + "epoch": 0.889, + "grad_norm": 1320.0, + "kl_loss_13": 98.08441047668457, + "kl_loss_2": 2142.9528198242188, + "kl_loss_4": 1286.9167236328126, + "kl_loss_9": 444.549055480957, + "learning_rate": 3.069872506157212e-05, + "loss": 979.7434, + "step": 8890 + }, + { + "ce_loss_13": 3.2229220390319826, + "ce_loss_17": 3.17815101146698, + "ce_loss_2": 4.168483769893646, + "ce_loss_4": 3.735355567932129, + "ce_loss_9": 3.368432331085205, + "epoch": 0.89, + "grad_norm": 1456.0, + "kl_loss_13": 100.03944244384766, + "kl_loss_2": 2111.732977294922, + "kl_loss_4": 1256.306622314453, + "kl_loss_9": 444.6061477661133, + "learning_rate": 3.0153689607045842e-05, + "loss": 974.7032, + "step": 8900 + }, + { + "ce_loss_13": 3.1256988406181336, + "ce_loss_17": 3.076783847808838, + "ce_loss_2": 4.143813717365265, + "ce_loss_4": 3.681407403945923, + "ce_loss_9": 3.2804441452026367, + "epoch": 0.891, + "grad_norm": 1040.0, + "kl_loss_13": 104.21717376708985, + "kl_loss_2": 2270.210107421875, + "kl_loss_4": 1353.6390441894532, + "kl_loss_9": 466.60550384521486, + "learning_rate": 2.9613385499648926e-05, + "loss": 996.9842, + "step": 8910 + }, + { + "ce_loss_13": 3.173978066444397, + "ce_loss_17": 3.1283139824867248, + "ce_loss_2": 4.11392525434494, + "ce_loss_4": 3.698298728466034, + "ce_loss_9": 3.3246174693107604, + "epoch": 0.892, + "grad_norm": 944.0, + "kl_loss_13": 100.05176887512206, + "kl_loss_2": 2078.5159606933594, + "kl_loss_4": 1262.536114501953, + "kl_loss_9": 442.68208770751954, + "learning_rate": 2.9077818180237692e-05, + "loss": 977.9584, + "step": 8920 + }, + { + "ce_loss_13": 3.2222016572952272, + "ce_loss_17": 3.173809790611267, + "ce_loss_2": 4.189061617851257, + "ce_loss_4": 3.758192002773285, + "ce_loss_9": 3.3684428095817567, + "epoch": 0.893, + "grad_norm": 1920.0, + "kl_loss_13": 100.49244918823243, + "kl_loss_2": 2120.1018798828127, + "kl_loss_4": 1276.4442626953125, + "kl_loss_9": 442.34128723144534, + "learning_rate": 2.8546993041969172e-05, + "loss": 978.0939, + "step": 8930 + }, + { + "ce_loss_13": 3.2594022035598753, + "ce_loss_17": 3.2134727001190186, + "ce_loss_2": 4.1672776222229, + "ce_loss_4": 3.758451211452484, + "ce_loss_9": 3.399921548366547, + "epoch": 0.894, + "grad_norm": 928.0, + "kl_loss_13": 99.06856918334961, + "kl_loss_2": 2047.678240966797, + "kl_loss_4": 1240.7626708984376, + "kl_loss_9": 438.21772918701174, + "learning_rate": 2.802091543024671e-05, + "loss": 973.7527, + "step": 8940 + }, + { + "ce_loss_13": 3.2544695258140566, + "ce_loss_17": 3.2096627593040465, + "ce_loss_2": 4.212161219120025, + "ce_loss_4": 3.780433785915375, + "ce_loss_9": 3.4017921686172485, + "epoch": 0.895, + "grad_norm": 996.0, + "kl_loss_13": 101.25610389709473, + "kl_loss_2": 2148.6890625, + "kl_loss_4": 1294.2809143066406, + "kl_loss_9": 449.90872650146486, + "learning_rate": 2.7499590642665774e-05, + "loss": 1007.6534, + "step": 8950 + }, + { + "ce_loss_13": 3.2645738244056703, + "ce_loss_17": 3.215935003757477, + "ce_loss_2": 4.196627688407898, + "ce_loss_4": 3.7690383076667784, + "ce_loss_9": 3.402366030216217, + "epoch": 0.896, + "grad_norm": 1032.0, + "kl_loss_13": 104.42727851867676, + "kl_loss_2": 2096.6643920898437, + "kl_loss_4": 1248.980078125, + "kl_loss_9": 445.657373046875, + "learning_rate": 2.6983023928961405e-05, + "loss": 968.7039, + "step": 8960 + }, + { + "ce_loss_13": 3.2358240604400637, + "ce_loss_17": 3.1873093128204344, + "ce_loss_2": 4.178699564933777, + "ce_loss_4": 3.7612051129341126, + "ce_loss_9": 3.384795296192169, + "epoch": 0.897, + "grad_norm": 920.0, + "kl_loss_13": 100.57173652648926, + "kl_loss_2": 2089.2233459472654, + "kl_loss_4": 1261.3327880859374, + "kl_loss_9": 444.78974609375, + "learning_rate": 2.6471220490954628e-05, + "loss": 988.0617, + "step": 8970 + }, + { + "ce_loss_13": 3.229248917102814, + "ce_loss_17": 3.183977997303009, + "ce_loss_2": 4.164087510108947, + "ce_loss_4": 3.733451807498932, + "ce_loss_9": 3.367630994319916, + "epoch": 0.898, + "grad_norm": 856.0, + "kl_loss_13": 99.83658943176269, + "kl_loss_2": 2104.553125, + "kl_loss_4": 1254.7056030273438, + "kl_loss_9": 440.1147705078125, + "learning_rate": 2.596418548250029e-05, + "loss": 981.8357, + "step": 8980 + }, + { + "ce_loss_13": 3.2640267848968505, + "ce_loss_17": 3.217205274105072, + "ce_loss_2": 4.190861165523529, + "ce_loss_4": 3.7741482257843018, + "ce_loss_9": 3.4099146366119384, + "epoch": 0.899, + "grad_norm": 936.0, + "kl_loss_13": 102.8364444732666, + "kl_loss_2": 2108.3700744628904, + "kl_loss_4": 1272.7893798828125, + "kl_loss_9": 451.96382598876954, + "learning_rate": 2.5461924009435368e-05, + "loss": 969.4295, + "step": 8990 + }, + { + "ce_loss_13": 3.2565832257270815, + "ce_loss_17": 3.2091624259948732, + "ce_loss_2": 4.195440435409546, + "ce_loss_4": 3.7732958793640137, + "ce_loss_9": 3.3987311244010927, + "epoch": 0.9, + "grad_norm": 992.0, + "kl_loss_13": 102.18106269836426, + "kl_loss_2": 2092.73095703125, + "kl_loss_4": 1259.2377136230468, + "kl_loss_9": 445.48841247558596, + "learning_rate": 2.4964441129527336e-05, + "loss": 992.3745, + "step": 9000 + }, + { + "ce_loss_13": 3.2569509148597717, + "ce_loss_17": 3.2118582010269163, + "ce_loss_2": 4.169191956520081, + "ce_loss_4": 3.7541915655136107, + "ce_loss_9": 3.395655703544617, + "epoch": 0.901, + "grad_norm": 1012.0, + "kl_loss_13": 99.25920944213867, + "kl_loss_2": 2053.3325317382814, + "kl_loss_4": 1231.8896118164062, + "kl_loss_9": 435.1133316040039, + "learning_rate": 2.4471741852423235e-05, + "loss": 959.1539, + "step": 9010 + }, + { + "ce_loss_13": 3.304662549495697, + "ce_loss_17": 3.258540964126587, + "ce_loss_2": 4.236178302764893, + "ce_loss_4": 3.8176496982574464, + "ce_loss_9": 3.450073802471161, + "epoch": 0.902, + "grad_norm": 904.0, + "kl_loss_13": 100.67208137512208, + "kl_loss_2": 2058.0515625, + "kl_loss_4": 1245.5396606445313, + "kl_loss_9": 440.82079925537107, + "learning_rate": 2.3983831139599287e-05, + "loss": 966.7086, + "step": 9020 + }, + { + "ce_loss_13": 3.2276505708694456, + "ce_loss_17": 3.1815640449523928, + "ce_loss_2": 4.158493387699127, + "ce_loss_4": 3.7346356391906737, + "ce_loss_9": 3.367464518547058, + "epoch": 0.903, + "grad_norm": 860.0, + "kl_loss_13": 98.8610065460205, + "kl_loss_2": 2072.151226806641, + "kl_loss_4": 1239.0315246582031, + "kl_loss_9": 432.47070770263673, + "learning_rate": 2.3500713904311022e-05, + "loss": 948.4061, + "step": 9030 + }, + { + "ce_loss_13": 3.2644612789154053, + "ce_loss_17": 3.2201750755310057, + "ce_loss_2": 4.1670746326446535, + "ce_loss_4": 3.7572099208831786, + "ce_loss_9": 3.402985489368439, + "epoch": 0.904, + "grad_norm": 852.0, + "kl_loss_13": 98.05723876953125, + "kl_loss_2": 2014.7668212890626, + "kl_loss_4": 1206.054266357422, + "kl_loss_9": 428.4418411254883, + "learning_rate": 2.3022395011543685e-05, + "loss": 946.4781, + "step": 9040 + }, + { + "ce_loss_13": 3.291104483604431, + "ce_loss_17": 3.243395411968231, + "ce_loss_2": 4.226366579532623, + "ce_loss_4": 3.808375656604767, + "ce_loss_9": 3.442223596572876, + "epoch": 0.905, + "grad_norm": 880.0, + "kl_loss_13": 103.13118743896484, + "kl_loss_2": 2099.974890136719, + "kl_loss_4": 1271.4048156738281, + "kl_loss_9": 454.81058197021486, + "learning_rate": 2.2548879277963063e-05, + "loss": 996.6263, + "step": 9050 + }, + { + "ce_loss_13": 3.209779965877533, + "ce_loss_17": 3.1624715328216553, + "ce_loss_2": 4.140535140037537, + "ce_loss_4": 3.715310525894165, + "ce_loss_9": 3.3525261998176576, + "epoch": 0.906, + "grad_norm": 924.0, + "kl_loss_13": 99.56582946777344, + "kl_loss_2": 2075.5917907714843, + "kl_loss_4": 1244.274395751953, + "kl_loss_9": 438.52172393798827, + "learning_rate": 2.208017147186736e-05, + "loss": 948.4633, + "step": 9060 + }, + { + "ce_loss_13": 3.2073147773742674, + "ce_loss_17": 3.1594581961631776, + "ce_loss_2": 4.143043243885041, + "ce_loss_4": 3.7249698638916016, + "ce_loss_9": 3.350212013721466, + "epoch": 0.907, + "grad_norm": 880.0, + "kl_loss_13": 100.06279525756835, + "kl_loss_2": 2091.8215209960936, + "kl_loss_4": 1263.5703552246093, + "kl_loss_9": 443.9049774169922, + "learning_rate": 2.1616276313139227e-05, + "loss": 966.5273, + "step": 9070 + }, + { + "ce_loss_13": 3.2482252955436706, + "ce_loss_17": 3.20016747713089, + "ce_loss_2": 4.183684349060059, + "ce_loss_4": 3.764383327960968, + "ce_loss_9": 3.3882714748382567, + "epoch": 0.908, + "grad_norm": 976.0, + "kl_loss_13": 100.39715614318848, + "kl_loss_2": 2095.0434753417967, + "kl_loss_4": 1264.0903381347657, + "kl_loss_9": 442.2976638793945, + "learning_rate": 2.1157198473197415e-05, + "loss": 985.0625, + "step": 9080 + }, + { + "ce_loss_13": 3.304949402809143, + "ce_loss_17": 3.257016623020172, + "ce_loss_2": 4.250957870483399, + "ce_loss_4": 3.8193687200546265, + "ce_loss_9": 3.450898289680481, + "epoch": 0.909, + "grad_norm": 1112.0, + "kl_loss_13": 102.42249374389648, + "kl_loss_2": 2101.319580078125, + "kl_loss_4": 1259.2540954589845, + "kl_loss_9": 453.0595275878906, + "learning_rate": 2.0702942574950812e-05, + "loss": 977.3465, + "step": 9090 + }, + { + "ce_loss_13": 3.2333330273628236, + "ce_loss_17": 3.1846224784851076, + "ce_loss_2": 4.184793710708618, + "ce_loss_4": 3.758673644065857, + "ce_loss_9": 3.3803303837776184, + "epoch": 0.91, + "grad_norm": 980.0, + "kl_loss_13": 102.81347427368163, + "kl_loss_2": 2125.416369628906, + "kl_loss_4": 1282.364031982422, + "kl_loss_9": 454.6492172241211, + "learning_rate": 2.025351319275137e-05, + "loss": 983.3227, + "step": 9100 + }, + { + "ce_loss_13": 3.3592461705207826, + "ce_loss_17": 3.310008633136749, + "ce_loss_2": 4.293835592269898, + "ce_loss_4": 3.8722840547561646, + "ce_loss_9": 3.50608172416687, + "epoch": 0.911, + "grad_norm": 1008.0, + "kl_loss_13": 104.87370223999024, + "kl_loss_2": 2116.4436096191407, + "kl_loss_4": 1290.9220703125, + "kl_loss_9": 461.05602874755857, + "learning_rate": 1.9808914852347816e-05, + "loss": 1005.332, + "step": 9110 + }, + { + "ce_loss_13": 3.209620714187622, + "ce_loss_17": 3.160206663608551, + "ce_loss_2": 4.145595002174377, + "ce_loss_4": 3.734630060195923, + "ce_loss_9": 3.3529163718223574, + "epoch": 0.912, + "grad_norm": 1256.0, + "kl_loss_13": 100.62721633911133, + "kl_loss_2": 2073.555010986328, + "kl_loss_4": 1262.6597595214844, + "kl_loss_9": 444.98316497802733, + "learning_rate": 1.9369152030840554e-05, + "loss": 968.9119, + "step": 9120 + }, + { + "ce_loss_13": 3.286211574077606, + "ce_loss_17": 3.241570603847504, + "ce_loss_2": 4.228869473934173, + "ce_loss_4": 3.799513578414917, + "ce_loss_9": 3.425935518741608, + "epoch": 0.913, + "grad_norm": 1176.0, + "kl_loss_13": 100.5714500427246, + "kl_loss_2": 2118.401690673828, + "kl_loss_4": 1268.2083984375, + "kl_loss_9": 441.81104278564453, + "learning_rate": 1.893422915663645e-05, + "loss": 977.7996, + "step": 9130 + }, + { + "ce_loss_13": 3.159002256393433, + "ce_loss_17": 3.110749089717865, + "ce_loss_2": 4.14225801229477, + "ce_loss_4": 3.7038495779037475, + "ce_loss_9": 3.311249113082886, + "epoch": 0.914, + "grad_norm": 976.0, + "kl_loss_13": 101.4797233581543, + "kl_loss_2": 2172.3015258789064, + "kl_loss_4": 1310.25732421875, + "kl_loss_9": 455.07178497314453, + "learning_rate": 1.850415060940386e-05, + "loss": 998.324, + "step": 9140 + }, + { + "ce_loss_13": 3.2790889859199526, + "ce_loss_17": 3.235070991516113, + "ce_loss_2": 4.196420991420746, + "ce_loss_4": 3.7808594942092895, + "ce_loss_9": 3.4224207162857057, + "epoch": 0.915, + "grad_norm": 1144.0, + "kl_loss_13": 100.17375221252442, + "kl_loss_2": 2055.3639587402345, + "kl_loss_4": 1234.2696319580077, + "kl_loss_9": 443.00874481201174, + "learning_rate": 1.8078920720028978e-05, + "loss": 968.3445, + "step": 9150 + }, + { + "ce_loss_13": 3.203438234329224, + "ce_loss_17": 3.1604392290115357, + "ce_loss_2": 4.122346830368042, + "ce_loss_4": 3.712253046035767, + "ce_loss_9": 3.3432358503341675, + "epoch": 0.916, + "grad_norm": 988.0, + "kl_loss_13": 97.228857421875, + "kl_loss_2": 2044.9042419433595, + "kl_loss_4": 1239.07822265625, + "kl_loss_9": 434.1706939697266, + "learning_rate": 1.765854377057219e-05, + "loss": 979.9846, + "step": 9160 + }, + { + "ce_loss_13": 3.1822198033332825, + "ce_loss_17": 3.138648498058319, + "ce_loss_2": 4.112299299240112, + "ce_loss_4": 3.688192677497864, + "ce_loss_9": 3.327041435241699, + "epoch": 0.917, + "grad_norm": 1096.0, + "kl_loss_13": 97.4744026184082, + "kl_loss_2": 2077.7341247558593, + "kl_loss_4": 1236.8712280273437, + "kl_loss_9": 431.72461395263673, + "learning_rate": 1.724302399422456e-05, + "loss": 968.9422, + "step": 9170 + }, + { + "ce_loss_13": 3.150641131401062, + "ce_loss_17": 3.101999652385712, + "ce_loss_2": 4.0972420334815975, + "ce_loss_4": 3.663783919811249, + "ce_loss_9": 3.294815444946289, + "epoch": 0.918, + "grad_norm": 1040.0, + "kl_loss_13": 102.12319526672363, + "kl_loss_2": 2116.698699951172, + "kl_loss_4": 1270.1096984863282, + "kl_loss_9": 452.41722717285154, + "learning_rate": 1.683236557526574e-05, + "loss": 984.1377, + "step": 9180 + }, + { + "ce_loss_13": 3.2643605709075927, + "ce_loss_17": 3.2191934108734133, + "ce_loss_2": 4.162167167663574, + "ce_loss_4": 3.7516831874847414, + "ce_loss_9": 3.401525783538818, + "epoch": 0.919, + "grad_norm": 880.0, + "kl_loss_13": 97.641109085083, + "kl_loss_2": 2010.9575744628905, + "kl_loss_4": 1213.6159240722657, + "kl_loss_9": 427.9808639526367, + "learning_rate": 1.6426572649021475e-05, + "loss": 966.4651, + "step": 9190 + }, + { + "ce_loss_13": 3.2934301257133485, + "ce_loss_17": 3.247780239582062, + "ce_loss_2": 4.178894245624543, + "ce_loss_4": 3.776092267036438, + "ce_loss_9": 3.425630736351013, + "epoch": 0.92, + "grad_norm": 1136.0, + "kl_loss_13": 100.05005683898926, + "kl_loss_2": 2017.3282287597656, + "kl_loss_4": 1221.2224914550782, + "kl_loss_9": 433.5446258544922, + "learning_rate": 1.6025649301821876e-05, + "loss": 955.6616, + "step": 9200 + }, + { + "ce_loss_13": 3.2818232655525206, + "ce_loss_17": 3.2359335899353026, + "ce_loss_2": 4.18104716539383, + "ce_loss_4": 3.7753984689712525, + "ce_loss_9": 3.4274493932723997, + "epoch": 0.921, + "grad_norm": 1360.0, + "kl_loss_13": 100.95476760864258, + "kl_loss_2": 2038.7368774414062, + "kl_loss_4": 1239.1733337402343, + "kl_loss_9": 443.64212799072266, + "learning_rate": 1.5629599570960716e-05, + "loss": 956.5411, + "step": 9210 + }, + { + "ce_loss_13": 3.19571430683136, + "ce_loss_17": 3.1482695937156677, + "ce_loss_2": 4.135426080226898, + "ce_loss_4": 3.6969372153282167, + "ce_loss_9": 3.3368793725967407, + "epoch": 0.922, + "grad_norm": 1120.0, + "kl_loss_13": 99.02320823669433, + "kl_loss_2": 2120.2299377441404, + "kl_loss_4": 1258.3416809082032, + "kl_loss_9": 443.88822174072266, + "learning_rate": 1.5238427444654367e-05, + "loss": 973.0723, + "step": 9220 + }, + { + "ce_loss_13": 3.2446742057800293, + "ce_loss_17": 3.197295343875885, + "ce_loss_2": 4.1721936702728275, + "ce_loss_4": 3.747636079788208, + "ce_loss_9": 3.38888738155365, + "epoch": 0.923, + "grad_norm": 940.0, + "kl_loss_13": 100.02047500610351, + "kl_loss_2": 2069.615399169922, + "kl_loss_4": 1231.2265930175781, + "kl_loss_9": 435.61701049804685, + "learning_rate": 1.4852136862001764e-05, + "loss": 962.7881, + "step": 9230 + }, + { + "ce_loss_13": 3.2174304962158202, + "ce_loss_17": 3.1717190623283384, + "ce_loss_2": 4.131673502922058, + "ce_loss_4": 3.724416470527649, + "ce_loss_9": 3.3621557712554933, + "epoch": 0.924, + "grad_norm": 944.0, + "kl_loss_13": 98.2211524963379, + "kl_loss_2": 2045.8399719238282, + "kl_loss_4": 1246.335382080078, + "kl_loss_9": 434.97546844482423, + "learning_rate": 1.4470731712944884e-05, + "loss": 975.3602, + "step": 9240 + }, + { + "ce_loss_13": 3.2365424394607545, + "ce_loss_17": 3.188973546028137, + "ce_loss_2": 4.17758731842041, + "ce_loss_4": 3.7488979578018187, + "ce_loss_9": 3.384091091156006, + "epoch": 0.925, + "grad_norm": 1024.0, + "kl_loss_13": 101.60187225341797, + "kl_loss_2": 2083.5795166015623, + "kl_loss_4": 1245.118994140625, + "kl_loss_9": 447.4918960571289, + "learning_rate": 1.4094215838229174e-05, + "loss": 992.8758, + "step": 9250 + }, + { + "ce_loss_13": 3.206554102897644, + "ce_loss_17": 3.162507212162018, + "ce_loss_2": 4.164082229137421, + "ce_loss_4": 3.7286868333816527, + "ce_loss_9": 3.3531262159347532, + "epoch": 0.926, + "grad_norm": 1248.0, + "kl_loss_13": 100.37296028137207, + "kl_loss_2": 2139.3466918945314, + "kl_loss_4": 1272.8878051757813, + "kl_loss_9": 447.43287200927733, + "learning_rate": 1.372259302936546e-05, + "loss": 1016.859, + "step": 9260 + }, + { + "ce_loss_13": 3.314477002620697, + "ce_loss_17": 3.265175449848175, + "ce_loss_2": 4.238245666027069, + "ce_loss_4": 3.823108470439911, + "ce_loss_9": 3.4550861358642577, + "epoch": 0.927, + "grad_norm": 868.0, + "kl_loss_13": 104.92797508239747, + "kl_loss_2": 2083.99296875, + "kl_loss_4": 1262.4045471191407, + "kl_loss_9": 451.29290161132815, + "learning_rate": 1.3355867028591206e-05, + "loss": 965.06, + "step": 9270 + }, + { + "ce_loss_13": 3.217471885681152, + "ce_loss_17": 3.1729040503501893, + "ce_loss_2": 4.126240134239197, + "ce_loss_4": 3.7070771932601927, + "ce_loss_9": 3.3539362072944643, + "epoch": 0.928, + "grad_norm": 1024.0, + "kl_loss_13": 99.2825912475586, + "kl_loss_2": 2049.244281005859, + "kl_loss_4": 1230.0013427734375, + "kl_loss_9": 435.8931488037109, + "learning_rate": 1.2994041528833267e-05, + "loss": 957.8826, + "step": 9280 + }, + { + "ce_loss_13": 3.2165645122528077, + "ce_loss_17": 3.169575834274292, + "ce_loss_2": 4.14753155708313, + "ce_loss_4": 3.7267600893974304, + "ce_loss_9": 3.3605769872665405, + "epoch": 0.929, + "grad_norm": 1416.0, + "kl_loss_13": 98.34602661132813, + "kl_loss_2": 2088.528155517578, + "kl_loss_4": 1253.7621276855468, + "kl_loss_9": 439.0786407470703, + "learning_rate": 1.2637120173670358e-05, + "loss": 966.1105, + "step": 9290 + }, + { + "ce_loss_13": 3.2424200773239136, + "ce_loss_17": 3.195589578151703, + "ce_loss_2": 4.184446549415588, + "ce_loss_4": 3.7560463309288026, + "ce_loss_9": 3.392116892337799, + "epoch": 0.93, + "grad_norm": 1208.0, + "kl_loss_13": 101.0448657989502, + "kl_loss_2": 2094.6267944335937, + "kl_loss_4": 1258.8706909179687, + "kl_loss_9": 445.57228088378906, + "learning_rate": 1.2285106557296478e-05, + "loss": 972.0389, + "step": 9300 + }, + { + "ce_loss_13": 3.1237415432929994, + "ce_loss_17": 3.078605842590332, + "ce_loss_2": 4.126356077194214, + "ce_loss_4": 3.6710851550102235, + "ce_loss_9": 3.2731125473976137, + "epoch": 0.931, + "grad_norm": 1096.0, + "kl_loss_13": 100.01634979248047, + "kl_loss_2": 2199.689569091797, + "kl_loss_4": 1317.4813415527344, + "kl_loss_9": 451.8364624023437, + "learning_rate": 1.1938004224484989e-05, + "loss": 991.3918, + "step": 9310 + }, + { + "ce_loss_13": 3.353495955467224, + "ce_loss_17": 3.30627703666687, + "ce_loss_2": 4.270099675655365, + "ce_loss_4": 3.8513387560844423, + "ce_loss_9": 3.496125316619873, + "epoch": 0.932, + "grad_norm": 1048.0, + "kl_loss_13": 102.64877319335938, + "kl_loss_2": 2069.185406494141, + "kl_loss_4": 1251.5001037597656, + "kl_loss_9": 445.19418182373045, + "learning_rate": 1.1595816670552429e-05, + "loss": 985.7512, + "step": 9320 + }, + { + "ce_loss_13": 3.274528515338898, + "ce_loss_17": 3.2265638947486877, + "ce_loss_2": 4.193568253517151, + "ce_loss_4": 3.77582710981369, + "ce_loss_9": 3.4121570944786073, + "epoch": 0.933, + "grad_norm": 964.0, + "kl_loss_13": 101.36200561523438, + "kl_loss_2": 2052.6002990722654, + "kl_loss_4": 1233.77529296875, + "kl_loss_9": 435.19707641601565, + "learning_rate": 1.1258547341323699e-05, + "loss": 956.7293, + "step": 9330 + }, + { + "ce_loss_13": 3.310185956954956, + "ce_loss_17": 3.2623629212379455, + "ce_loss_2": 4.228821527957916, + "ce_loss_4": 3.8128562450408934, + "ce_loss_9": 3.4524996280670166, + "epoch": 0.934, + "grad_norm": 1032.0, + "kl_loss_13": 100.94105491638183, + "kl_loss_2": 2089.229571533203, + "kl_loss_4": 1255.4903869628906, + "kl_loss_9": 448.49928588867186, + "learning_rate": 1.0926199633097156e-05, + "loss": 969.7935, + "step": 9340 + }, + { + "ce_loss_13": 3.3170249819755555, + "ce_loss_17": 3.2737075090408325, + "ce_loss_2": 4.206792187690735, + "ce_loss_4": 3.7933007717132567, + "ce_loss_9": 3.451795446872711, + "epoch": 0.935, + "grad_norm": 928.0, + "kl_loss_13": 98.22538108825684, + "kl_loss_2": 2030.3801025390626, + "kl_loss_4": 1213.0071411132812, + "kl_loss_9": 433.99659271240233, + "learning_rate": 1.0598776892610684e-05, + "loss": 973.3193, + "step": 9350 + }, + { + "ce_loss_13": 3.1363030910491942, + "ce_loss_17": 3.0911741971969606, + "ce_loss_2": 4.077869355678558, + "ce_loss_4": 3.6531554460525513, + "ce_loss_9": 3.2833625435829163, + "epoch": 0.936, + "grad_norm": 996.0, + "kl_loss_13": 97.95297317504883, + "kl_loss_2": 2108.7460998535157, + "kl_loss_4": 1266.1794372558593, + "kl_loss_9": 445.82669525146486, + "learning_rate": 1.0276282417007399e-05, + "loss": 965.5063, + "step": 9360 + }, + { + "ce_loss_13": 3.284811592102051, + "ce_loss_17": 3.2402807474136353, + "ce_loss_2": 4.185770976543426, + "ce_loss_4": 3.779066872596741, + "ce_loss_9": 3.426911973953247, + "epoch": 0.937, + "grad_norm": 1080.0, + "kl_loss_13": 97.99373664855958, + "kl_loss_2": 2032.2546813964843, + "kl_loss_4": 1223.124658203125, + "kl_loss_9": 433.991194152832, + "learning_rate": 9.958719453803277e-06, + "loss": 961.1727, + "step": 9370 + }, + { + "ce_loss_13": 3.2820864081382752, + "ce_loss_17": 3.2344361901283265, + "ce_loss_2": 4.217387580871582, + "ce_loss_4": 3.8025092005729677, + "ce_loss_9": 3.4310351490974424, + "epoch": 0.938, + "grad_norm": 948.0, + "kl_loss_13": 101.0464874267578, + "kl_loss_2": 2097.283587646484, + "kl_loss_4": 1277.6190002441406, + "kl_loss_9": 448.83250579833987, + "learning_rate": 9.646091200853802e-06, + "loss": 970.9555, + "step": 9380 + }, + { + "ce_loss_13": 3.239380931854248, + "ce_loss_17": 3.1941351532936095, + "ce_loss_2": 4.160029315948487, + "ce_loss_4": 3.7426826119422913, + "ce_loss_9": 3.382050836086273, + "epoch": 0.939, + "grad_norm": 932.0, + "kl_loss_13": 97.57374649047851, + "kl_loss_2": 2056.067559814453, + "kl_loss_4": 1236.481982421875, + "kl_loss_9": 434.8818954467773, + "learning_rate": 9.338400806321978e-06, + "loss": 937.6338, + "step": 9390 + }, + { + "ce_loss_13": 3.273054671287537, + "ce_loss_17": 3.225569522380829, + "ce_loss_2": 4.190238308906555, + "ce_loss_4": 3.7751076221466064, + "ce_loss_9": 3.417978584766388, + "epoch": 0.94, + "grad_norm": 912.0, + "kl_loss_13": 101.65242614746094, + "kl_loss_2": 2054.649365234375, + "kl_loss_4": 1235.611590576172, + "kl_loss_9": 443.1382141113281, + "learning_rate": 9.035651368646646e-06, + "loss": 957.3645, + "step": 9400 + }, + { + "ce_loss_13": 3.2743691921234133, + "ce_loss_17": 3.2289324164390565, + "ce_loss_2": 4.184344387054443, + "ce_loss_4": 3.772081124782562, + "ce_loss_9": 3.4122036576271055, + "epoch": 0.941, + "grad_norm": 1248.0, + "kl_loss_13": 99.33428688049317, + "kl_loss_2": 2048.259600830078, + "kl_loss_4": 1237.234393310547, + "kl_loss_9": 434.7593521118164, + "learning_rate": 8.737845936511335e-06, + "loss": 964.4977, + "step": 9410 + }, + { + "ce_loss_13": 3.2317710638046266, + "ce_loss_17": 3.1828368663787843, + "ce_loss_2": 4.175829625129699, + "ce_loss_4": 3.734463465213776, + "ce_loss_9": 3.373307502269745, + "epoch": 0.942, + "grad_norm": 1328.0, + "kl_loss_13": 101.4010066986084, + "kl_loss_2": 2124.009136962891, + "kl_loss_4": 1254.7905334472657, + "kl_loss_9": 447.0837966918945, + "learning_rate": 8.444987508813451e-06, + "loss": 969.6604, + "step": 9420 + }, + { + "ce_loss_13": 3.182221269607544, + "ce_loss_17": 3.135561966896057, + "ce_loss_2": 4.1588412642478945, + "ce_loss_4": 3.715184760093689, + "ce_loss_9": 3.3329781532287597, + "epoch": 0.943, + "grad_norm": 1152.0, + "kl_loss_13": 101.9586784362793, + "kl_loss_2": 2192.0331420898438, + "kl_loss_4": 1302.3288818359374, + "kl_loss_9": 456.5460006713867, + "learning_rate": 8.157079034633974e-06, + "loss": 994.2252, + "step": 9430 + }, + { + "ce_loss_13": 3.1799741983413696, + "ce_loss_17": 3.134570562839508, + "ce_loss_2": 4.1258539915084835, + "ce_loss_4": 3.6969086647033693, + "ce_loss_9": 3.321664845943451, + "epoch": 0.944, + "grad_norm": 904.0, + "kl_loss_13": 99.40446662902832, + "kl_loss_2": 2125.3603637695314, + "kl_loss_4": 1276.7077026367188, + "kl_loss_9": 442.9645172119141, + "learning_rate": 7.874123413208145e-06, + "loss": 972.3776, + "step": 9440 + }, + { + "ce_loss_13": 3.151604950428009, + "ce_loss_17": 3.1044161677360536, + "ce_loss_2": 4.119300007820129, + "ce_loss_4": 3.6832780241966248, + "ce_loss_9": 3.2998377084732056, + "epoch": 0.945, + "grad_norm": 1184.0, + "kl_loss_13": 99.68213806152343, + "kl_loss_2": 2137.3049438476564, + "kl_loss_4": 1274.2804870605469, + "kl_loss_9": 448.83734436035155, + "learning_rate": 7.59612349389599e-06, + "loss": 985.5232, + "step": 9450 + }, + { + "ce_loss_13": 3.2439227223396303, + "ce_loss_17": 3.199974203109741, + "ce_loss_2": 4.151203238964081, + "ce_loss_4": 3.740894412994385, + "ce_loss_9": 3.385606753826141, + "epoch": 0.946, + "grad_norm": 1152.0, + "kl_loss_13": 97.83563690185547, + "kl_loss_2": 2017.320086669922, + "kl_loss_4": 1215.5391723632813, + "kl_loss_9": 432.3570785522461, + "learning_rate": 7.323082076153509e-06, + "loss": 959.1424, + "step": 9460 + }, + { + "ce_loss_13": 3.283904266357422, + "ce_loss_17": 3.2351248502731322, + "ce_loss_2": 4.190016138553619, + "ce_loss_4": 3.7856735467910765, + "ce_loss_9": 3.422900605201721, + "epoch": 0.947, + "grad_norm": 972.0, + "kl_loss_13": 101.97411155700684, + "kl_loss_2": 2031.8303527832031, + "kl_loss_4": 1231.2672973632812, + "kl_loss_9": 443.36658477783203, + "learning_rate": 7.055001909504755e-06, + "loss": 976.001, + "step": 9470 + }, + { + "ce_loss_13": 3.3199066638946535, + "ce_loss_17": 3.273211121559143, + "ce_loss_2": 4.230184531211853, + "ce_loss_4": 3.819090116024017, + "ce_loss_9": 3.463311159610748, + "epoch": 0.948, + "grad_norm": 844.0, + "kl_loss_13": 100.60670051574706, + "kl_loss_2": 2066.559783935547, + "kl_loss_4": 1248.1835083007813, + "kl_loss_9": 444.16493835449216, + "learning_rate": 6.791885693514133e-06, + "loss": 970.0475, + "step": 9480 + }, + { + "ce_loss_13": 3.2259212255477907, + "ce_loss_17": 3.179897928237915, + "ce_loss_2": 4.178668856620789, + "ce_loss_4": 3.7473167419433593, + "ce_loss_9": 3.367188060283661, + "epoch": 0.949, + "grad_norm": 904.0, + "kl_loss_13": 100.98762588500976, + "kl_loss_2": 2135.3104248046875, + "kl_loss_4": 1278.0763793945312, + "kl_loss_9": 443.35464935302736, + "learning_rate": 6.533736077758867e-06, + "loss": 986.3941, + "step": 9490 + }, + { + "ce_loss_13": 3.1879597783088682, + "ce_loss_17": 3.14133483171463, + "ce_loss_2": 4.1672933340072635, + "ce_loss_4": 3.7187579393386843, + "ce_loss_9": 3.3373117685317992, + "epoch": 0.95, + "grad_norm": 1272.0, + "kl_loss_13": 102.77911109924317, + "kl_loss_2": 2194.187353515625, + "kl_loss_4": 1307.5126708984376, + "kl_loss_9": 456.8600448608398, + "learning_rate": 6.2805556618028556e-06, + "loss": 990.1027, + "step": 9500 + }, + { + "ce_loss_13": 3.274174678325653, + "ce_loss_17": 3.2298904180526735, + "ce_loss_2": 4.18009135723114, + "ce_loss_4": 3.759975802898407, + "ce_loss_9": 3.4065104603767393, + "epoch": 0.951, + "grad_norm": 972.0, + "kl_loss_13": 97.43217277526855, + "kl_loss_2": 2023.1135192871093, + "kl_loss_4": 1203.2561462402343, + "kl_loss_9": 423.83628997802737, + "learning_rate": 6.032346995169968e-06, + "loss": 931.4363, + "step": 9510 + }, + { + "ce_loss_13": 3.2770089983940123, + "ce_loss_17": 3.232247793674469, + "ce_loss_2": 4.198025560379028, + "ce_loss_4": 3.7814950585365295, + "ce_loss_9": 3.4177303552627563, + "epoch": 0.952, + "grad_norm": 924.0, + "kl_loss_13": 99.74614677429199, + "kl_loss_2": 2060.5488037109376, + "kl_loss_4": 1243.4562927246093, + "kl_loss_9": 439.9268493652344, + "learning_rate": 5.789112577318789e-06, + "loss": 958.333, + "step": 9520 + }, + { + "ce_loss_13": 3.2576918959617616, + "ce_loss_17": 3.212440812587738, + "ce_loss_2": 4.201545834541321, + "ce_loss_4": 3.769130289554596, + "ce_loss_9": 3.400744640827179, + "epoch": 0.953, + "grad_norm": 1032.0, + "kl_loss_13": 101.39390754699707, + "kl_loss_2": 2123.0364624023437, + "kl_loss_4": 1272.0700439453126, + "kl_loss_9": 446.78711853027346, + "learning_rate": 5.550854857617194e-06, + "loss": 963.8008, + "step": 9530 + }, + { + "ce_loss_13": 3.2441832065582275, + "ce_loss_17": 3.195235586166382, + "ce_loss_2": 4.208361446857452, + "ce_loss_4": 3.767829489707947, + "ce_loss_9": 3.3905673384666444, + "epoch": 0.954, + "grad_norm": 1072.0, + "kl_loss_13": 102.92916793823242, + "kl_loss_2": 2147.9145263671876, + "kl_loss_4": 1277.785137939453, + "kl_loss_9": 451.06036987304685, + "learning_rate": 5.317576235317756e-06, + "loss": 985.4813, + "step": 9540 + }, + { + "ce_loss_13": 3.2710776686668397, + "ce_loss_17": 3.225491261482239, + "ce_loss_2": 4.16835834980011, + "ce_loss_4": 3.759235715866089, + "ce_loss_9": 3.4088730216026306, + "epoch": 0.955, + "grad_norm": 1128.0, + "kl_loss_13": 98.39339408874511, + "kl_loss_2": 2006.6261901855469, + "kl_loss_4": 1200.5606079101562, + "kl_loss_9": 425.8466522216797, + "learning_rate": 5.089279059533658e-06, + "loss": 965.1515, + "step": 9550 + }, + { + "ce_loss_13": 3.323964560031891, + "ce_loss_17": 3.275723373889923, + "ce_loss_2": 4.234560871124268, + "ce_loss_4": 3.827406680583954, + "ce_loss_9": 3.4707021594047545, + "epoch": 0.956, + "grad_norm": 1184.0, + "kl_loss_13": 103.47303161621093, + "kl_loss_2": 2050.3690673828123, + "kl_loss_4": 1254.3973205566406, + "kl_loss_9": 450.9154342651367, + "learning_rate": 4.865965629214819e-06, + "loss": 961.9025, + "step": 9560 + }, + { + "ce_loss_13": 3.271520745754242, + "ce_loss_17": 3.2242767333984377, + "ce_loss_2": 4.207082843780517, + "ce_loss_4": 3.777870786190033, + "ce_loss_9": 3.4115238189697266, + "epoch": 0.957, + "grad_norm": 944.0, + "kl_loss_13": 102.03676414489746, + "kl_loss_2": 2116.9252990722657, + "kl_loss_4": 1271.1083251953125, + "kl_loss_9": 448.79185638427737, + "learning_rate": 4.6476381931251366e-06, + "loss": 960.7486, + "step": 9570 + }, + { + "ce_loss_13": 3.2546559453010557, + "ce_loss_17": 3.2062507033348084, + "ce_loss_2": 4.1746816515922545, + "ce_loss_4": 3.759263265132904, + "ce_loss_9": 3.3981380224227906, + "epoch": 0.958, + "grad_norm": 1112.0, + "kl_loss_13": 99.54115409851075, + "kl_loss_2": 2056.1169250488283, + "kl_loss_4": 1238.817608642578, + "kl_loss_9": 438.92663421630857, + "learning_rate": 4.434298949819449e-06, + "loss": 965.6343, + "step": 9580 + }, + { + "ce_loss_13": 3.2185049772262575, + "ce_loss_17": 3.169291353225708, + "ce_loss_2": 4.186837446689606, + "ce_loss_4": 3.7499027371406557, + "ce_loss_9": 3.3656785011291506, + "epoch": 0.959, + "grad_norm": 992.0, + "kl_loss_13": 104.03815231323242, + "kl_loss_2": 2191.375048828125, + "kl_loss_4": 1322.1709228515624, + "kl_loss_9": 459.67818756103514, + "learning_rate": 4.2259500476214406e-06, + "loss": 993.9736, + "step": 9590 + }, + { + "ce_loss_13": 3.196918773651123, + "ce_loss_17": 3.151186442375183, + "ce_loss_2": 4.145372676849365, + "ce_loss_4": 3.7130317449569703, + "ce_loss_9": 3.3432331919670104, + "epoch": 0.96, + "grad_norm": 1008.0, + "kl_loss_13": 100.60027847290038, + "kl_loss_2": 2123.793426513672, + "kl_loss_4": 1272.8138122558594, + "kl_loss_9": 447.2867904663086, + "learning_rate": 4.02259358460233e-06, + "loss": 970.3404, + "step": 9600 + }, + { + "ce_loss_13": 3.2634061217308044, + "ce_loss_17": 3.2161954641342163, + "ce_loss_2": 4.1841383337974545, + "ce_loss_4": 3.7625487208366395, + "ce_loss_9": 3.4065575122833254, + "epoch": 0.961, + "grad_norm": 988.0, + "kl_loss_13": 101.02522430419921, + "kl_loss_2": 2054.220361328125, + "kl_loss_4": 1223.5812133789063, + "kl_loss_9": 439.5580657958984, + "learning_rate": 3.8242316085594916e-06, + "loss": 952.3349, + "step": 9610 + }, + { + "ce_loss_13": 3.149327003955841, + "ce_loss_17": 3.099954533576965, + "ce_loss_2": 4.148278522491455, + "ce_loss_4": 3.689293098449707, + "ce_loss_9": 3.303290832042694, + "epoch": 0.962, + "grad_norm": 1024.0, + "kl_loss_13": 102.40749549865723, + "kl_loss_2": 2222.956262207031, + "kl_loss_4": 1320.8577514648437, + "kl_loss_9": 462.6416412353516, + "learning_rate": 3.630866116995757e-06, + "loss": 1007.3529, + "step": 9620 + }, + { + "ce_loss_13": 3.304829955101013, + "ce_loss_17": 3.2587551832199098, + "ce_loss_2": 4.208761322498321, + "ce_loss_4": 3.790085244178772, + "ce_loss_9": 3.4374767184257506, + "epoch": 0.963, + "grad_norm": 1136.0, + "kl_loss_13": 99.71647720336914, + "kl_loss_2": 2038.6428649902343, + "kl_loss_4": 1211.9402404785155, + "kl_loss_9": 431.8893249511719, + "learning_rate": 3.4424990570994797e-06, + "loss": 976.4067, + "step": 9630 + }, + { + "ce_loss_13": 3.2909238815307615, + "ce_loss_17": 3.2444775700569153, + "ce_loss_2": 4.200924503803253, + "ce_loss_4": 3.792708945274353, + "ce_loss_9": 3.4305668234825135, + "epoch": 0.964, + "grad_norm": 900.0, + "kl_loss_13": 99.606205368042, + "kl_loss_2": 2057.8226684570313, + "kl_loss_4": 1244.8847045898438, + "kl_loss_9": 442.5407653808594, + "learning_rate": 3.2591323257248896e-06, + "loss": 964.7584, + "step": 9640 + }, + { + "ce_loss_13": 3.1449180960655214, + "ce_loss_17": 3.100285840034485, + "ce_loss_2": 4.096813762187958, + "ce_loss_4": 3.665309524536133, + "ce_loss_9": 3.2900532126426696, + "epoch": 0.965, + "grad_norm": 1200.0, + "kl_loss_13": 98.90424499511718, + "kl_loss_2": 2128.9772094726563, + "kl_loss_4": 1276.8819152832032, + "kl_loss_9": 444.46954345703125, + "learning_rate": 3.0807677693729385e-06, + "loss": 987.6323, + "step": 9650 + }, + { + "ce_loss_13": 3.3292858719825746, + "ce_loss_17": 3.2839693307876585, + "ce_loss_2": 4.237276148796082, + "ce_loss_4": 3.8290547013282774, + "ce_loss_9": 3.473268282413483, + "epoch": 0.966, + "grad_norm": 1064.0, + "kl_loss_13": 100.42622108459473, + "kl_loss_2": 2045.5208312988282, + "kl_loss_4": 1235.3991943359374, + "kl_loss_9": 439.5499893188477, + "learning_rate": 2.9074071841727055e-06, + "loss": 951.5396, + "step": 9660 + }, + { + "ce_loss_13": 3.251963436603546, + "ce_loss_17": 3.205065143108368, + "ce_loss_2": 4.175255036354065, + "ce_loss_4": 3.7691023349761963, + "ce_loss_9": 3.399799370765686, + "epoch": 0.967, + "grad_norm": 1064.0, + "kl_loss_13": 100.6189468383789, + "kl_loss_2": 2061.871160888672, + "kl_loss_4": 1251.4933288574218, + "kl_loss_9": 444.52624053955077, + "learning_rate": 2.739052315863355e-06, + "loss": 945.0748, + "step": 9670 + }, + { + "ce_loss_13": 3.236733603477478, + "ce_loss_17": 3.1912460327148438, + "ce_loss_2": 4.1703333735466, + "ce_loss_4": 3.741064131259918, + "ce_loss_9": 3.374487030506134, + "epoch": 0.968, + "grad_norm": 1088.0, + "kl_loss_13": 100.81112670898438, + "kl_loss_2": 2100.6378967285154, + "kl_loss_4": 1258.6768310546875, + "kl_loss_9": 438.86712036132815, + "learning_rate": 2.5757048597765396e-06, + "loss": 964.0251, + "step": 9680 + }, + { + "ce_loss_13": 3.2489668011665342, + "ce_loss_17": 3.202586901187897, + "ce_loss_2": 4.190460276603699, + "ce_loss_4": 3.7609094619750976, + "ce_loss_9": 3.3913151621818542, + "epoch": 0.969, + "grad_norm": 1352.0, + "kl_loss_13": 99.7973243713379, + "kl_loss_2": 2099.314831542969, + "kl_loss_4": 1267.5642517089843, + "kl_loss_9": 444.0799499511719, + "learning_rate": 2.417366460819359e-06, + "loss": 971.6229, + "step": 9690 + }, + { + "ce_loss_13": 3.2588642239570618, + "ce_loss_17": 3.211339366436005, + "ce_loss_2": 4.218634414672851, + "ce_loss_4": 3.7851798295974732, + "ce_loss_9": 3.4068103432655334, + "epoch": 0.97, + "grad_norm": 1336.0, + "kl_loss_13": 102.64106559753418, + "kl_loss_2": 2143.829064941406, + "kl_loss_4": 1284.4738159179688, + "kl_loss_9": 453.5870697021484, + "learning_rate": 2.2640387134577057e-06, + "loss": 969.0391, + "step": 9700 + }, + { + "ce_loss_13": 3.1910021185874937, + "ce_loss_17": 3.1448869705200195, + "ce_loss_2": 4.086093938350677, + "ce_loss_4": 3.678375315666199, + "ce_loss_9": 3.327124845981598, + "epoch": 0.971, + "grad_norm": 1000.0, + "kl_loss_13": 94.90134620666504, + "kl_loss_2": 1979.2899047851563, + "kl_loss_4": 1190.0685180664063, + "kl_loss_9": 419.7801910400391, + "learning_rate": 2.115723161700278e-06, + "loss": 939.9465, + "step": 9710 + }, + { + "ce_loss_13": 3.1688317179679872, + "ce_loss_17": 3.119938683509827, + "ce_loss_2": 4.133386886119842, + "ce_loss_4": 3.6922177910804748, + "ce_loss_9": 3.314848017692566, + "epoch": 0.972, + "grad_norm": 1104.0, + "kl_loss_13": 102.64487037658691, + "kl_loss_2": 2159.221075439453, + "kl_loss_4": 1286.7723876953125, + "kl_loss_9": 451.5230285644531, + "learning_rate": 1.9724212990830937e-06, + "loss": 993.4086, + "step": 9720 + }, + { + "ce_loss_13": 3.3064271211624146, + "ce_loss_17": 3.2607157588005067, + "ce_loss_2": 4.25134893655777, + "ce_loss_4": 3.8263473868370057, + "ce_loss_9": 3.4552688241004943, + "epoch": 0.973, + "grad_norm": 800.0, + "kl_loss_13": 101.8689754486084, + "kl_loss_2": 2116.0334594726564, + "kl_loss_4": 1274.124072265625, + "kl_loss_9": 449.7283462524414, + "learning_rate": 1.8341345686543331e-06, + "loss": 976.408, + "step": 9730 + }, + { + "ce_loss_13": 3.2885117292404176, + "ce_loss_17": 3.243068742752075, + "ce_loss_2": 4.184333562850952, + "ce_loss_4": 3.7784474372863768, + "ce_loss_9": 3.430349862575531, + "epoch": 0.974, + "grad_norm": 1120.0, + "kl_loss_13": 99.04630851745605, + "kl_loss_2": 2011.7415832519532, + "kl_loss_4": 1210.7414764404298, + "kl_loss_9": 433.8052062988281, + "learning_rate": 1.7008643629596864e-06, + "loss": 970.3138, + "step": 9740 + }, + { + "ce_loss_13": 3.27404271364212, + "ce_loss_17": 3.2283719539642335, + "ce_loss_2": 4.20917158126831, + "ce_loss_4": 3.781008946895599, + "ce_loss_9": 3.4152266144752503, + "epoch": 0.975, + "grad_norm": 920.0, + "kl_loss_13": 100.49046516418457, + "kl_loss_2": 2105.6352600097657, + "kl_loss_4": 1248.0177612304688, + "kl_loss_9": 442.16507110595705, + "learning_rate": 1.5726120240288633e-06, + "loss": 983.2756, + "step": 9750 + }, + { + "ce_loss_13": 3.1785828113555907, + "ce_loss_17": 3.132006084918976, + "ce_loss_2": 4.106027913093567, + "ce_loss_4": 3.6846176981925964, + "ce_loss_9": 3.3189047813415526, + "epoch": 0.976, + "grad_norm": 936.0, + "kl_loss_13": 98.67556114196778, + "kl_loss_2": 2066.7419555664064, + "kl_loss_4": 1243.10107421875, + "kl_loss_9": 438.6411102294922, + "learning_rate": 1.4493788433612708e-06, + "loss": 959.045, + "step": 9760 + }, + { + "ce_loss_13": 3.29163259267807, + "ce_loss_17": 3.2444224834442137, + "ce_loss_2": 4.230081498622894, + "ce_loss_4": 3.8119967222213744, + "ce_loss_9": 3.4395094752311706, + "epoch": 0.977, + "grad_norm": 968.0, + "kl_loss_13": 100.62838783264161, + "kl_loss_2": 2113.931994628906, + "kl_loss_4": 1272.5500366210938, + "kl_loss_9": 447.321337890625, + "learning_rate": 1.3311660619138578e-06, + "loss": 982.3355, + "step": 9770 + }, + { + "ce_loss_13": 3.2895258545875548, + "ce_loss_17": 3.243997132778168, + "ce_loss_2": 4.1737782716751095, + "ce_loss_4": 3.7739211678504945, + "ce_loss_9": 3.4272176027297974, + "epoch": 0.978, + "grad_norm": 1072.0, + "kl_loss_13": 100.62196884155273, + "kl_loss_2": 1992.1853149414062, + "kl_loss_4": 1206.075421142578, + "kl_loss_9": 435.78404388427737, + "learning_rate": 1.2179748700879012e-06, + "loss": 961.1178, + "step": 9780 + }, + { + "ce_loss_13": 3.217727029323578, + "ce_loss_17": 3.170780122280121, + "ce_loss_2": 4.143377149105072, + "ce_loss_4": 3.727510917186737, + "ce_loss_9": 3.361529994010925, + "epoch": 0.979, + "grad_norm": 888.0, + "kl_loss_13": 99.33015480041504, + "kl_loss_2": 2051.4610900878906, + "kl_loss_4": 1237.0142761230468, + "kl_loss_9": 437.1269760131836, + "learning_rate": 1.1098064077174619e-06, + "loss": 968.0482, + "step": 9790 + }, + { + "ce_loss_13": 3.2474601030349732, + "ce_loss_17": 3.198060357570648, + "ce_loss_2": 4.2025586605072025, + "ce_loss_4": 3.773097288608551, + "ce_loss_9": 3.393211269378662, + "epoch": 0.98, + "grad_norm": 1264.0, + "kl_loss_13": 99.8125114440918, + "kl_loss_2": 2128.6316833496094, + "kl_loss_4": 1277.8404907226563, + "kl_loss_9": 442.59876403808596, + "learning_rate": 1.006661764057837e-06, + "loss": 972.1799, + "step": 9800 + }, + { + "ce_loss_13": 3.2554531693458557, + "ce_loss_17": 3.2100130558013915, + "ce_loss_2": 4.182665002346039, + "ce_loss_4": 3.7628962635993957, + "ce_loss_9": 3.3990574717521667, + "epoch": 0.981, + "grad_norm": 1048.0, + "kl_loss_13": 98.79133834838868, + "kl_loss_2": 2077.3804443359377, + "kl_loss_4": 1254.041552734375, + "kl_loss_9": 438.7265090942383, + "learning_rate": 9.085419777743465e-07, + "loss": 960.2199, + "step": 9810 + }, + { + "ce_loss_13": 3.203165817260742, + "ce_loss_17": 3.160168266296387, + "ce_loss_2": 4.130461525917053, + "ce_loss_4": 3.712307906150818, + "ce_loss_9": 3.3453029870986937, + "epoch": 0.982, + "grad_norm": 940.0, + "kl_loss_13": 97.26919097900391, + "kl_loss_2": 2075.6539306640625, + "kl_loss_4": 1251.8951721191406, + "kl_loss_9": 434.7082153320313, + "learning_rate": 8.15448036932176e-07, + "loss": 948.8596, + "step": 9820 + }, + { + "ce_loss_13": 3.2485483288764954, + "ce_loss_17": 3.2023658871650698, + "ce_loss_2": 4.175073170661927, + "ce_loss_4": 3.759416735172272, + "ce_loss_9": 3.391933870315552, + "epoch": 0.983, + "grad_norm": 1020.0, + "kl_loss_13": 99.71344871520996, + "kl_loss_2": 2092.0822631835936, + "kl_loss_4": 1258.6885375976562, + "kl_loss_9": 445.9726898193359, + "learning_rate": 7.273808789862724e-07, + "loss": 976.9576, + "step": 9830 + }, + { + "ce_loss_13": 3.326125967502594, + "ce_loss_17": 3.2803446769714357, + "ce_loss_2": 4.241369450092316, + "ce_loss_4": 3.82564092874527, + "ce_loss_9": 3.465975213050842, + "epoch": 0.984, + "grad_norm": 1136.0, + "kl_loss_13": 101.00319976806641, + "kl_loss_2": 2066.5107177734376, + "kl_loss_4": 1249.1889892578124, + "kl_loss_9": 445.17047729492185, + "learning_rate": 6.443413907720186e-07, + "loss": 960.6353, + "step": 9840 + }, + { + "ce_loss_13": 3.262605834007263, + "ce_loss_17": 3.2163902163505553, + "ce_loss_2": 4.1855257749557495, + "ce_loss_4": 3.7643518924713133, + "ce_loss_9": 3.402821886539459, + "epoch": 0.985, + "grad_norm": 1004.0, + "kl_loss_13": 99.9081314086914, + "kl_loss_2": 2048.3161682128907, + "kl_loss_4": 1231.493310546875, + "kl_loss_9": 435.41772613525393, + "learning_rate": 5.663304084960185e-07, + "loss": 953.618, + "step": 9850 + }, + { + "ce_loss_13": 3.188557481765747, + "ce_loss_17": 3.1420599579811097, + "ce_loss_2": 4.136715650558472, + "ce_loss_4": 3.711104428768158, + "ce_loss_9": 3.335402488708496, + "epoch": 0.986, + "grad_norm": 968.0, + "kl_loss_13": 100.21799049377441, + "kl_loss_2": 2118.813232421875, + "kl_loss_4": 1270.9372802734374, + "kl_loss_9": 446.0380462646484, + "learning_rate": 4.933487177280482e-07, + "loss": 960.2383, + "step": 9860 + }, + { + "ce_loss_13": 3.287597692012787, + "ce_loss_17": 3.2423799872398376, + "ce_loss_2": 4.196546292304992, + "ce_loss_4": 3.784658646583557, + "ce_loss_9": 3.428558623790741, + "epoch": 0.987, + "grad_norm": 960.0, + "kl_loss_13": 97.95343627929688, + "kl_loss_2": 2060.3000305175783, + "kl_loss_4": 1235.2383911132813, + "kl_loss_9": 434.31813659667966, + "learning_rate": 4.2539705339295075e-07, + "loss": 954.3248, + "step": 9870 + }, + { + "ce_loss_13": 3.1471114277839662, + "ce_loss_17": 3.0985710263252257, + "ce_loss_2": 4.095869493484497, + "ce_loss_4": 3.667863595485687, + "ce_loss_9": 3.2908640027046205, + "epoch": 0.988, + "grad_norm": 912.0, + "kl_loss_13": 101.20098686218262, + "kl_loss_2": 2104.124951171875, + "kl_loss_4": 1267.6967834472657, + "kl_loss_9": 447.0945327758789, + "learning_rate": 3.6247609976319816e-07, + "loss": 963.5682, + "step": 9880 + }, + { + "ce_loss_13": 3.2310577154159548, + "ce_loss_17": 3.1835248708724975, + "ce_loss_2": 4.184465396404266, + "ce_loss_4": 3.749646008014679, + "ce_loss_9": 3.377937078475952, + "epoch": 0.989, + "grad_norm": 912.0, + "kl_loss_13": 101.61584167480468, + "kl_loss_2": 2121.2127319335937, + "kl_loss_4": 1268.9790893554687, + "kl_loss_9": 450.99257507324216, + "learning_rate": 3.0458649045211895e-07, + "loss": 995.0213, + "step": 9890 + }, + { + "ce_loss_13": 3.2083937525749207, + "ce_loss_17": 3.160646367073059, + "ce_loss_2": 4.145768737792968, + "ce_loss_4": 3.7313124895095826, + "ce_loss_9": 3.3572762489318846, + "epoch": 0.99, + "grad_norm": 972.0, + "kl_loss_13": 102.85211944580078, + "kl_loss_2": 2091.924981689453, + "kl_loss_4": 1274.8233032226562, + "kl_loss_9": 453.5033782958984, + "learning_rate": 2.517288084074587e-07, + "loss": 991.6736, + "step": 9900 + }, + { + "ce_loss_13": 3.247076725959778, + "ce_loss_17": 3.1952731728553774, + "ce_loss_2": 4.2183543801307675, + "ce_loss_4": 3.783793330192566, + "ce_loss_9": 3.401419472694397, + "epoch": 0.991, + "grad_norm": 876.0, + "kl_loss_13": 103.02739791870117, + "kl_loss_2": 2161.908416748047, + "kl_loss_4": 1301.8399780273437, + "kl_loss_9": 461.16288299560546, + "learning_rate": 2.0390358590538505e-07, + "loss": 987.9509, + "step": 9910 + }, + { + "ce_loss_13": 3.2542178630828857, + "ce_loss_17": 3.2073527336120606, + "ce_loss_2": 4.188078260421753, + "ce_loss_4": 3.7715426087379456, + "ce_loss_9": 3.400512182712555, + "epoch": 0.992, + "grad_norm": 1152.0, + "kl_loss_13": 101.84062767028809, + "kl_loss_2": 2103.5510192871093, + "kl_loss_4": 1276.6694274902343, + "kl_loss_9": 449.5619659423828, + "learning_rate": 1.61111304545436e-07, + "loss": 968.6268, + "step": 9920 + }, + { + "ce_loss_13": 3.2235095143318175, + "ce_loss_17": 3.1772380828857423, + "ce_loss_2": 4.149358475208283, + "ce_loss_4": 3.7247429132461547, + "ce_loss_9": 3.362751579284668, + "epoch": 0.993, + "grad_norm": 972.0, + "kl_loss_13": 99.95104103088379, + "kl_loss_2": 2085.8706787109377, + "kl_loss_4": 1253.4031799316406, + "kl_loss_9": 445.9100814819336, + "learning_rate": 1.2335239524541298e-07, + "loss": 957.6521, + "step": 9930 + }, + { + "ce_loss_13": 3.1896217465400696, + "ce_loss_17": 3.1421361446380613, + "ce_loss_2": 4.117532026767731, + "ce_loss_4": 3.693878543376923, + "ce_loss_9": 3.3318581223487853, + "epoch": 0.994, + "grad_norm": 860.0, + "kl_loss_13": 99.76443748474121, + "kl_loss_2": 2069.033026123047, + "kl_loss_4": 1243.3323059082031, + "kl_loss_9": 437.63356018066406, + "learning_rate": 9.06272382371065e-08, + "loss": 965.0811, + "step": 9940 + }, + { + "ce_loss_13": 3.255199158191681, + "ce_loss_17": 3.2085448861122132, + "ce_loss_2": 4.207074856758117, + "ce_loss_4": 3.7756519436836244, + "ce_loss_9": 3.4032769441604613, + "epoch": 0.995, + "grad_norm": 924.0, + "kl_loss_13": 101.93049049377441, + "kl_loss_2": 2130.4299255371093, + "kl_loss_4": 1283.853662109375, + "kl_loss_9": 452.74979400634766, + "learning_rate": 6.293616306246586e-08, + "loss": 976.6729, + "step": 9950 + }, + { + "ce_loss_13": 3.2502265334129334, + "ce_loss_17": 3.206307113170624, + "ce_loss_2": 4.155244266986847, + "ce_loss_4": 3.744070565700531, + "ce_loss_9": 3.385901117324829, + "epoch": 0.996, + "grad_norm": 908.0, + "kl_loss_13": 97.94876403808594, + "kl_loss_2": 2036.8428344726562, + "kl_loss_4": 1224.213446044922, + "kl_loss_9": 432.75928955078126, + "learning_rate": 4.027944857032395e-08, + "loss": 936.5098, + "step": 9960 + }, + { + "ce_loss_13": 3.243815302848816, + "ce_loss_17": 3.201537036895752, + "ce_loss_2": 4.129641008377075, + "ce_loss_4": 3.7263023614883424, + "ce_loss_9": 3.3750877261161802, + "epoch": 0.997, + "grad_norm": 2320.0, + "kl_loss_13": 95.61507568359374, + "kl_loss_2": 1985.3149658203124, + "kl_loss_4": 1191.212744140625, + "kl_loss_9": 417.67685852050784, + "learning_rate": 2.265732291356626e-08, + "loss": 933.0522, + "step": 9970 + }, + { + "ce_loss_13": 3.292201781272888, + "ce_loss_17": 3.246002995967865, + "ce_loss_2": 4.189475786685944, + "ce_loss_4": 3.7807944893836973, + "ce_loss_9": 3.428407382965088, + "epoch": 0.998, + "grad_norm": 808.0, + "kl_loss_13": 99.77738304138184, + "kl_loss_2": 2022.5568237304688, + "kl_loss_4": 1217.6197814941406, + "kl_loss_9": 432.62791137695314, + "learning_rate": 1.0069963546743833e-08, + "loss": 970.5087, + "step": 9980 + }, + { + "ce_loss_13": 3.2676636815071105, + "ce_loss_17": 3.220956587791443, + "ce_loss_2": 4.202810847759247, + "ce_loss_4": 3.7772077441215517, + "ce_loss_9": 3.41645268201828, + "epoch": 0.999, + "grad_norm": 1064.0, + "kl_loss_13": 99.66792907714844, + "kl_loss_2": 2081.7039794921875, + "kl_loss_4": 1252.134698486328, + "kl_loss_9": 445.26441345214846, + "learning_rate": 2.517497224463483e-09, + "loss": 965.5449, + "step": 9990 + }, + { + "ce_loss_13": 3.2229382157325746, + "ce_loss_17": 3.174089479446411, + "ce_loss_2": 4.204296863079071, + "ce_loss_4": 3.758987283706665, + "ce_loss_9": 3.377695381641388, + "epoch": 1.0, + "grad_norm": 936.0, + "kl_loss_13": 102.52793426513672, + "kl_loss_2": 2185.0192138671873, + "kl_loss_4": 1306.76728515625, + "kl_loss_9": 458.4556060791016, + "learning_rate": 0.0, + "loss": 995.5732, + "step": 10000 + } + ], + "logging_steps": 10, + "max_steps": 10000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.447557417823109e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}