Balcony-Model31 / trainer_state.json
adpretko's picture
Upload folder using huggingface_hub
90ac3ca verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"ce_loss_10": 6.949970483779907,
"ce_loss_13": 3.5991063117980957,
"ce_loss_2": 20.74317169189453,
"ce_loss_3": 26.111305236816406,
"ce_loss_7": 10.075343608856201,
"epoch": 0.0001,
"grad_norm": 212992.0,
"kl_loss_10": 7864.61865234375,
"kl_loss_2": 35348.310546875,
"kl_loss_3": 46478.765625,
"kl_loss_7": 14199.76806640625,
"learning_rate": 1e-05,
"loss": 25853.3086,
"step": 1
},
{
"ce_loss_10": 6.125355773501926,
"ce_loss_13": 3.6540163622962103,
"ce_loss_2": 12.076997624503242,
"ce_loss_3": 15.207524087693956,
"ce_loss_7": 7.174011654324001,
"epoch": 0.001,
"grad_norm": 17792.0,
"kl_loss_10": 5945.64690483941,
"kl_loss_2": 17211.485812717016,
"kl_loss_3": 23727.339274088543,
"kl_loss_7": 7859.595960828993,
"learning_rate": 0.0001,
"loss": 13522.6684,
"step": 10
},
{
"ce_loss_10": 4.603781843185425,
"ce_loss_13": 3.6583157896995546,
"ce_loss_2": 6.582165956497192,
"ce_loss_3": 6.530840277671814,
"ce_loss_7": 4.767995834350586,
"epoch": 0.002,
"grad_norm": 2416.0,
"kl_loss_10": 2069.933459472656,
"kl_loss_2": 5383.958471679687,
"kl_loss_3": 5293.375524902343,
"kl_loss_7": 2034.765203857422,
"learning_rate": 0.0002,
"loss": 3721.8547,
"step": 20
},
{
"ce_loss_10": 3.8755152463912963,
"ce_loss_13": 3.440992832183838,
"ce_loss_2": 5.676479697227478,
"ce_loss_3": 5.412591814994812,
"ce_loss_7": 4.082889425754547,
"epoch": 0.003,
"grad_norm": 1440.0,
"kl_loss_10": 764.9044219970704,
"kl_loss_2": 4085.6137573242186,
"kl_loss_3": 3584.689697265625,
"kl_loss_7": 1103.4823669433595,
"learning_rate": 0.0003,
"loss": 2353.6012,
"step": 30
},
{
"ce_loss_10": 3.9151899099349974,
"ce_loss_13": 3.612821674346924,
"ce_loss_2": 5.4220003366470335,
"ce_loss_3": 5.159578323364258,
"ce_loss_7": 4.096081411838531,
"epoch": 0.004,
"grad_norm": 2240.0,
"kl_loss_10": 524.7813415527344,
"kl_loss_2": 3348.9226196289064,
"kl_loss_3": 2841.234338378906,
"kl_loss_7": 838.199349975586,
"learning_rate": 0.0004,
"loss": 1895.9125,
"step": 40
},
{
"ce_loss_10": 3.8530999541282656,
"ce_loss_13": 3.5993613958358766,
"ce_loss_2": 5.269976806640625,
"ce_loss_3": 4.972214603424073,
"ce_loss_7": 4.020862734317779,
"epoch": 0.005,
"grad_norm": 1856.0,
"kl_loss_10": 415.9984130859375,
"kl_loss_2": 3121.74814453125,
"kl_loss_3": 2545.165393066406,
"kl_loss_7": 715.3964080810547,
"learning_rate": 0.0005,
"loss": 1695.3973,
"step": 50
},
{
"ce_loss_10": 3.8142170667648316,
"ce_loss_13": 3.6055872440338135,
"ce_loss_2": 5.112091851234436,
"ce_loss_3": 4.877537202835083,
"ce_loss_7": 3.9615157723426817,
"epoch": 0.006,
"grad_norm": 1256.0,
"kl_loss_10": 352.13475494384767,
"kl_loss_2": 2834.7232299804687,
"kl_loss_3": 2411.520703125,
"kl_loss_7": 626.5961456298828,
"learning_rate": 0.0006,
"loss": 1559.8957,
"step": 60
},
{
"ce_loss_10": 3.7009406328201293,
"ce_loss_13": 3.519935369491577,
"ce_loss_2": 4.975889682769775,
"ce_loss_3": 4.767486214637756,
"ce_loss_7": 3.847541904449463,
"epoch": 0.007,
"grad_norm": 1208.0,
"kl_loss_10": 301.96008453369143,
"kl_loss_2": 2753.304443359375,
"kl_loss_3": 2332.9209411621096,
"kl_loss_7": 579.1212310791016,
"learning_rate": 0.0007,
"loss": 1482.1832,
"step": 70
},
{
"ce_loss_10": 3.691651237010956,
"ce_loss_13": 3.5254875898361204,
"ce_loss_2": 4.9323248863220215,
"ce_loss_3": 4.6815266609191895,
"ce_loss_7": 3.918946826457977,
"epoch": 0.008,
"grad_norm": 1920.0,
"kl_loss_10": 277.7585922241211,
"kl_loss_2": 2673.962158203125,
"kl_loss_3": 2198.915148925781,
"kl_loss_7": 679.1366607666016,
"learning_rate": 0.0008,
"loss": 1458.9709,
"step": 80
},
{
"ce_loss_10": 3.630067002773285,
"ce_loss_13": 3.48206342458725,
"ce_loss_2": 4.835212993621826,
"ce_loss_3": 4.615828561782837,
"ce_loss_7": 3.8199189424514772,
"epoch": 0.009,
"grad_norm": 2336.0,
"kl_loss_10": 252.8492774963379,
"kl_loss_2": 2587.6504516601562,
"kl_loss_3": 2134.85634765625,
"kl_loss_7": 622.173226928711,
"learning_rate": 0.0009000000000000001,
"loss": 1391.4618,
"step": 90
},
{
"ce_loss_10": 3.7532737612724305,
"ce_loss_13": 3.6053535461425783,
"ce_loss_2": 4.9651381254196165,
"ce_loss_3": 4.679249548912049,
"ce_loss_7": 3.9317663073539735,
"epoch": 0.01,
"grad_norm": 2128.0,
"kl_loss_10": 251.45062484741212,
"kl_loss_2": 2572.321484375,
"kl_loss_3": 2009.2118774414062,
"kl_loss_7": 549.0976821899415,
"learning_rate": 0.001,
"loss": 1348.2805,
"step": 100
},
{
"ce_loss_10": 3.7507344126701354,
"ce_loss_13": 3.560739505290985,
"ce_loss_2": 4.900289678573609,
"ce_loss_3": 4.575748300552368,
"ce_loss_7": 3.855139982700348,
"epoch": 0.011,
"grad_norm": 2096.0,
"kl_loss_10": 336.0966377258301,
"kl_loss_2": 2548.9544921875,
"kl_loss_3": 1914.9787902832031,
"kl_loss_7": 503.8795822143555,
"learning_rate": 0.0009999974825027757,
"loss": 1319.618,
"step": 110
},
{
"ce_loss_10": 3.820546102523804,
"ce_loss_13": 3.6168412566184998,
"ce_loss_2": 4.895331883430481,
"ce_loss_3": 4.579937386512756,
"ce_loss_7": 3.905752420425415,
"epoch": 0.012,
"grad_norm": 1216.0,
"kl_loss_10": 360.8111114501953,
"kl_loss_2": 2423.57294921875,
"kl_loss_3": 1821.830096435547,
"kl_loss_7": 497.84374084472654,
"learning_rate": 0.0009999899300364532,
"loss": 1256.7569,
"step": 120
},
{
"ce_loss_10": 3.765466582775116,
"ce_loss_13": 3.588362789154053,
"ce_loss_2": 4.861788105964661,
"ce_loss_3": 4.571296620368957,
"ce_loss_7": 3.876194405555725,
"epoch": 0.013,
"grad_norm": 1920.0,
"kl_loss_10": 300.5776092529297,
"kl_loss_2": 2421.4876953125,
"kl_loss_3": 1850.192041015625,
"kl_loss_7": 510.1988525390625,
"learning_rate": 0.0009999773426770863,
"loss": 1278.616,
"step": 130
},
{
"ce_loss_10": 3.812940168380737,
"ce_loss_13": 3.6267056345939634,
"ce_loss_2": 4.856600952148438,
"ce_loss_3": 4.555375063419342,
"ce_loss_7": 3.9197404980659485,
"epoch": 0.014,
"grad_norm": 1104.0,
"kl_loss_10": 334.16673736572267,
"kl_loss_2": 2343.682012939453,
"kl_loss_3": 1750.5664672851562,
"kl_loss_7": 514.5185562133789,
"learning_rate": 0.0009999597205514296,
"loss": 1248.4314,
"step": 140
},
{
"ce_loss_10": 3.7693222880363466,
"ce_loss_13": 3.5812222719192506,
"ce_loss_2": 4.7746042013168335,
"ce_loss_3": 4.491594767570495,
"ce_loss_7": 3.9013134360313417,
"epoch": 0.015,
"grad_norm": 2000.0,
"kl_loss_10": 301.1280906677246,
"kl_loss_2": 2261.218878173828,
"kl_loss_3": 1705.700830078125,
"kl_loss_7": 572.2008193969726,
"learning_rate": 0.0009999370638369377,
"loss": 1215.427,
"step": 150
},
{
"ce_loss_10": 3.771867072582245,
"ce_loss_13": 3.623634135723114,
"ce_loss_2": 4.793287062644959,
"ce_loss_3": 4.509058833122253,
"ce_loss_7": 3.964562237262726,
"epoch": 0.016,
"grad_norm": 1736.0,
"kl_loss_10": 262.3149169921875,
"kl_loss_2": 2258.9556396484377,
"kl_loss_3": 1685.8056640625,
"kl_loss_7": 604.1481872558594,
"learning_rate": 0.000999909372761763,
"loss": 1207.4248,
"step": 160
},
{
"ce_loss_10": 3.702605497837067,
"ce_loss_13": 3.555959129333496,
"ce_loss_2": 4.72133858203888,
"ce_loss_3": 4.451281535625458,
"ce_loss_7": 3.8406598806381225,
"epoch": 0.017,
"grad_norm": 1536.0,
"kl_loss_10": 242.87019119262695,
"kl_loss_2": 2254.203680419922,
"kl_loss_3": 1717.980206298828,
"kl_loss_7": 507.3527557373047,
"learning_rate": 0.0009998766476047546,
"loss": 1188.5746,
"step": 170
},
{
"ce_loss_10": 3.7377680063247682,
"ce_loss_13": 3.600107181072235,
"ce_loss_2": 4.7649291276931764,
"ce_loss_3": 4.480887150764465,
"ce_loss_7": 3.8928799867630004,
"epoch": 0.018,
"grad_norm": 1096.0,
"kl_loss_10": 231.40118408203125,
"kl_loss_2": 2247.180828857422,
"kl_loss_3": 1668.1500427246094,
"kl_loss_7": 502.5091278076172,
"learning_rate": 0.0009998388886954545,
"loss": 1181.5367,
"step": 180
},
{
"ce_loss_10": 3.693929398059845,
"ce_loss_13": 3.5655275106430055,
"ce_loss_2": 4.720048952102661,
"ce_loss_3": 4.425967907905578,
"ce_loss_7": 3.8348891854286196,
"epoch": 0.019,
"grad_norm": 1032.0,
"kl_loss_10": 214.6924057006836,
"kl_loss_2": 2225.93642578125,
"kl_loss_3": 1640.7731079101563,
"kl_loss_7": 467.8557983398438,
"learning_rate": 0.0009997960964140947,
"loss": 1132.1148,
"step": 190
},
{
"ce_loss_10": 3.682847249507904,
"ce_loss_13": 3.5613570332527162,
"ce_loss_2": 4.7296292066574095,
"ce_loss_3": 4.422930908203125,
"ce_loss_7": 3.8146942615509034,
"epoch": 0.02,
"grad_norm": 1360.0,
"kl_loss_10": 204.76239395141602,
"kl_loss_2": 2234.7769409179687,
"kl_loss_3": 1619.6412719726563,
"kl_loss_7": 443.86146240234376,
"learning_rate": 0.0009997482711915926,
"loss": 1118.6208,
"step": 200
},
{
"ce_loss_10": 3.6386141061782835,
"ce_loss_13": 3.523626208305359,
"ce_loss_2": 4.654034543037414,
"ce_loss_3": 4.351287698745727,
"ce_loss_7": 3.7648990035057066,
"epoch": 0.021,
"grad_norm": 844.0,
"kl_loss_10": 191.48220748901366,
"kl_loss_2": 2176.9761169433596,
"kl_loss_3": 1592.8676696777343,
"kl_loss_7": 425.4680374145508,
"learning_rate": 0.0009996954135095479,
"loss": 1087.035,
"step": 210
},
{
"ce_loss_10": 3.726301395893097,
"ce_loss_13": 3.613930583000183,
"ce_loss_2": 4.689849066734314,
"ce_loss_3": 4.408529257774353,
"ce_loss_7": 3.8633771181106566,
"epoch": 0.022,
"grad_norm": 968.0,
"kl_loss_10": 185.19210891723634,
"kl_loss_2": 2058.4550598144533,
"kl_loss_3": 1519.1773864746094,
"kl_loss_7": 434.87823486328125,
"learning_rate": 0.0009996375239002368,
"loss": 1051.0784,
"step": 220
},
{
"ce_loss_10": 3.7977551460266112,
"ce_loss_13": 3.6818463683128355,
"ce_loss_2": 4.714341163635254,
"ce_loss_3": 4.444690012931824,
"ce_loss_7": 3.9216720938682554,
"epoch": 0.023,
"grad_norm": 792.0,
"kl_loss_10": 197.30275802612306,
"kl_loss_2": 1985.7976623535155,
"kl_loss_3": 1460.261212158203,
"kl_loss_7": 417.5539093017578,
"learning_rate": 0.0009995746029466072,
"loss": 1021.8153,
"step": 230
},
{
"ce_loss_10": 3.5866660118103026,
"ce_loss_13": 3.465270149707794,
"ce_loss_2": 4.572040939331055,
"ce_loss_3": 4.279095077514649,
"ce_loss_7": 3.7078741550445558,
"epoch": 0.024,
"grad_norm": 908.0,
"kl_loss_10": 207.95580520629883,
"kl_loss_2": 2143.852404785156,
"kl_loss_3": 1571.8129760742188,
"kl_loss_7": 426.2864517211914,
"learning_rate": 0.0009995066512822719,
"loss": 1050.4631,
"step": 240
},
{
"ce_loss_10": 3.686765193939209,
"ce_loss_13": 3.5706915736198424,
"ce_loss_2": 4.679925036430359,
"ce_loss_3": 4.384693300724029,
"ce_loss_7": 3.8067264676094057,
"epoch": 0.025,
"grad_norm": 1032.0,
"kl_loss_10": 199.02285385131836,
"kl_loss_2": 2131.8410888671874,
"kl_loss_3": 1544.0210571289062,
"kl_loss_7": 413.1264175415039,
"learning_rate": 0.000999433669591504,
"loss": 1033.4686,
"step": 250
},
{
"ce_loss_10": 3.581657183170319,
"ce_loss_13": 3.472314774990082,
"ce_loss_2": 4.541779208183288,
"ce_loss_3": 4.317579293251038,
"ce_loss_7": 3.7064756393432616,
"epoch": 0.026,
"grad_norm": 932.0,
"kl_loss_10": 189.23829040527343,
"kl_loss_2": 2088.3945861816405,
"kl_loss_3": 1637.8548217773437,
"kl_loss_7": 412.1736145019531,
"learning_rate": 0.000999355658609228,
"loss": 1057.2906,
"step": 260
},
{
"ce_loss_10": 3.6219969391822815,
"ce_loss_13": 3.5048365235328673,
"ce_loss_2": 4.596334934234619,
"ce_loss_3": 4.441597318649292,
"ce_loss_7": 3.7431845664978027,
"epoch": 0.027,
"grad_norm": 900.0,
"kl_loss_10": 188.88880004882813,
"kl_loss_2": 2095.5673889160157,
"kl_loss_3": 1750.0010925292968,
"kl_loss_7": 405.53798522949216,
"learning_rate": 0.0009992726191210138,
"loss": 1093.5438,
"step": 270
},
{
"ce_loss_10": 3.652155375480652,
"ce_loss_13": 3.5428276896476745,
"ce_loss_2": 4.579152154922485,
"ce_loss_3": 4.383497536182404,
"ce_loss_7": 3.789737546443939,
"epoch": 0.028,
"grad_norm": 780.0,
"kl_loss_10": 187.1881446838379,
"kl_loss_2": 2006.3188415527343,
"kl_loss_3": 1629.3374816894532,
"kl_loss_7": 423.36602783203125,
"learning_rate": 0.0009991845519630679,
"loss": 1050.2449,
"step": 280
},
{
"ce_loss_10": 3.535972011089325,
"ce_loss_13": 3.427106332778931,
"ce_loss_2": 4.4744978785514835,
"ce_loss_3": 4.262858963012695,
"ce_loss_7": 3.6743877053260805,
"epoch": 0.029,
"grad_norm": 684.0,
"kl_loss_10": 179.29404220581054,
"kl_loss_2": 2016.5242065429688,
"kl_loss_3": 1582.6825744628907,
"kl_loss_7": 441.13164978027345,
"learning_rate": 0.0009990914580222257,
"loss": 1053.0684,
"step": 290
},
{
"ce_loss_10": 3.668388879299164,
"ce_loss_13": 3.567758929729462,
"ce_loss_2": 4.53744785785675,
"ce_loss_3": 4.309589576721192,
"ce_loss_7": 3.8119096040725706,
"epoch": 0.03,
"grad_norm": 1224.0,
"kl_loss_10": 187.29344177246094,
"kl_loss_2": 1896.2646728515624,
"kl_loss_3": 1456.2583312988281,
"kl_loss_7": 421.26478881835936,
"learning_rate": 0.0009989933382359422,
"loss": 1015.491,
"step": 300
},
{
"ce_loss_10": 3.6792996883392335,
"ce_loss_13": 3.5740679264068604,
"ce_loss_2": 4.557365846633911,
"ce_loss_3": 4.323324573040009,
"ce_loss_7": 3.7865865588188172,
"epoch": 0.031,
"grad_norm": 828.0,
"kl_loss_10": 187.5126724243164,
"kl_loss_2": 1923.0036499023438,
"kl_loss_3": 1442.6469970703124,
"kl_loss_7": 384.39212493896486,
"learning_rate": 0.0009988901935922825,
"loss": 997.117,
"step": 310
},
{
"ce_loss_10": 3.5293397903442383,
"ce_loss_13": 3.4199066400527953,
"ce_loss_2": 4.486744737625122,
"ce_loss_3": 4.227087867259979,
"ce_loss_7": 3.6469008684158326,
"epoch": 0.032,
"grad_norm": 976.0,
"kl_loss_10": 183.28864593505858,
"kl_loss_2": 2055.861083984375,
"kl_loss_3": 1531.5701293945312,
"kl_loss_7": 385.79076690673827,
"learning_rate": 0.0009987820251299122,
"loss": 1008.4045,
"step": 320
},
{
"ce_loss_10": 3.66086140871048,
"ce_loss_13": 3.556379699707031,
"ce_loss_2": 4.536041283607483,
"ce_loss_3": 4.270140862464904,
"ce_loss_7": 3.770153260231018,
"epoch": 0.033,
"grad_norm": 1144.0,
"kl_loss_10": 168.59372940063477,
"kl_loss_2": 1906.9965759277343,
"kl_loss_3": 1385.0498474121093,
"kl_loss_7": 372.18479614257814,
"learning_rate": 0.0009986688339380862,
"loss": 957.1518,
"step": 330
},
{
"ce_loss_10": 3.6052905559539794,
"ce_loss_13": 3.504980742931366,
"ce_loss_2": 4.501095390319824,
"ce_loss_3": 4.218203604221344,
"ce_loss_7": 3.727604556083679,
"epoch": 0.034,
"grad_norm": 1104.0,
"kl_loss_10": 164.38146286010743,
"kl_loss_2": 1931.4434020996093,
"kl_loss_3": 1379.435321044922,
"kl_loss_7": 389.75638275146486,
"learning_rate": 0.0009985506211566387,
"loss": 969.0948,
"step": 340
},
{
"ce_loss_10": 3.6377886295318604,
"ce_loss_13": 3.541017484664917,
"ce_loss_2": 4.482600402832031,
"ce_loss_3": 4.22925614118576,
"ce_loss_7": 3.7690312385559084,
"epoch": 0.035,
"grad_norm": 988.0,
"kl_loss_10": 158.43596343994142,
"kl_loss_2": 1829.8166870117188,
"kl_loss_3": 1337.818865966797,
"kl_loss_7": 388.05591278076173,
"learning_rate": 0.0009984273879759713,
"loss": 933.1328,
"step": 350
},
{
"ce_loss_10": 3.667439329624176,
"ce_loss_13": 3.5666789412498474,
"ce_loss_2": 4.5066794633865355,
"ce_loss_3": 4.2926198720932005,
"ce_loss_7": 3.7826303958892824,
"epoch": 0.036,
"grad_norm": 600.0,
"kl_loss_10": 162.84700927734374,
"kl_loss_2": 1826.3444274902345,
"kl_loss_3": 1395.6122314453125,
"kl_loss_7": 384.24933471679685,
"learning_rate": 0.0009982991356370402,
"loss": 957.8976,
"step": 360
},
{
"ce_loss_10": 3.643305718898773,
"ce_loss_13": 3.545375657081604,
"ce_loss_2": 4.487171721458435,
"ce_loss_3": 4.280910170078277,
"ce_loss_7": 3.767821896076202,
"epoch": 0.037,
"grad_norm": 596.0,
"kl_loss_10": 164.2067985534668,
"kl_loss_2": 1829.6034606933595,
"kl_loss_3": 1399.7697387695312,
"kl_loss_7": 389.38902282714844,
"learning_rate": 0.0009981658654313456,
"loss": 945.4266,
"step": 370
},
{
"ce_loss_10": 3.728627920150757,
"ce_loss_13": 3.628399407863617,
"ce_loss_2": 4.530555677413941,
"ce_loss_3": 4.30595852136612,
"ce_loss_7": 3.83515260219574,
"epoch": 0.038,
"grad_norm": 572.0,
"kl_loss_10": 166.8636932373047,
"kl_loss_2": 1769.3647521972657,
"kl_loss_3": 1309.9726135253907,
"kl_loss_7": 360.24556121826174,
"learning_rate": 0.000998027578700917,
"loss": 918.2047,
"step": 380
},
{
"ce_loss_10": 3.6558377385139464,
"ce_loss_13": 3.5584804892539976,
"ce_loss_2": 4.499538516998291,
"ce_loss_3": 4.255290400981903,
"ce_loss_7": 3.7707452058792112,
"epoch": 0.039,
"grad_norm": 684.0,
"kl_loss_10": 164.3509963989258,
"kl_loss_2": 1842.500555419922,
"kl_loss_3": 1353.7652526855468,
"kl_loss_7": 364.5350601196289,
"learning_rate": 0.0009978842768382998,
"loss": 935.4773,
"step": 390
},
{
"ce_loss_10": 3.6760897040367126,
"ce_loss_13": 3.5800448179244997,
"ce_loss_2": 4.493572664260864,
"ce_loss_3": 4.2423638820648195,
"ce_loss_7": 3.786301875114441,
"epoch": 0.04,
"grad_norm": 968.0,
"kl_loss_10": 161.01671752929687,
"kl_loss_2": 1790.2952514648437,
"kl_loss_3": 1298.0454895019532,
"kl_loss_7": 363.45732421875,
"learning_rate": 0.0009977359612865424,
"loss": 914.3111,
"step": 400
},
{
"ce_loss_10": 3.684686779975891,
"ce_loss_13": 3.586086595058441,
"ce_loss_2": 4.512642502784729,
"ce_loss_3": 4.255100309848785,
"ce_loss_7": 3.805712080001831,
"epoch": 0.041,
"grad_norm": 724.0,
"kl_loss_10": 161.0974250793457,
"kl_loss_2": 1807.8360168457032,
"kl_loss_3": 1310.046209716797,
"kl_loss_7": 391.8801742553711,
"learning_rate": 0.0009975826335391806,
"loss": 914.0043,
"step": 410
},
{
"ce_loss_10": 3.707440197467804,
"ce_loss_13": 3.604601538181305,
"ce_loss_2": 4.522381353378296,
"ce_loss_3": 4.265636503696442,
"ce_loss_7": 3.822117471694946,
"epoch": 0.042,
"grad_norm": 900.0,
"kl_loss_10": 166.57249908447267,
"kl_loss_2": 1773.633642578125,
"kl_loss_3": 1273.706396484375,
"kl_loss_7": 380.59193420410156,
"learning_rate": 0.0009974242951402235,
"loss": 906.3268,
"step": 420
},
{
"ce_loss_10": 3.7127435922622682,
"ce_loss_13": 3.6068360447883605,
"ce_loss_2": 4.534731841087341,
"ce_loss_3": 4.272397923469543,
"ce_loss_7": 3.8254016041755676,
"epoch": 0.043,
"grad_norm": 544.0,
"kl_loss_10": 171.76721878051757,
"kl_loss_2": 1813.8242553710938,
"kl_loss_3": 1297.1632202148437,
"kl_loss_7": 380.753857421875,
"learning_rate": 0.0009972609476841367,
"loss": 907.3121,
"step": 430
},
{
"ce_loss_10": 3.638201355934143,
"ce_loss_13": 3.521967649459839,
"ce_loss_2": 4.476631236076355,
"ce_loss_3": 4.207662534713745,
"ce_loss_7": 3.743925619125366,
"epoch": 0.044,
"grad_norm": 656.0,
"kl_loss_10": 205.51385726928712,
"kl_loss_2": 1862.7595336914062,
"kl_loss_3": 1318.6298767089843,
"kl_loss_7": 397.6446823120117,
"learning_rate": 0.0009970925928158272,
"loss": 947.2434,
"step": 440
},
{
"ce_loss_10": 3.5770766854286196,
"ce_loss_13": 3.463445019721985,
"ce_loss_2": 4.41340719461441,
"ce_loss_3": 4.154228365421295,
"ce_loss_7": 3.683333933353424,
"epoch": 0.045,
"grad_norm": 544.0,
"kl_loss_10": 187.16454544067383,
"kl_loss_2": 1860.5320922851563,
"kl_loss_3": 1349.3171264648438,
"kl_loss_7": 389.9994171142578,
"learning_rate": 0.000996919232230627,
"loss": 931.1581,
"step": 450
},
{
"ce_loss_10": 3.6615111470222472,
"ce_loss_13": 3.5475740671157836,
"ce_loss_2": 4.4471900224685665,
"ce_loss_3": 4.206307077407837,
"ce_loss_7": 3.769719123840332,
"epoch": 0.046,
"grad_norm": 792.0,
"kl_loss_10": 189.3802345275879,
"kl_loss_2": 1767.1443359375,
"kl_loss_3": 1299.2234252929688,
"kl_loss_7": 404.1760650634766,
"learning_rate": 0.0009967408676742752,
"loss": 896.3932,
"step": 460
},
{
"ce_loss_10": 3.815341627597809,
"ce_loss_13": 3.6976951956748962,
"ce_loss_2": 4.575603008270264,
"ce_loss_3": 4.349165272712708,
"ce_loss_7": 3.926807904243469,
"epoch": 0.047,
"grad_norm": 1020.0,
"kl_loss_10": 193.17176513671876,
"kl_loss_2": 1722.3591735839843,
"kl_loss_3": 1269.6951721191406,
"kl_loss_7": 399.5799560546875,
"learning_rate": 0.0009965575009429006,
"loss": 911.5342,
"step": 470
},
{
"ce_loss_10": 3.5749866485595705,
"ce_loss_13": 3.471819591522217,
"ce_loss_2": 4.3897274255752565,
"ce_loss_3": 4.163278090953827,
"ce_loss_7": 3.6932021975517273,
"epoch": 0.048,
"grad_norm": 832.0,
"kl_loss_10": 173.3515739440918,
"kl_loss_2": 1803.337139892578,
"kl_loss_3": 1356.6680847167968,
"kl_loss_7": 384.8886749267578,
"learning_rate": 0.0009963691338830043,
"loss": 913.6404,
"step": 480
},
{
"ce_loss_10": 3.6706506490707396,
"ce_loss_13": 3.5724706411361695,
"ce_loss_2": 4.442422878742218,
"ce_loss_3": 4.223198866844177,
"ce_loss_7": 3.7754740715026855,
"epoch": 0.049,
"grad_norm": 664.0,
"kl_loss_10": 163.7422233581543,
"kl_loss_2": 1726.0343017578125,
"kl_loss_3": 1283.6178405761718,
"kl_loss_7": 355.8869354248047,
"learning_rate": 0.0009961757683914405,
"loss": 866.413,
"step": 490
},
{
"ce_loss_10": 3.657481300830841,
"ce_loss_13": 3.561222219467163,
"ce_loss_2": 4.412674343585968,
"ce_loss_3": 4.190000641345978,
"ce_loss_7": 3.7463939428329467,
"epoch": 0.05,
"grad_norm": 552.0,
"kl_loss_10": 171.74871139526368,
"kl_loss_2": 1693.8871337890625,
"kl_loss_3": 1238.5546569824219,
"kl_loss_7": 333.96338348388673,
"learning_rate": 0.0009959774064153978,
"loss": 867.9215,
"step": 500
},
{
"ce_loss_10": 3.6671042442321777,
"ce_loss_13": 3.5669935941696167,
"ce_loss_2": 4.402782237529754,
"ce_loss_3": 4.179040241241455,
"ce_loss_7": 3.7529300928115843,
"epoch": 0.051,
"grad_norm": 548.0,
"kl_loss_10": 165.0301971435547,
"kl_loss_2": 1649.0840270996093,
"kl_loss_3": 1201.8943420410155,
"kl_loss_7": 327.53272857666013,
"learning_rate": 0.0009957740499523787,
"loss": 850.5875,
"step": 510
},
{
"ce_loss_10": 3.692741870880127,
"ce_loss_13": 3.5905726313591004,
"ce_loss_2": 4.450686037540436,
"ce_loss_3": 4.220798969268799,
"ce_loss_7": 3.785287392139435,
"epoch": 0.052,
"grad_norm": 560.0,
"kl_loss_10": 160.49609146118163,
"kl_loss_2": 1681.6054443359376,
"kl_loss_3": 1234.1128479003905,
"kl_loss_7": 330.61391296386716,
"learning_rate": 0.0009955657010501807,
"loss": 859.9023,
"step": 520
},
{
"ce_loss_10": 3.654950940608978,
"ce_loss_13": 3.554408383369446,
"ce_loss_2": 4.4271773338317875,
"ce_loss_3": 4.2007159948348995,
"ce_loss_7": 3.7458335757255554,
"epoch": 0.053,
"grad_norm": 560.0,
"kl_loss_10": 160.82289505004883,
"kl_loss_2": 1731.140557861328,
"kl_loss_3": 1270.9948364257812,
"kl_loss_7": 331.9795379638672,
"learning_rate": 0.000995352361806875,
"loss": 862.8967,
"step": 530
},
{
"ce_loss_10": 3.6911896467208862,
"ce_loss_13": 3.5907997369766234,
"ce_loss_2": 4.458630633354187,
"ce_loss_3": 4.2227191686630245,
"ce_loss_7": 3.7843895673751833,
"epoch": 0.054,
"grad_norm": 552.0,
"kl_loss_10": 166.47220001220703,
"kl_loss_2": 1722.0740966796875,
"kl_loss_3": 1249.600811767578,
"kl_loss_7": 335.7519927978516,
"learning_rate": 0.0009951340343707852,
"loss": 876.934,
"step": 540
},
{
"ce_loss_10": 3.7539408445358275,
"ce_loss_13": 3.6503811120986938,
"ce_loss_2": 4.52553424835205,
"ce_loss_3": 4.282987451553344,
"ce_loss_7": 3.839770758152008,
"epoch": 0.055,
"grad_norm": 512.0,
"kl_loss_10": 162.00789489746094,
"kl_loss_2": 1707.214862060547,
"kl_loss_3": 1233.2592041015625,
"kl_loss_7": 323.39465484619143,
"learning_rate": 0.0009949107209404665,
"loss": 863.0879,
"step": 550
},
{
"ce_loss_10": 3.6489940643310548,
"ce_loss_13": 3.5539053201675417,
"ce_loss_2": 4.41527898311615,
"ce_loss_3": 4.180786430835724,
"ce_loss_7": 3.750082802772522,
"epoch": 0.056,
"grad_norm": 540.0,
"kl_loss_10": 157.5300537109375,
"kl_loss_2": 1703.8317993164062,
"kl_loss_3": 1234.3482482910156,
"kl_loss_7": 346.96667022705077,
"learning_rate": 0.0009946824237646824,
"loss": 859.4348,
"step": 560
},
{
"ce_loss_10": 3.5962815046310426,
"ce_loss_13": 3.501141941547394,
"ce_loss_2": 4.377828812599182,
"ce_loss_3": 4.148308992385864,
"ce_loss_7": 3.7191163897514343,
"epoch": 0.057,
"grad_norm": 764.0,
"kl_loss_10": 153.23575592041016,
"kl_loss_2": 1739.7751159667969,
"kl_loss_3": 1272.3390563964845,
"kl_loss_7": 396.63781890869143,
"learning_rate": 0.0009944491451423828,
"loss": 901.9479,
"step": 570
},
{
"ce_loss_10": 3.594892370700836,
"ce_loss_13": 3.500366282463074,
"ce_loss_2": 4.390602493286133,
"ce_loss_3": 4.148435056209564,
"ce_loss_7": 3.710537350177765,
"epoch": 0.058,
"grad_norm": 804.0,
"kl_loss_10": 153.60133438110353,
"kl_loss_2": 1753.4079895019531,
"kl_loss_3": 1272.2657775878906,
"kl_loss_7": 368.931379699707,
"learning_rate": 0.0009942108874226813,
"loss": 870.9764,
"step": 580
},
{
"ce_loss_10": 3.7256513595581056,
"ce_loss_13": 3.6301231741905213,
"ce_loss_2": 4.468952918052674,
"ce_loss_3": 4.23498455286026,
"ce_loss_7": 3.8281203866004945,
"epoch": 0.059,
"grad_norm": 494.0,
"kl_loss_10": 155.00701828002929,
"kl_loss_2": 1650.434881591797,
"kl_loss_3": 1188.881414794922,
"kl_loss_7": 349.0711242675781,
"learning_rate": 0.00099396765300483,
"loss": 829.2725,
"step": 590
},
{
"ce_loss_10": 3.688658046722412,
"ce_loss_13": 3.600668156147003,
"ce_loss_2": 4.441829895973205,
"ce_loss_3": 4.204685604572296,
"ce_loss_7": 3.7927687644958494,
"epoch": 0.06,
"grad_norm": 700.0,
"kl_loss_10": 147.73722648620605,
"kl_loss_2": 1665.102276611328,
"kl_loss_3": 1201.0594848632813,
"kl_loss_7": 336.5929977416992,
"learning_rate": 0.0009937194443381972,
"loss": 836.3632,
"step": 600
},
{
"ce_loss_10": 3.7074933648109436,
"ce_loss_13": 3.6225223660469057,
"ce_loss_2": 4.444479322433471,
"ce_loss_3": 4.212475669384003,
"ce_loss_7": 3.806171452999115,
"epoch": 0.061,
"grad_norm": 490.0,
"kl_loss_10": 145.92314338684082,
"kl_loss_2": 1647.2934875488281,
"kl_loss_3": 1192.1070617675782,
"kl_loss_7": 330.35746612548826,
"learning_rate": 0.0009934662639222412,
"loss": 841.5062,
"step": 610
},
{
"ce_loss_10": 3.6668009042739866,
"ce_loss_13": 3.5791383743286134,
"ce_loss_2": 4.436111927032471,
"ce_loss_3": 4.192030191421509,
"ce_loss_7": 3.7709102272987365,
"epoch": 0.062,
"grad_norm": 548.0,
"kl_loss_10": 142.56752128601073,
"kl_loss_2": 1707.9285888671875,
"kl_loss_3": 1224.2419647216798,
"kl_loss_7": 333.47724609375,
"learning_rate": 0.000993208114306486,
"loss": 843.8041,
"step": 620
},
{
"ce_loss_10": 3.5789570450782775,
"ce_loss_13": 3.4927441477775574,
"ce_loss_2": 4.355255722999573,
"ce_loss_3": 4.113215839862823,
"ce_loss_7": 3.6797728538513184,
"epoch": 0.063,
"grad_norm": 684.0,
"kl_loss_10": 142.47375717163087,
"kl_loss_2": 1703.1561950683595,
"kl_loss_3": 1224.8568115234375,
"kl_loss_7": 327.08728790283203,
"learning_rate": 0.0009929449980904952,
"loss": 827.3757,
"step": 630
},
{
"ce_loss_10": 3.6368979692459105,
"ce_loss_13": 3.552669334411621,
"ce_loss_2": 4.39526858329773,
"ce_loss_3": 4.161888694763183,
"ce_loss_7": 3.7305431842803953,
"epoch": 0.064,
"grad_norm": 604.0,
"kl_loss_10": 145.31115531921387,
"kl_loss_2": 1675.4742797851563,
"kl_loss_3": 1206.004461669922,
"kl_loss_7": 311.66287689208986,
"learning_rate": 0.0009926769179238466,
"loss": 830.4232,
"step": 640
},
{
"ce_loss_10": 3.708518397808075,
"ce_loss_13": 3.6032424330711366,
"ce_loss_2": 4.449502897262573,
"ce_loss_3": 4.213514125347137,
"ce_loss_7": 3.7848907709121704,
"epoch": 0.065,
"grad_norm": 572.0,
"kl_loss_10": 183.44921951293946,
"kl_loss_2": 1690.6953979492187,
"kl_loss_3": 1209.2367431640625,
"kl_loss_7": 320.96158905029296,
"learning_rate": 0.000992403876506104,
"loss": 845.6277,
"step": 650
},
{
"ce_loss_10": 3.6422240853309633,
"ce_loss_13": 3.5376295328140257,
"ce_loss_2": 4.388966178894043,
"ce_loss_3": 4.148260116577148,
"ce_loss_7": 3.721568763256073,
"epoch": 0.066,
"grad_norm": 516.0,
"kl_loss_10": 166.38197479248046,
"kl_loss_2": 1675.3920837402343,
"kl_loss_3": 1201.2981964111327,
"kl_loss_7": 311.2148132324219,
"learning_rate": 0.0009921258765867918,
"loss": 834.6085,
"step": 660
},
{
"ce_loss_10": 3.593488574028015,
"ce_loss_13": 3.5049474120140074,
"ce_loss_2": 4.357883477210999,
"ce_loss_3": 4.115947949886322,
"ce_loss_7": 3.6764505982398985,
"epoch": 0.067,
"grad_norm": 600.0,
"kl_loss_10": 148.19011993408202,
"kl_loss_2": 1717.049383544922,
"kl_loss_3": 1223.6348205566405,
"kl_loss_7": 306.2616958618164,
"learning_rate": 0.0009918429209653662,
"loss": 833.8985,
"step": 670
},
{
"ce_loss_10": 3.648161160945892,
"ce_loss_13": 3.559934389591217,
"ce_loss_2": 4.409848690032959,
"ce_loss_3": 4.171260499954224,
"ce_loss_7": 3.7374308466911317,
"epoch": 0.068,
"grad_norm": 596.0,
"kl_loss_10": 147.48591995239258,
"kl_loss_2": 1679.9023315429688,
"kl_loss_3": 1208.9112182617187,
"kl_loss_7": 313.83782348632815,
"learning_rate": 0.0009915550124911866,
"loss": 822.998,
"step": 680
},
{
"ce_loss_10": 3.6632981300354004,
"ce_loss_13": 3.573338711261749,
"ce_loss_2": 4.395955181121826,
"ce_loss_3": 4.164679610729218,
"ce_loss_7": 3.7496419668197634,
"epoch": 0.069,
"grad_norm": 636.0,
"kl_loss_10": 148.64911651611328,
"kl_loss_2": 1629.2107971191406,
"kl_loss_3": 1186.3095611572267,
"kl_loss_7": 309.2399566650391,
"learning_rate": 0.0009912621540634887,
"loss": 816.0117,
"step": 690
},
{
"ce_loss_10": 3.6952749490737915,
"ce_loss_13": 3.608550024032593,
"ce_loss_2": 4.3951560974121096,
"ce_loss_3": 4.167996168136597,
"ce_loss_7": 3.778964614868164,
"epoch": 0.07,
"grad_norm": 524.0,
"kl_loss_10": 140.3430618286133,
"kl_loss_2": 1575.8819396972656,
"kl_loss_3": 1123.8694213867188,
"kl_loss_7": 294.651708984375,
"learning_rate": 0.0009909643486313534,
"loss": 794.9152,
"step": 700
},
{
"ce_loss_10": 3.5606731176376343,
"ce_loss_13": 3.4771942019462587,
"ce_loss_2": 4.3175184488296505,
"ce_loss_3": 4.074072551727295,
"ce_loss_7": 3.650731146335602,
"epoch": 0.071,
"grad_norm": 600.0,
"kl_loss_10": 135.5239990234375,
"kl_loss_2": 1676.284442138672,
"kl_loss_3": 1193.2137634277344,
"kl_loss_7": 307.0430740356445,
"learning_rate": 0.000990661599193678,
"loss": 839.2205,
"step": 710
},
{
"ce_loss_10": 3.7052354335784914,
"ce_loss_13": 3.6190937519073487,
"ce_loss_2": 4.42795637845993,
"ce_loss_3": 4.203339767456055,
"ce_loss_7": 3.7865342020988466,
"epoch": 0.072,
"grad_norm": 708.0,
"kl_loss_10": 139.11955757141112,
"kl_loss_2": 1630.3473266601563,
"kl_loss_3": 1169.5729766845702,
"kl_loss_7": 299.42345809936523,
"learning_rate": 0.0009903539087991462,
"loss": 803.8498,
"step": 720
},
{
"ce_loss_10": 3.6689595699310305,
"ce_loss_13": 3.586829674243927,
"ce_loss_2": 4.399398994445801,
"ce_loss_3": 4.174283814430237,
"ce_loss_7": 3.7554702758789062,
"epoch": 0.073,
"grad_norm": 860.0,
"kl_loss_10": 133.20760345458984,
"kl_loss_2": 1626.384521484375,
"kl_loss_3": 1158.8964233398438,
"kl_loss_7": 296.6233856201172,
"learning_rate": 0.0009900412805461966,
"loss": 810.3949,
"step": 730
},
{
"ce_loss_10": 3.7475465893745423,
"ce_loss_13": 3.6637478709220885,
"ce_loss_2": 4.477526593208313,
"ce_loss_3": 4.232499527931213,
"ce_loss_7": 3.834572732448578,
"epoch": 0.074,
"grad_norm": 756.0,
"kl_loss_10": 136.13002281188966,
"kl_loss_2": 1615.2621215820313,
"kl_loss_3": 1135.3047760009765,
"kl_loss_7": 302.71751708984374,
"learning_rate": 0.0009897237175829927,
"loss": 812.032,
"step": 740
},
{
"ce_loss_10": 3.633454430103302,
"ce_loss_13": 3.546045184135437,
"ce_loss_2": 4.386739385128021,
"ce_loss_3": 4.157361710071564,
"ce_loss_7": 3.7257861375808714,
"epoch": 0.075,
"grad_norm": 624.0,
"kl_loss_10": 138.0735656738281,
"kl_loss_2": 1664.8802978515625,
"kl_loss_3": 1209.155780029297,
"kl_loss_7": 314.29449157714845,
"learning_rate": 0.0009894012231073895,
"loss": 820.1248,
"step": 750
},
{
"ce_loss_10": 3.675256085395813,
"ce_loss_13": 3.591258680820465,
"ce_loss_2": 4.3781631827354435,
"ce_loss_3": 4.169950652122497,
"ce_loss_7": 3.7596523761749268,
"epoch": 0.076,
"grad_norm": 596.0,
"kl_loss_10": 137.33892288208008,
"kl_loss_2": 1570.5589111328125,
"kl_loss_3": 1161.7032104492187,
"kl_loss_7": 298.7381622314453,
"learning_rate": 0.0009890738003669028,
"loss": 801.2431,
"step": 760
},
{
"ce_loss_10": 3.64959534406662,
"ce_loss_13": 3.5664158701896667,
"ce_loss_2": 4.371342432498932,
"ce_loss_3": 4.150311291217804,
"ce_loss_7": 3.7354934453964233,
"epoch": 0.077,
"grad_norm": 540.0,
"kl_loss_10": 136.36218070983887,
"kl_loss_2": 1622.240057373047,
"kl_loss_3": 1172.391793823242,
"kl_loss_7": 304.76952667236327,
"learning_rate": 0.0009887414526586764,
"loss": 787.9819,
"step": 770
},
{
"ce_loss_10": 3.708216655254364,
"ce_loss_13": 3.625141477584839,
"ce_loss_2": 4.414664888381958,
"ce_loss_3": 4.183781635761261,
"ce_loss_7": 3.8081562399864195,
"epoch": 0.078,
"grad_norm": 596.0,
"kl_loss_10": 133.56560096740722,
"kl_loss_2": 1562.47041015625,
"kl_loss_3": 1106.445620727539,
"kl_loss_7": 312.0776168823242,
"learning_rate": 0.0009884041833294476,
"loss": 768.2491,
"step": 780
},
{
"ce_loss_10": 3.706817853450775,
"ce_loss_13": 3.622973358631134,
"ce_loss_2": 4.41116281747818,
"ce_loss_3": 4.179266679286957,
"ce_loss_7": 3.8186426639556883,
"epoch": 0.079,
"grad_norm": 632.0,
"kl_loss_10": 132.2478443145752,
"kl_loss_2": 1599.446923828125,
"kl_loss_3": 1117.8709930419923,
"kl_loss_7": 368.3747268676758,
"learning_rate": 0.000988061995775515,
"loss": 815.0693,
"step": 790
},
{
"ce_loss_10": 3.641828775405884,
"ce_loss_13": 3.5547205209732056,
"ce_loss_2": 4.335572981834412,
"ce_loss_3": 4.108006286621094,
"ce_loss_7": 3.7402275919914247,
"epoch": 0.08,
"grad_norm": 516.0,
"kl_loss_10": 141.807564163208,
"kl_loss_2": 1570.8703674316407,
"kl_loss_3": 1110.252996826172,
"kl_loss_7": 321.9771667480469,
"learning_rate": 0.0009877148934427035,
"loss": 786.1404,
"step": 800
},
{
"ce_loss_10": 3.681752073764801,
"ce_loss_13": 3.596014940738678,
"ce_loss_2": 4.380065774917602,
"ce_loss_3": 4.151778030395508,
"ce_loss_7": 3.7655294299125672,
"epoch": 0.081,
"grad_norm": 496.0,
"kl_loss_10": 145.9334274291992,
"kl_loss_2": 1572.8681091308595,
"kl_loss_3": 1116.675845336914,
"kl_loss_7": 297.05968246459963,
"learning_rate": 0.0009873628798263297,
"loss": 776.0455,
"step": 810
},
{
"ce_loss_10": 3.6424105167388916,
"ce_loss_13": 3.5447566747665404,
"ce_loss_2": 4.312346494197845,
"ce_loss_3": 4.088560962677002,
"ce_loss_7": 3.7104405045509337,
"epoch": 0.082,
"grad_norm": 478.0,
"kl_loss_10": 152.06344909667968,
"kl_loss_2": 1539.9718017578125,
"kl_loss_3": 1091.6052520751953,
"kl_loss_7": 286.7229400634766,
"learning_rate": 0.0009870059584711668,
"loss": 790.5065,
"step": 820
},
{
"ce_loss_10": 3.6575138568878174,
"ce_loss_13": 3.5694735765457155,
"ce_loss_2": 4.352469277381897,
"ce_loss_3": 4.124953854084015,
"ce_loss_7": 3.7358759164810182,
"epoch": 0.083,
"grad_norm": 516.0,
"kl_loss_10": 158.90749130249023,
"kl_loss_2": 1569.4235595703126,
"kl_loss_3": 1125.5340545654296,
"kl_loss_7": 290.7964630126953,
"learning_rate": 0.000986644132971409,
"loss": 786.8994,
"step": 830
},
{
"ce_loss_10": 3.6558743476867677,
"ce_loss_13": 3.5544149518013,
"ce_loss_2": 4.354631888866424,
"ce_loss_3": 4.1281127572059635,
"ce_loss_7": 3.727833020687103,
"epoch": 0.084,
"grad_norm": 576.0,
"kl_loss_10": 158.36446990966797,
"kl_loss_2": 1584.950128173828,
"kl_loss_3": 1138.0484100341796,
"kl_loss_7": 300.7915969848633,
"learning_rate": 0.0009862774069706345,
"loss": 786.4536,
"step": 840
},
{
"ce_loss_10": 3.7631431221961975,
"ce_loss_13": 3.6783902406692506,
"ce_loss_2": 4.423887753486634,
"ce_loss_3": 4.210748863220215,
"ce_loss_7": 3.8459392905235292,
"epoch": 0.085,
"grad_norm": 720.0,
"kl_loss_10": 144.1476722717285,
"kl_loss_2": 1526.45078125,
"kl_loss_3": 1098.919091796875,
"kl_loss_7": 305.10309143066405,
"learning_rate": 0.000985905784161771,
"loss": 773.6244,
"step": 850
},
{
"ce_loss_10": 3.693523097038269,
"ce_loss_13": 3.6117894887924193,
"ce_loss_2": 4.374859690666199,
"ce_loss_3": 4.145905554294586,
"ce_loss_7": 3.799845337867737,
"epoch": 0.086,
"grad_norm": 648.0,
"kl_loss_10": 141.55279006958008,
"kl_loss_2": 1548.1404724121094,
"kl_loss_3": 1092.3538146972655,
"kl_loss_7": 338.8992858886719,
"learning_rate": 0.000985529268287055,
"loss": 780.1624,
"step": 860
},
{
"ce_loss_10": 3.6179853677749634,
"ce_loss_13": 3.532400143146515,
"ce_loss_2": 4.3180185675621034,
"ce_loss_3": 4.092539095878601,
"ce_loss_7": 3.716568684577942,
"epoch": 0.087,
"grad_norm": 584.0,
"kl_loss_10": 138.25293006896973,
"kl_loss_2": 1583.606640625,
"kl_loss_3": 1113.2994171142577,
"kl_loss_7": 327.81214904785156,
"learning_rate": 0.0009851478631379982,
"loss": 787.4821,
"step": 870
},
{
"ce_loss_10": 3.6815198183059694,
"ce_loss_13": 3.5956546545028685,
"ce_loss_2": 4.367411196231842,
"ce_loss_3": 4.13222428560257,
"ce_loss_7": 3.7695237517356874,
"epoch": 0.088,
"grad_norm": 628.0,
"kl_loss_10": 140.43244590759278,
"kl_loss_2": 1545.1064147949219,
"kl_loss_3": 1094.267755126953,
"kl_loss_7": 312.3658508300781,
"learning_rate": 0.0009847615725553456,
"loss": 767.0908,
"step": 880
},
{
"ce_loss_10": 3.739601492881775,
"ce_loss_13": 3.657086157798767,
"ce_loss_2": 4.379729843139648,
"ce_loss_3": 4.177917766571045,
"ce_loss_7": 3.820488429069519,
"epoch": 0.089,
"grad_norm": 552.0,
"kl_loss_10": 134.12742614746094,
"kl_loss_2": 1464.5765686035156,
"kl_loss_3": 1051.556851196289,
"kl_loss_7": 283.62481689453125,
"learning_rate": 0.0009843704004290394,
"loss": 761.853,
"step": 890
},
{
"ce_loss_10": 3.6452771425247192,
"ce_loss_13": 3.5613077044487,
"ce_loss_2": 4.318511128425598,
"ce_loss_3": 4.107461535930634,
"ce_loss_7": 3.726675534248352,
"epoch": 0.09,
"grad_norm": 474.0,
"kl_loss_10": 136.06297454833984,
"kl_loss_2": 1542.6724487304687,
"kl_loss_3": 1117.772933959961,
"kl_loss_7": 292.2666213989258,
"learning_rate": 0.0009839743506981783,
"loss": 768.8108,
"step": 900
},
{
"ce_loss_10": 3.5574649572372437,
"ce_loss_13": 3.4748517513275146,
"ce_loss_2": 4.266572868824005,
"ce_loss_3": 4.057099211215973,
"ce_loss_7": 3.6422529578208924,
"epoch": 0.091,
"grad_norm": 516.0,
"kl_loss_10": 139.13952560424804,
"kl_loss_2": 1603.9869201660156,
"kl_loss_3": 1170.3635620117188,
"kl_loss_7": 298.2760665893555,
"learning_rate": 0.0009835734273509786,
"loss": 783.7168,
"step": 910
},
{
"ce_loss_10": 3.6700770974159242,
"ce_loss_13": 3.5813122153282166,
"ce_loss_2": 4.351845908164978,
"ce_loss_3": 4.139319920539856,
"ce_loss_7": 3.7498608589172364,
"epoch": 0.092,
"grad_norm": 516.0,
"kl_loss_10": 139.36617164611818,
"kl_loss_2": 1526.7721801757812,
"kl_loss_3": 1107.183511352539,
"kl_loss_7": 287.28514404296874,
"learning_rate": 0.0009831676344247342,
"loss": 768.4225,
"step": 920
},
{
"ce_loss_10": 3.684238874912262,
"ce_loss_13": 3.6015963315963746,
"ce_loss_2": 4.3427834749221805,
"ce_loss_3": 4.138106441497802,
"ce_loss_7": 3.75754714012146,
"epoch": 0.093,
"grad_norm": 490.0,
"kl_loss_10": 135.07495460510253,
"kl_loss_2": 1516.6379028320312,
"kl_loss_3": 1094.0326538085938,
"kl_loss_7": 277.64155731201174,
"learning_rate": 0.0009827569760057755,
"loss": 762.3584,
"step": 930
},
{
"ce_loss_10": 3.5946595072746277,
"ce_loss_13": 3.512081265449524,
"ce_loss_2": 4.322237813472748,
"ce_loss_3": 4.095906281471253,
"ce_loss_7": 3.6798322200775146,
"epoch": 0.094,
"grad_norm": 728.0,
"kl_loss_10": 138.28199310302733,
"kl_loss_2": 1619.1793823242188,
"kl_loss_3": 1165.3315551757812,
"kl_loss_7": 295.293204498291,
"learning_rate": 0.000982341456229428,
"loss": 780.917,
"step": 940
},
{
"ce_loss_10": 3.69069162607193,
"ce_loss_13": 3.6100045323371885,
"ce_loss_2": 4.376732325553894,
"ce_loss_3": 4.16404242515564,
"ce_loss_7": 3.7701812386512756,
"epoch": 0.095,
"grad_norm": 688.0,
"kl_loss_10": 131.1420455932617,
"kl_loss_2": 1575.732354736328,
"kl_loss_3": 1138.4372924804688,
"kl_loss_7": 285.67282180786134,
"learning_rate": 0.000981921079279971,
"loss": 765.979,
"step": 950
},
{
"ce_loss_10": 3.7074394822120667,
"ce_loss_13": 3.62913464307785,
"ce_loss_2": 4.366938805580139,
"ce_loss_3": 4.150120985507965,
"ce_loss_7": 3.7818633675575257,
"epoch": 0.096,
"grad_norm": 720.0,
"kl_loss_10": 130.51903839111327,
"kl_loss_2": 1507.3517028808594,
"kl_loss_3": 1076.092593383789,
"kl_loss_7": 272.2766448974609,
"learning_rate": 0.0009814958493905962,
"loss": 753.6946,
"step": 960
},
{
"ce_loss_10": 3.658416414260864,
"ce_loss_13": 3.576970672607422,
"ce_loss_2": 4.346470355987549,
"ce_loss_3": 4.128688275814056,
"ce_loss_7": 3.7415476202964784,
"epoch": 0.097,
"grad_norm": 512.0,
"kl_loss_10": 128.56299629211426,
"kl_loss_2": 1557.0646423339845,
"kl_loss_3": 1112.28828125,
"kl_loss_7": 279.6500648498535,
"learning_rate": 0.0009810657708433637,
"loss": 775.217,
"step": 970
},
{
"ce_loss_10": 3.7308164954185488,
"ce_loss_13": 3.6533005952835085,
"ce_loss_2": 4.3734122037887575,
"ce_loss_3": 4.170846402645111,
"ce_loss_7": 3.8050424695014953,
"epoch": 0.098,
"grad_norm": 716.0,
"kl_loss_10": 124.60902214050293,
"kl_loss_2": 1475.1663879394532,
"kl_loss_3": 1054.8542236328126,
"kl_loss_7": 269.9375114440918,
"learning_rate": 0.0009806308479691594,
"loss": 736.7519,
"step": 980
},
{
"ce_loss_10": 3.750465714931488,
"ce_loss_13": 3.668341946601868,
"ce_loss_2": 4.426263308525085,
"ce_loss_3": 4.20258377790451,
"ce_loss_7": 3.836391198635101,
"epoch": 0.099,
"grad_norm": 644.0,
"kl_loss_10": 131.81643409729003,
"kl_loss_2": 1535.673388671875,
"kl_loss_3": 1090.963656616211,
"kl_loss_7": 289.6497604370117,
"learning_rate": 0.0009801910851476522,
"loss": 754.2551,
"step": 990
},
{
"ce_loss_10": 3.653952169418335,
"ce_loss_13": 3.577095854282379,
"ce_loss_2": 4.349191665649414,
"ce_loss_3": 4.125085318088532,
"ce_loss_7": 3.7413162350654603,
"epoch": 0.1,
"grad_norm": 478.0,
"kl_loss_10": 128.62464637756347,
"kl_loss_2": 1573.8733642578125,
"kl_loss_3": 1114.0891967773437,
"kl_loss_7": 292.6377975463867,
"learning_rate": 0.0009797464868072487,
"loss": 758.6713,
"step": 1000
},
{
"ce_loss_10": 3.6456503033638,
"ce_loss_13": 3.5667870163917543,
"ce_loss_2": 4.3237790822982785,
"ce_loss_3": 4.11080631017685,
"ce_loss_7": 3.7275813579559327,
"epoch": 0.101,
"grad_norm": 432.0,
"kl_loss_10": 128.03596534729004,
"kl_loss_2": 1525.84384765625,
"kl_loss_3": 1094.1039764404297,
"kl_loss_7": 291.2514984130859,
"learning_rate": 0.0009792970574250492,
"loss": 758.2494,
"step": 1010
},
{
"ce_loss_10": 3.677238702774048,
"ce_loss_13": 3.597763466835022,
"ce_loss_2": 4.345702481269837,
"ce_loss_3": 4.1323373198509215,
"ce_loss_7": 3.757745099067688,
"epoch": 0.102,
"grad_norm": 480.0,
"kl_loss_10": 126.84439620971679,
"kl_loss_2": 1518.9351745605468,
"kl_loss_3": 1090.8279510498046,
"kl_loss_7": 289.6885223388672,
"learning_rate": 0.0009788428015268028,
"loss": 746.4768,
"step": 1020
},
{
"ce_loss_10": 3.670746088027954,
"ce_loss_13": 3.5901795506477354,
"ce_loss_2": 4.326242756843567,
"ce_loss_3": 4.110077440738678,
"ce_loss_7": 3.7697238445281984,
"epoch": 0.103,
"grad_norm": 520.0,
"kl_loss_10": 147.23381576538085,
"kl_loss_2": 1500.6592041015624,
"kl_loss_3": 1064.419287109375,
"kl_loss_7": 309.967301940918,
"learning_rate": 0.0009783837236868609,
"loss": 752.1227,
"step": 1030
},
{
"ce_loss_10": 3.665172076225281,
"ce_loss_13": 3.559502327442169,
"ce_loss_2": 4.309127068519592,
"ce_loss_3": 4.0925112009048465,
"ce_loss_7": 3.730722725391388,
"epoch": 0.104,
"grad_norm": 624.0,
"kl_loss_10": 168.995276260376,
"kl_loss_2": 1506.7355590820312,
"kl_loss_3": 1077.4224884033204,
"kl_loss_7": 306.44668807983396,
"learning_rate": 0.0009779198285281327,
"loss": 758.6978,
"step": 1040
},
{
"ce_loss_10": 3.6450916528701782,
"ce_loss_13": 3.5567120909690857,
"ce_loss_2": 4.307799768447876,
"ce_loss_3": 4.096536159515381,
"ce_loss_7": 3.7174035549163817,
"epoch": 0.105,
"grad_norm": 464.0,
"kl_loss_10": 145.4011459350586,
"kl_loss_2": 1511.9253051757812,
"kl_loss_3": 1079.955551147461,
"kl_loss_7": 290.32603912353517,
"learning_rate": 0.0009774511207220368,
"loss": 751.4335,
"step": 1050
},
{
"ce_loss_10": 3.6726208686828614,
"ce_loss_13": 3.5870786190032957,
"ce_loss_2": 4.340760517120361,
"ce_loss_3": 4.122452509403229,
"ce_loss_7": 3.7611562490463255,
"epoch": 0.106,
"grad_norm": 516.0,
"kl_loss_10": 146.77743186950684,
"kl_loss_2": 1523.6993774414063,
"kl_loss_3": 1080.6748168945312,
"kl_loss_7": 305.8709197998047,
"learning_rate": 0.0009769776049884564,
"loss": 759.1102,
"step": 1060
},
{
"ce_loss_10": 3.5789316415786745,
"ce_loss_13": 3.4973001360893248,
"ce_loss_2": 4.2655829906463625,
"ce_loss_3": 4.0485687255859375,
"ce_loss_7": 3.664482927322388,
"epoch": 0.107,
"grad_norm": 512.0,
"kl_loss_10": 138.50279579162597,
"kl_loss_2": 1555.6201232910157,
"kl_loss_3": 1112.0815826416015,
"kl_loss_7": 305.1489685058594,
"learning_rate": 0.0009764992860956889,
"loss": 779.55,
"step": 1070
},
{
"ce_loss_10": 3.7416428446769716,
"ce_loss_13": 3.6618621706962586,
"ce_loss_2": 4.364235496520996,
"ce_loss_3": 4.161127758026123,
"ce_loss_7": 3.8312565684318542,
"epoch": 0.108,
"grad_norm": 612.0,
"kl_loss_10": 132.17280654907228,
"kl_loss_2": 1434.9408752441407,
"kl_loss_3": 1021.1740997314453,
"kl_loss_7": 306.5512954711914,
"learning_rate": 0.0009760161688604008,
"loss": 729.6794,
"step": 1080
},
{
"ce_loss_10": 3.74602724313736,
"ce_loss_13": 3.6610143184661865,
"ce_loss_2": 4.390218591690063,
"ce_loss_3": 4.1842693328857425,
"ce_loss_7": 3.8411784768104553,
"epoch": 0.109,
"grad_norm": 576.0,
"kl_loss_10": 133.29356536865234,
"kl_loss_2": 1472.9822204589843,
"kl_loss_3": 1051.5467559814454,
"kl_loss_7": 310.2565521240234,
"learning_rate": 0.0009755282581475768,
"loss": 747.7812,
"step": 1090
},
{
"ce_loss_10": 3.801929402351379,
"ce_loss_13": 3.715673303604126,
"ce_loss_2": 4.428924131393432,
"ce_loss_3": 4.217723715305328,
"ce_loss_7": 3.886538052558899,
"epoch": 0.11,
"grad_norm": 552.0,
"kl_loss_10": 141.27199668884276,
"kl_loss_2": 1455.4054565429688,
"kl_loss_3": 1032.206768798828,
"kl_loss_7": 311.90191345214845,
"learning_rate": 0.0009750355588704727,
"loss": 730.8727,
"step": 1100
},
{
"ce_loss_10": 3.6245179295539858,
"ce_loss_13": 3.5434940338134764,
"ce_loss_2": 4.285040807723999,
"ce_loss_3": 4.064236760139465,
"ce_loss_7": 3.722900152206421,
"epoch": 0.111,
"grad_norm": 536.0,
"kl_loss_10": 128.93951110839845,
"kl_loss_2": 1479.008282470703,
"kl_loss_3": 1042.542987060547,
"kl_loss_7": 309.4350082397461,
"learning_rate": 0.0009745380759905647,
"loss": 755.6506,
"step": 1110
},
{
"ce_loss_10": 3.5733557820320128,
"ce_loss_13": 3.497096002101898,
"ce_loss_2": 4.2416357636451725,
"ce_loss_3": 4.02953668832779,
"ce_loss_7": 3.6843501210212706,
"epoch": 0.112,
"grad_norm": 584.0,
"kl_loss_10": 128.8479259490967,
"kl_loss_2": 1501.6354125976563,
"kl_loss_3": 1078.8607055664063,
"kl_loss_7": 309.57103271484374,
"learning_rate": 0.0009740358145174998,
"loss": 782.7103,
"step": 1120
},
{
"ce_loss_10": 3.7390747904777526,
"ce_loss_13": 3.654650056362152,
"ce_loss_2": 4.359592080116272,
"ce_loss_3": 4.165994334220886,
"ce_loss_7": 3.8400262117385866,
"epoch": 0.113,
"grad_norm": 434.0,
"kl_loss_10": 134.35130157470704,
"kl_loss_2": 1442.04345703125,
"kl_loss_3": 1051.9576324462892,
"kl_loss_7": 334.60899047851564,
"learning_rate": 0.0009735287795090455,
"loss": 747.7461,
"step": 1130
},
{
"ce_loss_10": 3.6206952929496765,
"ce_loss_13": 3.5408340215682985,
"ce_loss_2": 4.278374576568604,
"ce_loss_3": 4.073013770580292,
"ce_loss_7": 3.709249567985535,
"epoch": 0.114,
"grad_norm": 560.0,
"kl_loss_10": 129.40065841674806,
"kl_loss_2": 1489.2802490234376,
"kl_loss_3": 1078.2224548339843,
"kl_loss_7": 308.7394744873047,
"learning_rate": 0.0009730169760710386,
"loss": 743.8783,
"step": 1140
},
{
"ce_loss_10": 3.7078137516975405,
"ce_loss_13": 3.6258071303367614,
"ce_loss_2": 4.352467465400696,
"ce_loss_3": 4.14322521686554,
"ce_loss_7": 3.793818712234497,
"epoch": 0.115,
"grad_norm": 532.0,
"kl_loss_10": 132.82089805603027,
"kl_loss_2": 1462.5518798828125,
"kl_loss_3": 1047.3691467285157,
"kl_loss_7": 303.57424392700193,
"learning_rate": 0.0009725004093573342,
"loss": 741.0269,
"step": 1150
},
{
"ce_loss_10": 3.641883647441864,
"ce_loss_13": 3.5618484139442446,
"ce_loss_2": 4.298850560188294,
"ce_loss_3": 4.0857291460037235,
"ce_loss_7": 3.732511842250824,
"epoch": 0.116,
"grad_norm": 500.0,
"kl_loss_10": 125.74126281738282,
"kl_loss_2": 1472.2821716308595,
"kl_loss_3": 1051.9892150878907,
"kl_loss_7": 293.47923736572267,
"learning_rate": 0.0009719790845697534,
"loss": 730.1605,
"step": 1160
},
{
"ce_loss_10": 3.588691568374634,
"ce_loss_13": 3.514021909236908,
"ce_loss_2": 4.223182845115661,
"ce_loss_3": 4.0243830442428585,
"ce_loss_7": 3.668225383758545,
"epoch": 0.117,
"grad_norm": 544.0,
"kl_loss_10": 118.70261917114257,
"kl_loss_2": 1445.7062133789063,
"kl_loss_3": 1032.7508636474608,
"kl_loss_7": 274.055322265625,
"learning_rate": 0.0009714530069580309,
"loss": 718.2419,
"step": 1170
},
{
"ce_loss_10": 3.6957285404205322,
"ce_loss_13": 3.618162250518799,
"ce_loss_2": 4.352480411529541,
"ce_loss_3": 4.145036590099335,
"ce_loss_7": 3.7782084584236144,
"epoch": 0.118,
"grad_norm": 536.0,
"kl_loss_10": 127.76230659484864,
"kl_loss_2": 1480.515966796875,
"kl_loss_3": 1059.5112030029297,
"kl_loss_7": 282.41261138916013,
"learning_rate": 0.0009709221818197624,
"loss": 734.455,
"step": 1180
},
{
"ce_loss_10": 3.721509063243866,
"ce_loss_13": 3.6461830377578734,
"ce_loss_2": 4.384591698646545,
"ce_loss_3": 4.175746941566468,
"ce_loss_7": 3.804957926273346,
"epoch": 0.119,
"grad_norm": 454.0,
"kl_loss_10": 121.90502281188965,
"kl_loss_2": 1485.071533203125,
"kl_loss_3": 1060.364013671875,
"kl_loss_7": 273.83970947265624,
"learning_rate": 0.0009703866145003512,
"loss": 735.9141,
"step": 1190
},
{
"ce_loss_10": 3.6931097984313963,
"ce_loss_13": 3.618978762626648,
"ce_loss_2": 4.338696074485779,
"ce_loss_3": 4.131911754608154,
"ce_loss_7": 3.771903729438782,
"epoch": 0.12,
"grad_norm": 404.0,
"kl_loss_10": 117.82418823242188,
"kl_loss_2": 1472.2267517089845,
"kl_loss_3": 1051.9043701171875,
"kl_loss_7": 267.1518127441406,
"learning_rate": 0.0009698463103929542,
"loss": 740.661,
"step": 1200
},
{
"ce_loss_10": 3.658799970149994,
"ce_loss_13": 3.5842487812042236,
"ce_loss_2": 4.312227940559387,
"ce_loss_3": 4.10740053653717,
"ce_loss_7": 3.7385629415512085,
"epoch": 0.121,
"grad_norm": 412.0,
"kl_loss_10": 122.52205390930176,
"kl_loss_2": 1466.4009826660156,
"kl_loss_3": 1056.7593078613281,
"kl_loss_7": 272.06723709106444,
"learning_rate": 0.0009693012749384279,
"loss": 737.0117,
"step": 1210
},
{
"ce_loss_10": 3.679182291030884,
"ce_loss_13": 3.6015621542930605,
"ce_loss_2": 4.328339552879333,
"ce_loss_3": 4.114730060100555,
"ce_loss_7": 3.7581299543380737,
"epoch": 0.122,
"grad_norm": 500.0,
"kl_loss_10": 124.13164978027343,
"kl_loss_2": 1486.9410522460937,
"kl_loss_3": 1053.6353607177734,
"kl_loss_7": 279.761865234375,
"learning_rate": 0.0009687515136252732,
"loss": 728.5778,
"step": 1220
},
{
"ce_loss_10": 3.6272791743278505,
"ce_loss_13": 3.55220046043396,
"ce_loss_2": 4.301675605773926,
"ce_loss_3": 4.08597983121872,
"ce_loss_7": 3.70781672000885,
"epoch": 0.123,
"grad_norm": 568.0,
"kl_loss_10": 121.01997566223145,
"kl_loss_2": 1522.73505859375,
"kl_loss_3": 1086.6077362060546,
"kl_loss_7": 279.3069206237793,
"learning_rate": 0.0009681970319895803,
"loss": 759.7192,
"step": 1230
},
{
"ce_loss_10": 3.71327965259552,
"ce_loss_13": 3.6385520815849306,
"ce_loss_2": 4.354242825508118,
"ce_loss_3": 4.150368654727936,
"ce_loss_7": 3.7916497707366945,
"epoch": 0.124,
"grad_norm": 414.0,
"kl_loss_10": 124.44540634155274,
"kl_loss_2": 1443.4426513671874,
"kl_loss_3": 1030.0522674560548,
"kl_loss_7": 268.6724395751953,
"learning_rate": 0.0009676378356149733,
"loss": 722.6414,
"step": 1240
},
{
"ce_loss_10": 3.6944294214248656,
"ce_loss_13": 3.6130942940711974,
"ce_loss_2": 4.31483553647995,
"ce_loss_3": 4.113047087192536,
"ce_loss_7": 3.7628474831581116,
"epoch": 0.125,
"grad_norm": 572.0,
"kl_loss_10": 133.05949897766112,
"kl_loss_2": 1434.6165405273437,
"kl_loss_3": 1024.405093383789,
"kl_loss_7": 265.9820045471191,
"learning_rate": 0.0009670739301325534,
"loss": 721.2149,
"step": 1250
},
{
"ce_loss_10": 3.6491236448287965,
"ce_loss_13": 3.5683398127555845,
"ce_loss_2": 4.294895899295807,
"ce_loss_3": 4.082043838500977,
"ce_loss_7": 3.721854901313782,
"epoch": 0.126,
"grad_norm": 506.0,
"kl_loss_10": 130.2590259552002,
"kl_loss_2": 1460.9762817382812,
"kl_loss_3": 1047.9487213134767,
"kl_loss_7": 271.88867340087893,
"learning_rate": 0.0009665053212208426,
"loss": 732.2017,
"step": 1260
},
{
"ce_loss_10": 3.6933886647224425,
"ce_loss_13": 3.6137840390205382,
"ce_loss_2": 4.336398506164551,
"ce_loss_3": 4.125988566875458,
"ce_loss_7": 3.7641473054885863,
"epoch": 0.127,
"grad_norm": 470.0,
"kl_loss_10": 131.11599006652833,
"kl_loss_2": 1466.260760498047,
"kl_loss_3": 1047.477099609375,
"kl_loss_7": 271.31814041137693,
"learning_rate": 0.0009659320146057262,
"loss": 729.9061,
"step": 1270
},
{
"ce_loss_10": 3.6932409524917604,
"ce_loss_13": 3.6162060022354128,
"ce_loss_2": 4.326151037216187,
"ce_loss_3": 4.118873739242554,
"ce_loss_7": 3.7665857672691345,
"epoch": 0.128,
"grad_norm": 488.0,
"kl_loss_10": 126.3920768737793,
"kl_loss_2": 1439.7459106445312,
"kl_loss_3": 1023.4634368896484,
"kl_loss_7": 263.45100021362305,
"learning_rate": 0.0009653540160603955,
"loss": 714.3654,
"step": 1280
},
{
"ce_loss_10": 3.695625138282776,
"ce_loss_13": 3.619100844860077,
"ce_loss_2": 4.322635555267334,
"ce_loss_3": 4.121702527999878,
"ce_loss_7": 3.764703559875488,
"epoch": 0.129,
"grad_norm": 516.0,
"kl_loss_10": 125.06153717041016,
"kl_loss_2": 1449.9097961425782,
"kl_loss_3": 1036.939111328125,
"kl_loss_7": 261.42085418701174,
"learning_rate": 0.0009647713314052896,
"loss": 709.775,
"step": 1290
},
{
"ce_loss_10": 3.645713412761688,
"ce_loss_13": 3.5693684458732604,
"ce_loss_2": 4.318705654144287,
"ce_loss_3": 4.105780220031738,
"ce_loss_7": 3.721473240852356,
"epoch": 0.13,
"grad_norm": 504.0,
"kl_loss_10": 125.77756729125977,
"kl_loss_2": 1515.6605834960938,
"kl_loss_3": 1082.791098022461,
"kl_loss_7": 268.05779418945315,
"learning_rate": 0.0009641839665080363,
"loss": 739.9956,
"step": 1300
},
{
"ce_loss_10": 3.6060986638069155,
"ce_loss_13": 3.532057249546051,
"ce_loss_2": 4.258748412132263,
"ce_loss_3": 4.044409060478211,
"ce_loss_7": 3.6810790419578554,
"epoch": 0.131,
"grad_norm": 576.0,
"kl_loss_10": 120.33845672607421,
"kl_loss_2": 1464.5880249023437,
"kl_loss_3": 1035.7476196289062,
"kl_loss_7": 258.05460357666016,
"learning_rate": 0.0009635919272833937,
"loss": 712.5358,
"step": 1310
},
{
"ce_loss_10": 3.6437799096107484,
"ce_loss_13": 3.567954385280609,
"ce_loss_2": 4.29961267709732,
"ce_loss_3": 4.091295349597931,
"ce_loss_7": 3.7204660773277283,
"epoch": 0.132,
"grad_norm": 520.0,
"kl_loss_10": 123.54525375366211,
"kl_loss_2": 1460.5380920410157,
"kl_loss_3": 1039.971875,
"kl_loss_7": 264.77962188720704,
"learning_rate": 0.0009629952196931902,
"loss": 712.4777,
"step": 1320
},
{
"ce_loss_10": 3.63455046415329,
"ce_loss_13": 3.557650101184845,
"ce_loss_2": 4.270671212673188,
"ce_loss_3": 4.062439024448395,
"ce_loss_7": 3.702920150756836,
"epoch": 0.133,
"grad_norm": 434.0,
"kl_loss_10": 123.04212112426758,
"kl_loss_2": 1444.6365600585937,
"kl_loss_3": 1028.4689849853517,
"kl_loss_7": 258.92202911376955,
"learning_rate": 0.0009623938497462645,
"loss": 713.1292,
"step": 1330
},
{
"ce_loss_10": 3.6247922778129578,
"ce_loss_13": 3.5494427919387816,
"ce_loss_2": 4.2690158009529116,
"ce_loss_3": 4.058642566204071,
"ce_loss_7": 3.6972993493080137,
"epoch": 0.134,
"grad_norm": 478.0,
"kl_loss_10": 120.75489349365235,
"kl_loss_2": 1456.6053527832032,
"kl_loss_3": 1037.5501190185546,
"kl_loss_7": 266.06713485717773,
"learning_rate": 0.0009617878234984055,
"loss": 726.2297,
"step": 1340
},
{
"ce_loss_10": 3.717451739311218,
"ce_loss_13": 3.642001247406006,
"ce_loss_2": 4.3319720983505245,
"ce_loss_3": 4.123037731647491,
"ce_loss_7": 3.790008616447449,
"epoch": 0.135,
"grad_norm": 548.0,
"kl_loss_10": 120.84955863952636,
"kl_loss_2": 1400.965509033203,
"kl_loss_3": 989.1194274902343,
"kl_loss_7": 260.0563507080078,
"learning_rate": 0.0009611771470522907,
"loss": 704.2836,
"step": 1350
},
{
"ce_loss_10": 3.6397268891334535,
"ce_loss_13": 3.565677487850189,
"ce_loss_2": 4.285844933986664,
"ce_loss_3": 4.075031089782715,
"ce_loss_7": 3.7177743196487425,
"epoch": 0.136,
"grad_norm": 548.0,
"kl_loss_10": 119.19244728088378,
"kl_loss_2": 1430.9075134277343,
"kl_loss_3": 1014.8560333251953,
"kl_loss_7": 264.9954383850098,
"learning_rate": 0.0009605618265574251,
"loss": 706.0607,
"step": 1360
},
{
"ce_loss_10": 3.6019657135009764,
"ce_loss_13": 3.5283800959587097,
"ce_loss_2": 4.247595989704132,
"ce_loss_3": 4.0493292808532715,
"ce_loss_7": 3.682861661911011,
"epoch": 0.137,
"grad_norm": 544.0,
"kl_loss_10": 120.26506729125977,
"kl_loss_2": 1482.0683837890624,
"kl_loss_3": 1078.6839630126954,
"kl_loss_7": 272.45921630859374,
"learning_rate": 0.0009599418682100792,
"loss": 727.2132,
"step": 1370
},
{
"ce_loss_10": 3.645335590839386,
"ce_loss_13": 3.570277786254883,
"ce_loss_2": 4.2866430401802065,
"ce_loss_3": 4.071622550487518,
"ce_loss_7": 3.717624640464783,
"epoch": 0.138,
"grad_norm": 612.0,
"kl_loss_10": 119.49271163940429,
"kl_loss_2": 1442.7004455566407,
"kl_loss_3": 1025.406283569336,
"kl_loss_7": 261.58585968017576,
"learning_rate": 0.0009593172782532268,
"loss": 717.2026,
"step": 1380
},
{
"ce_loss_10": 3.6908539175987243,
"ce_loss_13": 3.617784011363983,
"ce_loss_2": 4.313388335704803,
"ce_loss_3": 4.114008998870849,
"ce_loss_7": 3.7639716506004333,
"epoch": 0.139,
"grad_norm": 476.0,
"kl_loss_10": 120.62876358032227,
"kl_loss_2": 1423.3504333496094,
"kl_loss_3": 1012.8270812988281,
"kl_loss_7": 261.68493881225584,
"learning_rate": 0.0009586880629764817,
"loss": 706.5565,
"step": 1390
},
{
"ce_loss_10": 3.61561758518219,
"ce_loss_13": 3.5403517365455626,
"ce_loss_2": 4.258929216861725,
"ce_loss_3": 4.060744059085846,
"ce_loss_7": 3.687643599510193,
"epoch": 0.14,
"grad_norm": 792.0,
"kl_loss_10": 120.44653511047363,
"kl_loss_2": 1437.7734375,
"kl_loss_3": 1068.6304412841796,
"kl_loss_7": 272.7257308959961,
"learning_rate": 0.0009580542287160348,
"loss": 716.5157,
"step": 1400
},
{
"ce_loss_10": 3.579881501197815,
"ce_loss_13": 3.505436861515045,
"ce_loss_2": 4.2163320779800415,
"ce_loss_3": 4.016775751113892,
"ce_loss_7": 3.6602004766464233,
"epoch": 0.141,
"grad_norm": 740.0,
"kl_loss_10": 119.12874908447266,
"kl_loss_2": 1437.7877868652345,
"kl_loss_3": 1029.927035522461,
"kl_loss_7": 274.3121276855469,
"learning_rate": 0.0009574157818545901,
"loss": 704.4754,
"step": 1410
},
{
"ce_loss_10": 3.654617667198181,
"ce_loss_13": 3.581939327716827,
"ce_loss_2": 4.266410648822784,
"ce_loss_3": 4.076312291622162,
"ce_loss_7": 3.7350067377090452,
"epoch": 0.142,
"grad_norm": 788.0,
"kl_loss_10": 117.25881233215333,
"kl_loss_2": 1402.042706298828,
"kl_loss_3": 1008.1144165039062,
"kl_loss_7": 268.0317886352539,
"learning_rate": 0.0009567727288213005,
"loss": 712.509,
"step": 1420
},
{
"ce_loss_10": 3.62344468832016,
"ce_loss_13": 3.552217972278595,
"ce_loss_2": 4.242154741287232,
"ce_loss_3": 4.047549939155578,
"ce_loss_7": 3.6985828638076783,
"epoch": 0.143,
"grad_norm": 466.0,
"kl_loss_10": 115.47184524536132,
"kl_loss_2": 1418.5062316894532,
"kl_loss_3": 1018.4080627441406,
"kl_loss_7": 270.1881278991699,
"learning_rate": 0.0009561250760917027,
"loss": 702.7143,
"step": 1430
},
{
"ce_loss_10": 3.6490369558334352,
"ce_loss_13": 3.5760287642478943,
"ce_loss_2": 4.275133848190308,
"ce_loss_3": 4.07141832113266,
"ce_loss_7": 3.7269250392913817,
"epoch": 0.144,
"grad_norm": 524.0,
"kl_loss_10": 119.90954666137695,
"kl_loss_2": 1441.103936767578,
"kl_loss_3": 1028.5968627929688,
"kl_loss_7": 267.66681442260744,
"learning_rate": 0.0009554728301876525,
"loss": 698.2885,
"step": 1440
},
{
"ce_loss_10": 3.7067930936813354,
"ce_loss_13": 3.629922258853912,
"ce_loss_2": 4.314408445358277,
"ce_loss_3": 4.132321739196778,
"ce_loss_7": 3.7809135794639586,
"epoch": 0.145,
"grad_norm": 632.0,
"kl_loss_10": 122.64454803466796,
"kl_loss_2": 1398.767156982422,
"kl_loss_3": 1023.6964874267578,
"kl_loss_7": 262.7124481201172,
"learning_rate": 0.0009548159976772592,
"loss": 721.9051,
"step": 1450
},
{
"ce_loss_10": 3.641107952594757,
"ce_loss_13": 3.5679691076278686,
"ce_loss_2": 4.273185658454895,
"ce_loss_3": 4.074773287773132,
"ce_loss_7": 3.715712809562683,
"epoch": 0.146,
"grad_norm": 472.0,
"kl_loss_10": 119.45023498535156,
"kl_loss_2": 1426.5163330078126,
"kl_loss_3": 1022.3043518066406,
"kl_loss_7": 264.85511245727537,
"learning_rate": 0.0009541545851748186,
"loss": 702.8599,
"step": 1460
},
{
"ce_loss_10": 3.5100984811782836,
"ce_loss_13": 3.436799693107605,
"ce_loss_2": 4.161883985996246,
"ce_loss_3": 3.955858516693115,
"ce_loss_7": 3.594360911846161,
"epoch": 0.147,
"grad_norm": 556.0,
"kl_loss_10": 116.41493873596191,
"kl_loss_2": 1467.1879943847657,
"kl_loss_3": 1037.354574584961,
"kl_loss_7": 266.5866645812988,
"learning_rate": 0.0009534885993407473,
"loss": 713.4948,
"step": 1470
},
{
"ce_loss_10": 3.6824231266975405,
"ce_loss_13": 3.608121466636658,
"ce_loss_2": 4.328725492954254,
"ce_loss_3": 4.115788686275482,
"ce_loss_7": 3.755172336101532,
"epoch": 0.148,
"grad_norm": 560.0,
"kl_loss_10": 118.46903686523437,
"kl_loss_2": 1448.8276794433593,
"kl_loss_3": 1029.914727783203,
"kl_loss_7": 263.37538986206056,
"learning_rate": 0.0009528180468815154,
"loss": 714.8544,
"step": 1480
},
{
"ce_loss_10": 3.7179338216781614,
"ce_loss_13": 3.6484482169151304,
"ce_loss_2": 4.323283433914185,
"ce_loss_3": 4.127403116226196,
"ce_loss_7": 3.7899964809417725,
"epoch": 0.149,
"grad_norm": 480.0,
"kl_loss_10": 114.30357208251954,
"kl_loss_2": 1395.1859313964844,
"kl_loss_3": 989.8287170410156,
"kl_loss_7": 257.14686279296876,
"learning_rate": 0.0009521429345495787,
"loss": 690.7114,
"step": 1490
},
{
"ce_loss_10": 3.7034213185310363,
"ce_loss_13": 3.6311787962913513,
"ce_loss_2": 4.309155285358429,
"ce_loss_3": 4.092868828773499,
"ce_loss_7": 3.7654823780059816,
"epoch": 0.15,
"grad_norm": 448.0,
"kl_loss_10": 116.55960197448731,
"kl_loss_2": 1382.6544982910157,
"kl_loss_3": 969.2838195800781,
"kl_loss_7": 249.21936950683593,
"learning_rate": 0.0009514632691433108,
"loss": 688.2995,
"step": 1500
},
{
"ce_loss_10": 3.6700626373291017,
"ce_loss_13": 3.5945797085762026,
"ce_loss_2": 4.289841759204864,
"ce_loss_3": 4.08635276556015,
"ce_loss_7": 3.738140869140625,
"epoch": 0.151,
"grad_norm": 448.0,
"kl_loss_10": 129.3878589630127,
"kl_loss_2": 1424.29853515625,
"kl_loss_3": 1001.7422454833984,
"kl_loss_7": 254.78705520629882,
"learning_rate": 0.0009507790575069346,
"loss": 706.6927,
"step": 1510
},
{
"ce_loss_10": 3.65303395986557,
"ce_loss_13": 3.5714800715446473,
"ce_loss_2": 4.28388956785202,
"ce_loss_3": 4.07215541601181,
"ce_loss_7": 3.7174383282661436,
"epoch": 0.152,
"grad_norm": 560.0,
"kl_loss_10": 131.33350143432617,
"kl_loss_2": 1434.660906982422,
"kl_loss_3": 1017.4268585205078,
"kl_loss_7": 260.6188400268555,
"learning_rate": 0.0009500903065304539,
"loss": 715.3042,
"step": 1520
},
{
"ce_loss_10": 3.683453822135925,
"ce_loss_13": 3.60856169462204,
"ce_loss_2": 4.287684428691864,
"ce_loss_3": 4.0820488929748535,
"ce_loss_7": 3.7489410638809204,
"epoch": 0.153,
"grad_norm": 592.0,
"kl_loss_10": 120.57846107482911,
"kl_loss_2": 1384.860614013672,
"kl_loss_3": 975.0672027587891,
"kl_loss_7": 247.4440773010254,
"learning_rate": 0.0009493970231495835,
"loss": 691.6448,
"step": 1530
},
{
"ce_loss_10": 3.6223431706428526,
"ce_loss_13": 3.55173202753067,
"ce_loss_2": 4.230155563354492,
"ce_loss_3": 4.02554075717926,
"ce_loss_7": 3.6863773345947264,
"epoch": 0.154,
"grad_norm": 490.0,
"kl_loss_10": 119.43844871520996,
"kl_loss_2": 1397.457257080078,
"kl_loss_3": 991.8431060791015,
"kl_loss_7": 243.42232818603514,
"learning_rate": 0.0009486992143456792,
"loss": 686.1227,
"step": 1540
},
{
"ce_loss_10": 3.6514541625976564,
"ce_loss_13": 3.571796643733978,
"ce_loss_2": 4.304909610748291,
"ce_loss_3": 4.0922522187232975,
"ce_loss_7": 3.7216905117034913,
"epoch": 0.155,
"grad_norm": 396.0,
"kl_loss_10": 128.10715980529784,
"kl_loss_2": 1491.158837890625,
"kl_loss_3": 1056.2210266113282,
"kl_loss_7": 262.1390213012695,
"learning_rate": 0.0009479968871456679,
"loss": 716.6379,
"step": 1550
},
{
"ce_loss_10": 3.6170923829078676,
"ce_loss_13": 3.542399287223816,
"ce_loss_2": 4.252330017089844,
"ce_loss_3": 4.045702540874482,
"ce_loss_7": 3.685628616809845,
"epoch": 0.156,
"grad_norm": 454.0,
"kl_loss_10": 121.63033790588379,
"kl_loss_2": 1463.288525390625,
"kl_loss_3": 1026.1065032958984,
"kl_loss_7": 254.98253784179687,
"learning_rate": 0.0009472900486219768,
"loss": 702.4742,
"step": 1560
},
{
"ce_loss_10": 3.6025954604148867,
"ce_loss_13": 3.5303670883178713,
"ce_loss_2": 4.232356917858124,
"ce_loss_3": 4.022554993629456,
"ce_loss_7": 3.6709616661071776,
"epoch": 0.157,
"grad_norm": 520.0,
"kl_loss_10": 118.88864822387696,
"kl_loss_2": 1434.4180419921875,
"kl_loss_3": 1021.8371948242187,
"kl_loss_7": 253.59476776123046,
"learning_rate": 0.000946578705892462,
"loss": 706.9224,
"step": 1570
},
{
"ce_loss_10": 3.6455034971237184,
"ce_loss_13": 3.5725855112075804,
"ce_loss_2": 4.251002633571625,
"ce_loss_3": 4.075614416599274,
"ce_loss_7": 3.712466835975647,
"epoch": 0.158,
"grad_norm": 520.0,
"kl_loss_10": 115.86212844848633,
"kl_loss_2": 1388.911444091797,
"kl_loss_3": 1008.0096618652344,
"kl_loss_7": 249.38579559326172,
"learning_rate": 0.0009458628661203367,
"loss": 702.1684,
"step": 1580
},
{
"ce_loss_10": 3.6394161105155947,
"ce_loss_13": 3.571541059017181,
"ce_loss_2": 4.284844183921814,
"ce_loss_3": 4.076776087284088,
"ce_loss_7": 3.7110044956207275,
"epoch": 0.159,
"grad_norm": 494.0,
"kl_loss_10": 113.66581001281739,
"kl_loss_2": 1444.869854736328,
"kl_loss_3": 1032.262744140625,
"kl_loss_7": 253.4554000854492,
"learning_rate": 0.0009451425365140996,
"loss": 688.5467,
"step": 1590
},
{
"ce_loss_10": 3.7211164236068726,
"ce_loss_13": 3.649132215976715,
"ce_loss_2": 4.325118780136108,
"ce_loss_3": 4.128993570804596,
"ce_loss_7": 3.7914722681045534,
"epoch": 0.16,
"grad_norm": 456.0,
"kl_loss_10": 117.80431632995605,
"kl_loss_2": 1373.1717468261718,
"kl_loss_3": 981.5717620849609,
"kl_loss_7": 253.6373489379883,
"learning_rate": 0.0009444177243274617,
"loss": 681.3762,
"step": 1600
},
{
"ce_loss_10": 3.574730896949768,
"ce_loss_13": 3.498664665222168,
"ce_loss_2": 4.200723135471344,
"ce_loss_3": 4.009881269931793,
"ce_loss_7": 3.6463570594787598,
"epoch": 0.161,
"grad_norm": 480.0,
"kl_loss_10": 122.87367897033691,
"kl_loss_2": 1430.2820068359374,
"kl_loss_3": 1037.6371978759767,
"kl_loss_7": 260.55384521484376,
"learning_rate": 0.0009436884368592739,
"loss": 706.6845,
"step": 1610
},
{
"ce_loss_10": 3.6286559462547303,
"ce_loss_13": 3.555983376502991,
"ce_loss_2": 4.232067906856537,
"ce_loss_3": 4.041269278526306,
"ce_loss_7": 3.6985298871994017,
"epoch": 0.162,
"grad_norm": 498.0,
"kl_loss_10": 118.67424545288085,
"kl_loss_2": 1385.5154724121094,
"kl_loss_3": 999.56015625,
"kl_loss_7": 250.75928268432617,
"learning_rate": 0.0009429546814534529,
"loss": 699.0302,
"step": 1620
},
{
"ce_loss_10": 3.639633226394653,
"ce_loss_13": 3.5706356167793274,
"ce_loss_2": 4.241236877441406,
"ce_loss_3": 4.056409633159637,
"ce_loss_7": 3.708655667304993,
"epoch": 0.163,
"grad_norm": 384.0,
"kl_loss_10": 117.12662200927734,
"kl_loss_2": 1374.89609375,
"kl_loss_3": 989.9520751953125,
"kl_loss_7": 248.8686378479004,
"learning_rate": 0.0009422164654989072,
"loss": 676.7936,
"step": 1630
},
{
"ce_loss_10": 3.7635043978691103,
"ce_loss_13": 3.687360870838165,
"ce_loss_2": 4.338383412361145,
"ce_loss_3": 4.1611551403999325,
"ce_loss_7": 3.828977358341217,
"epoch": 0.164,
"grad_norm": 424.0,
"kl_loss_10": 119.46500015258789,
"kl_loss_2": 1362.1967407226562,
"kl_loss_3": 990.5942932128906,
"kl_loss_7": 248.94699325561524,
"learning_rate": 0.0009414737964294635,
"loss": 685.8197,
"step": 1640
},
{
"ce_loss_10": 3.678090500831604,
"ce_loss_13": 3.6101470470428465,
"ce_loss_2": 4.259114742279053,
"ce_loss_3": 4.078064382076263,
"ce_loss_7": 3.7428590416908265,
"epoch": 0.165,
"grad_norm": 444.0,
"kl_loss_10": 112.9244888305664,
"kl_loss_2": 1333.3570129394532,
"kl_loss_3": 969.1624145507812,
"kl_loss_7": 238.38165054321288,
"learning_rate": 0.000940726681723791,
"loss": 682.7061,
"step": 1650
},
{
"ce_loss_10": 3.512197470664978,
"ce_loss_13": 3.4408557653427123,
"ce_loss_2": 4.148977339267731,
"ce_loss_3": 3.9514773368835447,
"ce_loss_7": 3.5815786600112913,
"epoch": 0.166,
"grad_norm": 488.0,
"kl_loss_10": 117.70020294189453,
"kl_loss_2": 1442.5229797363281,
"kl_loss_3": 1035.8803924560548,
"kl_loss_7": 256.4058250427246,
"learning_rate": 0.0009399751289053266,
"loss": 690.3188,
"step": 1660
},
{
"ce_loss_10": 3.742681550979614,
"ce_loss_13": 3.671311604976654,
"ce_loss_2": 4.328805279731751,
"ce_loss_3": 4.135993778705597,
"ce_loss_7": 3.809998023509979,
"epoch": 0.167,
"grad_norm": 478.0,
"kl_loss_10": 116.78232650756836,
"kl_loss_2": 1366.6806213378907,
"kl_loss_3": 967.4927185058593,
"kl_loss_7": 249.3471366882324,
"learning_rate": 0.0009392191455421988,
"loss": 682.1736,
"step": 1670
},
{
"ce_loss_10": 3.7067878365516664,
"ce_loss_13": 3.6276530623435974,
"ce_loss_2": 4.298012292385101,
"ce_loss_3": 4.104024171829224,
"ce_loss_7": 3.7697168350219727,
"epoch": 0.168,
"grad_norm": 490.0,
"kl_loss_10": 123.8147029876709,
"kl_loss_2": 1386.5616271972656,
"kl_loss_3": 990.9451782226563,
"kl_loss_7": 260.7920967102051,
"learning_rate": 0.0009384587392471515,
"loss": 679.4555,
"step": 1680
},
{
"ce_loss_10": 3.7010039329528808,
"ce_loss_13": 3.629232919216156,
"ce_loss_2": 4.288461661338806,
"ce_loss_3": 4.104441356658936,
"ce_loss_7": 3.773091959953308,
"epoch": 0.169,
"grad_norm": 494.0,
"kl_loss_10": 117.7159465789795,
"kl_loss_2": 1349.4280029296874,
"kl_loss_3": 968.4034729003906,
"kl_loss_7": 251.4811584472656,
"learning_rate": 0.0009376939176774678,
"loss": 675.85,
"step": 1690
},
{
"ce_loss_10": 3.678883969783783,
"ce_loss_13": 3.602108871936798,
"ce_loss_2": 4.274775016307831,
"ce_loss_3": 4.074398016929626,
"ce_loss_7": 3.7432108521461487,
"epoch": 0.17,
"grad_norm": 540.0,
"kl_loss_10": 124.26388397216797,
"kl_loss_2": 1371.6622314453125,
"kl_loss_3": 974.3467742919922,
"kl_loss_7": 252.39389877319337,
"learning_rate": 0.0009369246885348925,
"loss": 687.5515,
"step": 1700
},
{
"ce_loss_10": 3.6718587994575502,
"ce_loss_13": 3.591440510749817,
"ce_loss_2": 4.303124558925629,
"ce_loss_3": 4.095115387439728,
"ce_loss_7": 3.7370152711868285,
"epoch": 0.171,
"grad_norm": 548.0,
"kl_loss_10": 130.6899742126465,
"kl_loss_2": 1433.3515563964843,
"kl_loss_3": 1016.4502960205078,
"kl_loss_7": 255.15539627075196,
"learning_rate": 0.0009361510595655545,
"loss": 695.4526,
"step": 1710
},
{
"ce_loss_10": 3.6283922672271727,
"ce_loss_13": 3.5495692014694216,
"ce_loss_2": 4.237072479724884,
"ce_loss_3": 4.041323733329773,
"ce_loss_7": 3.6956202745437623,
"epoch": 0.172,
"grad_norm": 466.0,
"kl_loss_10": 127.2772174835205,
"kl_loss_2": 1409.4606872558593,
"kl_loss_3": 1009.6889221191407,
"kl_loss_7": 256.7372299194336,
"learning_rate": 0.0009353730385598887,
"loss": 691.3762,
"step": 1720
},
{
"ce_loss_10": 3.54926735162735,
"ce_loss_13": 3.4769670009613036,
"ce_loss_2": 4.178456115722656,
"ce_loss_3": 3.9720041275024416,
"ce_loss_7": 3.6176819682121275,
"epoch": 0.173,
"grad_norm": 436.0,
"kl_loss_10": 118.23514060974121,
"kl_loss_2": 1418.4845947265626,
"kl_loss_3": 998.5092987060547,
"kl_loss_7": 249.0585678100586,
"learning_rate": 0.0009345906333525581,
"loss": 697.0381,
"step": 1730
},
{
"ce_loss_10": 3.5872240900993346,
"ce_loss_13": 3.5136430621147157,
"ce_loss_2": 4.2012934923172,
"ce_loss_3": 3.9967063546180723,
"ce_loss_7": 3.65671169757843,
"epoch": 0.174,
"grad_norm": 408.0,
"kl_loss_10": 122.21610031127929,
"kl_loss_2": 1418.8342651367188,
"kl_loss_3": 1007.5152191162109,
"kl_loss_7": 254.60486221313477,
"learning_rate": 0.0009338038518223745,
"loss": 687.4246,
"step": 1740
},
{
"ce_loss_10": 3.657099163532257,
"ce_loss_13": 3.5811222553253175,
"ce_loss_2": 4.272738003730774,
"ce_loss_3": 4.0657650351524355,
"ce_loss_7": 3.7293712973594664,
"epoch": 0.175,
"grad_norm": 424.0,
"kl_loss_10": 122.57909774780273,
"kl_loss_2": 1418.8521423339844,
"kl_loss_3": 1004.8310028076172,
"kl_loss_7": 258.8813926696777,
"learning_rate": 0.0009330127018922195,
"loss": 709.7155,
"step": 1750
},
{
"ce_loss_10": 3.60728679895401,
"ce_loss_13": 3.5332212805747987,
"ce_loss_2": 4.2153865694999695,
"ce_loss_3": 4.016399335861206,
"ce_loss_7": 3.6759958028793336,
"epoch": 0.176,
"grad_norm": 446.0,
"kl_loss_10": 117.00486183166504,
"kl_loss_2": 1406.6310485839845,
"kl_loss_3": 989.8223937988281,
"kl_loss_7": 252.39801330566405,
"learning_rate": 0.0009322171915289634,
"loss": 689.0163,
"step": 1760
},
{
"ce_loss_10": 3.640791046619415,
"ce_loss_13": 3.5716994404792786,
"ce_loss_2": 4.240784847736359,
"ce_loss_3": 4.040842926502227,
"ce_loss_7": 3.7066658616065977,
"epoch": 0.177,
"grad_norm": 504.0,
"kl_loss_10": 114.7268009185791,
"kl_loss_2": 1384.9641479492188,
"kl_loss_3": 983.0798065185547,
"kl_loss_7": 249.6033966064453,
"learning_rate": 0.0009314173287433873,
"loss": 677.7067,
"step": 1770
},
{
"ce_loss_10": 3.6371870756149294,
"ce_loss_13": 3.565370166301727,
"ce_loss_2": 4.248478496074677,
"ce_loss_3": 4.042388367652893,
"ce_loss_7": 3.7081421256065368,
"epoch": 0.178,
"grad_norm": 544.0,
"kl_loss_10": 117.58927421569824,
"kl_loss_2": 1410.781103515625,
"kl_loss_3": 995.2456298828125,
"kl_loss_7": 252.9298988342285,
"learning_rate": 0.0009306131215901003,
"loss": 681.0704,
"step": 1780
},
{
"ce_loss_10": 3.6657212376594543,
"ce_loss_13": 3.5942596793174744,
"ce_loss_2": 4.267256224155426,
"ce_loss_3": 4.070630991458893,
"ce_loss_7": 3.736851954460144,
"epoch": 0.179,
"grad_norm": 608.0,
"kl_loss_10": 117.64482841491699,
"kl_loss_2": 1382.4147399902345,
"kl_loss_3": 974.7222351074219,
"kl_loss_7": 254.88860549926758,
"learning_rate": 0.0009298045781674596,
"loss": 674.4276,
"step": 1790
},
{
"ce_loss_10": 3.6482357382774353,
"ce_loss_13": 3.577735936641693,
"ce_loss_2": 4.238207507133484,
"ce_loss_3": 4.047251141071319,
"ce_loss_7": 3.7247403979301454,
"epoch": 0.18,
"grad_norm": 584.0,
"kl_loss_10": 113.15109825134277,
"kl_loss_2": 1356.1355346679688,
"kl_loss_3": 966.8513641357422,
"kl_loss_7": 260.6168983459473,
"learning_rate": 0.0009289917066174886,
"loss": 687.0212,
"step": 1800
},
{
"ce_loss_10": 3.6436230182647704,
"ce_loss_13": 3.573524606227875,
"ce_loss_2": 4.205891370773315,
"ce_loss_3": 4.035960531234741,
"ce_loss_7": 3.713192844390869,
"epoch": 0.181,
"grad_norm": 644.0,
"kl_loss_10": 111.37209777832031,
"kl_loss_2": 1312.0460144042968,
"kl_loss_3": 951.6195190429687,
"kl_loss_7": 248.27648315429687,
"learning_rate": 0.0009281745151257945,
"loss": 665.2686,
"step": 1810
},
{
"ce_loss_10": 3.6573068499565125,
"ce_loss_13": 3.5899064898490907,
"ce_loss_2": 4.263534939289093,
"ce_loss_3": 4.073390209674836,
"ce_loss_7": 3.725028562545776,
"epoch": 0.182,
"grad_norm": 496.0,
"kl_loss_10": 112.47701683044434,
"kl_loss_2": 1362.4901000976563,
"kl_loss_3": 982.173095703125,
"kl_loss_7": 248.95221557617188,
"learning_rate": 0.0009273530119214868,
"loss": 681.1132,
"step": 1820
},
{
"ce_loss_10": 3.7659960746765138,
"ce_loss_13": 3.6931302428245543,
"ce_loss_2": 4.335440850257873,
"ce_loss_3": 4.146318483352661,
"ce_loss_7": 3.831969678401947,
"epoch": 0.183,
"grad_norm": 460.0,
"kl_loss_10": 115.37424812316894,
"kl_loss_2": 1332.8465270996094,
"kl_loss_3": 945.53359375,
"kl_loss_7": 244.2625930786133,
"learning_rate": 0.0009265272052770935,
"loss": 653.1528,
"step": 1830
},
{
"ce_loss_10": 3.573833405971527,
"ce_loss_13": 3.504916477203369,
"ce_loss_2": 4.191505336761475,
"ce_loss_3": 4.003697621822357,
"ce_loss_7": 3.6443989157676695,
"epoch": 0.184,
"grad_norm": 524.0,
"kl_loss_10": 110.15304069519043,
"kl_loss_2": 1378.9180969238282,
"kl_loss_3": 997.8934539794922,
"kl_loss_7": 241.1827133178711,
"learning_rate": 0.0009256971035084784,
"loss": 679.6828,
"step": 1840
},
{
"ce_loss_10": 3.5141557097434997,
"ce_loss_13": 3.4410739183425902,
"ce_loss_2": 4.1375454545021055,
"ce_loss_3": 3.934183955192566,
"ce_loss_7": 3.588452696800232,
"epoch": 0.185,
"grad_norm": 528.0,
"kl_loss_10": 114.25855445861816,
"kl_loss_2": 1412.8881896972657,
"kl_loss_3": 1019.4745208740235,
"kl_loss_7": 253.67017517089843,
"learning_rate": 0.0009248627149747573,
"loss": 690.3363,
"step": 1850
},
{
"ce_loss_10": 3.7252640962600707,
"ce_loss_13": 3.653822290897369,
"ce_loss_2": 4.3040543556213375,
"ce_loss_3": 4.132464408874512,
"ce_loss_7": 3.793966567516327,
"epoch": 0.186,
"grad_norm": 564.0,
"kl_loss_10": 115.14772300720215,
"kl_loss_2": 1340.048565673828,
"kl_loss_3": 980.2122955322266,
"kl_loss_7": 244.52606735229492,
"learning_rate": 0.0009240240480782129,
"loss": 674.7569,
"step": 1860
},
{
"ce_loss_10": 3.635197627544403,
"ce_loss_13": 3.561525750160217,
"ce_loss_2": 4.234712994098663,
"ce_loss_3": 4.036596286296844,
"ce_loss_7": 3.7000155448913574,
"epoch": 0.187,
"grad_norm": 442.0,
"kl_loss_10": 116.40899467468262,
"kl_loss_2": 1366.7482482910157,
"kl_loss_3": 985.8397155761719,
"kl_loss_7": 245.4348571777344,
"learning_rate": 0.0009231811112642122,
"loss": 670.6495,
"step": 1870
},
{
"ce_loss_10": 3.680171477794647,
"ce_loss_13": 3.607526624202728,
"ce_loss_2": 4.242461228370667,
"ce_loss_3": 4.0597851276397705,
"ce_loss_7": 3.7417822241783143,
"epoch": 0.188,
"grad_norm": 462.0,
"kl_loss_10": 115.97058601379395,
"kl_loss_2": 1329.476806640625,
"kl_loss_3": 944.8101593017578,
"kl_loss_7": 240.48009033203124,
"learning_rate": 0.0009223339130211192,
"loss": 656.504,
"step": 1880
},
{
"ce_loss_10": 3.527233564853668,
"ce_loss_13": 3.456979143619537,
"ce_loss_2": 4.1365337610244755,
"ce_loss_3": 3.9334477186203003,
"ce_loss_7": 3.5922507286071776,
"epoch": 0.189,
"grad_norm": 492.0,
"kl_loss_10": 120.61857757568359,
"kl_loss_2": 1391.3780578613282,
"kl_loss_3": 981.7018951416015,
"kl_loss_7": 240.8455017089844,
"learning_rate": 0.0009214824618802108,
"loss": 678.3247,
"step": 1890
},
{
"ce_loss_10": 3.715823400020599,
"ce_loss_13": 3.639923906326294,
"ce_loss_2": 4.3134965896606445,
"ce_loss_3": 4.111241257190704,
"ce_loss_7": 3.779346799850464,
"epoch": 0.19,
"grad_norm": 456.0,
"kl_loss_10": 127.23999710083008,
"kl_loss_2": 1364.8941589355468,
"kl_loss_3": 960.9680725097656,
"kl_loss_7": 248.77009048461915,
"learning_rate": 0.0009206267664155906,
"loss": 685.2967,
"step": 1900
},
{
"ce_loss_10": 3.6354769825935365,
"ce_loss_13": 3.556038224697113,
"ce_loss_2": 4.224778318405152,
"ce_loss_3": 4.022096812725067,
"ce_loss_7": 3.690963363647461,
"epoch": 0.191,
"grad_norm": 524.0,
"kl_loss_10": 125.37596015930175,
"kl_loss_2": 1371.057391357422,
"kl_loss_3": 969.1447021484375,
"kl_loss_7": 243.19865951538085,
"learning_rate": 0.0009197668352441024,
"loss": 678.1597,
"step": 1910
},
{
"ce_loss_10": 3.6849255323410035,
"ce_loss_13": 3.6070022225379943,
"ce_loss_2": 4.272895455360413,
"ce_loss_3": 4.0722639799118046,
"ce_loss_7": 3.741350519657135,
"epoch": 0.192,
"grad_norm": 512.0,
"kl_loss_10": 128.65237312316896,
"kl_loss_2": 1349.729931640625,
"kl_loss_3": 949.7346984863282,
"kl_loss_7": 242.41346817016603,
"learning_rate": 0.0009189026770252437,
"loss": 671.3585,
"step": 1920
},
{
"ce_loss_10": 3.7201656699180603,
"ce_loss_13": 3.6394999861717223,
"ce_loss_2": 4.302338600158691,
"ce_loss_3": 4.1032923579216005,
"ce_loss_7": 3.7764319658279417,
"epoch": 0.193,
"grad_norm": 458.0,
"kl_loss_10": 133.32494201660157,
"kl_loss_2": 1342.450421142578,
"kl_loss_3": 949.4765075683594,
"kl_loss_7": 246.09827728271483,
"learning_rate": 0.000918034300461078,
"loss": 688.7368,
"step": 1930
},
{
"ce_loss_10": 3.747203004360199,
"ce_loss_13": 3.6689361929893494,
"ce_loss_2": 4.312567710876465,
"ce_loss_3": 4.122261881828308,
"ce_loss_7": 3.8043017029762267,
"epoch": 0.194,
"grad_norm": 446.0,
"kl_loss_10": 129.00423164367675,
"kl_loss_2": 1325.102197265625,
"kl_loss_3": 931.5237396240234,
"kl_loss_7": 241.53863983154298,
"learning_rate": 0.0009171617142961477,
"loss": 661.2737,
"step": 1940
},
{
"ce_loss_10": 3.699472951889038,
"ce_loss_13": 3.6279419898986816,
"ce_loss_2": 4.281847763061523,
"ce_loss_3": 4.083541011810302,
"ce_loss_7": 3.7648919463157653,
"epoch": 0.195,
"grad_norm": 434.0,
"kl_loss_10": 121.35350723266602,
"kl_loss_2": 1352.1436584472656,
"kl_loss_3": 952.5272399902344,
"kl_loss_7": 240.53460235595702,
"learning_rate": 0.0009162849273173857,
"loss": 665.7376,
"step": 1950
},
{
"ce_loss_10": 3.632410800457001,
"ce_loss_13": 3.5614632248878477,
"ce_loss_2": 4.223404765129089,
"ce_loss_3": 4.023203945159912,
"ce_loss_7": 3.700835573673248,
"epoch": 0.196,
"grad_norm": 470.0,
"kl_loss_10": 118.8283805847168,
"kl_loss_2": 1344.0367370605468,
"kl_loss_3": 944.1380065917969,
"kl_loss_7": 251.04139099121093,
"learning_rate": 0.0009154039483540273,
"loss": 672.422,
"step": 1960
},
{
"ce_loss_10": 3.6197654128074648,
"ce_loss_13": 3.546481454372406,
"ce_loss_2": 4.201360607147217,
"ce_loss_3": 4.004413700103759,
"ce_loss_7": 3.682723355293274,
"epoch": 0.197,
"grad_norm": 406.0,
"kl_loss_10": 120.08837623596192,
"kl_loss_2": 1349.5242309570312,
"kl_loss_3": 942.7119018554688,
"kl_loss_7": 243.98333435058595,
"learning_rate": 0.0009145187862775209,
"loss": 667.6594,
"step": 1970
},
{
"ce_loss_10": 3.6506257891654967,
"ce_loss_13": 3.5804669737815855,
"ce_loss_2": 4.243929970264435,
"ce_loss_3": 4.034862732887268,
"ce_loss_7": 3.7135458827018737,
"epoch": 0.198,
"grad_norm": 620.0,
"kl_loss_10": 117.48477897644042,
"kl_loss_2": 1377.6264770507812,
"kl_loss_3": 958.5087188720703,
"kl_loss_7": 243.4327537536621,
"learning_rate": 0.0009136294500014386,
"loss": 665.5496,
"step": 1980
},
{
"ce_loss_10": 3.599961686134338,
"ce_loss_13": 3.528086531162262,
"ce_loss_2": 4.217166924476624,
"ce_loss_3": 4.008637738227844,
"ce_loss_7": 3.6669308066368105,
"epoch": 0.199,
"grad_norm": 616.0,
"kl_loss_10": 115.34629516601562,
"kl_loss_2": 1399.4097900390625,
"kl_loss_3": 983.5843353271484,
"kl_loss_7": 244.90453720092773,
"learning_rate": 0.000912735948481387,
"loss": 681.3188,
"step": 1990
},
{
"ce_loss_10": 3.6347181677818297,
"ce_loss_13": 3.560755395889282,
"ce_loss_2": 4.230910205841065,
"ce_loss_3": 4.03362866640091,
"ce_loss_7": 3.700130546092987,
"epoch": 0.2,
"grad_norm": 492.0,
"kl_loss_10": 115.55288009643554,
"kl_loss_2": 1372.0876098632812,
"kl_loss_3": 976.6782318115235,
"kl_loss_7": 248.47412033081054,
"learning_rate": 0.0009118382907149164,
"loss": 666.3086,
"step": 2000
},
{
"ce_loss_10": 3.6592599511146546,
"ce_loss_13": 3.5870088934898376,
"ce_loss_2": 4.23843743801117,
"ce_loss_3": 4.045619630813599,
"ce_loss_7": 3.722980320453644,
"epoch": 0.201,
"grad_norm": 492.0,
"kl_loss_10": 114.64969139099121,
"kl_loss_2": 1351.183447265625,
"kl_loss_3": 956.2785675048829,
"kl_loss_7": 247.1648811340332,
"learning_rate": 0.0009109364857414306,
"loss": 658.4385,
"step": 2010
},
{
"ce_loss_10": 3.6247077345848084,
"ce_loss_13": 3.5549973130226133,
"ce_loss_2": 4.192802679538727,
"ce_loss_3": 4.006897258758545,
"ce_loss_7": 3.694356381893158,
"epoch": 0.202,
"grad_norm": 432.0,
"kl_loss_10": 111.96462211608886,
"kl_loss_2": 1332.7575988769531,
"kl_loss_3": 943.8550109863281,
"kl_loss_7": 248.51104660034179,
"learning_rate": 0.0009100305426420956,
"loss": 673.1317,
"step": 2020
},
{
"ce_loss_10": 3.5841406345367433,
"ce_loss_13": 3.5164321780204775,
"ce_loss_2": 4.202693927288055,
"ce_loss_3": 3.9953248143196105,
"ce_loss_7": 3.650038242340088,
"epoch": 0.203,
"grad_norm": 432.0,
"kl_loss_10": 113.3315975189209,
"kl_loss_2": 1413.3973693847656,
"kl_loss_3": 984.508837890625,
"kl_loss_7": 247.24474029541017,
"learning_rate": 0.0009091204705397484,
"loss": 669.0848,
"step": 2030
},
{
"ce_loss_10": 3.585637128353119,
"ce_loss_13": 3.5091155648231505,
"ce_loss_2": 4.185344040393829,
"ce_loss_3": 3.992532753944397,
"ce_loss_7": 3.6510769963264464,
"epoch": 0.204,
"grad_norm": 448.0,
"kl_loss_10": 124.74103927612305,
"kl_loss_2": 1400.9709167480469,
"kl_loss_3": 992.3661834716797,
"kl_loss_7": 250.7290901184082,
"learning_rate": 0.0009082062785988049,
"loss": 681.1268,
"step": 2040
},
{
"ce_loss_10": 3.721049964427948,
"ce_loss_13": 3.6463207244873046,
"ce_loss_2": 4.2808568477630615,
"ce_loss_3": 4.093539321422577,
"ce_loss_7": 3.7847790718078613,
"epoch": 0.205,
"grad_norm": 466.0,
"kl_loss_10": 117.74674758911132,
"kl_loss_2": 1322.8293701171874,
"kl_loss_3": 931.9617309570312,
"kl_loss_7": 242.782958984375,
"learning_rate": 0.0009072879760251679,
"loss": 667.7382,
"step": 2050
},
{
"ce_loss_10": 3.6576568126678466,
"ce_loss_13": 3.5876036405563356,
"ce_loss_2": 4.261196970939636,
"ce_loss_3": 4.065271866321564,
"ce_loss_7": 3.727122116088867,
"epoch": 0.206,
"grad_norm": 510.0,
"kl_loss_10": 116.0688491821289,
"kl_loss_2": 1368.6263488769532,
"kl_loss_3": 974.0030578613281,
"kl_loss_7": 247.75207290649413,
"learning_rate": 0.0009063655720661341,
"loss": 667.9454,
"step": 2060
},
{
"ce_loss_10": 3.7091850519180296,
"ce_loss_13": 3.6359564065933228,
"ce_loss_2": 4.2861632108688354,
"ce_loss_3": 4.091731405258178,
"ce_loss_7": 3.780376970767975,
"epoch": 0.207,
"grad_norm": 756.0,
"kl_loss_10": 117.6738265991211,
"kl_loss_2": 1338.1011291503905,
"kl_loss_3": 948.2550628662109,
"kl_loss_7": 265.19689025878904,
"learning_rate": 0.000905439076010301,
"loss": 666.8086,
"step": 2070
},
{
"ce_loss_10": 3.661241602897644,
"ce_loss_13": 3.5897186398506165,
"ce_loss_2": 4.2563663721084595,
"ce_loss_3": 4.06014586687088,
"ce_loss_7": 3.7477613568305967,
"epoch": 0.208,
"grad_norm": 502.0,
"kl_loss_10": 114.75448532104492,
"kl_loss_2": 1354.8022583007812,
"kl_loss_3": 965.9129913330078,
"kl_loss_7": 280.605126953125,
"learning_rate": 0.0009045084971874737,
"loss": 668.0169,
"step": 2080
},
{
"ce_loss_10": 3.6376566290855408,
"ce_loss_13": 3.568475866317749,
"ce_loss_2": 4.219207537174225,
"ce_loss_3": 4.027460336685181,
"ce_loss_7": 3.7098584175109863,
"epoch": 0.209,
"grad_norm": 476.0,
"kl_loss_10": 112.78402214050293,
"kl_loss_2": 1340.8613708496093,
"kl_loss_3": 948.0414184570312,
"kl_loss_7": 264.3713745117187,
"learning_rate": 0.0009035738449685707,
"loss": 673.0266,
"step": 2090
},
{
"ce_loss_10": 3.5796483874320986,
"ce_loss_13": 3.507876431941986,
"ce_loss_2": 4.179275572299957,
"ce_loss_3": 3.9804170727729797,
"ce_loss_7": 3.6553168416023256,
"epoch": 0.21,
"grad_norm": 576.0,
"kl_loss_10": 116.00268287658692,
"kl_loss_2": 1358.9301879882812,
"kl_loss_3": 960.5301177978515,
"kl_loss_7": 258.80527420043944,
"learning_rate": 0.0009026351287655293,
"loss": 660.1454,
"step": 2100
},
{
"ce_loss_10": 3.7848559260368346,
"ce_loss_13": 3.7130728244781492,
"ce_loss_2": 4.324071049690247,
"ce_loss_3": 4.141572403907776,
"ce_loss_7": 3.8475868105888367,
"epoch": 0.211,
"grad_norm": 410.0,
"kl_loss_10": 115.61974067687989,
"kl_loss_2": 1276.820086669922,
"kl_loss_3": 901.7400726318359,
"kl_loss_7": 240.78678359985352,
"learning_rate": 0.0009016923580312113,
"loss": 636.6335,
"step": 2110
},
{
"ce_loss_10": 3.6301342844963074,
"ce_loss_13": 3.560654580593109,
"ce_loss_2": 4.20012868642807,
"ce_loss_3": 4.0125791192054745,
"ce_loss_7": 3.6946483969688417,
"epoch": 0.212,
"grad_norm": 462.0,
"kl_loss_10": 112.99716110229492,
"kl_loss_2": 1313.61005859375,
"kl_loss_3": 932.8424255371094,
"kl_loss_7": 240.82636260986328,
"learning_rate": 0.0009007455422593077,
"loss": 661.1402,
"step": 2120
},
{
"ce_loss_10": 3.643904185295105,
"ce_loss_13": 3.5732839465141297,
"ce_loss_2": 4.229895269870758,
"ce_loss_3": 4.039864921569825,
"ce_loss_7": 3.7100953698158263,
"epoch": 0.213,
"grad_norm": 544.0,
"kl_loss_10": 113.59500656127929,
"kl_loss_2": 1376.1138916015625,
"kl_loss_3": 982.583627319336,
"kl_loss_7": 245.73143768310547,
"learning_rate": 0.0008997946909842425,
"loss": 673.5951,
"step": 2130
},
{
"ce_loss_10": 3.6592751264572145,
"ce_loss_13": 3.5883899569511413,
"ce_loss_2": 4.269070100784302,
"ce_loss_3": 4.073972117900849,
"ce_loss_7": 3.7271844029426573,
"epoch": 0.214,
"grad_norm": 486.0,
"kl_loss_10": 115.54258766174317,
"kl_loss_2": 1390.4590087890624,
"kl_loss_3": 991.5129302978515,
"kl_loss_7": 248.1077392578125,
"learning_rate": 0.0008988398137810777,
"loss": 666.5385,
"step": 2140
},
{
"ce_loss_10": 3.696693778038025,
"ce_loss_13": 3.6274760484695436,
"ce_loss_2": 4.272996628284455,
"ce_loss_3": 4.077895438671112,
"ce_loss_7": 3.759436583518982,
"epoch": 0.215,
"grad_norm": 410.0,
"kl_loss_10": 109.15078239440918,
"kl_loss_2": 1323.0664428710938,
"kl_loss_3": 929.4178771972656,
"kl_loss_7": 235.6734588623047,
"learning_rate": 0.0008978809202654162,
"loss": 648.7643,
"step": 2150
},
{
"ce_loss_10": 3.674948477745056,
"ce_loss_13": 3.6074150681495665,
"ce_loss_2": 4.254977214336395,
"ce_loss_3": 4.054577016830445,
"ce_loss_7": 3.7380695223808287,
"epoch": 0.216,
"grad_norm": 342.0,
"kl_loss_10": 111.83034286499023,
"kl_loss_2": 1326.1663391113282,
"kl_loss_3": 930.476919555664,
"kl_loss_7": 237.29275283813476,
"learning_rate": 0.0008969180200933046,
"loss": 659.8788,
"step": 2160
},
{
"ce_loss_10": 3.633396315574646,
"ce_loss_13": 3.5632909893989564,
"ce_loss_2": 4.235332405567169,
"ce_loss_3": 4.043020272254944,
"ce_loss_7": 3.700575852394104,
"epoch": 0.217,
"grad_norm": 426.0,
"kl_loss_10": 113.71143455505371,
"kl_loss_2": 1376.4859924316406,
"kl_loss_3": 965.2266693115234,
"kl_loss_7": 244.74252319335938,
"learning_rate": 0.0008959511229611376,
"loss": 671.5447,
"step": 2170
},
{
"ce_loss_10": 3.7160756826400756,
"ce_loss_13": 3.6463282823562624,
"ce_loss_2": 4.29187992811203,
"ce_loss_3": 4.0930745005607605,
"ce_loss_7": 3.77754408121109,
"epoch": 0.218,
"grad_norm": 494.0,
"kl_loss_10": 112.80306968688964,
"kl_loss_2": 1327.3540283203124,
"kl_loss_3": 931.4955108642578,
"kl_loss_7": 236.75977325439453,
"learning_rate": 0.0008949802386055581,
"loss": 652.6458,
"step": 2180
},
{
"ce_loss_10": 3.5766260862350463,
"ce_loss_13": 3.5034859418869018,
"ce_loss_2": 4.159404098987579,
"ce_loss_3": 3.9599217534065247,
"ce_loss_7": 3.6389345288276673,
"epoch": 0.219,
"grad_norm": 466.0,
"kl_loss_10": 111.61733703613281,
"kl_loss_2": 1335.0789672851563,
"kl_loss_3": 936.6266204833985,
"kl_loss_7": 234.00814971923828,
"learning_rate": 0.0008940053768033609,
"loss": 665.0317,
"step": 2190
},
{
"ce_loss_10": 3.65551677942276,
"ce_loss_13": 3.58828284740448,
"ce_loss_2": 4.223571491241455,
"ce_loss_3": 4.04930864572525,
"ce_loss_7": 3.7170740485191347,
"epoch": 0.22,
"grad_norm": 500.0,
"kl_loss_10": 111.12692604064941,
"kl_loss_2": 1315.6073425292968,
"kl_loss_3": 947.5290161132813,
"kl_loss_7": 230.48658447265626,
"learning_rate": 0.0008930265473713938,
"loss": 654.9715,
"step": 2200
},
{
"ce_loss_10": 3.6195818066596983,
"ce_loss_13": 3.54861319065094,
"ce_loss_2": 4.198423433303833,
"ce_loss_3": 4.012469959259033,
"ce_loss_7": 3.681425595283508,
"epoch": 0.221,
"grad_norm": 528.0,
"kl_loss_10": 115.1269718170166,
"kl_loss_2": 1324.1462280273438,
"kl_loss_3": 954.0400299072265,
"kl_loss_7": 233.1149475097656,
"learning_rate": 0.0008920437601664579,
"loss": 648.2547,
"step": 2210
},
{
"ce_loss_10": 3.6091471552848815,
"ce_loss_13": 3.539783036708832,
"ce_loss_2": 4.181177127361297,
"ce_loss_3": 3.993445408344269,
"ce_loss_7": 3.669792366027832,
"epoch": 0.222,
"grad_norm": 410.0,
"kl_loss_10": 115.02236862182617,
"kl_loss_2": 1333.0483093261719,
"kl_loss_3": 948.5272521972656,
"kl_loss_7": 236.40514373779297,
"learning_rate": 0.0008910570250852097,
"loss": 647.6241,
"step": 2220
},
{
"ce_loss_10": 3.7300463914871216,
"ce_loss_13": 3.657940351963043,
"ce_loss_2": 4.2689752101898195,
"ce_loss_3": 4.08460431098938,
"ce_loss_7": 3.786776435375214,
"epoch": 0.223,
"grad_norm": 396.0,
"kl_loss_10": 119.05343589782714,
"kl_loss_2": 1270.7809265136718,
"kl_loss_3": 894.4331726074219,
"kl_loss_7": 233.2676574707031,
"learning_rate": 0.0008900663520640604,
"loss": 634.1773,
"step": 2230
},
{
"ce_loss_10": 3.668592298030853,
"ce_loss_13": 3.5982241868972777,
"ce_loss_2": 4.234554326534271,
"ce_loss_3": 4.045566046237946,
"ce_loss_7": 3.7271997809410093,
"epoch": 0.224,
"grad_norm": 484.0,
"kl_loss_10": 115.87549057006837,
"kl_loss_2": 1305.9935668945313,
"kl_loss_3": 922.4547302246094,
"kl_loss_7": 233.83654251098633,
"learning_rate": 0.0008890717510790764,
"loss": 651.4086,
"step": 2240
},
{
"ce_loss_10": 3.624187970161438,
"ce_loss_13": 3.553602933883667,
"ce_loss_2": 4.209651458263397,
"ce_loss_3": 4.014156377315521,
"ce_loss_7": 3.6865435361862184,
"epoch": 0.225,
"grad_norm": 428.0,
"kl_loss_10": 111.85544166564941,
"kl_loss_2": 1344.305419921875,
"kl_loss_3": 948.2704040527344,
"kl_loss_7": 233.77221450805663,
"learning_rate": 0.0008880732321458784,
"loss": 659.9288,
"step": 2250
},
{
"ce_loss_10": 3.6572599172592164,
"ce_loss_13": 3.589207625389099,
"ce_loss_2": 4.229009163379669,
"ce_loss_3": 4.035797142982483,
"ce_loss_7": 3.720983362197876,
"epoch": 0.226,
"grad_norm": 450.0,
"kl_loss_10": 112.32478141784668,
"kl_loss_2": 1315.7465759277343,
"kl_loss_3": 923.5756164550781,
"kl_loss_7": 233.98969955444335,
"learning_rate": 0.0008870708053195413,
"loss": 656.0779,
"step": 2260
},
{
"ce_loss_10": 3.6822890281677245,
"ce_loss_13": 3.613891136646271,
"ce_loss_2": 4.243361723423004,
"ce_loss_3": 4.054495620727539,
"ce_loss_7": 3.7477814078330995,
"epoch": 0.227,
"grad_norm": 510.0,
"kl_loss_10": 109.18305702209473,
"kl_loss_2": 1298.216571044922,
"kl_loss_3": 915.6232360839844,
"kl_loss_7": 232.2860221862793,
"learning_rate": 0.0008860644806944918,
"loss": 640.8393,
"step": 2270
},
{
"ce_loss_10": 3.6228482365608214,
"ce_loss_13": 3.55364191532135,
"ce_loss_2": 4.206750881671906,
"ce_loss_3": 4.00951054096222,
"ce_loss_7": 3.692072665691376,
"epoch": 0.228,
"grad_norm": 516.0,
"kl_loss_10": 112.56623115539551,
"kl_loss_2": 1346.1175109863282,
"kl_loss_3": 947.8353302001954,
"kl_loss_7": 250.14039154052733,
"learning_rate": 0.0008850542684044079,
"loss": 646.6089,
"step": 2280
},
{
"ce_loss_10": 3.594875121116638,
"ce_loss_13": 3.5233037948608397,
"ce_loss_2": 4.204600942134857,
"ce_loss_3": 3.998581278324127,
"ce_loss_7": 3.665067207813263,
"epoch": 0.229,
"grad_norm": 450.0,
"kl_loss_10": 112.58180122375488,
"kl_loss_2": 1387.7576477050782,
"kl_loss_3": 974.7930572509765,
"kl_loss_7": 248.70092544555663,
"learning_rate": 0.0008840401786221159,
"loss": 661.1442,
"step": 2290
},
{
"ce_loss_10": 3.731163203716278,
"ce_loss_13": 3.6657732129096985,
"ce_loss_2": 4.296657812595368,
"ce_loss_3": 4.099385142326355,
"ce_loss_7": 3.791227328777313,
"epoch": 0.23,
"grad_norm": 480.0,
"kl_loss_10": 108.29386520385742,
"kl_loss_2": 1301.2722473144531,
"kl_loss_3": 910.2853973388671,
"kl_loss_7": 230.31064682006837,
"learning_rate": 0.000883022221559489,
"loss": 636.6557,
"step": 2300
},
{
"ce_loss_10": 3.68310546875,
"ce_loss_13": 3.615295338630676,
"ce_loss_2": 4.257085943222046,
"ce_loss_3": 4.067182207107544,
"ce_loss_7": 3.7437336564064028,
"epoch": 0.231,
"grad_norm": 446.0,
"kl_loss_10": 109.58497161865235,
"kl_loss_2": 1331.8181457519531,
"kl_loss_3": 935.3311614990234,
"kl_loss_7": 230.688321685791,
"learning_rate": 0.0008820004074673434,
"loss": 666.0036,
"step": 2310
},
{
"ce_loss_10": 3.588702178001404,
"ce_loss_13": 3.524743127822876,
"ce_loss_2": 4.165178096294403,
"ce_loss_3": 3.97126362323761,
"ce_loss_7": 3.6520536303520204,
"epoch": 0.232,
"grad_norm": 494.0,
"kl_loss_10": 105.68032302856446,
"kl_loss_2": 1342.763037109375,
"kl_loss_3": 936.0263000488281,
"kl_loss_7": 229.52542724609376,
"learning_rate": 0.0008809747466353355,
"loss": 641.3422,
"step": 2320
},
{
"ce_loss_10": 3.602530860900879,
"ce_loss_13": 3.5331971526145933,
"ce_loss_2": 4.167471373081208,
"ce_loss_3": 3.977475893497467,
"ce_loss_7": 3.6642409324645997,
"epoch": 0.233,
"grad_norm": 458.0,
"kl_loss_10": 110.41828346252441,
"kl_loss_2": 1312.3292907714845,
"kl_loss_3": 919.6944396972656,
"kl_loss_7": 231.60648040771486,
"learning_rate": 0.0008799452493918585,
"loss": 645.3,
"step": 2330
},
{
"ce_loss_10": 3.6880576372146607,
"ce_loss_13": 3.6198028326034546,
"ce_loss_2": 4.254074168205261,
"ce_loss_3": 4.062888276576996,
"ce_loss_7": 3.7507563471794128,
"epoch": 0.234,
"grad_norm": 474.0,
"kl_loss_10": 110.42879600524903,
"kl_loss_2": 1309.2276428222656,
"kl_loss_3": 921.8907257080078,
"kl_loss_7": 233.63690185546875,
"learning_rate": 0.0008789119261039385,
"loss": 662.3614,
"step": 2340
},
{
"ce_loss_10": 3.5944358229637148,
"ce_loss_13": 3.5265544295310973,
"ce_loss_2": 4.165751957893372,
"ce_loss_3": 3.9788960099220274,
"ce_loss_7": 3.656530773639679,
"epoch": 0.235,
"grad_norm": 390.0,
"kl_loss_10": 106.77197875976563,
"kl_loss_2": 1303.9624755859375,
"kl_loss_3": 920.4357025146485,
"kl_loss_7": 233.87101974487305,
"learning_rate": 0.0008778747871771292,
"loss": 636.2241,
"step": 2350
},
{
"ce_loss_10": 3.642832565307617,
"ce_loss_13": 3.5764389514923094,
"ce_loss_2": 4.1962644219398495,
"ce_loss_3": 4.005896735191345,
"ce_loss_7": 3.703742432594299,
"epoch": 0.236,
"grad_norm": 488.0,
"kl_loss_10": 106.56211357116699,
"kl_loss_2": 1280.7797607421876,
"kl_loss_3": 899.6746459960938,
"kl_loss_7": 226.2964195251465,
"learning_rate": 0.0008768338430554083,
"loss": 628.8105,
"step": 2360
},
{
"ce_loss_10": 3.6540219306945803,
"ce_loss_13": 3.5862942576408385,
"ce_loss_2": 4.217021405696869,
"ce_loss_3": 4.0323525190353395,
"ce_loss_7": 3.715273082256317,
"epoch": 0.237,
"grad_norm": 446.0,
"kl_loss_10": 108.96415519714355,
"kl_loss_2": 1303.7487731933593,
"kl_loss_3": 917.9468658447265,
"kl_loss_7": 231.81985321044922,
"learning_rate": 0.0008757891042210713,
"loss": 643.5909,
"step": 2370
},
{
"ce_loss_10": 3.6785866141319277,
"ce_loss_13": 3.6081984996795655,
"ce_loss_2": 4.2465239524841305,
"ce_loss_3": 4.049813544750213,
"ce_loss_7": 3.7435325622558593,
"epoch": 0.238,
"grad_norm": 504.0,
"kl_loss_10": 111.76208610534668,
"kl_loss_2": 1305.791943359375,
"kl_loss_3": 916.3558563232422,
"kl_loss_7": 238.5102066040039,
"learning_rate": 0.0008747405811946271,
"loss": 645.7604,
"step": 2380
},
{
"ce_loss_10": 3.5631368517875672,
"ce_loss_13": 3.4966716051101683,
"ce_loss_2": 4.149629712104797,
"ce_loss_3": 3.960558259487152,
"ce_loss_7": 3.629357707500458,
"epoch": 0.239,
"grad_norm": 466.0,
"kl_loss_10": 110.7029800415039,
"kl_loss_2": 1342.1500549316406,
"kl_loss_3": 952.2329833984375,
"kl_loss_7": 240.75519790649415,
"learning_rate": 0.0008736882845346905,
"loss": 640.1473,
"step": 2390
},
{
"ce_loss_10": 3.6702625513076783,
"ce_loss_13": 3.598974347114563,
"ce_loss_2": 4.235510897636414,
"ce_loss_3": 4.041280698776245,
"ce_loss_7": 3.734322738647461,
"epoch": 0.24,
"grad_norm": 504.0,
"kl_loss_10": 116.23694610595703,
"kl_loss_2": 1302.5589904785156,
"kl_loss_3": 916.9543487548829,
"kl_loss_7": 247.71970977783204,
"learning_rate": 0.0008726322248378774,
"loss": 637.9229,
"step": 2400
},
{
"ce_loss_10": 3.6652265906333925,
"ce_loss_13": 3.595516872406006,
"ce_loss_2": 4.246520745754242,
"ce_loss_3": 4.043909120559692,
"ce_loss_7": 3.7296547532081603,
"epoch": 0.241,
"grad_norm": 450.0,
"kl_loss_10": 113.42751388549804,
"kl_loss_2": 1334.7802490234376,
"kl_loss_3": 932.5218017578125,
"kl_loss_7": 240.9356887817383,
"learning_rate": 0.0008715724127386971,
"loss": 657.7121,
"step": 2410
},
{
"ce_loss_10": 3.7339873194694517,
"ce_loss_13": 3.6643826484680178,
"ce_loss_2": 4.278091061115265,
"ce_loss_3": 4.095656621456146,
"ce_loss_7": 3.791609489917755,
"epoch": 0.242,
"grad_norm": 462.0,
"kl_loss_10": 112.56025924682618,
"kl_loss_2": 1285.8053466796875,
"kl_loss_3": 903.9169921875,
"kl_loss_7": 234.85026092529296,
"learning_rate": 0.0008705088589094458,
"loss": 638.7832,
"step": 2420
},
{
"ce_loss_10": 3.745934009552002,
"ce_loss_13": 3.677195417881012,
"ce_loss_2": 4.306411802768707,
"ce_loss_3": 4.115467298030853,
"ce_loss_7": 3.8079170107841493,
"epoch": 0.243,
"grad_norm": 414.0,
"kl_loss_10": 111.40414924621582,
"kl_loss_2": 1291.0523193359375,
"kl_loss_3": 906.0183563232422,
"kl_loss_7": 231.1467155456543,
"learning_rate": 0.0008694415740600988,
"loss": 640.6179,
"step": 2430
},
{
"ce_loss_10": 3.595633792877197,
"ce_loss_13": 3.5286824345588683,
"ce_loss_2": 4.186046612262726,
"ce_loss_3": 3.9901591181755065,
"ce_loss_7": 3.6573471426963806,
"epoch": 0.244,
"grad_norm": 500.0,
"kl_loss_10": 109.70008354187011,
"kl_loss_2": 1348.1939697265625,
"kl_loss_3": 959.4411895751953,
"kl_loss_7": 231.17713012695313,
"learning_rate": 0.0008683705689382025,
"loss": 654.2107,
"step": 2440
},
{
"ce_loss_10": 3.684832978248596,
"ce_loss_13": 3.616632854938507,
"ce_loss_2": 4.231842195987701,
"ce_loss_3": 4.048018515110016,
"ce_loss_7": 3.740548253059387,
"epoch": 0.245,
"grad_norm": 450.0,
"kl_loss_10": 108.41531753540039,
"kl_loss_2": 1280.763720703125,
"kl_loss_3": 904.3856048583984,
"kl_loss_7": 224.61626434326172,
"learning_rate": 0.0008672958543287666,
"loss": 648.306,
"step": 2450
},
{
"ce_loss_10": 3.6943544864654543,
"ce_loss_13": 3.625825345516205,
"ce_loss_2": 4.246035170555115,
"ce_loss_3": 4.063331556320191,
"ce_loss_7": 3.75509090423584,
"epoch": 0.246,
"grad_norm": 428.0,
"kl_loss_10": 109.8709056854248,
"kl_loss_2": 1283.2248840332031,
"kl_loss_3": 904.9634002685547,
"kl_loss_7": 228.17876358032225,
"learning_rate": 0.0008662174410541554,
"loss": 632.1618,
"step": 2460
},
{
"ce_loss_10": 3.6546427369117738,
"ce_loss_13": 3.587355947494507,
"ce_loss_2": 4.204605233669281,
"ce_loss_3": 4.022764265537262,
"ce_loss_7": 3.717895233631134,
"epoch": 0.247,
"grad_norm": 394.0,
"kl_loss_10": 107.19166374206543,
"kl_loss_2": 1283.1351989746095,
"kl_loss_3": 906.7954956054688,
"kl_loss_7": 227.77373123168945,
"learning_rate": 0.0008651353399739787,
"loss": 642.6704,
"step": 2470
},
{
"ce_loss_10": 3.685038208961487,
"ce_loss_13": 3.6166242718696595,
"ce_loss_2": 4.247460579872131,
"ce_loss_3": 4.056502640247345,
"ce_loss_7": 3.7451204299926757,
"epoch": 0.248,
"grad_norm": 520.0,
"kl_loss_10": 109.50839309692383,
"kl_loss_2": 1297.0845886230468,
"kl_loss_3": 908.0451263427734,
"kl_loss_7": 229.69447708129883,
"learning_rate": 0.0008640495619849821,
"loss": 636.7805,
"step": 2480
},
{
"ce_loss_10": 3.6444509506225584,
"ce_loss_13": 3.5771190404891966,
"ce_loss_2": 4.202155363559723,
"ce_loss_3": 4.00964595079422,
"ce_loss_7": 3.709311318397522,
"epoch": 0.249,
"grad_norm": 492.0,
"kl_loss_10": 107.21613540649415,
"kl_loss_2": 1287.2485595703124,
"kl_loss_3": 903.1389892578125,
"kl_loss_7": 236.67683792114258,
"learning_rate": 0.0008629601180209381,
"loss": 632.6728,
"step": 2490
},
{
"ce_loss_10": 3.640513265132904,
"ce_loss_13": 3.571250784397125,
"ce_loss_2": 4.189491713047028,
"ce_loss_3": 4.000437986850739,
"ce_loss_7": 3.6998685002326965,
"epoch": 0.25,
"grad_norm": 388.0,
"kl_loss_10": 109.32069320678711,
"kl_loss_2": 1269.254248046875,
"kl_loss_3": 889.8983123779296,
"kl_loss_7": 235.86445999145508,
"learning_rate": 0.000861867019052535,
"loss": 634.9495,
"step": 2500
},
{
"ce_loss_10": 3.5532511711120605,
"ce_loss_13": 3.4857767462730407,
"ce_loss_2": 4.138734245300293,
"ce_loss_3": 3.9396822333335875,
"ce_loss_7": 3.6218135476112367,
"epoch": 0.251,
"grad_norm": 454.0,
"kl_loss_10": 108.08290519714356,
"kl_loss_2": 1328.9036437988282,
"kl_loss_3": 931.4583953857422,
"kl_loss_7": 239.18134002685548,
"learning_rate": 0.0008607702760872678,
"loss": 651.695,
"step": 2510
},
{
"ce_loss_10": 3.6758545279502868,
"ce_loss_13": 3.6099536180496217,
"ce_loss_2": 4.224261367321015,
"ce_loss_3": 4.035415709018707,
"ce_loss_7": 3.738192629814148,
"epoch": 0.252,
"grad_norm": 676.0,
"kl_loss_10": 109.95955963134766,
"kl_loss_2": 1264.877392578125,
"kl_loss_3": 890.2998840332032,
"kl_loss_7": 227.99790649414064,
"learning_rate": 0.0008596699001693256,
"loss": 638.9367,
"step": 2520
},
{
"ce_loss_10": 3.695500302314758,
"ce_loss_13": 3.61992267370224,
"ce_loss_2": 4.222428333759308,
"ce_loss_3": 4.035504674911499,
"ce_loss_7": 3.743453121185303,
"epoch": 0.253,
"grad_norm": 548.0,
"kl_loss_10": 127.16191024780274,
"kl_loss_2": 1273.7016052246095,
"kl_loss_3": 884.7771850585938,
"kl_loss_7": 228.30911254882812,
"learning_rate": 0.0008585659023794818,
"loss": 643.5041,
"step": 2530
},
{
"ce_loss_10": 3.637289047241211,
"ce_loss_13": 3.5686028838157653,
"ce_loss_2": 4.218254780769348,
"ce_loss_3": 4.019161069393158,
"ce_loss_7": 3.696817862987518,
"epoch": 0.254,
"grad_norm": 462.0,
"kl_loss_10": 120.60056190490722,
"kl_loss_2": 1330.6954162597656,
"kl_loss_3": 938.699496459961,
"kl_loss_7": 233.61075134277343,
"learning_rate": 0.0008574582938349817,
"loss": 644.681,
"step": 2540
},
{
"ce_loss_10": 3.6420682311058044,
"ce_loss_13": 3.567863130569458,
"ce_loss_2": 4.228188097476959,
"ce_loss_3": 4.029180979728698,
"ce_loss_7": 3.705214190483093,
"epoch": 0.255,
"grad_norm": 372.0,
"kl_loss_10": 117.31230735778809,
"kl_loss_2": 1354.2801513671875,
"kl_loss_3": 952.2808837890625,
"kl_loss_7": 241.81886978149413,
"learning_rate": 0.0008563470856894315,
"loss": 640.0965,
"step": 2550
},
{
"ce_loss_10": 3.6231508612632752,
"ce_loss_13": 3.5556546688079833,
"ce_loss_2": 4.189491558074951,
"ce_loss_3": 3.9987146973609926,
"ce_loss_7": 3.681329298019409,
"epoch": 0.256,
"grad_norm": 472.0,
"kl_loss_10": 108.91358489990235,
"kl_loss_2": 1298.7685607910157,
"kl_loss_3": 917.6602386474609,
"kl_loss_7": 230.57952499389648,
"learning_rate": 0.0008552322891326845,
"loss": 638.6699,
"step": 2560
},
{
"ce_loss_10": 3.5982382774353026,
"ce_loss_13": 3.528933322429657,
"ce_loss_2": 4.162684524059296,
"ce_loss_3": 3.9698683977127076,
"ce_loss_7": 3.656614398956299,
"epoch": 0.257,
"grad_norm": 434.0,
"kl_loss_10": 109.61449127197265,
"kl_loss_2": 1301.0992553710937,
"kl_loss_3": 921.5931274414063,
"kl_loss_7": 231.00868759155273,
"learning_rate": 0.0008541139153907296,
"loss": 634.7393,
"step": 2570
},
{
"ce_loss_10": 3.5526642203330994,
"ce_loss_13": 3.485519516468048,
"ce_loss_2": 4.114146530628204,
"ce_loss_3": 3.924081325531006,
"ce_loss_7": 3.614666759967804,
"epoch": 0.258,
"grad_norm": 548.0,
"kl_loss_10": 107.18747673034667,
"kl_loss_2": 1300.3511169433593,
"kl_loss_3": 919.3837615966797,
"kl_loss_7": 228.61345367431642,
"learning_rate": 0.0008529919757255782,
"loss": 640.0102,
"step": 2580
},
{
"ce_loss_10": 3.591010940074921,
"ce_loss_13": 3.519867956638336,
"ce_loss_2": 4.122671520709991,
"ce_loss_3": 3.9352630972862244,
"ce_loss_7": 3.642985260486603,
"epoch": 0.259,
"grad_norm": 462.0,
"kl_loss_10": 115.52005577087402,
"kl_loss_2": 1261.9588439941406,
"kl_loss_3": 881.7059204101563,
"kl_loss_7": 222.79493560791016,
"learning_rate": 0.0008518664814351503,
"loss": 624.9721,
"step": 2590
},
{
"ce_loss_10": 3.5559722065925596,
"ce_loss_13": 3.4851076006889343,
"ce_loss_2": 4.12727187871933,
"ce_loss_3": 3.9312629222869875,
"ce_loss_7": 3.6141554713249207,
"epoch": 0.26,
"grad_norm": 468.0,
"kl_loss_10": 118.2235237121582,
"kl_loss_2": 1329.9431396484374,
"kl_loss_3": 938.9490844726563,
"kl_loss_7": 229.97386627197267,
"learning_rate": 0.0008507374438531607,
"loss": 664.5543,
"step": 2600
},
{
"ce_loss_10": 3.52994726896286,
"ce_loss_13": 3.460645878314972,
"ce_loss_2": 4.086832702159882,
"ce_loss_3": 3.8989439606666565,
"ce_loss_7": 3.5843619465827943,
"epoch": 0.261,
"grad_norm": 454.0,
"kl_loss_10": 111.8594409942627,
"kl_loss_2": 1283.5894409179687,
"kl_loss_3": 906.6501983642578,
"kl_loss_7": 225.07547607421876,
"learning_rate": 0.0008496048743490053,
"loss": 631.2727,
"step": 2610
},
{
"ce_loss_10": 3.688689887523651,
"ce_loss_13": 3.6171088218688965,
"ce_loss_2": 4.232082653045654,
"ce_loss_3": 4.045144772529602,
"ce_loss_7": 3.7411328554153442,
"epoch": 0.262,
"grad_norm": 498.0,
"kl_loss_10": 112.36735153198242,
"kl_loss_2": 1262.6953674316405,
"kl_loss_3": 890.0498321533203,
"kl_loss_7": 224.3122886657715,
"learning_rate": 0.0008484687843276469,
"loss": 626.552,
"step": 2620
},
{
"ce_loss_10": 3.6142364263534548,
"ce_loss_13": 3.54564208984375,
"ce_loss_2": 4.1653601884841915,
"ce_loss_3": 3.979761373996735,
"ce_loss_7": 3.670048642158508,
"epoch": 0.263,
"grad_norm": 604.0,
"kl_loss_10": 113.79613609313965,
"kl_loss_2": 1291.2943481445313,
"kl_loss_3": 909.2907562255859,
"kl_loss_7": 229.63263626098632,
"learning_rate": 0.0008473291852294987,
"loss": 643.7754,
"step": 2630
},
{
"ce_loss_10": 3.624956822395325,
"ce_loss_13": 3.5516101837158205,
"ce_loss_2": 4.185651910305023,
"ce_loss_3": 3.9955446600914,
"ce_loss_7": 3.682393753528595,
"epoch": 0.264,
"grad_norm": 560.0,
"kl_loss_10": 118.29873504638672,
"kl_loss_2": 1318.540350341797,
"kl_loss_3": 922.2017700195313,
"kl_loss_7": 233.18995971679686,
"learning_rate": 0.0008461860885303114,
"loss": 639.2153,
"step": 2640
},
{
"ce_loss_10": 3.651511311531067,
"ce_loss_13": 3.5835230231285093,
"ce_loss_2": 4.192479598522186,
"ce_loss_3": 4.010076713562012,
"ce_loss_7": 3.705660367012024,
"epoch": 0.265,
"grad_norm": 532.0,
"kl_loss_10": 120.45394592285156,
"kl_loss_2": 1256.7761352539062,
"kl_loss_3": 889.5764984130859,
"kl_loss_7": 224.92701721191406,
"learning_rate": 0.000845039505741056,
"loss": 630.0308,
"step": 2650
},
{
"ce_loss_10": 3.6381911635398865,
"ce_loss_13": 3.567105031013489,
"ce_loss_2": 4.197239780426026,
"ce_loss_3": 4.007714176177979,
"ce_loss_7": 3.694069528579712,
"epoch": 0.266,
"grad_norm": 476.0,
"kl_loss_10": 120.83754425048828,
"kl_loss_2": 1304.788995361328,
"kl_loss_3": 924.8183044433594,
"kl_loss_7": 230.70355300903321,
"learning_rate": 0.0008438894484078086,
"loss": 659.7302,
"step": 2660
},
{
"ce_loss_10": 3.6475989818573,
"ce_loss_13": 3.575591731071472,
"ce_loss_2": 4.190777051448822,
"ce_loss_3": 4.004483807086944,
"ce_loss_7": 3.6995357990264894,
"epoch": 0.267,
"grad_norm": 486.0,
"kl_loss_10": 115.3904426574707,
"kl_loss_2": 1277.0895690917969,
"kl_loss_3": 898.4886535644531,
"kl_loss_7": 228.7105613708496,
"learning_rate": 0.0008427359281116334,
"loss": 634.3606,
"step": 2670
},
{
"ce_loss_10": 3.547777831554413,
"ce_loss_13": 3.4795953035354614,
"ce_loss_2": 4.113273656368255,
"ce_loss_3": 3.9340083956718446,
"ce_loss_7": 3.6087413907051085,
"epoch": 0.268,
"grad_norm": 572.0,
"kl_loss_10": 111.49882125854492,
"kl_loss_2": 1312.4553100585938,
"kl_loss_3": 937.0226959228515,
"kl_loss_7": 228.7356918334961,
"learning_rate": 0.0008415789564684673,
"loss": 643.8098,
"step": 2680
},
{
"ce_loss_10": 3.7991090655326842,
"ce_loss_13": 3.7250254154205322,
"ce_loss_2": 4.329492318630218,
"ce_loss_3": 4.153719592094421,
"ce_loss_7": 3.8572392106056212,
"epoch": 0.269,
"grad_norm": 536.0,
"kl_loss_10": 119.75568199157715,
"kl_loss_2": 1240.7658264160157,
"kl_loss_3": 900.5773712158203,
"kl_loss_7": 236.5658187866211,
"learning_rate": 0.0008404185451290017,
"loss": 621.7949,
"step": 2690
},
{
"ce_loss_10": 3.6571221590042113,
"ce_loss_13": 3.5904589772224424,
"ce_loss_2": 4.20020170211792,
"ce_loss_3": 4.020016396045685,
"ce_loss_7": 3.7206122040748597,
"epoch": 0.27,
"grad_norm": 612.0,
"kl_loss_10": 109.39498329162598,
"kl_loss_2": 1278.1862854003907,
"kl_loss_3": 903.8941192626953,
"kl_loss_7": 233.3107780456543,
"learning_rate": 0.0008392547057785661,
"loss": 629.3908,
"step": 2700
},
{
"ce_loss_10": 3.5812445521354674,
"ce_loss_13": 3.511835253238678,
"ce_loss_2": 4.144945275783539,
"ce_loss_3": 3.964122700691223,
"ce_loss_7": 3.65096001625061,
"epoch": 0.271,
"grad_norm": 536.0,
"kl_loss_10": 110.46146507263184,
"kl_loss_2": 1320.70322265625,
"kl_loss_3": 951.1240264892579,
"kl_loss_7": 251.59460906982423,
"learning_rate": 0.0008380874501370098,
"loss": 636.0247,
"step": 2710
},
{
"ce_loss_10": 3.576056122779846,
"ce_loss_13": 3.5100281834602356,
"ce_loss_2": 4.140222942829132,
"ce_loss_3": 3.9531075954437256,
"ce_loss_7": 3.6410070419311524,
"epoch": 0.272,
"grad_norm": 544.0,
"kl_loss_10": 110.53367462158204,
"kl_loss_2": 1306.970849609375,
"kl_loss_3": 923.5474395751953,
"kl_loss_7": 236.50457839965821,
"learning_rate": 0.0008369167899585841,
"loss": 640.9572,
"step": 2720
},
{
"ce_loss_10": 3.6997987627983093,
"ce_loss_13": 3.635802137851715,
"ce_loss_2": 4.225974369049072,
"ce_loss_3": 4.0530330538749695,
"ce_loss_7": 3.760606610774994,
"epoch": 0.273,
"grad_norm": 700.0,
"kl_loss_10": 106.76014976501465,
"kl_loss_2": 1235.3878662109375,
"kl_loss_3": 873.6434448242187,
"kl_loss_7": 224.53188552856446,
"learning_rate": 0.0008357427370318238,
"loss": 630.9094,
"step": 2730
},
{
"ce_loss_10": 3.6538386702537538,
"ce_loss_13": 3.5854528903961183,
"ce_loss_2": 4.205159163475036,
"ce_loss_3": 4.013937699794769,
"ce_loss_7": 3.712061953544617,
"epoch": 0.274,
"grad_norm": 448.0,
"kl_loss_10": 110.17894134521484,
"kl_loss_2": 1286.5609130859375,
"kl_loss_3": 904.9044860839844,
"kl_loss_7": 229.6865478515625,
"learning_rate": 0.0008345653031794292,
"loss": 635.8903,
"step": 2740
},
{
"ce_loss_10": 3.6535327553749086,
"ce_loss_13": 3.5872610807418823,
"ce_loss_2": 4.199507582187652,
"ce_loss_3": 4.0229881525039675,
"ce_loss_7": 3.712740111351013,
"epoch": 0.275,
"grad_norm": 494.0,
"kl_loss_10": 108.84803733825683,
"kl_loss_2": 1267.8911865234375,
"kl_loss_3": 897.5597198486328,
"kl_loss_7": 226.9661651611328,
"learning_rate": 0.0008333845002581458,
"loss": 628.4583,
"step": 2750
},
{
"ce_loss_10": 3.5756468772888184,
"ce_loss_13": 3.5082659125328064,
"ce_loss_2": 4.145406031608582,
"ce_loss_3": 3.954765808582306,
"ce_loss_7": 3.6372469902038573,
"epoch": 0.276,
"grad_norm": 442.0,
"kl_loss_10": 107.86047019958497,
"kl_loss_2": 1333.4900146484374,
"kl_loss_3": 936.4077697753906,
"kl_loss_7": 230.6979835510254,
"learning_rate": 0.0008322003401586462,
"loss": 647.3615,
"step": 2760
},
{
"ce_loss_10": 3.615983176231384,
"ce_loss_13": 3.5494200587272644,
"ce_loss_2": 4.153625726699829,
"ce_loss_3": 3.9661497712135314,
"ce_loss_7": 3.670648729801178,
"epoch": 0.277,
"grad_norm": 456.0,
"kl_loss_10": 107.68133277893067,
"kl_loss_2": 1252.6388610839845,
"kl_loss_3": 875.4474029541016,
"kl_loss_7": 220.59310073852538,
"learning_rate": 0.0008310128348054094,
"loss": 608.5761,
"step": 2770
},
{
"ce_loss_10": 3.581556737422943,
"ce_loss_13": 3.518260824680328,
"ce_loss_2": 4.13277485370636,
"ce_loss_3": 3.9427122831344605,
"ce_loss_7": 3.6427804708480833,
"epoch": 0.278,
"grad_norm": 556.0,
"kl_loss_10": 107.17420654296875,
"kl_loss_2": 1270.6333923339844,
"kl_loss_3": 894.9070373535156,
"kl_loss_7": 225.04671096801758,
"learning_rate": 0.0008298219961566008,
"loss": 624.6308,
"step": 2780
},
{
"ce_loss_10": 3.5525727391242983,
"ce_loss_13": 3.485328257083893,
"ce_loss_2": 4.1319693446159365,
"ce_loss_3": 3.9388387560844422,
"ce_loss_7": 3.6104352355003355,
"epoch": 0.279,
"grad_norm": 394.0,
"kl_loss_10": 112.5637420654297,
"kl_loss_2": 1333.4249206542968,
"kl_loss_3": 937.83544921875,
"kl_loss_7": 227.29133377075195,
"learning_rate": 0.0008286278362039527,
"loss": 635.7598,
"step": 2790
},
{
"ce_loss_10": 3.587259495258331,
"ce_loss_13": 3.5128440499305724,
"ce_loss_2": 4.159168899059296,
"ce_loss_3": 3.9615533709526063,
"ce_loss_7": 3.6405880570411684,
"epoch": 0.28,
"grad_norm": 402.0,
"kl_loss_10": 114.4114917755127,
"kl_loss_2": 1318.4106079101562,
"kl_loss_3": 924.2900451660156,
"kl_loss_7": 224.0070999145508,
"learning_rate": 0.0008274303669726426,
"loss": 628.2539,
"step": 2800
},
{
"ce_loss_10": 3.479981768131256,
"ce_loss_13": 3.411652183532715,
"ce_loss_2": 4.056045913696289,
"ce_loss_3": 3.866449761390686,
"ce_loss_7": 3.5390130996704103,
"epoch": 0.281,
"grad_norm": 484.0,
"kl_loss_10": 111.04401168823242,
"kl_loss_2": 1325.4802673339843,
"kl_loss_3": 933.757958984375,
"kl_loss_7": 223.9423583984375,
"learning_rate": 0.0008262296005211721,
"loss": 628.6442,
"step": 2810
},
{
"ce_loss_10": 3.6082133769989015,
"ce_loss_13": 3.543479096889496,
"ce_loss_2": 4.177137637138367,
"ce_loss_3": 3.9879695653915403,
"ce_loss_7": 3.667951965332031,
"epoch": 0.282,
"grad_norm": 436.0,
"kl_loss_10": 106.70964050292969,
"kl_loss_2": 1303.7048461914062,
"kl_loss_3": 916.7790283203125,
"kl_loss_7": 222.79779052734375,
"learning_rate": 0.0008250255489412463,
"loss": 627.094,
"step": 2820
},
{
"ce_loss_10": 3.716309094429016,
"ce_loss_13": 3.642316293716431,
"ce_loss_2": 4.261255824565888,
"ce_loss_3": 4.0787659049034115,
"ce_loss_7": 3.7748358964920046,
"epoch": 0.283,
"grad_norm": 604.0,
"kl_loss_10": 114.64575080871582,
"kl_loss_2": 1277.7698669433594,
"kl_loss_3": 901.2324371337891,
"kl_loss_7": 231.05035324096679,
"learning_rate": 0.0008238182243576511,
"loss": 633.2869,
"step": 2830
},
{
"ce_loss_10": 3.682005834579468,
"ce_loss_13": 3.615007734298706,
"ce_loss_2": 4.202854037284851,
"ce_loss_3": 4.023157751560211,
"ce_loss_7": 3.736177396774292,
"epoch": 0.284,
"grad_norm": 548.0,
"kl_loss_10": 110.75144157409667,
"kl_loss_2": 1221.345361328125,
"kl_loss_3": 870.2192474365235,
"kl_loss_7": 222.93451232910155,
"learning_rate": 0.0008226076389281315,
"loss": 611.4373,
"step": 2840
},
{
"ce_loss_10": 3.7233519554138184,
"ce_loss_13": 3.6542891025543214,
"ce_loss_2": 4.248092949390411,
"ce_loss_3": 4.069663691520691,
"ce_loss_7": 3.775057625770569,
"epoch": 0.285,
"grad_norm": 704.0,
"kl_loss_10": 110.19483451843261,
"kl_loss_2": 1255.0005493164062,
"kl_loss_3": 882.4262268066407,
"kl_loss_7": 222.1134048461914,
"learning_rate": 0.0008213938048432696,
"loss": 610.5205,
"step": 2850
},
{
"ce_loss_10": 3.64341379404068,
"ce_loss_13": 3.5780718684196473,
"ce_loss_2": 4.180006468296051,
"ce_loss_3": 3.9996572732925415,
"ce_loss_7": 3.701814925670624,
"epoch": 0.286,
"grad_norm": 442.0,
"kl_loss_10": 108.8284294128418,
"kl_loss_2": 1259.7149841308594,
"kl_loss_3": 887.2555450439453,
"kl_loss_7": 224.16595458984375,
"learning_rate": 0.0008201767343263612,
"loss": 623.8719,
"step": 2860
},
{
"ce_loss_10": 3.580712640285492,
"ce_loss_13": 3.514770495891571,
"ce_loss_2": 4.152428865432739,
"ce_loss_3": 3.9586745381355284,
"ce_loss_7": 3.6420669674873354,
"epoch": 0.287,
"grad_norm": 478.0,
"kl_loss_10": 104.37866973876953,
"kl_loss_2": 1290.6899536132812,
"kl_loss_3": 906.0174041748047,
"kl_loss_7": 219.9403289794922,
"learning_rate": 0.0008189564396332927,
"loss": 611.9311,
"step": 2870
},
{
"ce_loss_10": 3.560059654712677,
"ce_loss_13": 3.4959851503372192,
"ce_loss_2": 4.124033749103546,
"ce_loss_3": 3.9340568661689757,
"ce_loss_7": 3.6201700448989866,
"epoch": 0.288,
"grad_norm": 480.0,
"kl_loss_10": 103.69261512756347,
"kl_loss_2": 1290.621844482422,
"kl_loss_3": 906.7965057373046,
"kl_loss_7": 223.63958206176758,
"learning_rate": 0.0008177329330524181,
"loss": 627.1938,
"step": 2880
},
{
"ce_loss_10": 3.6303093075752257,
"ce_loss_13": 3.5619912266731264,
"ce_loss_2": 4.173935759067535,
"ce_loss_3": 3.9890891432762148,
"ce_loss_7": 3.6870488286018372,
"epoch": 0.289,
"grad_norm": 452.0,
"kl_loss_10": 105.55148658752441,
"kl_loss_2": 1245.1935241699218,
"kl_loss_3": 874.3131744384766,
"kl_loss_7": 225.99310531616212,
"learning_rate": 0.0008165062269044352,
"loss": 620.2292,
"step": 2890
},
{
"ce_loss_10": 3.574904942512512,
"ce_loss_13": 3.5098610281944276,
"ce_loss_2": 4.134270429611206,
"ce_loss_3": 3.941369962692261,
"ce_loss_7": 3.641903018951416,
"epoch": 0.29,
"grad_norm": 394.0,
"kl_loss_10": 109.33544578552247,
"kl_loss_2": 1282.6864868164062,
"kl_loss_3": 899.0664276123047,
"kl_loss_7": 231.66256561279297,
"learning_rate": 0.0008152763335422613,
"loss": 630.6565,
"step": 2900
},
{
"ce_loss_10": 3.5721667885780333,
"ce_loss_13": 3.503155696392059,
"ce_loss_2": 4.123925364017486,
"ce_loss_3": 3.935485672950745,
"ce_loss_7": 3.625267505645752,
"epoch": 0.291,
"grad_norm": 600.0,
"kl_loss_10": 111.11225318908691,
"kl_loss_2": 1285.6910400390625,
"kl_loss_3": 903.9730163574219,
"kl_loss_7": 227.13845748901366,
"learning_rate": 0.0008140432653509088,
"loss": 623.4421,
"step": 2910
},
{
"ce_loss_10": 3.617369520664215,
"ce_loss_13": 3.5511601328849793,
"ce_loss_2": 4.158329248428345,
"ce_loss_3": 3.9670159101486204,
"ce_loss_7": 3.6755479097366335,
"epoch": 0.292,
"grad_norm": 424.0,
"kl_loss_10": 108.97641296386719,
"kl_loss_2": 1271.4477172851562,
"kl_loss_3": 887.2839324951171,
"kl_loss_7": 224.6483947753906,
"learning_rate": 0.0008128070347473608,
"loss": 614.8932,
"step": 2920
},
{
"ce_loss_10": 3.6203887820243836,
"ce_loss_13": 3.5571650743484495,
"ce_loss_2": 4.184931480884552,
"ce_loss_3": 3.988680112361908,
"ce_loss_7": 3.6783168077468873,
"epoch": 0.293,
"grad_norm": 442.0,
"kl_loss_10": 106.56389541625977,
"kl_loss_2": 1309.3880310058594,
"kl_loss_3": 912.7983032226563,
"kl_loss_7": 223.91178283691406,
"learning_rate": 0.0008115676541804455,
"loss": 627.653,
"step": 2930
},
{
"ce_loss_10": 3.631400096416473,
"ce_loss_13": 3.565066874027252,
"ce_loss_2": 4.173557686805725,
"ce_loss_3": 3.9938668727874758,
"ce_loss_7": 3.6861080646514894,
"epoch": 0.294,
"grad_norm": 410.0,
"kl_loss_10": 107.59184074401855,
"kl_loss_2": 1258.5327880859375,
"kl_loss_3": 893.1478424072266,
"kl_loss_7": 221.63349151611328,
"learning_rate": 0.0008103251361307119,
"loss": 625.068,
"step": 2940
},
{
"ce_loss_10": 3.6633550047874452,
"ce_loss_13": 3.5975454568862917,
"ce_loss_2": 4.201860392093659,
"ce_loss_3": 4.0187016248703005,
"ce_loss_7": 3.7217815637588503,
"epoch": 0.295,
"grad_norm": 484.0,
"kl_loss_10": 107.80433006286621,
"kl_loss_2": 1263.7409545898438,
"kl_loss_3": 899.7593811035156,
"kl_loss_7": 224.86621551513673,
"learning_rate": 0.0008090794931103026,
"loss": 620.4886,
"step": 2950
},
{
"ce_loss_10": 3.6508840203285216,
"ce_loss_13": 3.588442325592041,
"ce_loss_2": 4.192799139022827,
"ce_loss_3": 4.012849128246307,
"ce_loss_7": 3.706376481056213,
"epoch": 0.296,
"grad_norm": 560.0,
"kl_loss_10": 104.73687210083008,
"kl_loss_2": 1249.5858154296875,
"kl_loss_3": 882.2532653808594,
"kl_loss_7": 217.49150695800782,
"learning_rate": 0.0008078307376628291,
"loss": 618.8026,
"step": 2960
},
{
"ce_loss_10": 3.714610981941223,
"ce_loss_13": 3.648031437397003,
"ce_loss_2": 4.232832741737366,
"ce_loss_3": 4.05701197385788,
"ce_loss_7": 3.7682809591293336,
"epoch": 0.297,
"grad_norm": 438.0,
"kl_loss_10": 105.18131446838379,
"kl_loss_2": 1206.7602905273438,
"kl_loss_3": 851.9172241210938,
"kl_loss_7": 215.51920394897462,
"learning_rate": 0.000806578882363245,
"loss": 597.2082,
"step": 2970
},
{
"ce_loss_10": 3.6252587914466856,
"ce_loss_13": 3.5611796617507934,
"ce_loss_2": 4.163775825500489,
"ce_loss_3": 3.977950668334961,
"ce_loss_7": 3.6833796977996824,
"epoch": 0.298,
"grad_norm": 648.0,
"kl_loss_10": 103.29475135803223,
"kl_loss_2": 1245.597314453125,
"kl_loss_3": 877.1612213134765,
"kl_loss_7": 219.75524444580077,
"learning_rate": 0.0008053239398177191,
"loss": 627.9783,
"step": 2980
},
{
"ce_loss_10": 3.602860856056213,
"ce_loss_13": 3.538487696647644,
"ce_loss_2": 4.142560148239136,
"ce_loss_3": 3.9584405183792115,
"ce_loss_7": 3.659018313884735,
"epoch": 0.299,
"grad_norm": 502.0,
"kl_loss_10": 104.49293098449706,
"kl_loss_2": 1247.2865417480468,
"kl_loss_3": 873.8518829345703,
"kl_loss_7": 218.60237731933594,
"learning_rate": 0.0008040659226635089,
"loss": 629.0394,
"step": 2990
},
{
"ce_loss_10": 3.737885308265686,
"ce_loss_13": 3.670609879493713,
"ce_loss_2": 4.267246758937835,
"ce_loss_3": 4.084029448032379,
"ce_loss_7": 3.801585829257965,
"epoch": 0.3,
"grad_norm": 474.0,
"kl_loss_10": 109.13075065612793,
"kl_loss_2": 1251.634942626953,
"kl_loss_3": 874.3355682373046,
"kl_loss_7": 234.4466766357422,
"learning_rate": 0.0008028048435688333,
"loss": 617.8753,
"step": 3000
},
{
"ce_loss_10": 3.608117866516113,
"ce_loss_13": 3.5417439699172975,
"ce_loss_2": 4.164859163761139,
"ce_loss_3": 3.9728567838668822,
"ce_loss_7": 3.666444170475006,
"epoch": 0.301,
"grad_norm": 458.0,
"kl_loss_10": 104.67718696594238,
"kl_loss_2": 1290.2385009765626,
"kl_loss_3": 895.6051879882813,
"kl_loss_7": 230.77984161376952,
"learning_rate": 0.0008015407152327448,
"loss": 624.6472,
"step": 3010
},
{
"ce_loss_10": 3.655743646621704,
"ce_loss_13": 3.589059603214264,
"ce_loss_2": 4.197956717014312,
"ce_loss_3": 4.010993158817291,
"ce_loss_7": 3.715561032295227,
"epoch": 0.302,
"grad_norm": 490.0,
"kl_loss_10": 108.86725807189941,
"kl_loss_2": 1260.063540649414,
"kl_loss_3": 888.4078857421875,
"kl_loss_7": 225.77315521240234,
"learning_rate": 0.0008002735503850016,
"loss": 621.0348,
"step": 3020
},
{
"ce_loss_10": 3.5483126521110533,
"ce_loss_13": 3.477071487903595,
"ce_loss_2": 4.1067805051803585,
"ce_loss_3": 3.9177415490150453,
"ce_loss_7": 3.613772678375244,
"epoch": 0.303,
"grad_norm": 442.0,
"kl_loss_10": 113.85512008666993,
"kl_loss_2": 1301.403955078125,
"kl_loss_3": 923.0243713378907,
"kl_loss_7": 243.21601486206055,
"learning_rate": 0.0007990033617859396,
"loss": 643.6124,
"step": 3030
},
{
"ce_loss_10": 3.596233379840851,
"ce_loss_13": 3.527192997932434,
"ce_loss_2": 4.134040641784668,
"ce_loss_3": 3.9505585551261904,
"ce_loss_7": 3.6559112668037415,
"epoch": 0.304,
"grad_norm": 576.0,
"kl_loss_10": 111.53803482055665,
"kl_loss_2": 1246.1058349609375,
"kl_loss_3": 878.1594299316406,
"kl_loss_7": 229.73634643554686,
"learning_rate": 0.000797730162226344,
"loss": 607.6155,
"step": 3040
},
{
"ce_loss_10": 3.6262240767478944,
"ce_loss_13": 3.55741890668869,
"ce_loss_2": 4.167547011375428,
"ce_loss_3": 3.981596386432648,
"ce_loss_7": 3.686588776111603,
"epoch": 0.305,
"grad_norm": 430.0,
"kl_loss_10": 113.34700317382813,
"kl_loss_2": 1258.705908203125,
"kl_loss_3": 888.1617828369141,
"kl_loss_7": 230.86616134643555,
"learning_rate": 0.0007964539645273203,
"loss": 613.4882,
"step": 3050
},
{
"ce_loss_10": 3.6409424901008607,
"ce_loss_13": 3.574270474910736,
"ce_loss_2": 4.167909657955169,
"ce_loss_3": 3.9862082481384276,
"ce_loss_7": 3.6940474629402162,
"epoch": 0.306,
"grad_norm": 486.0,
"kl_loss_10": 106.54784317016602,
"kl_loss_2": 1238.411444091797,
"kl_loss_3": 866.3065307617187,
"kl_loss_7": 220.125154876709,
"learning_rate": 0.000795174781540165,
"loss": 615.3713,
"step": 3060
},
{
"ce_loss_10": 3.721142077445984,
"ce_loss_13": 3.643355393409729,
"ce_loss_2": 4.226944315433502,
"ce_loss_3": 4.049481880664826,
"ce_loss_7": 3.771383452415466,
"epoch": 0.307,
"grad_norm": 418.0,
"kl_loss_10": 122.25658149719239,
"kl_loss_2": 1203.3186645507812,
"kl_loss_3": 851.1368927001953,
"kl_loss_7": 225.69219970703125,
"learning_rate": 0.0007938926261462366,
"loss": 615.3413,
"step": 3070
},
{
"ce_loss_10": 3.6604058384895324,
"ce_loss_13": 3.5915605425834656,
"ce_loss_2": 4.175356435775757,
"ce_loss_3": 3.9981507778167726,
"ce_loss_7": 3.7177716493606567,
"epoch": 0.308,
"grad_norm": 528.0,
"kl_loss_10": 111.06630897521973,
"kl_loss_2": 1238.821875,
"kl_loss_3": 876.2146820068359,
"kl_loss_7": 223.17951583862305,
"learning_rate": 0.0007926075112568258,
"loss": 625.9794,
"step": 3080
},
{
"ce_loss_10": 3.652525985240936,
"ce_loss_13": 3.586830127239227,
"ce_loss_2": 4.18239061832428,
"ce_loss_3": 4.001185369491577,
"ce_loss_7": 3.7117084741592405,
"epoch": 0.309,
"grad_norm": 408.0,
"kl_loss_10": 105.23004531860352,
"kl_loss_2": 1238.4955749511719,
"kl_loss_3": 879.8196685791015,
"kl_loss_7": 219.05570373535156,
"learning_rate": 0.0007913194498130252,
"loss": 606.0291,
"step": 3090
},
{
"ce_loss_10": 3.576726019382477,
"ce_loss_13": 3.5116322517395018,
"ce_loss_2": 4.1267429232597355,
"ce_loss_3": 3.951638638973236,
"ce_loss_7": 3.633882737159729,
"epoch": 0.31,
"grad_norm": 596.0,
"kl_loss_10": 104.83585128784179,
"kl_loss_2": 1271.4552978515626,
"kl_loss_3": 899.7277404785157,
"kl_loss_7": 221.13562393188477,
"learning_rate": 0.0007900284547855992,
"loss": 625.4858,
"step": 3100
},
{
"ce_loss_10": 3.585216796398163,
"ce_loss_13": 3.5201086163520814,
"ce_loss_2": 4.1151411652565,
"ce_loss_3": 3.9415447235107424,
"ce_loss_7": 3.6418359875679016,
"epoch": 0.311,
"grad_norm": 460.0,
"kl_loss_10": 104.5475685119629,
"kl_loss_2": 1231.1286071777345,
"kl_loss_3": 880.9251007080078,
"kl_loss_7": 215.18857421875,
"learning_rate": 0.0007887345391748532,
"loss": 620.3755,
"step": 3110
},
{
"ce_loss_10": 3.7296380400657654,
"ce_loss_13": 3.6599961280822755,
"ce_loss_2": 4.223298215866089,
"ce_loss_3": 4.0600717782974245,
"ce_loss_7": 3.7785526752471923,
"epoch": 0.312,
"grad_norm": 434.0,
"kl_loss_10": 110.47460975646973,
"kl_loss_2": 1200.8911010742188,
"kl_loss_3": 857.2419494628906,
"kl_loss_7": 215.84228134155273,
"learning_rate": 0.0007874377160105036,
"loss": 594.074,
"step": 3120
},
{
"ce_loss_10": 3.647064197063446,
"ce_loss_13": 3.5629413604736326,
"ce_loss_2": 4.20149587392807,
"ce_loss_3": 4.026539087295532,
"ce_loss_7": 3.7087369561195374,
"epoch": 0.313,
"grad_norm": 504.0,
"kl_loss_10": 117.26640319824219,
"kl_loss_2": 1253.9545349121095,
"kl_loss_3": 905.2646728515625,
"kl_loss_7": 235.96448516845703,
"learning_rate": 0.0007861379983515449,
"loss": 636.8147,
"step": 3130
},
{
"ce_loss_10": 3.7007260084152223,
"ce_loss_13": 3.631552994251251,
"ce_loss_2": 4.21818333864212,
"ce_loss_3": 4.040616655349732,
"ce_loss_7": 3.757350814342499,
"epoch": 0.314,
"grad_norm": 466.0,
"kl_loss_10": 112.17713203430176,
"kl_loss_2": 1239.388739013672,
"kl_loss_3": 879.6389221191406,
"kl_loss_7": 227.5748275756836,
"learning_rate": 0.0007848353992861195,
"loss": 608.3133,
"step": 3140
},
{
"ce_loss_10": 3.786464810371399,
"ce_loss_13": 3.710281562805176,
"ce_loss_2": 4.314648783206939,
"ce_loss_3": 4.134762763977051,
"ce_loss_7": 3.843652272224426,
"epoch": 0.315,
"grad_norm": 458.0,
"kl_loss_10": 124.92040100097657,
"kl_loss_2": 1240.0601806640625,
"kl_loss_3": 878.5192687988281,
"kl_loss_7": 241.41039581298827,
"learning_rate": 0.0007835299319313853,
"loss": 620.9426,
"step": 3150
},
{
"ce_loss_10": 3.663742733001709,
"ce_loss_13": 3.5886364459991453,
"ce_loss_2": 4.173808574676514,
"ce_loss_3": 3.994606840610504,
"ce_loss_7": 3.7157713413238525,
"epoch": 0.316,
"grad_norm": 478.0,
"kl_loss_10": 119.08418045043945,
"kl_loss_2": 1222.8418212890624,
"kl_loss_3": 864.8908721923829,
"kl_loss_7": 229.07857666015624,
"learning_rate": 0.0007822216094333848,
"loss": 627.9899,
"step": 3160
},
{
"ce_loss_10": 3.660254752635956,
"ce_loss_13": 3.592539119720459,
"ce_loss_2": 4.196765351295471,
"ce_loss_3": 4.01435557603836,
"ce_loss_7": 3.7205393433570864,
"epoch": 0.317,
"grad_norm": 402.0,
"kl_loss_10": 115.94363555908203,
"kl_loss_2": 1238.9798767089844,
"kl_loss_3": 878.1770599365234,
"kl_loss_7": 235.79936828613282,
"learning_rate": 0.0007809104449669101,
"loss": 611.1889,
"step": 3170
},
{
"ce_loss_10": 3.625234532356262,
"ce_loss_13": 3.5487714052200316,
"ce_loss_2": 4.128973770141601,
"ce_loss_3": 3.9529131054878235,
"ce_loss_7": 3.6731096148490905,
"epoch": 0.318,
"grad_norm": 524.0,
"kl_loss_10": 118.72456016540528,
"kl_loss_2": 1219.5467956542968,
"kl_loss_3": 854.3440185546875,
"kl_loss_7": 228.71927642822266,
"learning_rate": 0.0007795964517353734,
"loss": 608.5358,
"step": 3180
},
{
"ce_loss_10": 3.623099219799042,
"ce_loss_13": 3.54120157957077,
"ce_loss_2": 4.132599997520447,
"ce_loss_3": 3.9547854542732237,
"ce_loss_7": 3.670779359340668,
"epoch": 0.319,
"grad_norm": 438.0,
"kl_loss_10": 145.3066722869873,
"kl_loss_2": 1253.242724609375,
"kl_loss_3": 888.5025939941406,
"kl_loss_7": 249.7946647644043,
"learning_rate": 0.000778279642970672,
"loss": 614.9188,
"step": 3190
},
{
"ce_loss_10": 3.61579008102417,
"ce_loss_13": 3.5464967608451845,
"ce_loss_2": 4.135542809963226,
"ce_loss_3": 3.949288582801819,
"ce_loss_7": 3.6743045926094053,
"epoch": 0.32,
"grad_norm": 580.0,
"kl_loss_10": 120.59939308166504,
"kl_loss_2": 1232.2691650390625,
"kl_loss_3": 859.8254913330078,
"kl_loss_7": 236.18389816284179,
"learning_rate": 0.0007769600319330552,
"loss": 603.041,
"step": 3200
},
{
"ce_loss_10": 3.6462074518203735,
"ce_loss_13": 3.5768035650253296,
"ce_loss_2": 4.193181753158569,
"ce_loss_3": 4.002738869190216,
"ce_loss_7": 3.7033765077590943,
"epoch": 0.321,
"grad_norm": 536.0,
"kl_loss_10": 113.30461883544922,
"kl_loss_2": 1261.3274169921874,
"kl_loss_3": 880.6385803222656,
"kl_loss_7": 233.63602294921876,
"learning_rate": 0.0007756376319109917,
"loss": 615.0137,
"step": 3210
},
{
"ce_loss_10": 3.6983227729797363,
"ce_loss_13": 3.628855359554291,
"ce_loss_2": 4.215565764904023,
"ce_loss_3": 4.036277508735656,
"ce_loss_7": 3.7591845273971556,
"epoch": 0.322,
"grad_norm": 414.0,
"kl_loss_10": 113.85302391052247,
"kl_loss_2": 1215.7693420410155,
"kl_loss_3": 852.6664520263672,
"kl_loss_7": 233.59415435791016,
"learning_rate": 0.0007743124562210351,
"loss": 595.5453,
"step": 3220
},
{
"ce_loss_10": 3.7038231015205385,
"ce_loss_13": 3.636870324611664,
"ce_loss_2": 4.220840120315552,
"ce_loss_3": 4.040867578983307,
"ce_loss_7": 3.759516727924347,
"epoch": 0.323,
"grad_norm": 500.0,
"kl_loss_10": 116.7942398071289,
"kl_loss_2": 1231.380029296875,
"kl_loss_3": 860.8811431884766,
"kl_loss_7": 226.99792938232423,
"learning_rate": 0.0007729845182076895,
"loss": 609.7637,
"step": 3230
},
{
"ce_loss_10": 3.635891842842102,
"ce_loss_13": 3.570459449291229,
"ce_loss_2": 4.146735298633575,
"ce_loss_3": 3.971414268016815,
"ce_loss_7": 3.6916919469833376,
"epoch": 0.324,
"grad_norm": 544.0,
"kl_loss_10": 107.84456443786621,
"kl_loss_2": 1210.0861877441407,
"kl_loss_3": 854.6661926269531,
"kl_loss_7": 223.18558731079102,
"learning_rate": 0.0007716538312432765,
"loss": 613.749,
"step": 3240
},
{
"ce_loss_10": 3.5933796405792235,
"ce_loss_13": 3.5238136887550353,
"ce_loss_2": 4.137728452682495,
"ce_loss_3": 3.9503737330436706,
"ce_loss_7": 3.6502291560173035,
"epoch": 0.325,
"grad_norm": 532.0,
"kl_loss_10": 110.89730453491211,
"kl_loss_2": 1272.4953063964845,
"kl_loss_3": 899.5693664550781,
"kl_loss_7": 234.18169174194335,
"learning_rate": 0.0007703204087277988,
"loss": 621.0721,
"step": 3250
},
{
"ce_loss_10": 3.691065728664398,
"ce_loss_13": 3.625254142284393,
"ce_loss_2": 4.195106828212738,
"ce_loss_3": 4.023638522624969,
"ce_loss_7": 3.744424653053284,
"epoch": 0.326,
"grad_norm": 480.0,
"kl_loss_10": 108.84702529907227,
"kl_loss_2": 1187.3806762695312,
"kl_loss_3": 834.2469482421875,
"kl_loss_7": 219.25809020996093,
"learning_rate": 0.0007689842640888063,
"loss": 594.9809,
"step": 3260
},
{
"ce_loss_10": 3.6937523603439333,
"ce_loss_13": 3.6257047772407534,
"ce_loss_2": 4.207961022853851,
"ce_loss_3": 4.029592931270599,
"ce_loss_7": 3.7506144404411317,
"epoch": 0.327,
"grad_norm": 432.0,
"kl_loss_10": 109.73489418029786,
"kl_loss_2": 1197.2553649902343,
"kl_loss_3": 845.9240936279297,
"kl_loss_7": 224.3518325805664,
"learning_rate": 0.0007676454107812607,
"loss": 600.9104,
"step": 3270
},
{
"ce_loss_10": 3.6202093243598936,
"ce_loss_13": 3.556860589981079,
"ce_loss_2": 4.152219152450561,
"ce_loss_3": 3.972454571723938,
"ce_loss_7": 3.6772433161735534,
"epoch": 0.328,
"grad_norm": 552.0,
"kl_loss_10": 107.7342628479004,
"kl_loss_2": 1234.4693603515625,
"kl_loss_3": 866.5982177734375,
"kl_loss_7": 224.09054641723634,
"learning_rate": 0.0007663038622873999,
"loss": 600.4109,
"step": 3280
},
{
"ce_loss_10": 3.6624753713607787,
"ce_loss_13": 3.5959082007408143,
"ce_loss_2": 4.186812722682953,
"ce_loss_3": 4.007313239574432,
"ce_loss_7": 3.7183284163475037,
"epoch": 0.329,
"grad_norm": 416.0,
"kl_loss_10": 107.99775848388671,
"kl_loss_2": 1235.7617919921875,
"kl_loss_3": 865.4350341796875,
"kl_loss_7": 219.93520736694336,
"learning_rate": 0.0007649596321166025,
"loss": 596.3813,
"step": 3290
},
{
"ce_loss_10": 3.5629011154174806,
"ce_loss_13": 3.500445473194122,
"ce_loss_2": 4.090505909919739,
"ce_loss_3": 3.9089764833450316,
"ce_loss_7": 3.619310712814331,
"epoch": 0.33,
"grad_norm": 448.0,
"kl_loss_10": 101.5875473022461,
"kl_loss_2": 1220.246160888672,
"kl_loss_3": 856.5614715576172,
"kl_loss_7": 215.10712509155275,
"learning_rate": 0.0007636127338052513,
"loss": 603.8148,
"step": 3300
},
{
"ce_loss_10": 3.670552396774292,
"ce_loss_13": 3.6016101121902464,
"ce_loss_2": 4.213171231746673,
"ce_loss_3": 4.018688130378723,
"ce_loss_7": 3.727590525150299,
"epoch": 0.331,
"grad_norm": 374.0,
"kl_loss_10": 108.33710594177246,
"kl_loss_2": 1257.856024169922,
"kl_loss_3": 874.112905883789,
"kl_loss_7": 224.637939453125,
"learning_rate": 0.0007622631809165971,
"loss": 604.7203,
"step": 3310
},
{
"ce_loss_10": 3.671126115322113,
"ce_loss_13": 3.6092859148979186,
"ce_loss_2": 4.177222061157226,
"ce_loss_3": 3.9993849992752075,
"ce_loss_7": 3.722568082809448,
"epoch": 0.332,
"grad_norm": 414.0,
"kl_loss_10": 101.74094352722167,
"kl_loss_2": 1180.6327026367187,
"kl_loss_3": 821.7367553710938,
"kl_loss_7": 208.3566993713379,
"learning_rate": 0.000760910987040623,
"loss": 588.4586,
"step": 3320
},
{
"ce_loss_10": 3.64985990524292,
"ce_loss_13": 3.585498571395874,
"ce_loss_2": 4.191103303432465,
"ce_loss_3": 4.004770576953888,
"ce_loss_7": 3.7059614300727843,
"epoch": 0.333,
"grad_norm": 346.0,
"kl_loss_10": 102.83302307128906,
"kl_loss_2": 1259.7546875,
"kl_loss_3": 881.3513031005859,
"kl_loss_7": 217.63404388427733,
"learning_rate": 0.000759556165793906,
"loss": 599.8207,
"step": 3330
},
{
"ce_loss_10": 3.676869213581085,
"ce_loss_13": 3.610471022129059,
"ce_loss_2": 4.2084539294242855,
"ce_loss_3": 4.019213974475861,
"ce_loss_7": 3.7275768160820006,
"epoch": 0.334,
"grad_norm": 502.0,
"kl_loss_10": 104.88800392150878,
"kl_loss_2": 1223.232958984375,
"kl_loss_3": 852.1926971435547,
"kl_loss_7": 215.24551544189453,
"learning_rate": 0.000758198730819481,
"loss": 604.6092,
"step": 3340
},
{
"ce_loss_10": 3.616540086269379,
"ce_loss_13": 3.553786301612854,
"ce_loss_2": 4.152189195156097,
"ce_loss_3": 3.9668321132659914,
"ce_loss_7": 3.6709399580955506,
"epoch": 0.335,
"grad_norm": 488.0,
"kl_loss_10": 102.31456336975097,
"kl_loss_2": 1251.2591918945313,
"kl_loss_3": 875.474462890625,
"kl_loss_7": 214.77994079589843,
"learning_rate": 0.0007568386957867032,
"loss": 608.125,
"step": 3350
},
{
"ce_loss_10": 3.695429575443268,
"ce_loss_13": 3.6276296377182007,
"ce_loss_2": 4.209121763706207,
"ce_loss_3": 4.032123720645904,
"ce_loss_7": 3.749704658985138,
"epoch": 0.336,
"grad_norm": 664.0,
"kl_loss_10": 107.0846736907959,
"kl_loss_2": 1209.7884765625,
"kl_loss_3": 853.7374877929688,
"kl_loss_7": 220.54676055908203,
"learning_rate": 0.0007554760743911103,
"loss": 605.0996,
"step": 3360
},
{
"ce_loss_10": 3.5890319466590883,
"ce_loss_13": 3.5283274173736574,
"ce_loss_2": 4.114323127269745,
"ce_loss_3": 3.932225775718689,
"ce_loss_7": 3.644662916660309,
"epoch": 0.337,
"grad_norm": 398.0,
"kl_loss_10": 101.10566368103028,
"kl_loss_2": 1236.1671508789063,
"kl_loss_3": 865.7673828125,
"kl_loss_7": 212.85166015625,
"learning_rate": 0.0007541108803542846,
"loss": 613.867,
"step": 3370
},
{
"ce_loss_10": 3.6427289605140687,
"ce_loss_13": 3.576077425479889,
"ce_loss_2": 4.166507577896118,
"ce_loss_3": 3.9814778923988343,
"ce_loss_7": 3.6960788011550902,
"epoch": 0.338,
"grad_norm": 420.0,
"kl_loss_10": 106.68134155273438,
"kl_loss_2": 1229.0040222167968,
"kl_loss_3": 856.9913909912109,
"kl_loss_7": 213.85500411987306,
"learning_rate": 0.0007527431274237149,
"loss": 624.6923,
"step": 3380
},
{
"ce_loss_10": 3.611558997631073,
"ce_loss_13": 3.549490749835968,
"ce_loss_2": 4.114035534858703,
"ce_loss_3": 3.942946660518646,
"ce_loss_7": 3.662776732444763,
"epoch": 0.339,
"grad_norm": 406.0,
"kl_loss_10": 102.27137718200683,
"kl_loss_2": 1206.6684020996095,
"kl_loss_3": 846.7297576904297,
"kl_loss_7": 210.38721313476563,
"learning_rate": 0.0007513728293726579,
"loss": 594.8909,
"step": 3390
},
{
"ce_loss_10": 3.737028419971466,
"ce_loss_13": 3.669820773601532,
"ce_loss_2": 4.24596471786499,
"ce_loss_3": 4.065989923477173,
"ce_loss_7": 3.7901018500328063,
"epoch": 0.34,
"grad_norm": 456.0,
"kl_loss_10": 106.7515941619873,
"kl_loss_2": 1213.6457214355469,
"kl_loss_3": 848.0824188232422,
"kl_loss_7": 217.41063537597657,
"learning_rate": 0.00075,
"loss": 593.8513,
"step": 3400
},
{
"ce_loss_10": 3.719330894947052,
"ce_loss_13": 3.6538206934928894,
"ce_loss_2": 4.25202556848526,
"ce_loss_3": 4.069514441490173,
"ce_loss_7": 3.7754390835762024,
"epoch": 0.341,
"grad_norm": 442.0,
"kl_loss_10": 105.26911506652831,
"kl_loss_2": 1229.2578063964843,
"kl_loss_3": 857.8241027832031,
"kl_loss_7": 215.74853057861327,
"learning_rate": 0.0007486246531301177,
"loss": 595.3941,
"step": 3410
},
{
"ce_loss_10": 3.5200854897499085,
"ce_loss_13": 3.457200789451599,
"ce_loss_2": 4.057665538787842,
"ce_loss_3": 3.8753583312034605,
"ce_loss_7": 3.575985038280487,
"epoch": 0.342,
"grad_norm": 388.0,
"kl_loss_10": 101.49059600830078,
"kl_loss_2": 1229.5487548828125,
"kl_loss_3": 867.5537567138672,
"kl_loss_7": 212.1739074707031,
"learning_rate": 0.0007472468026127384,
"loss": 593.475,
"step": 3420
},
{
"ce_loss_10": 3.6591346502304076,
"ce_loss_13": 3.5927812099456786,
"ce_loss_2": 4.209147357940674,
"ce_loss_3": 4.019513976573944,
"ce_loss_7": 3.7172008395195006,
"epoch": 0.343,
"grad_norm": 442.0,
"kl_loss_10": 106.34202499389649,
"kl_loss_2": 1270.0667724609375,
"kl_loss_3": 890.6144561767578,
"kl_loss_7": 221.5020393371582,
"learning_rate": 0.000745866462322802,
"loss": 614.0497,
"step": 3430
},
{
"ce_loss_10": 3.647415816783905,
"ce_loss_13": 3.5850081205368043,
"ce_loss_2": 4.1631152629852295,
"ce_loss_3": 3.980070149898529,
"ce_loss_7": 3.7022210240364073,
"epoch": 0.344,
"grad_norm": 428.0,
"kl_loss_10": 103.86195526123046,
"kl_loss_2": 1198.3542846679688,
"kl_loss_3": 835.6711212158203,
"kl_loss_7": 208.45360870361327,
"learning_rate": 0.0007444836461603195,
"loss": 592.3941,
"step": 3440
},
{
"ce_loss_10": 3.7135616302490235,
"ce_loss_13": 3.6434731125831603,
"ce_loss_2": 4.233828973770142,
"ce_loss_3": 4.05616340637207,
"ce_loss_7": 3.762986993789673,
"epoch": 0.345,
"grad_norm": 548.0,
"kl_loss_10": 110.37765045166016,
"kl_loss_2": 1249.6877746582031,
"kl_loss_3": 880.3564361572265,
"kl_loss_7": 216.23881912231445,
"learning_rate": 0.0007430983680502344,
"loss": 610.9966,
"step": 3450
},
{
"ce_loss_10": 3.5541942715644836,
"ce_loss_13": 3.4891390204429626,
"ce_loss_2": 4.090934145450592,
"ce_loss_3": 3.908629584312439,
"ce_loss_7": 3.606754219532013,
"epoch": 0.346,
"grad_norm": 432.0,
"kl_loss_10": 110.62757797241211,
"kl_loss_2": 1245.3806091308593,
"kl_loss_3": 869.5422088623047,
"kl_loss_7": 211.6188102722168,
"learning_rate": 0.0007417106419422819,
"loss": 606.0509,
"step": 3460
},
{
"ce_loss_10": 3.6656701445579527,
"ce_loss_13": 3.596804141998291,
"ce_loss_2": 4.186310410499573,
"ce_loss_3": 4.003709590435028,
"ce_loss_7": 3.716957890987396,
"epoch": 0.347,
"grad_norm": 432.0,
"kl_loss_10": 110.30144805908203,
"kl_loss_2": 1208.0226745605469,
"kl_loss_3": 843.9369232177735,
"kl_loss_7": 210.9572967529297,
"learning_rate": 0.0007403204818108486,
"loss": 597.1857,
"step": 3470
},
{
"ce_loss_10": 3.6337965607643126,
"ce_loss_13": 3.5606253027915953,
"ce_loss_2": 4.153940236568451,
"ce_loss_3": 3.970261514186859,
"ce_loss_7": 3.680176484584808,
"epoch": 0.348,
"grad_norm": 380.0,
"kl_loss_10": 122.88734741210938,
"kl_loss_2": 1235.673895263672,
"kl_loss_3": 863.5119903564453,
"kl_loss_7": 214.55614318847657,
"learning_rate": 0.0007389279016548316,
"loss": 589.7067,
"step": 3480
},
{
"ce_loss_10": 3.6385215759277343,
"ce_loss_13": 3.5720754146575926,
"ce_loss_2": 4.187943410873413,
"ce_loss_3": 3.9984039187431337,
"ce_loss_7": 3.692442834377289,
"epoch": 0.349,
"grad_norm": 540.0,
"kl_loss_10": 110.95368614196778,
"kl_loss_2": 1266.4402160644531,
"kl_loss_3": 881.5294525146485,
"kl_loss_7": 217.94278945922852,
"learning_rate": 0.0007375329154974975,
"loss": 613.9418,
"step": 3490
},
{
"ce_loss_10": 3.5970895290374756,
"ce_loss_13": 3.5335337281227113,
"ce_loss_2": 4.117660129070282,
"ce_loss_3": 3.938844621181488,
"ce_loss_7": 3.6496007084846496,
"epoch": 0.35,
"grad_norm": 364.0,
"kl_loss_10": 106.09449501037598,
"kl_loss_2": 1217.6699768066405,
"kl_loss_3": 855.84267578125,
"kl_loss_7": 211.2824508666992,
"learning_rate": 0.0007361355373863414,
"loss": 604.2842,
"step": 3500
},
{
"ce_loss_10": 3.6508504867553713,
"ce_loss_13": 3.5859110236167906,
"ce_loss_2": 4.1644844770431515,
"ce_loss_3": 3.989104926586151,
"ce_loss_7": 3.7059740304946898,
"epoch": 0.351,
"grad_norm": 420.0,
"kl_loss_10": 105.65600318908692,
"kl_loss_2": 1192.6789306640626,
"kl_loss_3": 837.2236511230469,
"kl_loss_7": 210.62101364135742,
"learning_rate": 0.0007347357813929454,
"loss": 605.2478,
"step": 3510
},
{
"ce_loss_10": 3.5983325362205507,
"ce_loss_13": 3.5318838000297545,
"ce_loss_2": 4.108148908615112,
"ce_loss_3": 3.935304307937622,
"ce_loss_7": 3.6479654192924498,
"epoch": 0.352,
"grad_norm": 500.0,
"kl_loss_10": 106.45629920959473,
"kl_loss_2": 1190.6948181152343,
"kl_loss_3": 837.8225341796875,
"kl_loss_7": 210.1330581665039,
"learning_rate": 0.0007333336616128369,
"loss": 599.2653,
"step": 3520
},
{
"ce_loss_10": 3.570793068408966,
"ce_loss_13": 3.507152056694031,
"ce_loss_2": 4.106606543064117,
"ce_loss_3": 3.9213356494903566,
"ce_loss_7": 3.624741232395172,
"epoch": 0.353,
"grad_norm": 468.0,
"kl_loss_10": 102.9274845123291,
"kl_loss_2": 1231.522442626953,
"kl_loss_3": 864.720751953125,
"kl_loss_7": 214.17628860473633,
"learning_rate": 0.0007319291921653463,
"loss": 605.1452,
"step": 3530
},
{
"ce_loss_10": 3.6573350191116334,
"ce_loss_13": 3.591005003452301,
"ce_loss_2": 4.190282225608826,
"ce_loss_3": 4.010705304145813,
"ce_loss_7": 3.713829779624939,
"epoch": 0.354,
"grad_norm": 480.0,
"kl_loss_10": 105.38732643127442,
"kl_loss_2": 1246.1359802246093,
"kl_loss_3": 875.5277282714844,
"kl_loss_7": 217.63313064575195,
"learning_rate": 0.0007305223871934656,
"loss": 597.4614,
"step": 3540
},
{
"ce_loss_10": 3.6225136160850524,
"ce_loss_13": 3.556077516078949,
"ce_loss_2": 4.138617634773254,
"ce_loss_3": 3.9633963227272035,
"ce_loss_7": 3.678558957576752,
"epoch": 0.355,
"grad_norm": 502.0,
"kl_loss_10": 104.04609298706055,
"kl_loss_2": 1205.1107055664063,
"kl_loss_3": 845.5688415527344,
"kl_loss_7": 210.7905143737793,
"learning_rate": 0.0007291132608637052,
"loss": 595.3202,
"step": 3550
},
{
"ce_loss_10": 3.585705029964447,
"ce_loss_13": 3.52364000082016,
"ce_loss_2": 4.140194058418274,
"ce_loss_3": 3.939319980144501,
"ce_loss_7": 3.637845540046692,
"epoch": 0.356,
"grad_norm": 612.0,
"kl_loss_10": 100.68717575073242,
"kl_loss_2": 1272.5315246582031,
"kl_loss_3": 866.628369140625,
"kl_loss_7": 206.60951766967773,
"learning_rate": 0.0007277018273659516,
"loss": 612.2947,
"step": 3560
},
{
"ce_loss_10": 3.708829402923584,
"ce_loss_13": 3.6439966320991517,
"ce_loss_2": 4.2357800006866455,
"ce_loss_3": 4.058422148227692,
"ce_loss_7": 3.7655990600585936,
"epoch": 0.357,
"grad_norm": 400.0,
"kl_loss_10": 105.25033149719238,
"kl_loss_2": 1234.6828186035157,
"kl_loss_3": 864.7261169433593,
"kl_loss_7": 215.20211639404297,
"learning_rate": 0.0007262881009133242,
"loss": 605.0631,
"step": 3570
},
{
"ce_loss_10": 3.6265846729278564,
"ce_loss_13": 3.5641749501228333,
"ce_loss_2": 4.144611585140228,
"ce_loss_3": 3.9691020011901856,
"ce_loss_7": 3.6797274351119995,
"epoch": 0.358,
"grad_norm": 422.0,
"kl_loss_10": 101.45686912536621,
"kl_loss_2": 1216.0844970703124,
"kl_loss_3": 849.7874267578125,
"kl_loss_7": 208.09806137084962,
"learning_rate": 0.0007248720957420329,
"loss": 589.5256,
"step": 3580
},
{
"ce_loss_10": 3.6416075587272645,
"ce_loss_13": 3.5768683552742004,
"ce_loss_2": 4.156981098651886,
"ce_loss_3": 3.9762784600257874,
"ce_loss_7": 3.690297317504883,
"epoch": 0.359,
"grad_norm": 374.0,
"kl_loss_10": 104.18233222961426,
"kl_loss_2": 1196.5406433105468,
"kl_loss_3": 831.4658630371093,
"kl_loss_7": 209.4309959411621,
"learning_rate": 0.0007234538261112341,
"loss": 608.9998,
"step": 3590
},
{
"ce_loss_10": 3.6725340247154237,
"ce_loss_13": 3.6092687249183655,
"ce_loss_2": 4.202276730537415,
"ce_loss_3": 4.014237463474274,
"ce_loss_7": 3.7282424688339235,
"epoch": 0.36,
"grad_norm": 400.0,
"kl_loss_10": 101.90313911437988,
"kl_loss_2": 1228.7942749023437,
"kl_loss_3": 851.1504791259765,
"kl_loss_7": 214.15290603637695,
"learning_rate": 0.0007220333063028871,
"loss": 593.6457,
"step": 3600
},
{
"ce_loss_10": 3.7029056310653687,
"ce_loss_13": 3.6388812899589538,
"ce_loss_2": 4.263094091415406,
"ce_loss_3": 4.055423867702484,
"ce_loss_7": 3.7583480000495912,
"epoch": 0.361,
"grad_norm": 406.0,
"kl_loss_10": 103.6033935546875,
"kl_loss_2": 1316.5648254394532,
"kl_loss_3": 896.4495971679687,
"kl_loss_7": 217.90971908569335,
"learning_rate": 0.0007206105506216106,
"loss": 621.4246,
"step": 3610
},
{
"ce_loss_10": 3.582909846305847,
"ce_loss_13": 3.5207375407218935,
"ce_loss_2": 4.105194330215454,
"ce_loss_3": 3.92072172164917,
"ce_loss_7": 3.6367709159851076,
"epoch": 0.362,
"grad_norm": 488.0,
"kl_loss_10": 100.51245307922363,
"kl_loss_2": 1208.4382385253907,
"kl_loss_3": 842.719369506836,
"kl_loss_7": 209.43429107666014,
"learning_rate": 0.0007191855733945387,
"loss": 586.8207,
"step": 3620
},
{
"ce_loss_10": 3.6772588729858398,
"ce_loss_13": 3.611865592002869,
"ce_loss_2": 4.192759323120117,
"ce_loss_3": 4.0132176041603085,
"ce_loss_7": 3.7312068581581115,
"epoch": 0.363,
"grad_norm": 482.0,
"kl_loss_10": 103.05736274719239,
"kl_loss_2": 1206.339794921875,
"kl_loss_3": 840.5841491699218,
"kl_loss_7": 209.33160095214845,
"learning_rate": 0.0007177583889711762,
"loss": 590.5756,
"step": 3630
},
{
"ce_loss_10": 3.5943727612495424,
"ce_loss_13": 3.5278201699256897,
"ce_loss_2": 4.115126085281372,
"ce_loss_3": 3.9359707951545717,
"ce_loss_7": 3.64764518737793,
"epoch": 0.364,
"grad_norm": 474.0,
"kl_loss_10": 104.63778533935547,
"kl_loss_2": 1232.7115539550782,
"kl_loss_3": 867.7350891113281,
"kl_loss_7": 215.38798904418945,
"learning_rate": 0.0007163290117232541,
"loss": 602.1762,
"step": 3640
},
{
"ce_loss_10": 3.719394052028656,
"ce_loss_13": 3.6543713212013245,
"ce_loss_2": 4.207157838344574,
"ce_loss_3": 4.033388280868531,
"ce_loss_7": 3.766360378265381,
"epoch": 0.365,
"grad_norm": 516.0,
"kl_loss_10": 106.55956001281739,
"kl_loss_2": 1177.5490844726562,
"kl_loss_3": 820.275503540039,
"kl_loss_7": 210.7781494140625,
"learning_rate": 0.0007148974560445859,
"loss": 585.3312,
"step": 3650
},
{
"ce_loss_10": 3.63283451795578,
"ce_loss_13": 3.569260811805725,
"ce_loss_2": 4.140059876441955,
"ce_loss_3": 3.9612114429473877,
"ce_loss_7": 3.68426718711853,
"epoch": 0.366,
"grad_norm": 446.0,
"kl_loss_10": 101.39652633666992,
"kl_loss_2": 1181.2005432128906,
"kl_loss_3": 826.3975830078125,
"kl_loss_7": 208.74162216186522,
"learning_rate": 0.0007134637363509209,
"loss": 580.396,
"step": 3660
},
{
"ce_loss_10": 3.740837073326111,
"ce_loss_13": 3.676628518104553,
"ce_loss_2": 4.238210546970367,
"ce_loss_3": 4.064305305480957,
"ce_loss_7": 3.7917707443237303,
"epoch": 0.367,
"grad_norm": 374.0,
"kl_loss_10": 102.68134994506836,
"kl_loss_2": 1165.9671203613282,
"kl_loss_3": 815.8925506591797,
"kl_loss_7": 205.73183975219726,
"learning_rate": 0.0007120278670798009,
"loss": 586.6874,
"step": 3670
},
{
"ce_loss_10": 3.530075693130493,
"ce_loss_13": 3.467638063430786,
"ce_loss_2": 4.08873633146286,
"ce_loss_3": 3.8983967661857606,
"ce_loss_7": 3.590684974193573,
"epoch": 0.368,
"grad_norm": 504.0,
"kl_loss_10": 102.20494270324707,
"kl_loss_2": 1276.5897247314454,
"kl_loss_3": 894.699105834961,
"kl_loss_7": 217.834383392334,
"learning_rate": 0.0007105898626904133,
"loss": 620.3519,
"step": 3680
},
{
"ce_loss_10": 3.6397287964820864,
"ce_loss_13": 3.576084387302399,
"ce_loss_2": 4.165349864959717,
"ce_loss_3": 3.9844519972801207,
"ce_loss_7": 3.6932525277137755,
"epoch": 0.369,
"grad_norm": 548.0,
"kl_loss_10": 103.31561088562012,
"kl_loss_2": 1214.6401062011719,
"kl_loss_3": 850.1350677490234,
"kl_loss_7": 211.8514373779297,
"learning_rate": 0.0007091497376634463,
"loss": 587.3888,
"step": 3690
},
{
"ce_loss_10": 3.580397891998291,
"ce_loss_13": 3.518483591079712,
"ce_loss_2": 4.098948669433594,
"ce_loss_3": 3.9198103308677674,
"ce_loss_7": 3.633430314064026,
"epoch": 0.37,
"grad_norm": 462.0,
"kl_loss_10": 102.7860034942627,
"kl_loss_2": 1196.8778686523438,
"kl_loss_3": 839.7853210449218,
"kl_loss_7": 210.37151184082032,
"learning_rate": 0.0007077075065009433,
"loss": 599.0922,
"step": 3700
},
{
"ce_loss_10": 3.6922479033470155,
"ce_loss_13": 3.6247249126434324,
"ce_loss_2": 4.215528225898742,
"ce_loss_3": 4.034583401679993,
"ce_loss_7": 3.7439934253692626,
"epoch": 0.371,
"grad_norm": 436.0,
"kl_loss_10": 107.0543056488037,
"kl_loss_2": 1234.6434143066406,
"kl_loss_3": 869.9170135498047,
"kl_loss_7": 215.78035430908204,
"learning_rate": 0.0007062631837261557,
"loss": 601.1125,
"step": 3710
},
{
"ce_loss_10": 3.558840346336365,
"ce_loss_13": 3.4976505637168884,
"ce_loss_2": 4.082807242870331,
"ce_loss_3": 3.90502552986145,
"ce_loss_7": 3.611116898059845,
"epoch": 0.372,
"grad_norm": 418.0,
"kl_loss_10": 102.55169563293457,
"kl_loss_2": 1217.97548828125,
"kl_loss_3": 855.1094757080078,
"kl_loss_7": 209.0750946044922,
"learning_rate": 0.0007048167838833977,
"loss": 602.8635,
"step": 3720
},
{
"ce_loss_10": 3.6581831574440002,
"ce_loss_13": 3.593174624443054,
"ce_loss_2": 4.162305021286011,
"ce_loss_3": 3.9847410321235657,
"ce_loss_7": 3.7109787225723267,
"epoch": 0.373,
"grad_norm": 536.0,
"kl_loss_10": 103.06450958251953,
"kl_loss_2": 1197.146795654297,
"kl_loss_3": 834.3573669433594,
"kl_loss_7": 209.46187515258788,
"learning_rate": 0.0007033683215379002,
"loss": 588.3938,
"step": 3730
},
{
"ce_loss_10": 3.6515901923179626,
"ce_loss_13": 3.586732280254364,
"ce_loss_2": 4.166427576541901,
"ce_loss_3": 3.9861610412597654,
"ce_loss_7": 3.703124833106995,
"epoch": 0.374,
"grad_norm": 384.0,
"kl_loss_10": 101.91668891906738,
"kl_loss_2": 1196.090036010742,
"kl_loss_3": 834.7775848388671,
"kl_loss_7": 206.9270217895508,
"learning_rate": 0.0007019178112756625,
"loss": 596.7028,
"step": 3740
},
{
"ce_loss_10": 3.5998276591300966,
"ce_loss_13": 3.539226603507996,
"ce_loss_2": 4.120503497123718,
"ce_loss_3": 3.938064229488373,
"ce_loss_7": 3.6514668703079223,
"epoch": 0.375,
"grad_norm": 484.0,
"kl_loss_10": 101.7071418762207,
"kl_loss_2": 1206.4351013183593,
"kl_loss_3": 842.5018493652344,
"kl_loss_7": 207.55127868652343,
"learning_rate": 0.0007004652677033068,
"loss": 596.7216,
"step": 3750
},
{
"ce_loss_10": 3.6823023438453673,
"ce_loss_13": 3.6218234419822695,
"ce_loss_2": 4.1750637769699095,
"ce_loss_3": 4.004729413986206,
"ce_loss_7": 3.732503056526184,
"epoch": 0.376,
"grad_norm": 388.0,
"kl_loss_10": 99.9868221282959,
"kl_loss_2": 1168.4398498535156,
"kl_loss_3": 816.7180572509766,
"kl_loss_7": 201.70328750610352,
"learning_rate": 0.0006990107054479312,
"loss": 584.5167,
"step": 3760
},
{
"ce_loss_10": 3.667929840087891,
"ce_loss_13": 3.6051357984542847,
"ce_loss_2": 4.166206574440002,
"ce_loss_3": 3.9985297203063963,
"ce_loss_7": 3.719240057468414,
"epoch": 0.377,
"grad_norm": 496.0,
"kl_loss_10": 102.5582088470459,
"kl_loss_2": 1182.1695739746094,
"kl_loss_3": 832.6118957519532,
"kl_loss_7": 206.43120498657225,
"learning_rate": 0.000697554139156961,
"loss": 586.6759,
"step": 3770
},
{
"ce_loss_10": 3.648312306404114,
"ce_loss_13": 3.5864667892456055,
"ce_loss_2": 4.165168154239654,
"ce_loss_3": 3.980992519855499,
"ce_loss_7": 3.703998303413391,
"epoch": 0.378,
"grad_norm": 532.0,
"kl_loss_10": 102.77268753051757,
"kl_loss_2": 1217.0308044433593,
"kl_loss_3": 845.4426635742187,
"kl_loss_7": 211.65556106567382,
"learning_rate": 0.0006960955834980027,
"loss": 586.9333,
"step": 3780
},
{
"ce_loss_10": 3.624769401550293,
"ce_loss_13": 3.559871160984039,
"ce_loss_2": 4.141481828689575,
"ce_loss_3": 3.9655247926712036,
"ce_loss_7": 3.681060993671417,
"epoch": 0.379,
"grad_norm": 402.0,
"kl_loss_10": 104.66882057189942,
"kl_loss_2": 1194.9725402832032,
"kl_loss_3": 840.5210388183593,
"kl_loss_7": 214.32746124267578,
"learning_rate": 0.0006946350531586958,
"loss": 591.0428,
"step": 3790
},
{
"ce_loss_10": 3.6484233260154726,
"ce_loss_13": 3.5856125354766846,
"ce_loss_2": 4.168078374862671,
"ce_loss_3": 3.984307587146759,
"ce_loss_7": 3.7046299457550047,
"epoch": 0.38,
"grad_norm": 494.0,
"kl_loss_10": 102.10320167541504,
"kl_loss_2": 1202.4750549316407,
"kl_loss_3": 836.4282287597656,
"kl_loss_7": 215.46153411865234,
"learning_rate": 0.0006931725628465643,
"loss": 600.8652,
"step": 3800
},
{
"ce_loss_10": 3.669872498512268,
"ce_loss_13": 3.606708490848541,
"ce_loss_2": 4.190654408931732,
"ce_loss_3": 4.012761104106903,
"ce_loss_7": 3.725092887878418,
"epoch": 0.381,
"grad_norm": 462.0,
"kl_loss_10": 105.94147644042968,
"kl_loss_2": 1198.5632446289062,
"kl_loss_3": 842.3563995361328,
"kl_loss_7": 216.23879013061523,
"learning_rate": 0.0006917081272888696,
"loss": 594.3836,
"step": 3810
},
{
"ce_loss_10": 3.5702871322631835,
"ce_loss_13": 3.503624665737152,
"ce_loss_2": 4.083241939544678,
"ce_loss_3": 3.9013825416564942,
"ce_loss_7": 3.6281121611595153,
"epoch": 0.382,
"grad_norm": 430.0,
"kl_loss_10": 104.559330368042,
"kl_loss_2": 1205.9051391601563,
"kl_loss_3": 846.9787689208985,
"kl_loss_7": 214.2649803161621,
"learning_rate": 0.0006902417612324615,
"loss": 588.9565,
"step": 3820
},
{
"ce_loss_10": 3.705217492580414,
"ce_loss_13": 3.6370500326156616,
"ce_loss_2": 4.2347581624984745,
"ce_loss_3": 4.056124079227447,
"ce_loss_7": 3.7589930057525636,
"epoch": 0.383,
"grad_norm": 418.0,
"kl_loss_10": 107.22665023803711,
"kl_loss_2": 1242.482080078125,
"kl_loss_3": 871.0590393066407,
"kl_loss_7": 218.71700134277344,
"learning_rate": 0.00068877347944363,
"loss": 600.3775,
"step": 3830
},
{
"ce_loss_10": 3.6945597529411316,
"ce_loss_13": 3.6302199006080627,
"ce_loss_2": 4.190360188484192,
"ce_loss_3": 4.017442071437836,
"ce_loss_7": 3.74516099691391,
"epoch": 0.384,
"grad_norm": 460.0,
"kl_loss_10": 105.2132453918457,
"kl_loss_2": 1180.0169799804687,
"kl_loss_3": 825.1839294433594,
"kl_loss_7": 210.17990188598634,
"learning_rate": 0.0006873032967079561,
"loss": 592.1172,
"step": 3840
},
{
"ce_loss_10": 3.6860820412635804,
"ce_loss_13": 3.622925412654877,
"ce_loss_2": 4.173858499526977,
"ce_loss_3": 4.0060118436813354,
"ce_loss_7": 3.7361050128936766,
"epoch": 0.385,
"grad_norm": 444.0,
"kl_loss_10": 102.31974792480469,
"kl_loss_2": 1169.402410888672,
"kl_loss_3": 819.6500732421875,
"kl_loss_7": 207.8970947265625,
"learning_rate": 0.0006858312278301637,
"loss": 578.5368,
"step": 3850
},
{
"ce_loss_10": 3.724821174144745,
"ce_loss_13": 3.6599106669425963,
"ce_loss_2": 4.216867661476135,
"ce_loss_3": 4.043015420436859,
"ce_loss_7": 3.7741833090782166,
"epoch": 0.386,
"grad_norm": 628.0,
"kl_loss_10": 105.45792541503906,
"kl_loss_2": 1182.8445251464843,
"kl_loss_3": 827.4248168945312,
"kl_loss_7": 208.66201171875,
"learning_rate": 0.0006843572876339704,
"loss": 581.9299,
"step": 3860
},
{
"ce_loss_10": 3.639630389213562,
"ce_loss_13": 3.578851103782654,
"ce_loss_2": 4.1167685151100155,
"ce_loss_3": 3.953296732902527,
"ce_loss_7": 3.6866363167762755,
"epoch": 0.387,
"grad_norm": 402.0,
"kl_loss_10": 101.30325736999512,
"kl_loss_2": 1144.7853637695312,
"kl_loss_3": 802.1904113769531,
"kl_loss_7": 201.72076492309571,
"learning_rate": 0.0006828814909619373,
"loss": 586.7184,
"step": 3870
},
{
"ce_loss_10": 3.7647191643714906,
"ce_loss_13": 3.697379672527313,
"ce_loss_2": 4.260519480705261,
"ce_loss_3": 4.083593368530273,
"ce_loss_7": 3.813764202594757,
"epoch": 0.388,
"grad_norm": 350.0,
"kl_loss_10": 106.36605720520019,
"kl_loss_2": 1172.6269104003907,
"kl_loss_3": 820.4572174072266,
"kl_loss_7": 210.88503875732422,
"learning_rate": 0.0006814038526753205,
"loss": 576.9886,
"step": 3880
},
{
"ce_loss_10": 3.6557364583015444,
"ce_loss_13": 3.5924967169761657,
"ce_loss_2": 4.160025131702423,
"ce_loss_3": 3.984356963634491,
"ce_loss_7": 3.7067020535469055,
"epoch": 0.389,
"grad_norm": 330.0,
"kl_loss_10": 102.68659782409668,
"kl_loss_2": 1186.152001953125,
"kl_loss_3": 826.8501800537109,
"kl_loss_7": 206.71521759033203,
"learning_rate": 0.0006799243876539213,
"loss": 580.4666,
"step": 3890
},
{
"ce_loss_10": 3.5759631991386414,
"ce_loss_13": 3.5127877712249758,
"ce_loss_2": 4.105723321437836,
"ce_loss_3": 3.9167493343353272,
"ce_loss_7": 3.6288220643997193,
"epoch": 0.39,
"grad_norm": 536.0,
"kl_loss_10": 103.75163269042969,
"kl_loss_2": 1215.1460266113281,
"kl_loss_3": 839.8725982666016,
"kl_loss_7": 208.5065475463867,
"learning_rate": 0.0006784431107959359,
"loss": 592.4442,
"step": 3900
},
{
"ce_loss_10": 3.639443838596344,
"ce_loss_13": 3.5752380013465883,
"ce_loss_2": 4.170507109165191,
"ce_loss_3": 3.9816882967948914,
"ce_loss_7": 3.694754195213318,
"epoch": 0.391,
"grad_norm": 510.0,
"kl_loss_10": 103.07575302124023,
"kl_loss_2": 1237.5377136230468,
"kl_loss_3": 858.0287719726563,
"kl_loss_7": 214.26128845214845,
"learning_rate": 0.0006769600370178059,
"loss": 594.2272,
"step": 3910
},
{
"ce_loss_10": 3.607291209697723,
"ce_loss_13": 3.5426042318344115,
"ce_loss_2": 4.134967279434204,
"ce_loss_3": 3.9495469093322755,
"ce_loss_7": 3.6644778490066527,
"epoch": 0.392,
"grad_norm": 348.0,
"kl_loss_10": 100.81994514465332,
"kl_loss_2": 1201.7113891601562,
"kl_loss_3": 841.3645660400391,
"kl_loss_7": 207.30770874023438,
"learning_rate": 0.0006754751812540679,
"loss": 578.4809,
"step": 3920
},
{
"ce_loss_10": 3.6542662262916563,
"ce_loss_13": 3.5899597883224486,
"ce_loss_2": 4.172767472267151,
"ce_loss_3": 3.9909741401672365,
"ce_loss_7": 3.706152844429016,
"epoch": 0.393,
"grad_norm": 440.0,
"kl_loss_10": 104.03220100402832,
"kl_loss_2": 1209.6233947753906,
"kl_loss_3": 843.8147003173829,
"kl_loss_7": 210.7646583557129,
"learning_rate": 0.0006739885584572025,
"loss": 592.3653,
"step": 3930
},
{
"ce_loss_10": 3.685343015193939,
"ce_loss_13": 3.619848680496216,
"ce_loss_2": 4.199707639217377,
"ce_loss_3": 4.017499768733979,
"ce_loss_7": 3.734171211719513,
"epoch": 0.394,
"grad_norm": 564.0,
"kl_loss_10": 107.80731964111328,
"kl_loss_2": 1232.0240844726563,
"kl_loss_3": 850.9272064208984,
"kl_loss_7": 211.88618087768555,
"learning_rate": 0.0006725001835974853,
"loss": 590.3288,
"step": 3940
},
{
"ce_loss_10": 3.671092712879181,
"ce_loss_13": 3.6061443567276,
"ce_loss_2": 4.189756679534912,
"ce_loss_3": 4.005955624580383,
"ce_loss_7": 3.7217952370643617,
"epoch": 0.395,
"grad_norm": 472.0,
"kl_loss_10": 105.94960823059083,
"kl_loss_2": 1209.6172180175781,
"kl_loss_3": 848.8837646484375,
"kl_loss_7": 211.4744026184082,
"learning_rate": 0.0006710100716628344,
"loss": 581.9217,
"step": 3950
},
{
"ce_loss_10": 3.6513510942459106,
"ce_loss_13": 3.586063766479492,
"ce_loss_2": 4.175520932674408,
"ce_loss_3": 3.992800068855286,
"ce_loss_7": 3.7037784814834596,
"epoch": 0.396,
"grad_norm": 556.0,
"kl_loss_10": 102.45261993408204,
"kl_loss_2": 1202.025439453125,
"kl_loss_3": 843.4705932617187,
"kl_loss_7": 207.75647506713867,
"learning_rate": 0.0006695182376586602,
"loss": 594.7452,
"step": 3960
},
{
"ce_loss_10": 3.6946488857269286,
"ce_loss_13": 3.6310433030128477,
"ce_loss_2": 4.180384719371796,
"ce_loss_3": 4.00883582830429,
"ce_loss_7": 3.739116144180298,
"epoch": 0.397,
"grad_norm": 484.0,
"kl_loss_10": 100.45674743652344,
"kl_loss_2": 1141.924838256836,
"kl_loss_3": 795.2099151611328,
"kl_loss_7": 201.57386474609376,
"learning_rate": 0.000668024696607715,
"loss": 581.8865,
"step": 3970
},
{
"ce_loss_10": 3.63701788187027,
"ce_loss_13": 3.5759130001068113,
"ce_loss_2": 4.141798782348633,
"ce_loss_3": 3.965423548221588,
"ce_loss_7": 3.691797506809235,
"epoch": 0.398,
"grad_norm": 402.0,
"kl_loss_10": 99.83709602355957,
"kl_loss_2": 1189.6253723144532,
"kl_loss_3": 836.8567596435547,
"kl_loss_7": 210.05224533081054,
"learning_rate": 0.0006665294635499404,
"loss": 585.3059,
"step": 3980
},
{
"ce_loss_10": 3.645500433444977,
"ce_loss_13": 3.5827003002166746,
"ce_loss_2": 4.174324834346772,
"ce_loss_3": 3.992855429649353,
"ce_loss_7": 3.7015270590782166,
"epoch": 0.399,
"grad_norm": 438.0,
"kl_loss_10": 103.66120948791504,
"kl_loss_2": 1245.642510986328,
"kl_loss_3": 869.6440063476563,
"kl_loss_7": 216.26355361938477,
"learning_rate": 0.0006650325535423167,
"loss": 596.3225,
"step": 3990
},
{
"ce_loss_10": 3.6747123122215273,
"ce_loss_13": 3.6138512253761292,
"ce_loss_2": 4.168187916278839,
"ce_loss_3": 3.993897998332977,
"ce_loss_7": 3.725596582889557,
"epoch": 0.4,
"grad_norm": 520.0,
"kl_loss_10": 96.3211498260498,
"kl_loss_2": 1152.9211303710938,
"kl_loss_3": 801.8546081542969,
"kl_loss_7": 200.72928695678712,
"learning_rate": 0.0006635339816587109,
"loss": 575.9933,
"step": 4000
},
{
"ce_loss_10": 3.6128929018974305,
"ce_loss_13": 3.548132801055908,
"ce_loss_2": 4.128501725196839,
"ce_loss_3": 3.945591115951538,
"ce_loss_7": 3.6652005195617674,
"epoch": 0.401,
"grad_norm": 430.0,
"kl_loss_10": 103.19527244567871,
"kl_loss_2": 1214.8156677246093,
"kl_loss_3": 840.3229400634766,
"kl_loss_7": 210.74479904174805,
"learning_rate": 0.0006620337629897252,
"loss": 583.2822,
"step": 4010
},
{
"ce_loss_10": 3.619123613834381,
"ce_loss_13": 3.5573631048202516,
"ce_loss_2": 4.140160727500915,
"ce_loss_3": 3.958257591724396,
"ce_loss_7": 3.674074041843414,
"epoch": 0.402,
"grad_norm": 432.0,
"kl_loss_10": 100.38173408508301,
"kl_loss_2": 1207.5167907714845,
"kl_loss_3": 837.2485626220703,
"kl_loss_7": 208.48973083496094,
"learning_rate": 0.0006605319126425454,
"loss": 597.1898,
"step": 4020
},
{
"ce_loss_10": 3.5208260893821715,
"ce_loss_13": 3.4589377880096435,
"ce_loss_2": 4.050716698169708,
"ce_loss_3": 3.8632638931274412,
"ce_loss_7": 3.5759450912475588,
"epoch": 0.403,
"grad_norm": 420.0,
"kl_loss_10": 100.48741989135742,
"kl_loss_2": 1233.5194946289062,
"kl_loss_3": 854.4578369140625,
"kl_loss_7": 208.70274200439454,
"learning_rate": 0.0006590284457407876,
"loss": 593.5098,
"step": 4030
},
{
"ce_loss_10": 3.6270558714866636,
"ce_loss_13": 3.5626144886016844,
"ce_loss_2": 4.136511921882629,
"ce_loss_3": 3.957785797119141,
"ce_loss_7": 3.6768479347229004,
"epoch": 0.404,
"grad_norm": 392.0,
"kl_loss_10": 101.69999923706055,
"kl_loss_2": 1185.4601745605469,
"kl_loss_3": 821.0296905517578,
"kl_loss_7": 206.82139434814454,
"learning_rate": 0.0006575233774243465,
"loss": 582.2525,
"step": 4040
},
{
"ce_loss_10": 3.612906110286713,
"ce_loss_13": 3.550376224517822,
"ce_loss_2": 4.1283538222312925,
"ce_loss_3": 3.951547086238861,
"ce_loss_7": 3.667691433429718,
"epoch": 0.405,
"grad_norm": 464.0,
"kl_loss_10": 100.57203559875488,
"kl_loss_2": 1203.0161071777343,
"kl_loss_3": 838.8151794433594,
"kl_loss_7": 210.55067977905273,
"learning_rate": 0.0006560167228492435,
"loss": 587.686,
"step": 4050
},
{
"ce_loss_10": 3.6582042455673216,
"ce_loss_13": 3.597072696685791,
"ce_loss_2": 4.15371550321579,
"ce_loss_3": 3.9819828867912292,
"ce_loss_7": 3.7127379179000854,
"epoch": 0.406,
"grad_norm": 396.0,
"kl_loss_10": 97.44431228637696,
"kl_loss_2": 1157.4290466308594,
"kl_loss_3": 807.0505889892578,
"kl_loss_7": 202.94429702758788,
"learning_rate": 0.0006545084971874737,
"loss": 580.7177,
"step": 4060
},
{
"ce_loss_10": 3.6273567199707033,
"ce_loss_13": 3.564158725738525,
"ce_loss_2": 4.158101809024811,
"ce_loss_3": 3.9733991026878357,
"ce_loss_7": 3.685515010356903,
"epoch": 0.407,
"grad_norm": 372.0,
"kl_loss_10": 103.08215293884277,
"kl_loss_2": 1230.8001892089844,
"kl_loss_3": 853.4359588623047,
"kl_loss_7": 216.80452346801758,
"learning_rate": 0.0006529987156268526,
"loss": 583.8351,
"step": 4070
},
{
"ce_loss_10": 3.5464280128479,
"ce_loss_13": 3.481638014316559,
"ce_loss_2": 4.076263022422791,
"ce_loss_3": 3.8974447727203367,
"ce_loss_7": 3.6043801426887514,
"epoch": 0.408,
"grad_norm": 350.0,
"kl_loss_10": 102.87330780029296,
"kl_loss_2": 1214.2586059570312,
"kl_loss_3": 851.9112091064453,
"kl_loss_7": 211.73340759277343,
"learning_rate": 0.0006514873933708637,
"loss": 602.7298,
"step": 4080
},
{
"ce_loss_10": 3.6543262004852295,
"ce_loss_13": 3.5908489346504213,
"ce_loss_2": 4.153554606437683,
"ce_loss_3": 3.9771866679191588,
"ce_loss_7": 3.703446090221405,
"epoch": 0.409,
"grad_norm": 378.0,
"kl_loss_10": 100.85495872497559,
"kl_loss_2": 1179.416357421875,
"kl_loss_3": 822.3047607421875,
"kl_loss_7": 207.08517990112304,
"learning_rate": 0.0006499745456385053,
"loss": 579.5981,
"step": 4090
},
{
"ce_loss_10": 3.622114622592926,
"ce_loss_13": 3.5604026079177857,
"ce_loss_2": 4.138943600654602,
"ce_loss_3": 3.9601905822753904,
"ce_loss_7": 3.6786248087882996,
"epoch": 0.41,
"grad_norm": 460.0,
"kl_loss_10": 101.49279441833497,
"kl_loss_2": 1187.613018798828,
"kl_loss_3": 832.265737915039,
"kl_loss_7": 211.90668182373048,
"learning_rate": 0.0006484601876641375,
"loss": 591.7443,
"step": 4100
},
{
"ce_loss_10": 3.6106685280799864,
"ce_loss_13": 3.5491909265518187,
"ce_loss_2": 4.104636693000794,
"ce_loss_3": 3.9329436659812926,
"ce_loss_7": 3.6641584396362306,
"epoch": 0.411,
"grad_norm": 378.0,
"kl_loss_10": 101.25703315734863,
"kl_loss_2": 1168.0580017089844,
"kl_loss_3": 813.8080810546875,
"kl_loss_7": 212.12922592163085,
"learning_rate": 0.000646944334697328,
"loss": 577.3537,
"step": 4110
},
{
"ce_loss_10": 3.7338776111602785,
"ce_loss_13": 3.665091943740845,
"ce_loss_2": 4.2223006844520565,
"ce_loss_3": 4.049113523960114,
"ce_loss_7": 3.799789845943451,
"epoch": 0.412,
"grad_norm": 450.0,
"kl_loss_10": 109.65744743347167,
"kl_loss_2": 1151.4740142822266,
"kl_loss_3": 801.2218536376953,
"kl_loss_7": 236.72526626586915,
"learning_rate": 0.0006454270020026995,
"loss": 574.9525,
"step": 4120
},
{
"ce_loss_10": 3.69082772731781,
"ce_loss_13": 3.6286051154136656,
"ce_loss_2": 4.175914537906647,
"ce_loss_3": 4.002845597267151,
"ce_loss_7": 3.7393308877944946,
"epoch": 0.413,
"grad_norm": 580.0,
"kl_loss_10": 104.95364952087402,
"kl_loss_2": 1127.3133270263672,
"kl_loss_3": 788.5207000732422,
"kl_loss_7": 214.98480072021485,
"learning_rate": 0.0006439082048597755,
"loss": 564.7141,
"step": 4130
},
{
"ce_loss_10": 3.683094894886017,
"ce_loss_13": 3.61643271446228,
"ce_loss_2": 4.181109619140625,
"ce_loss_3": 4.005432403087616,
"ce_loss_7": 3.745869052410126,
"epoch": 0.414,
"grad_norm": 520.0,
"kl_loss_10": 111.28029708862304,
"kl_loss_2": 1178.55703125,
"kl_loss_3": 823.4579254150391,
"kl_loss_7": 238.62436599731444,
"learning_rate": 0.0006423879585628261,
"loss": 585.353,
"step": 4140
},
{
"ce_loss_10": 3.648063910007477,
"ce_loss_13": 3.579416477680206,
"ce_loss_2": 4.166888773441315,
"ce_loss_3": 3.98115758895874,
"ce_loss_7": 3.7089965462684633,
"epoch": 0.415,
"grad_norm": 402.0,
"kl_loss_10": 109.57027854919434,
"kl_loss_2": 1214.0814270019532,
"kl_loss_3": 843.1505004882813,
"kl_loss_7": 233.17276763916016,
"learning_rate": 0.0006408662784207149,
"loss": 596.7986,
"step": 4150
},
{
"ce_loss_10": 3.596502733230591,
"ce_loss_13": 3.5327386379241945,
"ce_loss_2": 4.09819370508194,
"ce_loss_3": 3.9237332344055176,
"ce_loss_7": 3.654523158073425,
"epoch": 0.416,
"grad_norm": 544.0,
"kl_loss_10": 99.90503120422363,
"kl_loss_2": 1189.1891540527345,
"kl_loss_3": 823.6777069091797,
"kl_loss_7": 211.67333221435547,
"learning_rate": 0.0006393431797567439,
"loss": 583.1826,
"step": 4160
},
{
"ce_loss_10": 3.6853842735290527,
"ce_loss_13": 3.622405004501343,
"ce_loss_2": 4.1561102867126465,
"ce_loss_3": 3.9865566968917845,
"ce_loss_7": 3.7344152450561525,
"epoch": 0.417,
"grad_norm": 384.0,
"kl_loss_10": 103.1281753540039,
"kl_loss_2": 1144.869805908203,
"kl_loss_3": 800.3423767089844,
"kl_loss_7": 211.40862579345702,
"learning_rate": 0.0006378186779084996,
"loss": 557.4173,
"step": 4170
},
{
"ce_loss_10": 3.5140963315963747,
"ce_loss_13": 3.452511179447174,
"ce_loss_2": 4.041843056678772,
"ce_loss_3": 3.857197344303131,
"ce_loss_7": 3.571711480617523,
"epoch": 0.418,
"grad_norm": 464.0,
"kl_loss_10": 100.09027862548828,
"kl_loss_2": 1203.0338989257812,
"kl_loss_3": 838.9081939697265,
"kl_loss_7": 213.11346130371095,
"learning_rate": 0.0006362927882276989,
"loss": 588.2966,
"step": 4180
},
{
"ce_loss_10": 3.7188942313194273,
"ce_loss_13": 3.6518460750579833,
"ce_loss_2": 4.204531168937683,
"ce_loss_3": 4.025935411453247,
"ce_loss_7": 3.7728618144989015,
"epoch": 0.419,
"grad_norm": 426.0,
"kl_loss_10": 103.15027618408203,
"kl_loss_2": 1156.1428161621093,
"kl_loss_3": 794.2856292724609,
"kl_loss_7": 211.89537048339844,
"learning_rate": 0.000634765526080034,
"loss": 562.2326,
"step": 4190
},
{
"ce_loss_10": 3.717780148983002,
"ce_loss_13": 3.6511818051338194,
"ce_loss_2": 4.210239946842194,
"ce_loss_3": 4.0393988490104675,
"ce_loss_7": 3.7724336862564085,
"epoch": 0.42,
"grad_norm": 456.0,
"kl_loss_10": 104.51988563537597,
"kl_loss_2": 1161.7059631347656,
"kl_loss_3": 818.392855834961,
"kl_loss_7": 219.07965316772462,
"learning_rate": 0.0006332369068450174,
"loss": 570.1012,
"step": 4200
},
{
"ce_loss_10": 3.648071753978729,
"ce_loss_13": 3.5840353846549986,
"ce_loss_2": 4.147714996337891,
"ce_loss_3": 3.972030484676361,
"ce_loss_7": 3.7039226770401,
"epoch": 0.421,
"grad_norm": 426.0,
"kl_loss_10": 101.72255935668946,
"kl_loss_2": 1175.2358459472657,
"kl_loss_3": 821.6455657958984,
"kl_loss_7": 216.67398834228516,
"learning_rate": 0.0006317069459158283,
"loss": 576.074,
"step": 4210
},
{
"ce_loss_10": 3.766611933708191,
"ce_loss_13": 3.7019524574279785,
"ce_loss_2": 4.238518404960632,
"ce_loss_3": 4.070182096958161,
"ce_loss_7": 3.818829393386841,
"epoch": 0.422,
"grad_norm": 404.0,
"kl_loss_10": 102.42731742858886,
"kl_loss_2": 1134.2777221679687,
"kl_loss_3": 793.5420806884765,
"kl_loss_7": 214.86822509765625,
"learning_rate": 0.0006301756586991561,
"loss": 572.4437,
"step": 4220
},
{
"ce_loss_10": 3.538297724723816,
"ce_loss_13": 3.4769801259040833,
"ce_loss_2": 4.051598787307739,
"ce_loss_3": 3.8692006349563597,
"ce_loss_7": 3.592081093788147,
"epoch": 0.423,
"grad_norm": 524.0,
"kl_loss_10": 100.02308959960938,
"kl_loss_2": 1219.534228515625,
"kl_loss_3": 847.8958953857422,
"kl_loss_7": 217.3907485961914,
"learning_rate": 0.0006286430606150459,
"loss": 590.4341,
"step": 4230
},
{
"ce_loss_10": 3.732722854614258,
"ce_loss_13": 3.670178234577179,
"ce_loss_2": 4.228793060779571,
"ce_loss_3": 4.055911266803742,
"ce_loss_7": 3.7854557275772094,
"epoch": 0.424,
"grad_norm": 440.0,
"kl_loss_10": 101.63710746765136,
"kl_loss_2": 1171.4819213867188,
"kl_loss_3": 815.24853515625,
"kl_loss_7": 212.84099502563475,
"learning_rate": 0.0006271091670967436,
"loss": 572.0026,
"step": 4240
},
{
"ce_loss_10": 3.64589341878891,
"ce_loss_13": 3.579445707798004,
"ce_loss_2": 4.168534743785858,
"ce_loss_3": 3.9873276472091677,
"ce_loss_7": 3.7041419625282286,
"epoch": 0.425,
"grad_norm": 436.0,
"kl_loss_10": 105.33321189880371,
"kl_loss_2": 1223.9686584472656,
"kl_loss_3": 856.7900268554688,
"kl_loss_7": 219.8565589904785,
"learning_rate": 0.0006255739935905395,
"loss": 587.2729,
"step": 4250
},
{
"ce_loss_10": 3.684093916416168,
"ce_loss_13": 3.622530627250671,
"ce_loss_2": 4.176068413257599,
"ce_loss_3": 4.005461478233338,
"ce_loss_7": 3.73612722158432,
"epoch": 0.426,
"grad_norm": 444.0,
"kl_loss_10": 101.16957168579101,
"kl_loss_2": 1151.114599609375,
"kl_loss_3": 804.5711151123047,
"kl_loss_7": 206.51019058227538,
"learning_rate": 0.0006240375555556145,
"loss": 584.5814,
"step": 4260
},
{
"ce_loss_10": 3.694865620136261,
"ce_loss_13": 3.6328345060348513,
"ce_loss_2": 4.216705179214477,
"ce_loss_3": 4.035941934585571,
"ce_loss_7": 3.7489806532859804,
"epoch": 0.427,
"grad_norm": 544.0,
"kl_loss_10": 102.23134536743164,
"kl_loss_2": 1200.0044555664062,
"kl_loss_3": 832.4086944580079,
"kl_loss_7": 208.58624954223632,
"learning_rate": 0.000622499868463882,
"loss": 581.1191,
"step": 4270
},
{
"ce_loss_10": 3.6664886713027953,
"ce_loss_13": 3.6031296968460085,
"ce_loss_2": 4.138775157928467,
"ce_loss_3": 3.968552088737488,
"ce_loss_7": 3.716127264499664,
"epoch": 0.428,
"grad_norm": 442.0,
"kl_loss_10": 102.83601112365723,
"kl_loss_2": 1148.9752075195313,
"kl_loss_3": 798.4193389892578,
"kl_loss_7": 204.8626609802246,
"learning_rate": 0.0006209609477998338,
"loss": 570.8694,
"step": 4280
},
{
"ce_loss_10": 3.7170133352279664,
"ce_loss_13": 3.6512863278388976,
"ce_loss_2": 4.214985513687134,
"ce_loss_3": 4.041373360157013,
"ce_loss_7": 3.76862713098526,
"epoch": 0.429,
"grad_norm": 492.0,
"kl_loss_10": 105.98460693359375,
"kl_loss_2": 1171.2547790527344,
"kl_loss_3": 819.7431121826172,
"kl_loss_7": 209.78300704956055,
"learning_rate": 0.0006194208090603844,
"loss": 582.6892,
"step": 4290
},
{
"ce_loss_10": 3.636822462081909,
"ce_loss_13": 3.572554814815521,
"ce_loss_2": 4.128273499011994,
"ce_loss_3": 3.9540862798690797,
"ce_loss_7": 3.6845338463783266,
"epoch": 0.43,
"grad_norm": 384.0,
"kl_loss_10": 104.19713554382324,
"kl_loss_2": 1158.2531616210938,
"kl_loss_3": 808.0290679931641,
"kl_loss_7": 201.06265716552736,
"learning_rate": 0.0006178794677547138,
"loss": 566.7275,
"step": 4300
},
{
"ce_loss_10": 3.669668412208557,
"ce_loss_13": 3.6048370003700256,
"ce_loss_2": 4.167822825908661,
"ce_loss_3": 3.990470898151398,
"ce_loss_7": 3.7204079270362853,
"epoch": 0.431,
"grad_norm": 462.0,
"kl_loss_10": 105.12696495056153,
"kl_loss_2": 1189.7153015136719,
"kl_loss_3": 827.7414642333985,
"kl_loss_7": 209.76073608398437,
"learning_rate": 0.0006163369394041111,
"loss": 578.5617,
"step": 4310
},
{
"ce_loss_10": 3.603849542140961,
"ce_loss_13": 3.540567708015442,
"ce_loss_2": 4.114995861053467,
"ce_loss_3": 3.93278226852417,
"ce_loss_7": 3.6533514499664306,
"epoch": 0.432,
"grad_norm": 524.0,
"kl_loss_10": 103.23071632385253,
"kl_loss_2": 1199.0398742675782,
"kl_loss_3": 837.4948120117188,
"kl_loss_7": 206.72886505126954,
"learning_rate": 0.0006147932395418205,
"loss": 593.6705,
"step": 4320
},
{
"ce_loss_10": 3.6318950057029724,
"ce_loss_13": 3.5694007515907287,
"ce_loss_2": 4.121479880809784,
"ce_loss_3": 3.9539971709251405,
"ce_loss_7": 3.6812774300575257,
"epoch": 0.433,
"grad_norm": 372.0,
"kl_loss_10": 101.08283462524415,
"kl_loss_2": 1163.6617614746094,
"kl_loss_3": 814.8068634033203,
"kl_loss_7": 204.31798858642577,
"learning_rate": 0.0006132483837128823,
"loss": 570.1899,
"step": 4330
},
{
"ce_loss_10": 3.6211368441581726,
"ce_loss_13": 3.5578442931175234,
"ce_loss_2": 4.120713996887207,
"ce_loss_3": 3.9408787965774534,
"ce_loss_7": 3.6715193152427674,
"epoch": 0.434,
"grad_norm": 380.0,
"kl_loss_10": 102.18530006408692,
"kl_loss_2": 1181.1154479980469,
"kl_loss_3": 821.5291748046875,
"kl_loss_7": 205.94673614501954,
"learning_rate": 0.0006117023874739772,
"loss": 579.966,
"step": 4340
},
{
"ce_loss_10": 3.606392514705658,
"ce_loss_13": 3.542631506919861,
"ce_loss_2": 4.1229788064956665,
"ce_loss_3": 3.943661665916443,
"ce_loss_7": 3.660093939304352,
"epoch": 0.435,
"grad_norm": 366.0,
"kl_loss_10": 101.41253623962402,
"kl_loss_2": 1198.5234008789062,
"kl_loss_3": 836.8120849609375,
"kl_loss_7": 206.9767189025879,
"learning_rate": 0.0006101552663932703,
"loss": 586.1095,
"step": 4350
},
{
"ce_loss_10": 3.6401270270347594,
"ce_loss_13": 3.5747036576271056,
"ce_loss_2": 4.133774304389954,
"ce_loss_3": 3.9579702854156493,
"ce_loss_7": 3.689171576499939,
"epoch": 0.436,
"grad_norm": 432.0,
"kl_loss_10": 103.28445014953613,
"kl_loss_2": 1170.830484008789,
"kl_loss_3": 821.6876098632813,
"kl_loss_7": 207.47048645019532,
"learning_rate": 0.0006086070360502539,
"loss": 578.1617,
"step": 4360
},
{
"ce_loss_10": 3.6478831648826597,
"ce_loss_13": 3.5829063415527345,
"ce_loss_2": 4.140194344520569,
"ce_loss_3": 3.9674217224121096,
"ce_loss_7": 3.6954386711120604,
"epoch": 0.437,
"grad_norm": 324.0,
"kl_loss_10": 102.49744033813477,
"kl_loss_2": 1182.2726196289063,
"kl_loss_3": 820.302099609375,
"kl_loss_7": 202.6822937011719,
"learning_rate": 0.0006070577120355903,
"loss": 585.725,
"step": 4370
},
{
"ce_loss_10": 3.6493834018707276,
"ce_loss_13": 3.585710608959198,
"ce_loss_2": 4.1475905418396,
"ce_loss_3": 3.9780289769172668,
"ce_loss_7": 3.6994438648223875,
"epoch": 0.438,
"grad_norm": 464.0,
"kl_loss_10": 99.22572135925293,
"kl_loss_2": 1158.4001525878907,
"kl_loss_3": 817.9062316894531,
"kl_loss_7": 200.7786117553711,
"learning_rate": 0.0006055073099509549,
"loss": 570.4337,
"step": 4380
},
{
"ce_loss_10": 3.7072151064872743,
"ce_loss_13": 3.6444019198417665,
"ce_loss_2": 4.1913762331008915,
"ce_loss_3": 4.024674141407013,
"ce_loss_7": 3.755181634426117,
"epoch": 0.439,
"grad_norm": 414.0,
"kl_loss_10": 101.21295433044433,
"kl_loss_2": 1155.983868408203,
"kl_loss_3": 813.5707092285156,
"kl_loss_7": 201.68513870239258,
"learning_rate": 0.0006039558454088796,
"loss": 578.4039,
"step": 4390
},
{
"ce_loss_10": 3.6866373896598814,
"ce_loss_13": 3.6209323048591613,
"ce_loss_2": 4.190221071243286,
"ce_loss_3": 4.017517447471619,
"ce_loss_7": 3.736443567276001,
"epoch": 0.44,
"grad_norm": 388.0,
"kl_loss_10": 103.66101570129395,
"kl_loss_2": 1179.7899597167968,
"kl_loss_3": 831.9971649169922,
"kl_loss_7": 206.1973434448242,
"learning_rate": 0.0006024033340325954,
"loss": 572.2276,
"step": 4400
},
{
"ce_loss_10": 3.7494076251983643,
"ce_loss_13": 3.6860761404037476,
"ce_loss_2": 4.22088440656662,
"ce_loss_3": 4.061302840709686,
"ce_loss_7": 3.7976527214050293,
"epoch": 0.441,
"grad_norm": 384.0,
"kl_loss_10": 100.95717124938965,
"kl_loss_2": 1117.0268005371095,
"kl_loss_3": 788.523080444336,
"kl_loss_7": 197.15192718505858,
"learning_rate": 0.0006008497914558743,
"loss": 559.696,
"step": 4410
},
{
"ce_loss_10": 3.689165186882019,
"ce_loss_13": 3.6250773549079893,
"ce_loss_2": 4.1833924651145935,
"ce_loss_3": 4.016273534297943,
"ce_loss_7": 3.738771951198578,
"epoch": 0.442,
"grad_norm": 476.0,
"kl_loss_10": 105.19830055236817,
"kl_loss_2": 1174.740167236328,
"kl_loss_3": 830.987890625,
"kl_loss_7": 209.00811996459962,
"learning_rate": 0.0005992952333228728,
"loss": 576.4588,
"step": 4420
},
{
"ce_loss_10": 3.620419418811798,
"ce_loss_13": 3.5588944792747497,
"ce_loss_2": 4.125707459449768,
"ce_loss_3": 3.9479523420333864,
"ce_loss_7": 3.6681005358695984,
"epoch": 0.443,
"grad_norm": 464.0,
"kl_loss_10": 100.17966499328614,
"kl_loss_2": 1181.0232360839843,
"kl_loss_3": 829.0245361328125,
"kl_loss_7": 201.25574188232423,
"learning_rate": 0.0005977396752879741,
"loss": 577.6452,
"step": 4430
},
{
"ce_loss_10": 3.5535963416099547,
"ce_loss_13": 3.4911730885505676,
"ce_loss_2": 4.057285642623901,
"ce_loss_3": 3.882522702217102,
"ce_loss_7": 3.603209447860718,
"epoch": 0.444,
"grad_norm": 450.0,
"kl_loss_10": 96.56860618591308,
"kl_loss_2": 1184.1321594238282,
"kl_loss_3": 827.8955352783203,
"kl_loss_7": 199.06893157958984,
"learning_rate": 0.0005961831330156305,
"loss": 569.2716,
"step": 4440
},
{
"ce_loss_10": 3.697277545928955,
"ce_loss_13": 3.6338467955589295,
"ce_loss_2": 4.1992070317268375,
"ce_loss_3": 4.02395384311676,
"ce_loss_7": 3.747213661670685,
"epoch": 0.445,
"grad_norm": 392.0,
"kl_loss_10": 101.60056228637696,
"kl_loss_2": 1189.420147705078,
"kl_loss_3": 827.8122314453125,
"kl_loss_7": 205.08227157592773,
"learning_rate": 0.0005946256221802051,
"loss": 584.411,
"step": 4450
},
{
"ce_loss_10": 3.679532468318939,
"ce_loss_13": 3.6183473825454713,
"ce_loss_2": 4.146489477157592,
"ce_loss_3": 3.9755648136138917,
"ce_loss_7": 3.7207812786102297,
"epoch": 0.446,
"grad_norm": 494.0,
"kl_loss_10": 101.10317420959473,
"kl_loss_2": 1119.8320098876952,
"kl_loss_3": 779.770297241211,
"kl_loss_7": 198.91878814697264,
"learning_rate": 0.0005930671584658151,
"loss": 578.7685,
"step": 4460
},
{
"ce_loss_10": 3.674864172935486,
"ce_loss_13": 3.6118743062019347,
"ce_loss_2": 4.166282546520233,
"ce_loss_3": 3.9925308227539062,
"ce_loss_7": 3.7198517322540283,
"epoch": 0.447,
"grad_norm": 364.0,
"kl_loss_10": 100.75155410766601,
"kl_loss_2": 1165.5830871582032,
"kl_loss_3": 814.2670196533203,
"kl_loss_7": 201.9087059020996,
"learning_rate": 0.0005915077575661722,
"loss": 579.8401,
"step": 4470
},
{
"ce_loss_10": 3.694182288646698,
"ce_loss_13": 3.628465461730957,
"ce_loss_2": 4.190526556968689,
"ce_loss_3": 4.015213489532471,
"ce_loss_7": 3.7417189121246337,
"epoch": 0.448,
"grad_norm": 520.0,
"kl_loss_10": 105.40261840820312,
"kl_loss_2": 1179.2632690429687,
"kl_loss_3": 825.197119140625,
"kl_loss_7": 209.67544021606446,
"learning_rate": 0.000589947435184427,
"loss": 569.8479,
"step": 4480
},
{
"ce_loss_10": 3.7602591633796694,
"ce_loss_13": 3.6975467801094055,
"ce_loss_2": 4.231885468959808,
"ce_loss_3": 4.062859082221985,
"ce_loss_7": 3.8065670251846315,
"epoch": 0.449,
"grad_norm": 406.0,
"kl_loss_10": 104.7243579864502,
"kl_loss_2": 1147.1027252197266,
"kl_loss_3": 795.4058624267578,
"kl_loss_7": 203.6425910949707,
"learning_rate": 0.0005883862070330078,
"loss": 568.9265,
"step": 4490
},
{
"ce_loss_10": 3.6874640941619874,
"ce_loss_13": 3.6227025985717773,
"ce_loss_2": 4.18091858625412,
"ce_loss_3": 4.004498326778412,
"ce_loss_7": 3.7389190554618836,
"epoch": 0.45,
"grad_norm": 342.0,
"kl_loss_10": 102.03626098632813,
"kl_loss_2": 1166.0193176269531,
"kl_loss_3": 811.4805572509765,
"kl_loss_7": 204.2785285949707,
"learning_rate": 0.0005868240888334653,
"loss": 567.3452,
"step": 4500
},
{
"ce_loss_10": 3.570815551280975,
"ce_loss_13": 3.508398413658142,
"ce_loss_2": 4.096131467819214,
"ce_loss_3": 3.9093389391899107,
"ce_loss_7": 3.625988078117371,
"epoch": 0.451,
"grad_norm": 616.0,
"kl_loss_10": 100.9030990600586,
"kl_loss_2": 1212.356463623047,
"kl_loss_3": 839.7065948486328,
"kl_loss_7": 207.68597564697265,
"learning_rate": 0.0005852610963163119,
"loss": 584.0681,
"step": 4510
},
{
"ce_loss_10": 3.5951132655143736,
"ce_loss_13": 3.5340840578079225,
"ce_loss_2": 4.088473439216614,
"ce_loss_3": 3.9123128294944762,
"ce_loss_7": 3.6418415188789366,
"epoch": 0.452,
"grad_norm": 440.0,
"kl_loss_10": 97.94427604675293,
"kl_loss_2": 1155.4515991210938,
"kl_loss_3": 802.8143249511719,
"kl_loss_7": 198.15041809082032,
"learning_rate": 0.0005836972452208654,
"loss": 560.779,
"step": 4520
},
{
"ce_loss_10": 3.6001816511154177,
"ce_loss_13": 3.540806245803833,
"ce_loss_2": 4.105304884910583,
"ce_loss_3": 3.9283313751220703,
"ce_loss_7": 3.6497029066085815,
"epoch": 0.453,
"grad_norm": 470.0,
"kl_loss_10": 99.28575630187989,
"kl_loss_2": 1176.1295288085937,
"kl_loss_3": 817.2998046875,
"kl_loss_7": 202.73690338134764,
"learning_rate": 0.0005821325512950885,
"loss": 572.314,
"step": 4530
},
{
"ce_loss_10": 3.629274320602417,
"ce_loss_13": 3.5687419891357424,
"ce_loss_2": 4.1162322640419005,
"ce_loss_3": 3.9458845138549803,
"ce_loss_7": 3.680540406703949,
"epoch": 0.454,
"grad_norm": 368.0,
"kl_loss_10": 96.52360496520996,
"kl_loss_2": 1136.2307861328125,
"kl_loss_3": 790.6944702148437,
"kl_loss_7": 197.31127700805663,
"learning_rate": 0.0005805670302954321,
"loss": 568.0196,
"step": 4540
},
{
"ce_loss_10": 3.6337098717689513,
"ce_loss_13": 3.5753876209259032,
"ce_loss_2": 4.115709042549133,
"ce_loss_3": 3.9439353704452516,
"ce_loss_7": 3.6809528470039368,
"epoch": 0.455,
"grad_norm": 434.0,
"kl_loss_10": 95.89570465087891,
"kl_loss_2": 1140.969873046875,
"kl_loss_3": 792.410400390625,
"kl_loss_7": 194.6849395751953,
"learning_rate": 0.000579000697986675,
"loss": 559.3398,
"step": 4550
},
{
"ce_loss_10": 3.5949880719184875,
"ce_loss_13": 3.5312354803085326,
"ce_loss_2": 4.110612523555756,
"ce_loss_3": 3.9363887429237367,
"ce_loss_7": 3.6481791853904726,
"epoch": 0.456,
"grad_norm": 398.0,
"kl_loss_10": 102.14065132141113,
"kl_loss_2": 1200.508935546875,
"kl_loss_3": 844.4349182128906,
"kl_loss_7": 207.93037872314454,
"learning_rate": 0.0005774335701417662,
"loss": 577.7247,
"step": 4560
},
{
"ce_loss_10": 3.578439974784851,
"ce_loss_13": 3.5177830338478087,
"ce_loss_2": 4.086728799343109,
"ce_loss_3": 3.9092958092689516,
"ce_loss_7": 3.628882908821106,
"epoch": 0.457,
"grad_norm": 438.0,
"kl_loss_10": 98.15573539733887,
"kl_loss_2": 1190.6679321289062,
"kl_loss_3": 827.183969116211,
"kl_loss_7": 201.49042510986328,
"learning_rate": 0.0005758656625416658,
"loss": 579.3393,
"step": 4570
},
{
"ce_loss_10": 3.6351425409317017,
"ce_loss_13": 3.5740421295166014,
"ce_loss_2": 4.13430563211441,
"ce_loss_3": 3.9581828236579897,
"ce_loss_7": 3.685711920261383,
"epoch": 0.458,
"grad_norm": 378.0,
"kl_loss_10": 98.59328498840333,
"kl_loss_2": 1165.538037109375,
"kl_loss_3": 813.1740905761719,
"kl_loss_7": 200.91252059936522,
"learning_rate": 0.0005742969909751859,
"loss": 562.4629,
"step": 4580
},
{
"ce_loss_10": 3.6438634276390074,
"ce_loss_13": 3.5822227597236633,
"ce_loss_2": 4.139957237243652,
"ce_loss_3": 3.96221022605896,
"ce_loss_7": 3.692858374118805,
"epoch": 0.459,
"grad_norm": 396.0,
"kl_loss_10": 100.12554626464843,
"kl_loss_2": 1167.3160705566406,
"kl_loss_3": 805.8544036865235,
"kl_loss_7": 201.26202087402345,
"learning_rate": 0.0005727275712388318,
"loss": 570.0833,
"step": 4590
},
{
"ce_loss_10": 3.681215536594391,
"ce_loss_13": 3.620731198787689,
"ce_loss_2": 4.155962944030762,
"ce_loss_3": 3.984270441532135,
"ce_loss_7": 3.7283701658248902,
"epoch": 0.46,
"grad_norm": 568.0,
"kl_loss_10": 98.76027946472168,
"kl_loss_2": 1132.1197998046875,
"kl_loss_3": 792.0047241210938,
"kl_loss_7": 197.17216033935546,
"learning_rate": 0.0005711574191366427,
"loss": 562.7997,
"step": 4600
},
{
"ce_loss_10": 3.6236431002616882,
"ce_loss_13": 3.565703308582306,
"ce_loss_2": 4.114531934261322,
"ce_loss_3": 3.93969669342041,
"ce_loss_7": 3.671102833747864,
"epoch": 0.461,
"grad_norm": 372.0,
"kl_loss_10": 98.42190704345703,
"kl_loss_2": 1170.4917938232422,
"kl_loss_3": 808.7791198730469,
"kl_loss_7": 199.0694892883301,
"learning_rate": 0.0005695865504800327,
"loss": 564.0159,
"step": 4610
},
{
"ce_loss_10": 3.562722647190094,
"ce_loss_13": 3.500598740577698,
"ce_loss_2": 4.109580218791962,
"ce_loss_3": 3.9190361380577086,
"ce_loss_7": 3.6191172361373902,
"epoch": 0.462,
"grad_norm": 480.0,
"kl_loss_10": 100.51305274963379,
"kl_loss_2": 1233.0393005371093,
"kl_loss_3": 860.259619140625,
"kl_loss_7": 208.89999542236328,
"learning_rate": 0.0005680149810876322,
"loss": 581.488,
"step": 4620
},
{
"ce_loss_10": 3.6198580145835875,
"ce_loss_13": 3.5573437213897705,
"ce_loss_2": 4.117598211765289,
"ce_loss_3": 3.94056499004364,
"ce_loss_7": 3.667776870727539,
"epoch": 0.463,
"grad_norm": 560.0,
"kl_loss_10": 99.44257354736328,
"kl_loss_2": 1160.7040802001952,
"kl_loss_3": 809.362094116211,
"kl_loss_7": 201.12859268188475,
"learning_rate": 0.0005664427267851271,
"loss": 565.3629,
"step": 4630
},
{
"ce_loss_10": 3.534971606731415,
"ce_loss_13": 3.47266343832016,
"ce_loss_2": 4.036073172092438,
"ce_loss_3": 3.857685387134552,
"ce_loss_7": 3.5870521306991576,
"epoch": 0.464,
"grad_norm": 498.0,
"kl_loss_10": 97.52345237731933,
"kl_loss_2": 1167.1843322753907,
"kl_loss_3": 810.5214752197265,
"kl_loss_7": 199.60354309082032,
"learning_rate": 0.0005648698034051009,
"loss": 562.6416,
"step": 4640
},
{
"ce_loss_10": 3.6570662021636964,
"ce_loss_13": 3.594506525993347,
"ce_loss_2": 4.158554673194885,
"ce_loss_3": 3.980504941940308,
"ce_loss_7": 3.7062572717666624,
"epoch": 0.465,
"grad_norm": 412.0,
"kl_loss_10": 99.88166885375976,
"kl_loss_2": 1173.9357055664063,
"kl_loss_3": 818.5712066650391,
"kl_loss_7": 200.30800857543946,
"learning_rate": 0.0005632962267868747,
"loss": 561.8186,
"step": 4650
},
{
"ce_loss_10": 3.5903021335601806,
"ce_loss_13": 3.5294329643249513,
"ce_loss_2": 4.08318532705307,
"ce_loss_3": 3.9098427176475523,
"ce_loss_7": 3.6388569593429567,
"epoch": 0.466,
"grad_norm": 464.0,
"kl_loss_10": 95.17009468078614,
"kl_loss_2": 1143.232162475586,
"kl_loss_3": 798.761831665039,
"kl_loss_7": 195.75977783203126,
"learning_rate": 0.0005617220127763474,
"loss": 567.0608,
"step": 4660
},
{
"ce_loss_10": 3.669221520423889,
"ce_loss_13": 3.607930314540863,
"ce_loss_2": 4.160642421245575,
"ce_loss_3": 3.9847203373908995,
"ce_loss_7": 3.717066395282745,
"epoch": 0.467,
"grad_norm": 412.0,
"kl_loss_10": 98.76815719604492,
"kl_loss_2": 1153.8832275390625,
"kl_loss_3": 803.9543914794922,
"kl_loss_7": 198.99397354125978,
"learning_rate": 0.0005601471772258368,
"loss": 567.3518,
"step": 4670
},
{
"ce_loss_10": 3.6542641162872314,
"ce_loss_13": 3.593363094329834,
"ce_loss_2": 4.133442676067352,
"ce_loss_3": 3.96450389623642,
"ce_loss_7": 3.7022117972373962,
"epoch": 0.468,
"grad_norm": 384.0,
"kl_loss_10": 98.04742546081543,
"kl_loss_2": 1118.5282470703125,
"kl_loss_3": 784.399691772461,
"kl_loss_7": 197.338858795166,
"learning_rate": 0.0005585717359939192,
"loss": 565.1176,
"step": 4680
},
{
"ce_loss_10": 3.56116144657135,
"ce_loss_13": 3.4993683457374574,
"ce_loss_2": 4.055442547798156,
"ce_loss_3": 3.887247931957245,
"ce_loss_7": 3.6099945425987245,
"epoch": 0.469,
"grad_norm": 490.0,
"kl_loss_10": 97.45741577148438,
"kl_loss_2": 1149.7481964111328,
"kl_loss_3": 806.3391754150391,
"kl_loss_7": 197.63161849975586,
"learning_rate": 0.0005569957049452703,
"loss": 571.714,
"step": 4690
},
{
"ce_loss_10": 3.6181132555007935,
"ce_loss_13": 3.558199667930603,
"ce_loss_2": 4.1229860305786135,
"ce_loss_3": 3.9408149838447573,
"ce_loss_7": 3.668530523777008,
"epoch": 0.47,
"grad_norm": 458.0,
"kl_loss_10": 98.11741218566894,
"kl_loss_2": 1179.65732421875,
"kl_loss_3": 819.0914672851562,
"kl_loss_7": 202.21502075195312,
"learning_rate": 0.0005554190999505056,
"loss": 572.5331,
"step": 4700
},
{
"ce_loss_10": 3.7477443337440492,
"ce_loss_13": 3.6823888421058655,
"ce_loss_2": 4.236353850364685,
"ce_loss_3": 4.064246296882629,
"ce_loss_7": 3.7983964323997497,
"epoch": 0.471,
"grad_norm": 376.0,
"kl_loss_10": 101.09743614196778,
"kl_loss_2": 1167.4985229492188,
"kl_loss_3": 813.3948120117187,
"kl_loss_7": 205.17110900878907,
"learning_rate": 0.0005538419368860196,
"loss": 552.1318,
"step": 4710
},
{
"ce_loss_10": 3.670793604850769,
"ce_loss_13": 3.6081652998924256,
"ce_loss_2": 4.154720652103424,
"ce_loss_3": 3.986761474609375,
"ce_loss_7": 3.7201395988464356,
"epoch": 0.472,
"grad_norm": 416.0,
"kl_loss_10": 100.02058029174805,
"kl_loss_2": 1152.6582946777344,
"kl_loss_3": 806.7274566650391,
"kl_loss_7": 202.40063400268554,
"learning_rate": 0.0005522642316338268,
"loss": 576.1212,
"step": 4720
},
{
"ce_loss_10": 3.673479509353638,
"ce_loss_13": 3.613760471343994,
"ce_loss_2": 4.150910186767578,
"ce_loss_3": 3.981798696517944,
"ce_loss_7": 3.721827840805054,
"epoch": 0.473,
"grad_norm": 478.0,
"kl_loss_10": 99.9439712524414,
"kl_loss_2": 1142.4451599121094,
"kl_loss_3": 795.6325531005859,
"kl_loss_7": 199.72487106323243,
"learning_rate": 0.0005506860000814017,
"loss": 573.0671,
"step": 4730
},
{
"ce_loss_10": 3.700618231296539,
"ce_loss_13": 3.638905906677246,
"ce_loss_2": 4.180734276771545,
"ce_loss_3": 4.006302297115326,
"ce_loss_7": 3.7447570085525514,
"epoch": 0.474,
"grad_norm": 372.0,
"kl_loss_10": 99.73388938903808,
"kl_loss_2": 1127.7213500976563,
"kl_loss_3": 793.5628936767578,
"kl_loss_7": 197.02488555908204,
"learning_rate": 0.0005491072581215186,
"loss": 565.0697,
"step": 4740
},
{
"ce_loss_10": 3.706625771522522,
"ce_loss_13": 3.6401172399520876,
"ce_loss_2": 4.184090709686279,
"ce_loss_3": 4.019766807556152,
"ce_loss_7": 3.754279363155365,
"epoch": 0.475,
"grad_norm": 516.0,
"kl_loss_10": 103.58124504089355,
"kl_loss_2": 1159.682275390625,
"kl_loss_3": 813.5887573242187,
"kl_loss_7": 204.05538330078124,
"learning_rate": 0.0005475280216520913,
"loss": 556.0086,
"step": 4750
},
{
"ce_loss_10": 3.617805337905884,
"ce_loss_13": 3.5573843002319334,
"ce_loss_2": 4.093091154098511,
"ce_loss_3": 3.926499140262604,
"ce_loss_7": 3.664002466201782,
"epoch": 0.476,
"grad_norm": 438.0,
"kl_loss_10": 97.125687789917,
"kl_loss_2": 1118.9559478759766,
"kl_loss_3": 784.6352722167969,
"kl_loss_7": 196.01404037475587,
"learning_rate": 0.0005459483065760138,
"loss": 565.9596,
"step": 4760
},
{
"ce_loss_10": 3.552186381816864,
"ce_loss_13": 3.4902740478515626,
"ce_loss_2": 4.07539484500885,
"ce_loss_3": 3.891750192642212,
"ce_loss_7": 3.601547920703888,
"epoch": 0.477,
"grad_norm": 584.0,
"kl_loss_10": 97.89878273010254,
"kl_loss_2": 1199.7971740722655,
"kl_loss_3": 836.253662109375,
"kl_loss_7": 197.98745880126953,
"learning_rate": 0.0005443681288009991,
"loss": 568.1693,
"step": 4770
},
{
"ce_loss_10": 3.6120885968208314,
"ce_loss_13": 3.5525715351104736,
"ce_loss_2": 4.106596338748932,
"ce_loss_3": 3.932267451286316,
"ce_loss_7": 3.6594039678573607,
"epoch": 0.478,
"grad_norm": 430.0,
"kl_loss_10": 98.81552238464356,
"kl_loss_2": 1169.4871887207032,
"kl_loss_3": 816.0136047363281,
"kl_loss_7": 198.91362609863282,
"learning_rate": 0.0005427875042394199,
"loss": 570.9199,
"step": 4780
},
{
"ce_loss_10": 3.6413972973823547,
"ce_loss_13": 3.5771793842315676,
"ce_loss_2": 4.133682417869568,
"ce_loss_3": 3.9594278573989867,
"ce_loss_7": 3.6885754466056824,
"epoch": 0.479,
"grad_norm": 396.0,
"kl_loss_10": 102.98994331359863,
"kl_loss_2": 1166.8763580322266,
"kl_loss_3": 812.8268646240234,
"kl_loss_7": 201.2046257019043,
"learning_rate": 0.0005412064488081482,
"loss": 576.3787,
"step": 4790
},
{
"ce_loss_10": 3.6483134269714355,
"ce_loss_13": 3.5873068809509276,
"ce_loss_2": 4.13967661857605,
"ce_loss_3": 3.9646928787231444,
"ce_loss_7": 3.697467315196991,
"epoch": 0.48,
"grad_norm": 370.0,
"kl_loss_10": 99.1940761566162,
"kl_loss_2": 1147.4876434326172,
"kl_loss_3": 791.1785400390625,
"kl_loss_7": 197.28219909667968,
"learning_rate": 0.0005396249784283942,
"loss": 558.8872,
"step": 4800
},
{
"ce_loss_10": 3.675038015842438,
"ce_loss_13": 3.605392372608185,
"ce_loss_2": 4.173114275932312,
"ce_loss_3": 3.99793621301651,
"ce_loss_7": 3.719290328025818,
"epoch": 0.481,
"grad_norm": 424.0,
"kl_loss_10": 109.17574653625488,
"kl_loss_2": 1186.1647857666017,
"kl_loss_3": 827.1956359863282,
"kl_loss_7": 205.77321548461913,
"learning_rate": 0.0005380431090255476,
"loss": 574.2385,
"step": 4810
},
{
"ce_loss_10": 3.6580063104629517,
"ce_loss_13": 3.600323748588562,
"ce_loss_2": 4.138371276855469,
"ce_loss_3": 3.968156564235687,
"ce_loss_7": 3.705365836620331,
"epoch": 0.482,
"grad_norm": 368.0,
"kl_loss_10": 96.444384765625,
"kl_loss_2": 1126.6424652099608,
"kl_loss_3": 782.5597137451172,
"kl_loss_7": 192.58789978027343,
"learning_rate": 0.0005364608565290155,
"loss": 556.892,
"step": 4820
},
{
"ce_loss_10": 3.66942412853241,
"ce_loss_13": 3.6059840083122254,
"ce_loss_2": 4.159801661968231,
"ce_loss_3": 3.985584008693695,
"ce_loss_7": 3.7178696751594544,
"epoch": 0.483,
"grad_norm": 528.0,
"kl_loss_10": 101.1554500579834,
"kl_loss_2": 1154.6878021240234,
"kl_loss_3": 803.7564819335937,
"kl_loss_7": 199.8929084777832,
"learning_rate": 0.0005348782368720626,
"loss": 563.005,
"step": 4830
},
{
"ce_loss_10": 3.596053886413574,
"ce_loss_13": 3.5365728974342345,
"ce_loss_2": 4.080682539939881,
"ce_loss_3": 3.9060325980186463,
"ce_loss_7": 3.6432487964630127,
"epoch": 0.484,
"grad_norm": 520.0,
"kl_loss_10": 96.21514892578125,
"kl_loss_2": 1134.5447143554688,
"kl_loss_3": 787.3878204345704,
"kl_loss_7": 194.21477661132812,
"learning_rate": 0.000533295265991652,
"loss": 564.2112,
"step": 4840
},
{
"ce_loss_10": 3.6783321022987367,
"ce_loss_13": 3.6159629583358766,
"ce_loss_2": 4.154437899589539,
"ce_loss_3": 3.9877618312835694,
"ce_loss_7": 3.727357840538025,
"epoch": 0.485,
"grad_norm": 434.0,
"kl_loss_10": 97.2699405670166,
"kl_loss_2": 1128.611801147461,
"kl_loss_3": 786.6338958740234,
"kl_loss_7": 195.64030685424805,
"learning_rate": 0.0005317119598282822,
"loss": 554.8634,
"step": 4850
},
{
"ce_loss_10": 3.6783334612846375,
"ce_loss_13": 3.6158772826194765,
"ce_loss_2": 4.161105620861053,
"ce_loss_3": 3.9936763644218445,
"ce_loss_7": 3.726669430732727,
"epoch": 0.486,
"grad_norm": 500.0,
"kl_loss_10": 99.51188240051269,
"kl_loss_2": 1139.204409790039,
"kl_loss_3": 796.6284942626953,
"kl_loss_7": 197.98922119140624,
"learning_rate": 0.0005301283343258293,
"loss": 559.5733,
"step": 4860
},
{
"ce_loss_10": 3.739852726459503,
"ce_loss_13": 3.679302477836609,
"ce_loss_2": 4.207214975357056,
"ce_loss_3": 4.046137988567352,
"ce_loss_7": 3.7877432465553285,
"epoch": 0.487,
"grad_norm": 434.0,
"kl_loss_10": 98.4985725402832,
"kl_loss_2": 1115.5814056396484,
"kl_loss_3": 781.7784240722656,
"kl_loss_7": 195.47981796264648,
"learning_rate": 0.000528544405431384,
"loss": 548.517,
"step": 4870
},
{
"ce_loss_10": 3.617240381240845,
"ce_loss_13": 3.555454957485199,
"ce_loss_2": 4.122074174880981,
"ce_loss_3": 3.944024980068207,
"ce_loss_7": 3.668628621101379,
"epoch": 0.488,
"grad_norm": 432.0,
"kl_loss_10": 98.9582015991211,
"kl_loss_2": 1175.8768676757813,
"kl_loss_3": 814.3092010498046,
"kl_loss_7": 202.09591979980468,
"learning_rate": 0.000526960189095093,
"loss": 569.4682,
"step": 4880
},
{
"ce_loss_10": 3.5905461430549623,
"ce_loss_13": 3.5317755937576294,
"ce_loss_2": 4.075044083595276,
"ce_loss_3": 3.9047257542610168,
"ce_loss_7": 3.637452006340027,
"epoch": 0.489,
"grad_norm": 406.0,
"kl_loss_10": 95.30788230895996,
"kl_loss_2": 1125.9373596191406,
"kl_loss_3": 783.2284423828125,
"kl_loss_7": 192.63981170654296,
"learning_rate": 0.0005253757012699972,
"loss": 553.6164,
"step": 4890
},
{
"ce_loss_10": 3.680708420276642,
"ce_loss_13": 3.621255648136139,
"ce_loss_2": 4.161669278144837,
"ce_loss_3": 3.9898939728736877,
"ce_loss_7": 3.726882266998291,
"epoch": 0.49,
"grad_norm": 436.0,
"kl_loss_10": 98.59705772399903,
"kl_loss_2": 1136.0980651855468,
"kl_loss_3": 790.2571563720703,
"kl_loss_7": 197.4443115234375,
"learning_rate": 0.0005237909579118712,
"loss": 568.0026,
"step": 4900
},
{
"ce_loss_10": 3.6435038447380066,
"ce_loss_13": 3.581137490272522,
"ce_loss_2": 4.134112453460693,
"ce_loss_3": 3.9640262126922607,
"ce_loss_7": 3.6911675453186037,
"epoch": 0.491,
"grad_norm": 520.0,
"kl_loss_10": 99.66703796386719,
"kl_loss_2": 1167.6467651367188,
"kl_loss_3": 818.6171966552735,
"kl_loss_7": 200.65354614257814,
"learning_rate": 0.0005222059749790631,
"loss": 568.3183,
"step": 4910
},
{
"ce_loss_10": 3.7152050852775576,
"ce_loss_13": 3.652708613872528,
"ce_loss_2": 4.176082861423493,
"ce_loss_3": 4.013521981239319,
"ce_loss_7": 3.759286069869995,
"epoch": 0.492,
"grad_norm": 394.0,
"kl_loss_10": 100.0508934020996,
"kl_loss_2": 1112.6296081542969,
"kl_loss_3": 774.64658203125,
"kl_loss_7": 196.3288688659668,
"learning_rate": 0.0005206207684323337,
"loss": 544.9011,
"step": 4920
},
{
"ce_loss_10": 3.689722108840942,
"ce_loss_13": 3.6289564847946165,
"ce_loss_2": 4.170908105373383,
"ce_loss_3": 3.9987404584884643,
"ce_loss_7": 3.7391751527786257,
"epoch": 0.493,
"grad_norm": 368.0,
"kl_loss_10": 100.77400093078613,
"kl_loss_2": 1140.2743774414062,
"kl_loss_3": 795.368798828125,
"kl_loss_7": 200.2589553833008,
"learning_rate": 0.000519035354234695,
"loss": 567.6383,
"step": 4930
},
{
"ce_loss_10": 3.666009783744812,
"ce_loss_13": 3.603765845298767,
"ce_loss_2": 4.156714332103729,
"ce_loss_3": 3.9840614438056945,
"ce_loss_7": 3.7159415602684023,
"epoch": 0.494,
"grad_norm": 516.0,
"kl_loss_10": 99.73322830200195,
"kl_loss_2": 1144.6152709960938,
"kl_loss_3": 797.5058837890625,
"kl_loss_7": 199.84856643676758,
"learning_rate": 0.0005174497483512506,
"loss": 551.5833,
"step": 4940
},
{
"ce_loss_10": 3.715251398086548,
"ce_loss_13": 3.6532492995262147,
"ce_loss_2": 4.190750586986542,
"ce_loss_3": 4.017711067199707,
"ce_loss_7": 3.760482394695282,
"epoch": 0.495,
"grad_norm": 404.0,
"kl_loss_10": 99.74794273376465,
"kl_loss_2": 1135.6743072509767,
"kl_loss_3": 788.5320007324219,
"kl_loss_7": 197.0201416015625,
"learning_rate": 0.0005158639667490339,
"loss": 559.5508,
"step": 4950
},
{
"ce_loss_10": 3.60677056312561,
"ce_loss_13": 3.545226526260376,
"ce_loss_2": 4.091673123836517,
"ce_loss_3": 3.921009349822998,
"ce_loss_7": 3.6560636878013613,
"epoch": 0.496,
"grad_norm": 380.0,
"kl_loss_10": 97.61143035888672,
"kl_loss_2": 1146.4500457763672,
"kl_loss_3": 801.032958984375,
"kl_loss_7": 198.76946029663085,
"learning_rate": 0.0005142780253968481,
"loss": 559.3498,
"step": 4960
},
{
"ce_loss_10": 3.558833396434784,
"ce_loss_13": 3.498770594596863,
"ce_loss_2": 4.029934275150299,
"ce_loss_3": 3.8623911499977113,
"ce_loss_7": 3.605703389644623,
"epoch": 0.497,
"grad_norm": 404.0,
"kl_loss_10": 95.15658073425293,
"kl_loss_2": 1120.565899658203,
"kl_loss_3": 776.6140930175782,
"kl_loss_7": 192.40693054199218,
"learning_rate": 0.0005126919402651053,
"loss": 541.1446,
"step": 4970
},
{
"ce_loss_10": 3.6243564009666445,
"ce_loss_13": 3.562463808059692,
"ce_loss_2": 4.122486090660095,
"ce_loss_3": 3.9518114924430847,
"ce_loss_7": 3.6740434527397157,
"epoch": 0.498,
"grad_norm": 500.0,
"kl_loss_10": 98.81732482910157,
"kl_loss_2": 1158.3788116455078,
"kl_loss_3": 805.2687072753906,
"kl_loss_7": 198.79998626708985,
"learning_rate": 0.0005111057273256647,
"loss": 562.34,
"step": 4980
},
{
"ce_loss_10": 3.736222839355469,
"ce_loss_13": 3.676733374595642,
"ce_loss_2": 4.189973556995392,
"ce_loss_3": 4.022429513931274,
"ce_loss_7": 3.7769731283187866,
"epoch": 0.499,
"grad_norm": 396.0,
"kl_loss_10": 98.13356437683106,
"kl_loss_2": 1078.4886474609375,
"kl_loss_3": 748.3916412353516,
"kl_loss_7": 191.23028793334962,
"learning_rate": 0.0005095194025516733,
"loss": 536.8887,
"step": 4990
},
{
"ce_loss_10": 3.6507428646087647,
"ce_loss_13": 3.592644715309143,
"ce_loss_2": 4.122073376178742,
"ce_loss_3": 3.9521225333213805,
"ce_loss_7": 3.697298324108124,
"epoch": 0.5,
"grad_norm": 378.0,
"kl_loss_10": 95.96725730895996,
"kl_loss_2": 1110.4840362548828,
"kl_loss_3": 769.709603881836,
"kl_loss_7": 192.08199310302734,
"learning_rate": 0.000507932981917404,
"loss": 562.5593,
"step": 5000
},
{
"ce_loss_10": 3.609897780418396,
"ce_loss_13": 3.5468419432640075,
"ce_loss_2": 4.115197873115539,
"ce_loss_3": 3.9347579956054686,
"ce_loss_7": 3.6594788432121277,
"epoch": 0.501,
"grad_norm": 496.0,
"kl_loss_10": 102.02307662963867,
"kl_loss_2": 1185.6702362060546,
"kl_loss_3": 822.8478820800781,
"kl_loss_7": 202.77078170776366,
"learning_rate": 0.0005063464813980949,
"loss": 576.005,
"step": 5010
},
{
"ce_loss_10": 3.595167326927185,
"ce_loss_13": 3.534419858455658,
"ce_loss_2": 4.08291003704071,
"ce_loss_3": 3.910551607608795,
"ce_loss_7": 3.6416044354438784,
"epoch": 0.502,
"grad_norm": 366.0,
"kl_loss_10": 98.82206382751465,
"kl_loss_2": 1157.9163513183594,
"kl_loss_3": 802.2986022949219,
"kl_loss_7": 196.4967498779297,
"learning_rate": 0.0005047599169697884,
"loss": 557.0335,
"step": 5020
},
{
"ce_loss_10": 3.5276883602142335,
"ce_loss_13": 3.469167137145996,
"ce_loss_2": 4.028472435474396,
"ce_loss_3": 3.8497302412986754,
"ce_loss_7": 3.5778237104415895,
"epoch": 0.503,
"grad_norm": 544.0,
"kl_loss_10": 95.17037048339844,
"kl_loss_2": 1142.5230926513673,
"kl_loss_3": 789.8021270751954,
"kl_loss_7": 195.37155456542968,
"learning_rate": 0.000503173304609171,
"loss": 545.4258,
"step": 5030
},
{
"ce_loss_10": 3.6576398611068726,
"ce_loss_13": 3.5950983643531798,
"ce_loss_2": 4.14467431306839,
"ce_loss_3": 3.9757012486457826,
"ce_loss_7": 3.7055052399635313,
"epoch": 0.504,
"grad_norm": 482.0,
"kl_loss_10": 98.4008186340332,
"kl_loss_2": 1135.7276794433594,
"kl_loss_3": 789.9247985839844,
"kl_loss_7": 196.23304824829103,
"learning_rate": 0.0005015866602934111,
"loss": 552.1605,
"step": 5040
},
{
"ce_loss_10": 3.621449387073517,
"ce_loss_13": 3.5583016514778136,
"ce_loss_2": 4.125820016860962,
"ce_loss_3": 3.9470208525657653,
"ce_loss_7": 3.6696552276611327,
"epoch": 0.505,
"grad_norm": 386.0,
"kl_loss_10": 101.05188751220703,
"kl_loss_2": 1170.8730712890624,
"kl_loss_3": 822.174462890625,
"kl_loss_7": 203.6134246826172,
"learning_rate": 0.0005,
"loss": 564.1666,
"step": 5050
},
{
"ce_loss_10": 3.608661472797394,
"ce_loss_13": 3.549720525741577,
"ce_loss_2": 4.094336903095245,
"ce_loss_3": 3.921399199962616,
"ce_loss_7": 3.6561817049980165,
"epoch": 0.506,
"grad_norm": 532.0,
"kl_loss_10": 97.96763725280762,
"kl_loss_2": 1147.6109741210937,
"kl_loss_3": 799.4344543457031,
"kl_loss_7": 197.70511016845703,
"learning_rate": 0.0004984133397065889,
"loss": 551.9219,
"step": 5060
},
{
"ce_loss_10": 3.619631803035736,
"ce_loss_13": 3.5591482758522033,
"ce_loss_2": 4.1191855549812315,
"ce_loss_3": 3.947730815410614,
"ce_loss_7": 3.671154284477234,
"epoch": 0.507,
"grad_norm": 420.0,
"kl_loss_10": 98.14169616699219,
"kl_loss_2": 1152.0039337158203,
"kl_loss_3": 803.1968292236328,
"kl_loss_7": 198.87692565917968,
"learning_rate": 0.0004968266953908291,
"loss": 554.0305,
"step": 5070
},
{
"ce_loss_10": 3.6628435134887694,
"ce_loss_13": 3.6024859309196473,
"ce_loss_2": 4.145783054828644,
"ce_loss_3": 3.972540259361267,
"ce_loss_7": 3.7080691695213317,
"epoch": 0.508,
"grad_norm": 532.0,
"kl_loss_10": 98.82306175231933,
"kl_loss_2": 1137.6268676757813,
"kl_loss_3": 795.5397338867188,
"kl_loss_7": 194.52870864868163,
"learning_rate": 0.0004952400830302117,
"loss": 554.9051,
"step": 5080
},
{
"ce_loss_10": 3.585409712791443,
"ce_loss_13": 3.525643265247345,
"ce_loss_2": 4.091677510738373,
"ce_loss_3": 3.9131953358650207,
"ce_loss_7": 3.6364392280578612,
"epoch": 0.509,
"grad_norm": 412.0,
"kl_loss_10": 98.62568626403808,
"kl_loss_2": 1168.942919921875,
"kl_loss_3": 811.3192687988281,
"kl_loss_7": 199.42913665771485,
"learning_rate": 0.0004936535186019053,
"loss": 559.6511,
"step": 5090
},
{
"ce_loss_10": 3.6907896161079408,
"ce_loss_13": 3.62961208820343,
"ce_loss_2": 4.153078198432922,
"ce_loss_3": 3.9874324560165406,
"ce_loss_7": 3.735322892665863,
"epoch": 0.51,
"grad_norm": 376.0,
"kl_loss_10": 97.42878112792968,
"kl_loss_2": 1101.246890258789,
"kl_loss_3": 771.5801239013672,
"kl_loss_7": 192.14101791381836,
"learning_rate": 0.000492067018082596,
"loss": 549.3435,
"step": 5100
},
{
"ce_loss_10": 3.6234113693237306,
"ce_loss_13": 3.55826051235199,
"ce_loss_2": 4.134788942337036,
"ce_loss_3": 3.9512638211250306,
"ce_loss_7": 3.673302376270294,
"epoch": 0.511,
"grad_norm": 358.0,
"kl_loss_10": 100.71795692443848,
"kl_loss_2": 1184.7957580566406,
"kl_loss_3": 822.3129302978516,
"kl_loss_7": 201.37216567993164,
"learning_rate": 0.0004904805974483267,
"loss": 578.112,
"step": 5110
},
{
"ce_loss_10": 3.73909273147583,
"ce_loss_13": 3.6729060292243956,
"ce_loss_2": 4.232535266876221,
"ce_loss_3": 4.064092624187469,
"ce_loss_7": 3.78980005979538,
"epoch": 0.512,
"grad_norm": 418.0,
"kl_loss_10": 103.6674789428711,
"kl_loss_2": 1170.0532684326172,
"kl_loss_3": 824.2820373535156,
"kl_loss_7": 206.52156982421874,
"learning_rate": 0.0004888942726743353,
"loss": 580.3403,
"step": 5120
},
{
"ce_loss_10": 3.6079283952713013,
"ce_loss_13": 3.5456172823905945,
"ce_loss_2": 4.103336191177368,
"ce_loss_3": 3.9267752170562744,
"ce_loss_7": 3.655103015899658,
"epoch": 0.513,
"grad_norm": 378.0,
"kl_loss_10": 97.65564994812011,
"kl_loss_2": 1156.2654846191406,
"kl_loss_3": 800.4834381103516,
"kl_loss_7": 198.76654281616212,
"learning_rate": 0.0004873080597348947,
"loss": 561.8108,
"step": 5130
},
{
"ce_loss_10": 3.492985022068024,
"ce_loss_13": 3.433611583709717,
"ce_loss_2": 4.009678089618683,
"ce_loss_3": 3.82467257976532,
"ce_loss_7": 3.543225371837616,
"epoch": 0.514,
"grad_norm": 440.0,
"kl_loss_10": 96.83905181884765,
"kl_loss_2": 1194.322329711914,
"kl_loss_3": 828.9491943359375,
"kl_loss_7": 198.22924575805663,
"learning_rate": 0.0004857219746031519,
"loss": 567.8251,
"step": 5140
},
{
"ce_loss_10": 3.6722797036170958,
"ce_loss_13": 3.6109776854515077,
"ce_loss_2": 4.149738478660583,
"ce_loss_3": 3.975986909866333,
"ce_loss_7": 3.7163102626800537,
"epoch": 0.515,
"grad_norm": 430.0,
"kl_loss_10": 99.9472442626953,
"kl_loss_2": 1140.7201843261719,
"kl_loss_3": 787.3806091308594,
"kl_loss_7": 197.54812469482422,
"learning_rate": 0.0004841360332509663,
"loss": 556.8349,
"step": 5150
},
{
"ce_loss_10": 3.6183668613433837,
"ce_loss_13": 3.5591975688934325,
"ce_loss_2": 4.100240254402161,
"ce_loss_3": 3.9269237518310547,
"ce_loss_7": 3.6642425417900086,
"epoch": 0.516,
"grad_norm": 366.0,
"kl_loss_10": 93.92010688781738,
"kl_loss_2": 1122.7465362548828,
"kl_loss_3": 778.0984069824219,
"kl_loss_7": 191.03939056396484,
"learning_rate": 0.0004825502516487497,
"loss": 537.9487,
"step": 5160
},
{
"ce_loss_10": 3.5835310339927675,
"ce_loss_13": 3.523791456222534,
"ce_loss_2": 4.082003366947174,
"ce_loss_3": 3.908873450756073,
"ce_loss_7": 3.634874391555786,
"epoch": 0.517,
"grad_norm": 608.0,
"kl_loss_10": 99.05728721618652,
"kl_loss_2": 1155.0127502441405,
"kl_loss_3": 805.5277587890625,
"kl_loss_7": 198.6641098022461,
"learning_rate": 0.00048096464576530507,
"loss": 561.8511,
"step": 5170
},
{
"ce_loss_10": 3.6886157989501953,
"ce_loss_13": 3.628003740310669,
"ce_loss_2": 4.146280741691589,
"ce_loss_3": 3.9846285343170167,
"ce_loss_7": 3.731534945964813,
"epoch": 0.518,
"grad_norm": 390.0,
"kl_loss_10": 98.92878913879395,
"kl_loss_2": 1103.851336669922,
"kl_loss_3": 767.5214813232421,
"kl_loss_7": 193.13973236083984,
"learning_rate": 0.00047937923156766646,
"loss": 544.8563,
"step": 5180
},
{
"ce_loss_10": 3.737223446369171,
"ce_loss_13": 3.6758363366127016,
"ce_loss_2": 4.200218558311462,
"ce_loss_3": 4.037039196491241,
"ce_loss_7": 3.7829922437667847,
"epoch": 0.519,
"grad_norm": 428.0,
"kl_loss_10": 102.72743797302246,
"kl_loss_2": 1108.4752288818358,
"kl_loss_3": 772.8697265625,
"kl_loss_7": 198.5632797241211,
"learning_rate": 0.00047779402502093696,
"loss": 549.91,
"step": 5190
},
{
"ce_loss_10": 3.703013610839844,
"ce_loss_13": 3.640911114215851,
"ce_loss_2": 4.174945414066315,
"ce_loss_3": 4.009368169307709,
"ce_loss_7": 3.7497113823890684,
"epoch": 0.52,
"grad_norm": 478.0,
"kl_loss_10": 99.68995170593261,
"kl_loss_2": 1110.2117858886718,
"kl_loss_3": 777.3010894775391,
"kl_loss_7": 196.47792434692383,
"learning_rate": 0.0004762090420881289,
"loss": 553.7422,
"step": 5200
},
{
"ce_loss_10": 3.6182032585144044,
"ce_loss_13": 3.5570725202560425,
"ce_loss_2": 4.098654413223267,
"ce_loss_3": 3.916290044784546,
"ce_loss_7": 3.665347421169281,
"epoch": 0.521,
"grad_norm": 426.0,
"kl_loss_10": 98.28518867492676,
"kl_loss_2": 1126.3521606445313,
"kl_loss_3": 772.9946044921875,
"kl_loss_7": 193.74108428955077,
"learning_rate": 0.00047462429873000296,
"loss": 544.104,
"step": 5210
},
{
"ce_loss_10": 3.7033097624778746,
"ce_loss_13": 3.6430840730667113,
"ce_loss_2": 4.168367850780487,
"ce_loss_3": 3.9993362069129943,
"ce_loss_7": 3.74978985786438,
"epoch": 0.522,
"grad_norm": 412.0,
"kl_loss_10": 98.88156356811524,
"kl_loss_2": 1115.6398986816407,
"kl_loss_3": 774.026156616211,
"kl_loss_7": 195.32233123779298,
"learning_rate": 0.0004730398109049071,
"loss": 547.7821,
"step": 5220
},
{
"ce_loss_10": 3.633508253097534,
"ce_loss_13": 3.5716773152351378,
"ce_loss_2": 4.128389453887939,
"ce_loss_3": 3.9533074378967283,
"ce_loss_7": 3.6823344349861147,
"epoch": 0.523,
"grad_norm": 396.0,
"kl_loss_10": 98.93126792907715,
"kl_loss_2": 1163.846746826172,
"kl_loss_3": 810.2734771728516,
"kl_loss_7": 200.85460052490234,
"learning_rate": 0.000471455594568616,
"loss": 558.1328,
"step": 5230
},
{
"ce_loss_10": 3.707250881195068,
"ce_loss_13": 3.6447718501091004,
"ce_loss_2": 4.174321246147156,
"ce_loss_3": 4.004381275177002,
"ce_loss_7": 3.753636956214905,
"epoch": 0.524,
"grad_norm": 394.0,
"kl_loss_10": 100.72676544189453,
"kl_loss_2": 1114.3457427978515,
"kl_loss_3": 768.1556701660156,
"kl_loss_7": 195.28340759277344,
"learning_rate": 0.00046987166567417086,
"loss": 552.4388,
"step": 5240
},
{
"ce_loss_10": 3.6187984108924867,
"ce_loss_13": 3.5605034112930296,
"ce_loss_2": 4.1001020789146425,
"ce_loss_3": 3.9256922364234925,
"ce_loss_7": 3.664110267162323,
"epoch": 0.525,
"grad_norm": 380.0,
"kl_loss_10": 95.83710632324218,
"kl_loss_2": 1120.3159301757812,
"kl_loss_3": 775.0560852050781,
"kl_loss_7": 192.1679656982422,
"learning_rate": 0.00046828804017171776,
"loss": 536.3316,
"step": 5250
},
{
"ce_loss_10": 3.6720359563827514,
"ce_loss_13": 3.6088499784469605,
"ce_loss_2": 4.162907612323761,
"ce_loss_3": 3.9896105885505677,
"ce_loss_7": 3.722712779045105,
"epoch": 0.526,
"grad_norm": 394.0,
"kl_loss_10": 98.17714996337891,
"kl_loss_2": 1138.502374267578,
"kl_loss_3": 789.8116973876953,
"kl_loss_7": 197.40582656860352,
"learning_rate": 0.00046670473400834805,
"loss": 559.8189,
"step": 5260
},
{
"ce_loss_10": 3.597737526893616,
"ce_loss_13": 3.5393651485443116,
"ce_loss_2": 4.074982023239135,
"ce_loss_3": 3.9021154403686524,
"ce_loss_7": 3.644618010520935,
"epoch": 0.527,
"grad_norm": 436.0,
"kl_loss_10": 95.52880744934082,
"kl_loss_2": 1111.367953491211,
"kl_loss_3": 768.6636322021484,
"kl_loss_7": 191.67658157348632,
"learning_rate": 0.00046512176312793734,
"loss": 559.1187,
"step": 5270
},
{
"ce_loss_10": 3.5923956394195558,
"ce_loss_13": 3.5312567353248596,
"ce_loss_2": 4.0659032464027405,
"ce_loss_3": 3.9041757225990295,
"ce_loss_7": 3.638344919681549,
"epoch": 0.528,
"grad_norm": 382.0,
"kl_loss_10": 95.8816967010498,
"kl_loss_2": 1131.7323181152344,
"kl_loss_3": 788.8408813476562,
"kl_loss_7": 193.95931167602538,
"learning_rate": 0.00046353914347098467,
"loss": 557.7083,
"step": 5280
},
{
"ce_loss_10": 3.688094747066498,
"ce_loss_13": 3.626521134376526,
"ce_loss_2": 4.17344571352005,
"ce_loss_3": 3.9936492323875425,
"ce_loss_7": 3.7344411969184876,
"epoch": 0.529,
"grad_norm": 438.0,
"kl_loss_10": 99.97393112182617,
"kl_loss_2": 1136.7248291015626,
"kl_loss_3": 780.4773040771485,
"kl_loss_7": 194.1311233520508,
"learning_rate": 0.0004619568909744524,
"loss": 554.6544,
"step": 5290
},
{
"ce_loss_10": 3.6992242336273193,
"ce_loss_13": 3.6374841570854186,
"ce_loss_2": 4.173903214931488,
"ce_loss_3": 4.004681324958801,
"ce_loss_7": 3.7441007494926453,
"epoch": 0.53,
"grad_norm": 496.0,
"kl_loss_10": 100.66301612854004,
"kl_loss_2": 1118.1583740234375,
"kl_loss_3": 778.1599609375,
"kl_loss_7": 195.17978057861328,
"learning_rate": 0.00046037502157160573,
"loss": 555.7068,
"step": 5300
},
{
"ce_loss_10": 3.5648537158966063,
"ce_loss_13": 3.50801477432251,
"ce_loss_2": 4.0505608201026915,
"ce_loss_3": 3.885770845413208,
"ce_loss_7": 3.614854156970978,
"epoch": 0.531,
"grad_norm": 392.0,
"kl_loss_10": 95.29824142456054,
"kl_loss_2": 1148.0569580078125,
"kl_loss_3": 803.5360778808594,
"kl_loss_7": 195.23088302612305,
"learning_rate": 0.00045879355119185207,
"loss": 559.6594,
"step": 5310
},
{
"ce_loss_10": 3.6439425349235535,
"ce_loss_13": 3.583683359622955,
"ce_loss_2": 4.135701584815979,
"ce_loss_3": 3.9598298192024233,
"ce_loss_7": 3.692049765586853,
"epoch": 0.532,
"grad_norm": 444.0,
"kl_loss_10": 97.83190078735352,
"kl_loss_2": 1160.7438171386718,
"kl_loss_3": 807.647915649414,
"kl_loss_7": 199.49599685668946,
"learning_rate": 0.0004572124957605803,
"loss": 565.4321,
"step": 5320
},
{
"ce_loss_10": 3.6681848645210264,
"ce_loss_13": 3.607477676868439,
"ce_loss_2": 4.14128270149231,
"ce_loss_3": 3.9746485590934753,
"ce_loss_7": 3.7138744235038756,
"epoch": 0.533,
"grad_norm": 340.0,
"kl_loss_10": 95.41666564941406,
"kl_loss_2": 1136.1244140625,
"kl_loss_3": 793.3468963623047,
"kl_loss_7": 195.33221740722655,
"learning_rate": 0.00045563187119900103,
"loss": 550.4382,
"step": 5330
},
{
"ce_loss_10": 3.5087064266204835,
"ce_loss_13": 3.4494638442993164,
"ce_loss_2": 4.00373204946518,
"ce_loss_3": 3.8344790935516357,
"ce_loss_7": 3.5566913962364195,
"epoch": 0.534,
"grad_norm": 456.0,
"kl_loss_10": 96.30420112609863,
"kl_loss_2": 1145.2862731933594,
"kl_loss_3": 803.7556610107422,
"kl_loss_7": 194.92612915039064,
"learning_rate": 0.00045405169342398633,
"loss": 560.8537,
"step": 5340
},
{
"ce_loss_10": 3.5990882992744444,
"ce_loss_13": 3.535432243347168,
"ce_loss_2": 4.08842386007309,
"ce_loss_3": 3.912948155403137,
"ce_loss_7": 3.6465937376022337,
"epoch": 0.535,
"grad_norm": 422.0,
"kl_loss_10": 99.51773872375489,
"kl_loss_2": 1142.4013549804688,
"kl_loss_3": 795.5528442382813,
"kl_loss_7": 196.72316284179686,
"learning_rate": 0.0004524719783479088,
"loss": 548.8232,
"step": 5350
},
{
"ce_loss_10": 3.552276241779327,
"ce_loss_13": 3.492251825332642,
"ce_loss_2": 4.056445682048798,
"ce_loss_3": 3.8783608794212343,
"ce_loss_7": 3.603780543804169,
"epoch": 0.536,
"grad_norm": 376.0,
"kl_loss_10": 97.24302253723144,
"kl_loss_2": 1164.848809814453,
"kl_loss_3": 811.2062194824218,
"kl_loss_7": 198.37730560302734,
"learning_rate": 0.00045089274187848144,
"loss": 554.2202,
"step": 5360
},
{
"ce_loss_10": 3.6724863052368164,
"ce_loss_13": 3.6130531072616576,
"ce_loss_2": 4.1379453301429745,
"ce_loss_3": 3.968498194217682,
"ce_loss_7": 3.717296040058136,
"epoch": 0.537,
"grad_norm": 536.0,
"kl_loss_10": 96.28798866271973,
"kl_loss_2": 1108.8939270019532,
"kl_loss_3": 770.5279510498046,
"kl_loss_7": 192.69188079833984,
"learning_rate": 0.00044931399991859835,
"loss": 545.4216,
"step": 5370
},
{
"ce_loss_10": 3.5360588788986207,
"ce_loss_13": 3.474487328529358,
"ce_loss_2": 4.018628227710724,
"ce_loss_3": 3.8429470539093016,
"ce_loss_7": 3.5856809496879576,
"epoch": 0.538,
"grad_norm": 446.0,
"kl_loss_10": 97.58423805236816,
"kl_loss_2": 1139.092123413086,
"kl_loss_3": 788.7141876220703,
"kl_loss_7": 196.66349868774415,
"learning_rate": 0.00044773576836617336,
"loss": 546.6951,
"step": 5380
},
{
"ce_loss_10": 3.6238678693771362,
"ce_loss_13": 3.5626631021499633,
"ce_loss_2": 4.120850419998169,
"ce_loss_3": 3.943516790866852,
"ce_loss_7": 3.6712807416915894,
"epoch": 0.539,
"grad_norm": 388.0,
"kl_loss_10": 99.70593795776367,
"kl_loss_2": 1163.2907775878907,
"kl_loss_3": 810.1283508300781,
"kl_loss_7": 199.7040023803711,
"learning_rate": 0.00044615806311398056,
"loss": 569.078,
"step": 5390
},
{
"ce_loss_10": 3.706363093852997,
"ce_loss_13": 3.6457801342010496,
"ce_loss_2": 4.146688032150268,
"ce_loss_3": 3.9897242546081544,
"ce_loss_7": 3.7506498098373413,
"epoch": 0.54,
"grad_norm": 318.0,
"kl_loss_10": 98.56370239257812,
"kl_loss_2": 1084.4558197021483,
"kl_loss_3": 756.0719848632813,
"kl_loss_7": 191.6246208190918,
"learning_rate": 0.00044458090004949454,
"loss": 551.6847,
"step": 5400
},
{
"ce_loss_10": 3.5594072341918945,
"ce_loss_13": 3.4980836510658264,
"ce_loss_2": 4.072906112670898,
"ce_loss_3": 3.8963231086730956,
"ce_loss_7": 3.6096426606178285,
"epoch": 0.541,
"grad_norm": 490.0,
"kl_loss_10": 98.93370399475097,
"kl_loss_2": 1204.406317138672,
"kl_loss_3": 841.533901977539,
"kl_loss_7": 202.28990631103517,
"learning_rate": 0.0004430042950547297,
"loss": 563.3182,
"step": 5410
},
{
"ce_loss_10": 3.656948244571686,
"ce_loss_13": 3.5917163252830506,
"ce_loss_2": 4.146977603435516,
"ce_loss_3": 3.9775506377220156,
"ce_loss_7": 3.7048157334327696,
"epoch": 0.542,
"grad_norm": 472.0,
"kl_loss_10": 100.26595115661621,
"kl_loss_2": 1150.8060424804687,
"kl_loss_3": 803.8866760253907,
"kl_loss_7": 200.08724365234374,
"learning_rate": 0.0004414282640060809,
"loss": 559.1381,
"step": 5420
},
{
"ce_loss_10": 3.7556936740875244,
"ce_loss_13": 3.690820097923279,
"ce_loss_2": 4.2162927985191345,
"ce_loss_3": 4.059760391712189,
"ce_loss_7": 3.7993207812309264,
"epoch": 0.543,
"grad_norm": 466.0,
"kl_loss_10": 100.5603858947754,
"kl_loss_2": 1102.3566284179688,
"kl_loss_3": 774.5157104492188,
"kl_loss_7": 196.8573425292969,
"learning_rate": 0.0004398528227741633,
"loss": 566.5525,
"step": 5430
},
{
"ce_loss_10": 3.6126871943473815,
"ce_loss_13": 3.553126609325409,
"ce_loss_2": 4.1005645275115965,
"ce_loss_3": 3.9280160546302794,
"ce_loss_7": 3.660943078994751,
"epoch": 0.544,
"grad_norm": 458.0,
"kl_loss_10": 97.1538932800293,
"kl_loss_2": 1131.997964477539,
"kl_loss_3": 791.6496276855469,
"kl_loss_7": 198.33607559204103,
"learning_rate": 0.00043827798722365264,
"loss": 560.7217,
"step": 5440
},
{
"ce_loss_10": 3.744398605823517,
"ce_loss_13": 3.681015205383301,
"ce_loss_2": 4.201505517959594,
"ce_loss_3": 4.03549770116806,
"ce_loss_7": 3.788591706752777,
"epoch": 0.545,
"grad_norm": 352.0,
"kl_loss_10": 99.98037643432617,
"kl_loss_2": 1095.4162628173829,
"kl_loss_3": 762.5562530517578,
"kl_loss_7": 196.50249557495118,
"learning_rate": 0.00043670377321312535,
"loss": 539.1079,
"step": 5450
},
{
"ce_loss_10": 3.7459957599639893,
"ce_loss_13": 3.6846879959106444,
"ce_loss_2": 4.2025530457496645,
"ce_loss_3": 4.042814528942108,
"ce_loss_7": 3.789257228374481,
"epoch": 0.546,
"grad_norm": 346.0,
"kl_loss_10": 99.90774993896484,
"kl_loss_2": 1095.3400299072266,
"kl_loss_3": 761.9524017333985,
"kl_loss_7": 193.25130310058594,
"learning_rate": 0.0004351301965948991,
"loss": 550.9912,
"step": 5460
},
{
"ce_loss_10": 3.6544747233390806,
"ce_loss_13": 3.5925102829933167,
"ce_loss_2": 4.1156612753868105,
"ce_loss_3": 3.9492591619491577,
"ce_loss_7": 3.700915348529816,
"epoch": 0.547,
"grad_norm": 446.0,
"kl_loss_10": 99.69101219177246,
"kl_loss_2": 1097.9489288330078,
"kl_loss_3": 763.9795166015625,
"kl_loss_7": 193.2705093383789,
"learning_rate": 0.000433557273214873,
"loss": 548.6603,
"step": 5470
},
{
"ce_loss_10": 3.6407829880714417,
"ce_loss_13": 3.58055636882782,
"ce_loss_2": 4.112010169029236,
"ce_loss_3": 3.9410730838775634,
"ce_loss_7": 3.6900732636451723,
"epoch": 0.548,
"grad_norm": 364.0,
"kl_loss_10": 96.30272674560547,
"kl_loss_2": 1104.9110717773438,
"kl_loss_3": 764.0930358886719,
"kl_loss_7": 193.28277206420898,
"learning_rate": 0.000431985018912368,
"loss": 539.9292,
"step": 5480
},
{
"ce_loss_10": 3.6089709639549254,
"ce_loss_13": 3.5466750621795655,
"ce_loss_2": 4.105600357055664,
"ce_loss_3": 3.9258901715278625,
"ce_loss_7": 3.658845567703247,
"epoch": 0.549,
"grad_norm": 428.0,
"kl_loss_10": 98.85242919921875,
"kl_loss_2": 1163.1305419921875,
"kl_loss_3": 809.7076019287109,
"kl_loss_7": 198.85261154174805,
"learning_rate": 0.0004304134495199674,
"loss": 550.7034,
"step": 5490
},
{
"ce_loss_10": 3.638536274433136,
"ce_loss_13": 3.575793814659119,
"ce_loss_2": 4.123488712310791,
"ce_loss_3": 3.954343330860138,
"ce_loss_7": 3.685023546218872,
"epoch": 0.55,
"grad_norm": 488.0,
"kl_loss_10": 99.10371284484863,
"kl_loss_2": 1163.9283081054687,
"kl_loss_3": 806.7497436523438,
"kl_loss_7": 200.15425338745118,
"learning_rate": 0.0004288425808633575,
"loss": 555.8719,
"step": 5500
},
{
"ce_loss_10": 3.6068961024284363,
"ce_loss_13": 3.5489359140396117,
"ce_loss_2": 4.091926336288452,
"ce_loss_3": 3.914480412006378,
"ce_loss_7": 3.653805840015411,
"epoch": 0.551,
"grad_norm": 482.0,
"kl_loss_10": 95.30807762145996,
"kl_loss_2": 1135.6305114746094,
"kl_loss_3": 782.8162139892578,
"kl_loss_7": 192.36727905273438,
"learning_rate": 0.0004272724287611684,
"loss": 551.1164,
"step": 5510
},
{
"ce_loss_10": 3.5843793511390687,
"ce_loss_13": 3.5220483541488647,
"ce_loss_2": 4.066782796382904,
"ce_loss_3": 3.8880024194717406,
"ce_loss_7": 3.628884470462799,
"epoch": 0.552,
"grad_norm": 472.0,
"kl_loss_10": 98.19914245605469,
"kl_loss_2": 1138.4930938720704,
"kl_loss_3": 792.6924499511719,
"kl_loss_7": 197.34004135131835,
"learning_rate": 0.00042570300902481425,
"loss": 550.9366,
"step": 5520
},
{
"ce_loss_10": 3.6187870144844054,
"ce_loss_13": 3.559086096286774,
"ce_loss_2": 4.0836735486984255,
"ce_loss_3": 3.913509225845337,
"ce_loss_7": 3.662268269062042,
"epoch": 0.553,
"grad_norm": 460.0,
"kl_loss_10": 96.8458236694336,
"kl_loss_2": 1113.29208984375,
"kl_loss_3": 778.9167602539062,
"kl_loss_7": 192.73130722045897,
"learning_rate": 0.00042413433745833423,
"loss": 545.5068,
"step": 5530
},
{
"ce_loss_10": 3.6217783451080323,
"ce_loss_13": 3.5588382482528687,
"ce_loss_2": 4.102611029148102,
"ce_loss_3": 3.9288668751716616,
"ce_loss_7": 3.667692792415619,
"epoch": 0.554,
"grad_norm": 394.0,
"kl_loss_10": 99.64076881408691,
"kl_loss_2": 1129.861962890625,
"kl_loss_3": 781.159780883789,
"kl_loss_7": 194.5426254272461,
"learning_rate": 0.0004225664298582339,
"loss": 538.3319,
"step": 5540
},
{
"ce_loss_10": 3.7008472084999084,
"ce_loss_13": 3.6404882073402405,
"ce_loss_2": 4.157876873016358,
"ce_loss_3": 3.9944301009178163,
"ce_loss_7": 3.7464569926261904,
"epoch": 0.555,
"grad_norm": 352.0,
"kl_loss_10": 98.0084358215332,
"kl_loss_2": 1092.2807312011719,
"kl_loss_3": 758.9974426269531,
"kl_loss_7": 191.41172409057617,
"learning_rate": 0.000420999302013325,
"loss": 539.2247,
"step": 5550
},
{
"ce_loss_10": 3.5973586678504943,
"ce_loss_13": 3.534582734107971,
"ce_loss_2": 4.09981359243393,
"ce_loss_3": 3.9165178179740905,
"ce_loss_7": 3.6474678754806518,
"epoch": 0.556,
"grad_norm": 454.0,
"kl_loss_10": 99.95339088439941,
"kl_loss_2": 1148.3679443359374,
"kl_loss_3": 795.4782531738281,
"kl_loss_7": 199.34042739868164,
"learning_rate": 0.000419432969704568,
"loss": 547.6515,
"step": 5560
},
{
"ce_loss_10": 3.6402106523513793,
"ce_loss_13": 3.580482280254364,
"ce_loss_2": 4.112204611301422,
"ce_loss_3": 3.9463653802871703,
"ce_loss_7": 3.6864510416984557,
"epoch": 0.557,
"grad_norm": 374.0,
"kl_loss_10": 97.21049270629882,
"kl_loss_2": 1103.2306396484375,
"kl_loss_3": 765.6696472167969,
"kl_loss_7": 192.21127700805664,
"learning_rate": 0.00041786744870491154,
"loss": 552.003,
"step": 5570
},
{
"ce_loss_10": 3.5763687014579775,
"ce_loss_13": 3.513793337345123,
"ce_loss_2": 4.059341847896576,
"ce_loss_3": 3.8873541951179504,
"ce_loss_7": 3.6242376923561097,
"epoch": 0.558,
"grad_norm": 496.0,
"kl_loss_10": 99.6470874786377,
"kl_loss_2": 1146.4394836425781,
"kl_loss_3": 799.3714019775391,
"kl_loss_7": 198.99811019897462,
"learning_rate": 0.0004163027547791347,
"loss": 555.3918,
"step": 5580
},
{
"ce_loss_10": 3.550457501411438,
"ce_loss_13": 3.490234684944153,
"ce_loss_2": 4.058210396766663,
"ce_loss_3": 3.8777605056762696,
"ce_loss_7": 3.5981253504753115,
"epoch": 0.559,
"grad_norm": 362.0,
"kl_loss_10": 96.0154800415039,
"kl_loss_2": 1166.6077453613282,
"kl_loss_3": 807.5666870117187,
"kl_loss_7": 196.15278396606445,
"learning_rate": 0.0004147389036836881,
"loss": 556.2604,
"step": 5590
},
{
"ce_loss_10": 3.606854057312012,
"ce_loss_13": 3.545392167568207,
"ce_loss_2": 4.097903311252594,
"ce_loss_3": 3.924593436717987,
"ce_loss_7": 3.652910280227661,
"epoch": 0.56,
"grad_norm": 580.0,
"kl_loss_10": 99.4388584136963,
"kl_loss_2": 1150.4553649902343,
"kl_loss_3": 802.4499359130859,
"kl_loss_7": 196.6334327697754,
"learning_rate": 0.00041317591116653486,
"loss": 563.6437,
"step": 5600
},
{
"ce_loss_10": 3.6449447154998778,
"ce_loss_13": 3.5830300569534304,
"ce_loss_2": 4.1296777606010435,
"ce_loss_3": 3.9580175995826723,
"ce_loss_7": 3.695024287700653,
"epoch": 0.561,
"grad_norm": 528.0,
"kl_loss_10": 100.15715980529785,
"kl_loss_2": 1137.3770324707032,
"kl_loss_3": 786.0429443359375,
"kl_loss_7": 199.2029815673828,
"learning_rate": 0.0004116137929669921,
"loss": 545.8336,
"step": 5610
},
{
"ce_loss_10": 3.6345237135887145,
"ce_loss_13": 3.575796604156494,
"ce_loss_2": 4.1131403088569645,
"ce_loss_3": 3.940467345714569,
"ce_loss_7": 3.6807628154754637,
"epoch": 0.562,
"grad_norm": 388.0,
"kl_loss_10": 95.75808372497559,
"kl_loss_2": 1128.9722564697265,
"kl_loss_3": 784.2019927978515,
"kl_loss_7": 193.04863052368165,
"learning_rate": 0.00041005256481557305,
"loss": 543.754,
"step": 5620
},
{
"ce_loss_10": 3.7401763558387757,
"ce_loss_13": 3.6805962681770326,
"ce_loss_2": 4.185898721218109,
"ce_loss_3": 4.027551281452179,
"ce_loss_7": 3.783228611946106,
"epoch": 0.563,
"grad_norm": 516.0,
"kl_loss_10": 96.21339073181153,
"kl_loss_2": 1061.5840301513672,
"kl_loss_3": 738.3266693115235,
"kl_loss_7": 187.24717712402344,
"learning_rate": 0.00040849224243382767,
"loss": 533.9922,
"step": 5630
},
{
"ce_loss_10": 3.5920221328735353,
"ce_loss_13": 3.5324007272720337,
"ce_loss_2": 4.072316908836365,
"ce_loss_3": 3.8983843684196473,
"ce_loss_7": 3.6374821186065676,
"epoch": 0.564,
"grad_norm": 338.0,
"kl_loss_10": 95.43405532836914,
"kl_loss_2": 1128.149676513672,
"kl_loss_3": 783.0245666503906,
"kl_loss_7": 193.40655746459962,
"learning_rate": 0.000406932841534185,
"loss": 541.5961,
"step": 5640
},
{
"ce_loss_10": 3.5484704256057737,
"ce_loss_13": 3.486864137649536,
"ce_loss_2": 4.036842632293701,
"ce_loss_3": 3.8651990056037904,
"ce_loss_7": 3.597266983985901,
"epoch": 0.565,
"grad_norm": 604.0,
"kl_loss_10": 95.5288932800293,
"kl_loss_2": 1141.9300598144532,
"kl_loss_3": 797.4877136230468,
"kl_loss_7": 194.9025909423828,
"learning_rate": 0.0004053743778197951,
"loss": 559.9006,
"step": 5650
},
{
"ce_loss_10": 3.6602503299713134,
"ce_loss_13": 3.596804344654083,
"ce_loss_2": 4.136269843578338,
"ce_loss_3": 3.967122423648834,
"ce_loss_7": 3.7048738479614256,
"epoch": 0.566,
"grad_norm": 418.0,
"kl_loss_10": 101.36623306274414,
"kl_loss_2": 1114.9331634521484,
"kl_loss_3": 774.6735748291015,
"kl_loss_7": 196.29364929199218,
"learning_rate": 0.0004038168669843697,
"loss": 553.1191,
"step": 5660
},
{
"ce_loss_10": 3.6255574107170103,
"ce_loss_13": 3.5639535069465635,
"ce_loss_2": 4.085369718074799,
"ce_loss_3": 3.919695568084717,
"ce_loss_7": 3.6704380750656127,
"epoch": 0.567,
"grad_norm": 736.0,
"kl_loss_10": 98.19256973266602,
"kl_loss_2": 1100.560809326172,
"kl_loss_3": 765.2613342285156,
"kl_loss_7": 192.50554656982422,
"learning_rate": 0.000402260324712026,
"loss": 547.8535,
"step": 5670
},
{
"ce_loss_10": 3.669494354724884,
"ce_loss_13": 3.60741925239563,
"ce_loss_2": 4.148016309738159,
"ce_loss_3": 3.9792763590812683,
"ce_loss_7": 3.7149499893188476,
"epoch": 0.568,
"grad_norm": 498.0,
"kl_loss_10": 99.65063438415527,
"kl_loss_2": 1126.1991058349608,
"kl_loss_3": 783.0127746582032,
"kl_loss_7": 194.12312698364258,
"learning_rate": 0.00040070476667712743,
"loss": 543.5942,
"step": 5680
},
{
"ce_loss_10": 3.7005011796951295,
"ce_loss_13": 3.6357283353805543,
"ce_loss_2": 4.166275656223297,
"ce_loss_3": 4.000016844272613,
"ce_loss_7": 3.745167064666748,
"epoch": 0.569,
"grad_norm": 356.0,
"kl_loss_10": 100.85004692077636,
"kl_loss_2": 1110.8271209716797,
"kl_loss_3": 770.1773681640625,
"kl_loss_7": 194.5637939453125,
"learning_rate": 0.0003991502085441259,
"loss": 548.6594,
"step": 5690
},
{
"ce_loss_10": 3.729709804058075,
"ce_loss_13": 3.6688971519470215,
"ce_loss_2": 4.18160834312439,
"ce_loss_3": 4.015434455871582,
"ce_loss_7": 3.7735623002052305,
"epoch": 0.57,
"grad_norm": 374.0,
"kl_loss_10": 98.11349868774414,
"kl_loss_2": 1070.740576171875,
"kl_loss_3": 744.3529327392578,
"kl_loss_7": 190.02555770874022,
"learning_rate": 0.0003975966659674047,
"loss": 541.8046,
"step": 5700
},
{
"ce_loss_10": 3.691783332824707,
"ce_loss_13": 3.6318029403686523,
"ce_loss_2": 4.161513650417328,
"ce_loss_3": 3.986249303817749,
"ce_loss_7": 3.73646023273468,
"epoch": 0.571,
"grad_norm": 536.0,
"kl_loss_10": 98.58754501342773,
"kl_loss_2": 1102.3078491210938,
"kl_loss_3": 759.7488586425782,
"kl_loss_7": 191.9359992980957,
"learning_rate": 0.0003960441545911204,
"loss": 538.4236,
"step": 5710
},
{
"ce_loss_10": 3.6897791981697083,
"ce_loss_13": 3.6274471282958984,
"ce_loss_2": 4.157203590869903,
"ce_loss_3": 3.9884847044944762,
"ce_loss_7": 3.736597418785095,
"epoch": 0.572,
"grad_norm": 600.0,
"kl_loss_10": 97.47168769836426,
"kl_loss_2": 1115.5811676025392,
"kl_loss_3": 773.7587646484375,
"kl_loss_7": 193.96655197143554,
"learning_rate": 0.0003944926900490452,
"loss": 541.7897,
"step": 5720
},
{
"ce_loss_10": 3.6022287607192993,
"ce_loss_13": 3.541483187675476,
"ce_loss_2": 4.094926071166992,
"ce_loss_3": 3.9194396138191223,
"ce_loss_7": 3.65008407831192,
"epoch": 0.573,
"grad_norm": 352.0,
"kl_loss_10": 96.51857452392578,
"kl_loss_2": 1147.3706939697265,
"kl_loss_3": 794.91083984375,
"kl_loss_7": 194.98720092773436,
"learning_rate": 0.0003929422879644099,
"loss": 544.8611,
"step": 5730
},
{
"ce_loss_10": 3.6093438267707825,
"ce_loss_13": 3.5497053503990172,
"ce_loss_2": 4.068271553516388,
"ce_loss_3": 3.9011916518211365,
"ce_loss_7": 3.6547187089920046,
"epoch": 0.574,
"grad_norm": 426.0,
"kl_loss_10": 95.6807746887207,
"kl_loss_2": 1107.6688201904296,
"kl_loss_3": 763.7292449951171,
"kl_loss_7": 189.65744247436524,
"learning_rate": 0.0003913929639497462,
"loss": 535.444,
"step": 5740
},
{
"ce_loss_10": 3.5539973855018614,
"ce_loss_13": 3.4933292627334596,
"ce_loss_2": 4.044394338130951,
"ce_loss_3": 3.8677351474761963,
"ce_loss_7": 3.6000022888183594,
"epoch": 0.575,
"grad_norm": 408.0,
"kl_loss_10": 95.82653579711913,
"kl_loss_2": 1130.1885803222656,
"kl_loss_3": 778.0026184082031,
"kl_loss_7": 190.79474563598632,
"learning_rate": 0.00038984473360672965,
"loss": 541.1631,
"step": 5750
},
{
"ce_loss_10": 3.5721747159957884,
"ce_loss_13": 3.5100734710693358,
"ce_loss_2": 4.053931272029876,
"ce_loss_3": 3.883261811733246,
"ce_loss_7": 3.6166505217552185,
"epoch": 0.576,
"grad_norm": 436.0,
"kl_loss_10": 95.3091812133789,
"kl_loss_2": 1128.7456329345703,
"kl_loss_3": 780.4191925048829,
"kl_loss_7": 190.4754554748535,
"learning_rate": 0.0003882976125260229,
"loss": 539.7566,
"step": 5760
},
{
"ce_loss_10": 3.638679492473602,
"ce_loss_13": 3.5770092844963073,
"ce_loss_2": 4.1140677571296695,
"ce_loss_3": 3.9416022896766663,
"ce_loss_7": 3.6866235971450805,
"epoch": 0.577,
"grad_norm": 366.0,
"kl_loss_10": 98.93351516723632,
"kl_loss_2": 1112.5931701660156,
"kl_loss_3": 770.6242248535157,
"kl_loss_7": 191.9038848876953,
"learning_rate": 0.00038675161628711776,
"loss": 545.2976,
"step": 5770
},
{
"ce_loss_10": 3.678569030761719,
"ce_loss_13": 3.616915798187256,
"ce_loss_2": 4.1388965249061584,
"ce_loss_3": 3.9749330997467043,
"ce_loss_7": 3.722931241989136,
"epoch": 0.578,
"grad_norm": 404.0,
"kl_loss_10": 97.5284637451172,
"kl_loss_2": 1093.5021606445312,
"kl_loss_3": 761.3094451904296,
"kl_loss_7": 191.26370391845703,
"learning_rate": 0.0003852067604581794,
"loss": 553.459,
"step": 5780
},
{
"ce_loss_10": 3.6174680829048156,
"ce_loss_13": 3.5550846695899962,
"ce_loss_2": 4.100849425792694,
"ce_loss_3": 3.927929162979126,
"ce_loss_7": 3.665549111366272,
"epoch": 0.579,
"grad_norm": 502.0,
"kl_loss_10": 97.5420696258545,
"kl_loss_2": 1125.1912048339843,
"kl_loss_3": 782.9702056884765,
"kl_loss_7": 193.16246643066407,
"learning_rate": 0.0003836630605958888,
"loss": 543.639,
"step": 5790
},
{
"ce_loss_10": 3.6780447602272033,
"ce_loss_13": 3.616100025177002,
"ce_loss_2": 4.136243522167206,
"ce_loss_3": 3.9725910425186157,
"ce_loss_7": 3.7234076499938964,
"epoch": 0.58,
"grad_norm": 506.0,
"kl_loss_10": 99.15894927978516,
"kl_loss_2": 1117.2952941894532,
"kl_loss_3": 777.6545166015625,
"kl_loss_7": 194.16991271972657,
"learning_rate": 0.0003821205322452863,
"loss": 560.4495,
"step": 5800
},
{
"ce_loss_10": 3.657036304473877,
"ce_loss_13": 3.5961548686027527,
"ce_loss_2": 4.118453872203827,
"ce_loss_3": 3.948525774478912,
"ce_loss_7": 3.7012171149253845,
"epoch": 0.581,
"grad_norm": 438.0,
"kl_loss_10": 98.11412734985352,
"kl_loss_2": 1098.6213439941407,
"kl_loss_3": 759.3198364257812,
"kl_loss_7": 189.98369064331055,
"learning_rate": 0.0003805791909396155,
"loss": 541.5742,
"step": 5810
},
{
"ce_loss_10": 3.6096495151519776,
"ce_loss_13": 3.550210452079773,
"ce_loss_2": 4.077665090560913,
"ce_loss_3": 3.9094552993774414,
"ce_loss_7": 3.654946839809418,
"epoch": 0.582,
"grad_norm": 428.0,
"kl_loss_10": 95.98116798400879,
"kl_loss_2": 1109.6123931884765,
"kl_loss_3": 763.3366668701171,
"kl_loss_7": 189.48765182495117,
"learning_rate": 0.0003790390522001662,
"loss": 547.1139,
"step": 5820
},
{
"ce_loss_10": 3.538465416431427,
"ce_loss_13": 3.4795125126838684,
"ce_loss_2": 4.019526553153992,
"ce_loss_3": 3.8418781757354736,
"ce_loss_7": 3.5831465244293215,
"epoch": 0.583,
"grad_norm": 354.0,
"kl_loss_10": 94.34587249755859,
"kl_loss_2": 1136.918035888672,
"kl_loss_3": 784.7109252929688,
"kl_loss_7": 191.27632827758788,
"learning_rate": 0.0003775001315361183,
"loss": 542.445,
"step": 5830
},
{
"ce_loss_10": 3.659132921695709,
"ce_loss_13": 3.596101534366608,
"ce_loss_2": 4.132727253437042,
"ce_loss_3": 3.958163845539093,
"ce_loss_7": 3.704639720916748,
"epoch": 0.584,
"grad_norm": 298.0,
"kl_loss_10": 98.75731201171875,
"kl_loss_2": 1122.0884033203124,
"kl_loss_3": 776.4772644042969,
"kl_loss_7": 193.22739944458007,
"learning_rate": 0.0003759624444443858,
"loss": 544.9992,
"step": 5840
},
{
"ce_loss_10": 3.6889251112937926,
"ce_loss_13": 3.6282206773757935,
"ce_loss_2": 4.151758980751038,
"ce_loss_3": 3.9822983741760254,
"ce_loss_7": 3.732993245124817,
"epoch": 0.585,
"grad_norm": 346.0,
"kl_loss_10": 99.06045837402344,
"kl_loss_2": 1097.8614471435546,
"kl_loss_3": 758.9134582519531,
"kl_loss_7": 191.27917098999023,
"learning_rate": 0.00037442600640946044,
"loss": 536.17,
"step": 5850
},
{
"ce_loss_10": 3.6461440443992617,
"ce_loss_13": 3.5892478227615356,
"ce_loss_2": 4.105236732959748,
"ce_loss_3": 3.9375507473945617,
"ce_loss_7": 3.692450475692749,
"epoch": 0.586,
"grad_norm": 408.0,
"kl_loss_10": 94.86803092956544,
"kl_loss_2": 1099.2377655029297,
"kl_loss_3": 758.3301605224609,
"kl_loss_7": 189.78098831176757,
"learning_rate": 0.00037289083290325663,
"loss": 531.0057,
"step": 5860
},
{
"ce_loss_10": 3.63515100479126,
"ce_loss_13": 3.574202799797058,
"ce_loss_2": 4.095511162281037,
"ce_loss_3": 3.930715727806091,
"ce_loss_7": 3.6794507265090943,
"epoch": 0.587,
"grad_norm": 540.0,
"kl_loss_10": 97.98805313110351,
"kl_loss_2": 1091.7025299072266,
"kl_loss_3": 757.6223114013671,
"kl_loss_7": 191.85128860473634,
"learning_rate": 0.0003713569393849543,
"loss": 533.4333,
"step": 5870
},
{
"ce_loss_10": 3.6827593207359315,
"ce_loss_13": 3.6205956816673277,
"ce_loss_2": 4.148468089103699,
"ce_loss_3": 3.978341579437256,
"ce_loss_7": 3.7273068189620973,
"epoch": 0.588,
"grad_norm": 398.0,
"kl_loss_10": 98.60938911437988,
"kl_loss_2": 1107.6281311035157,
"kl_loss_3": 765.0102233886719,
"kl_loss_7": 192.96542663574218,
"learning_rate": 0.00036982434130084397,
"loss": 541.5767,
"step": 5880
},
{
"ce_loss_10": 3.589915359020233,
"ce_loss_13": 3.5286367654800417,
"ce_loss_2": 4.061057722568512,
"ce_loss_3": 3.8881011605262756,
"ce_loss_7": 3.6373565912246706,
"epoch": 0.589,
"grad_norm": 506.0,
"kl_loss_10": 97.51137619018554,
"kl_loss_2": 1115.5977966308594,
"kl_loss_3": 775.6395446777344,
"kl_loss_7": 195.47111892700195,
"learning_rate": 0.00036829305408417166,
"loss": 546.8446,
"step": 5890
},
{
"ce_loss_10": 3.5797411799430847,
"ce_loss_13": 3.5188158631324766,
"ce_loss_2": 4.067822527885437,
"ce_loss_3": 3.893584966659546,
"ce_loss_7": 3.6291656494140625,
"epoch": 0.59,
"grad_norm": 364.0,
"kl_loss_10": 96.57020835876465,
"kl_loss_2": 1141.290579223633,
"kl_loss_3": 789.6200988769531,
"kl_loss_7": 196.76195220947267,
"learning_rate": 0.0003667630931549826,
"loss": 548.8211,
"step": 5900
},
{
"ce_loss_10": 3.547331213951111,
"ce_loss_13": 3.4874081373214723,
"ce_loss_2": 4.03765162229538,
"ce_loss_3": 3.8655640482902527,
"ce_loss_7": 3.5946906566619874,
"epoch": 0.591,
"grad_norm": 454.0,
"kl_loss_10": 95.69526252746581,
"kl_loss_2": 1154.8450256347655,
"kl_loss_3": 798.5165588378907,
"kl_loss_7": 194.5025749206543,
"learning_rate": 0.00036523447391996613,
"loss": 552.8163,
"step": 5910
},
{
"ce_loss_10": 3.6425758361816407,
"ce_loss_13": 3.5853498816490172,
"ce_loss_2": 4.10631023645401,
"ce_loss_3": 3.9402198076248167,
"ce_loss_7": 3.690027916431427,
"epoch": 0.592,
"grad_norm": 432.0,
"kl_loss_10": 94.87303581237794,
"kl_loss_2": 1090.1558319091796,
"kl_loss_3": 756.7847717285156,
"kl_loss_7": 189.84710311889648,
"learning_rate": 0.00036370721177230114,
"loss": 533.6673,
"step": 5920
},
{
"ce_loss_10": 3.635672652721405,
"ce_loss_13": 3.577661764621735,
"ce_loss_2": 4.114610862731934,
"ce_loss_3": 3.9419226169586183,
"ce_loss_7": 3.681511878967285,
"epoch": 0.593,
"grad_norm": 326.0,
"kl_loss_10": 95.39519729614258,
"kl_loss_2": 1127.0120056152343,
"kl_loss_3": 780.4901336669922,
"kl_loss_7": 194.04692993164062,
"learning_rate": 0.00036218132209150044,
"loss": 545.1962,
"step": 5930
},
{
"ce_loss_10": 3.593142592906952,
"ce_loss_13": 3.530347979068756,
"ce_loss_2": 4.095171976089477,
"ce_loss_3": 3.920231354236603,
"ce_loss_7": 3.645453596115112,
"epoch": 0.594,
"grad_norm": 378.0,
"kl_loss_10": 99.63440895080566,
"kl_loss_2": 1173.4297882080077,
"kl_loss_3": 813.8213714599609,
"kl_loss_7": 199.65494766235352,
"learning_rate": 0.0003606568202432562,
"loss": 557.0208,
"step": 5940
},
{
"ce_loss_10": 3.665185475349426,
"ce_loss_13": 3.6032612800598143,
"ce_loss_2": 4.14498724937439,
"ce_loss_3": 3.9701961159706114,
"ce_loss_7": 3.7108847856521607,
"epoch": 0.595,
"grad_norm": 528.0,
"kl_loss_10": 99.43977394104004,
"kl_loss_2": 1140.6280212402344,
"kl_loss_3": 787.1899200439453,
"kl_loss_7": 195.35167922973633,
"learning_rate": 0.0003591337215792851,
"loss": 544.2271,
"step": 5950
},
{
"ce_loss_10": 3.706349265575409,
"ce_loss_13": 3.64465993642807,
"ce_loss_2": 4.152172029018402,
"ce_loss_3": 3.9943688988685606,
"ce_loss_7": 3.7489245533943176,
"epoch": 0.596,
"grad_norm": 356.0,
"kl_loss_10": 99.39506378173829,
"kl_loss_2": 1087.233724975586,
"kl_loss_3": 759.1374755859375,
"kl_loss_7": 190.80716857910156,
"learning_rate": 0.00035761204143717383,
"loss": 544.3471,
"step": 5960
},
{
"ce_loss_10": 3.6578794836997988,
"ce_loss_13": 3.5957969784736634,
"ce_loss_2": 4.119996964931488,
"ce_loss_3": 3.9552765846252442,
"ce_loss_7": 3.7025834202766417,
"epoch": 0.597,
"grad_norm": 400.0,
"kl_loss_10": 99.01246032714843,
"kl_loss_2": 1115.1319488525392,
"kl_loss_3": 774.3078552246094,
"kl_loss_7": 193.01641845703125,
"learning_rate": 0.0003560917951402245,
"loss": 556.3752,
"step": 5970
},
{
"ce_loss_10": 3.632036602497101,
"ce_loss_13": 3.5740628480911254,
"ce_loss_2": 4.0921210765838625,
"ce_loss_3": 3.9307610511779787,
"ce_loss_7": 3.6746655702590942,
"epoch": 0.598,
"grad_norm": 412.0,
"kl_loss_10": 95.97110137939453,
"kl_loss_2": 1101.7569305419922,
"kl_loss_3": 768.7692047119141,
"kl_loss_7": 189.95830230712892,
"learning_rate": 0.00035457299799730046,
"loss": 538.1885,
"step": 5980
},
{
"ce_loss_10": 3.69617702960968,
"ce_loss_13": 3.6354240775108337,
"ce_loss_2": 4.163921213150024,
"ce_loss_3": 3.993851900100708,
"ce_loss_7": 3.7415480971336366,
"epoch": 0.599,
"grad_norm": 388.0,
"kl_loss_10": 96.27426452636719,
"kl_loss_2": 1105.9306549072267,
"kl_loss_3": 762.228305053711,
"kl_loss_7": 190.51752395629882,
"learning_rate": 0.0003530556653026721,
"loss": 545.8183,
"step": 5990
},
{
"ce_loss_10": 3.611501228809357,
"ce_loss_13": 3.5530946016311646,
"ce_loss_2": 4.07593857049942,
"ce_loss_3": 3.9016834497451782,
"ce_loss_7": 3.6570339798927307,
"epoch": 0.6,
"grad_norm": 1424.0,
"kl_loss_10": 94.48569107055664,
"kl_loss_2": 1108.4388488769532,
"kl_loss_3": 760.983023071289,
"kl_loss_7": 188.30435333251953,
"learning_rate": 0.00035153981233586274,
"loss": 543.2547,
"step": 6000
},
{
"ce_loss_10": 3.589734137058258,
"ce_loss_13": 3.5291273951530457,
"ce_loss_2": 4.066950809955597,
"ce_loss_3": 3.8936201214790342,
"ce_loss_7": 3.6356727838516236,
"epoch": 0.601,
"grad_norm": 478.0,
"kl_loss_10": 95.43113746643067,
"kl_loss_2": 1117.119808959961,
"kl_loss_3": 769.7344940185546,
"kl_loss_7": 188.8736831665039,
"learning_rate": 0.00035002545436149473,
"loss": 555.4068,
"step": 6010
},
{
"ce_loss_10": 3.603361654281616,
"ce_loss_13": 3.5395719528198244,
"ce_loss_2": 4.084376287460327,
"ce_loss_3": 3.913386416435242,
"ce_loss_7": 3.6495144724845887,
"epoch": 0.602,
"grad_norm": 414.0,
"kl_loss_10": 99.58069725036621,
"kl_loss_2": 1138.4922149658203,
"kl_loss_3": 791.1285461425781,
"kl_loss_7": 196.0400062561035,
"learning_rate": 0.0003485126066291364,
"loss": 543.3661,
"step": 6020
},
{
"ce_loss_10": 3.6472663640975953,
"ce_loss_13": 3.586405074596405,
"ce_loss_2": 4.12690646648407,
"ce_loss_3": 3.9540088891983034,
"ce_loss_7": 3.6910028219223023,
"epoch": 0.603,
"grad_norm": 426.0,
"kl_loss_10": 97.50395317077637,
"kl_loss_2": 1120.6384643554688,
"kl_loss_3": 773.8977966308594,
"kl_loss_7": 189.96464309692382,
"learning_rate": 0.0003470012843731476,
"loss": 547.4742,
"step": 6030
},
{
"ce_loss_10": 3.587485361099243,
"ce_loss_13": 3.527864229679108,
"ce_loss_2": 4.065750408172607,
"ce_loss_3": 3.8930314064025877,
"ce_loss_7": 3.6307687997817992,
"epoch": 0.604,
"grad_norm": 450.0,
"kl_loss_10": 95.93178520202636,
"kl_loss_2": 1125.8798370361328,
"kl_loss_3": 778.0897277832031,
"kl_loss_7": 190.32968826293944,
"learning_rate": 0.00034549150281252633,
"loss": 553.9461,
"step": 6040
},
{
"ce_loss_10": 3.567354416847229,
"ce_loss_13": 3.5087788224220278,
"ce_loss_2": 4.041226005554199,
"ce_loss_3": 3.868573796749115,
"ce_loss_7": 3.613230037689209,
"epoch": 0.605,
"grad_norm": 376.0,
"kl_loss_10": 96.31193771362305,
"kl_loss_2": 1101.1357208251952,
"kl_loss_3": 760.5451019287109,
"kl_loss_7": 190.99923782348634,
"learning_rate": 0.0003439832771507565,
"loss": 537.7418,
"step": 6050
},
{
"ce_loss_10": 3.569633936882019,
"ce_loss_13": 3.5091484904289247,
"ce_loss_2": 4.052746975421906,
"ce_loss_3": 3.8793442845344543,
"ce_loss_7": 3.6145769238471983,
"epoch": 0.606,
"grad_norm": 364.0,
"kl_loss_10": 96.17846641540527,
"kl_loss_2": 1126.9381469726563,
"kl_loss_3": 780.4287139892579,
"kl_loss_7": 191.24787139892578,
"learning_rate": 0.0003424766225756537,
"loss": 539.2611,
"step": 6060
},
{
"ce_loss_10": 3.6349270820617674,
"ce_loss_13": 3.5724891662597655,
"ce_loss_2": 4.110528755187988,
"ce_loss_3": 3.9370043516159057,
"ce_loss_7": 3.679009509086609,
"epoch": 0.607,
"grad_norm": 380.0,
"kl_loss_10": 98.61342163085938,
"kl_loss_2": 1107.0002716064453,
"kl_loss_3": 763.0299987792969,
"kl_loss_7": 192.68891830444335,
"learning_rate": 0.00034097155425921255,
"loss": 535.4806,
"step": 6070
},
{
"ce_loss_10": 3.5260583400726317,
"ce_loss_13": 3.4644631028175352,
"ce_loss_2": 4.0014289021492,
"ce_loss_3": 3.829664409160614,
"ce_loss_7": 3.571485424041748,
"epoch": 0.608,
"grad_norm": 422.0,
"kl_loss_10": 95.72014465332032,
"kl_loss_2": 1128.9732635498046,
"kl_loss_3": 780.0001983642578,
"kl_loss_7": 191.94852294921876,
"learning_rate": 0.0003394680873574546,
"loss": 542.5872,
"step": 6080
},
{
"ce_loss_10": 3.638583517074585,
"ce_loss_13": 3.5754881620407106,
"ce_loss_2": 4.1181090593338014,
"ce_loss_3": 3.9476171731948853,
"ce_loss_7": 3.6838363647460937,
"epoch": 0.609,
"grad_norm": 402.0,
"kl_loss_10": 99.43503112792969,
"kl_loss_2": 1131.3410400390626,
"kl_loss_3": 782.6971099853515,
"kl_loss_7": 192.93393096923828,
"learning_rate": 0.0003379662370102747,
"loss": 542.0118,
"step": 6090
},
{
"ce_loss_10": 3.6437841415405274,
"ce_loss_13": 3.5835014939308167,
"ce_loss_2": 4.107234466075897,
"ce_loss_3": 3.9407611727714538,
"ce_loss_7": 3.689082384109497,
"epoch": 0.61,
"grad_norm": 378.0,
"kl_loss_10": 95.95064582824708,
"kl_loss_2": 1116.5803283691407,
"kl_loss_3": 769.8769500732421,
"kl_loss_7": 190.42120208740235,
"learning_rate": 0.0003364660183412892,
"loss": 543.2468,
"step": 6100
},
{
"ce_loss_10": 3.6229702949523928,
"ce_loss_13": 3.5642863631248476,
"ce_loss_2": 4.082474946975708,
"ce_loss_3": 3.920805549621582,
"ce_loss_7": 3.6692759871482847,
"epoch": 0.611,
"grad_norm": 438.0,
"kl_loss_10": 95.98471641540527,
"kl_loss_2": 1107.3975402832032,
"kl_loss_3": 770.6610443115235,
"kl_loss_7": 191.18293151855468,
"learning_rate": 0.0003349674464576834,
"loss": 547.1137,
"step": 6110
},
{
"ce_loss_10": 3.572301459312439,
"ce_loss_13": 3.5100274682044983,
"ce_loss_2": 4.04880428314209,
"ce_loss_3": 3.87799711227417,
"ce_loss_7": 3.6172243118286134,
"epoch": 0.612,
"grad_norm": 400.0,
"kl_loss_10": 97.55015258789062,
"kl_loss_2": 1121.5612213134766,
"kl_loss_3": 776.7356872558594,
"kl_loss_7": 191.68118591308593,
"learning_rate": 0.00033347053645005966,
"loss": 533.933,
"step": 6120
},
{
"ce_loss_10": 3.6915227651596068,
"ce_loss_13": 3.6307403206825257,
"ce_loss_2": 4.149306988716125,
"ce_loss_3": 3.986075186729431,
"ce_loss_7": 3.7352558612823485,
"epoch": 0.613,
"grad_norm": 456.0,
"kl_loss_10": 97.44704780578613,
"kl_loss_2": 1082.3290954589843,
"kl_loss_3": 751.4226776123047,
"kl_loss_7": 188.24736099243165,
"learning_rate": 0.00033197530339228485,
"loss": 541.4641,
"step": 6130
},
{
"ce_loss_10": 3.6387569904327393,
"ce_loss_13": 3.5774574756622313,
"ce_loss_2": 4.1059521555900576,
"ce_loss_3": 3.9463557958602906,
"ce_loss_7": 3.686212944984436,
"epoch": 0.614,
"grad_norm": 320.0,
"kl_loss_10": 97.79526100158691,
"kl_loss_2": 1105.2626007080078,
"kl_loss_3": 773.3958312988282,
"kl_loss_7": 193.28426208496094,
"learning_rate": 0.00033048176234133967,
"loss": 539.6668,
"step": 6140
},
{
"ce_loss_10": 3.6235718965530395,
"ce_loss_13": 3.563166308403015,
"ce_loss_2": 4.0937678694725035,
"ce_loss_3": 3.9205260276794434,
"ce_loss_7": 3.6674267172813417,
"epoch": 0.615,
"grad_norm": 434.0,
"kl_loss_10": 96.52788619995117,
"kl_loss_2": 1108.4606842041017,
"kl_loss_3": 766.9316375732421,
"kl_loss_7": 191.76514892578126,
"learning_rate": 0.0003289899283371657,
"loss": 545.3005,
"step": 6150
},
{
"ce_loss_10": 3.6545772314071656,
"ce_loss_13": 3.5920246958732607,
"ce_loss_2": 4.122934722900391,
"ce_loss_3": 3.954042661190033,
"ce_loss_7": 3.7002484798431396,
"epoch": 0.616,
"grad_norm": 512.0,
"kl_loss_10": 96.86014366149902,
"kl_loss_2": 1110.978466796875,
"kl_loss_3": 763.7065795898437,
"kl_loss_7": 189.29309463500977,
"learning_rate": 0.0003274998164025148,
"loss": 546.4095,
"step": 6160
},
{
"ce_loss_10": 3.687037003040314,
"ce_loss_13": 3.62383953332901,
"ce_loss_2": 4.151339697837829,
"ce_loss_3": 3.982528305053711,
"ce_loss_7": 3.730109751224518,
"epoch": 0.617,
"grad_norm": 420.0,
"kl_loss_10": 98.5214340209961,
"kl_loss_2": 1105.55556640625,
"kl_loss_3": 765.5938140869141,
"kl_loss_7": 192.1310241699219,
"learning_rate": 0.0003260114415427975,
"loss": 551.3336,
"step": 6170
},
{
"ce_loss_10": 3.6019906878471373,
"ce_loss_13": 3.543228101730347,
"ce_loss_2": 4.074436497688294,
"ce_loss_3": 3.910420286655426,
"ce_loss_7": 3.650412619113922,
"epoch": 0.618,
"grad_norm": 326.0,
"kl_loss_10": 96.38783836364746,
"kl_loss_2": 1118.9971984863282,
"kl_loss_3": 780.3896484375,
"kl_loss_7": 191.8697937011719,
"learning_rate": 0.0003245248187459323,
"loss": 553.7879,
"step": 6180
},
{
"ce_loss_10": 3.5864107251167296,
"ce_loss_13": 3.53016597032547,
"ce_loss_2": 4.042503225803375,
"ce_loss_3": 3.874970889091492,
"ce_loss_7": 3.6281535744667055,
"epoch": 0.619,
"grad_norm": 418.0,
"kl_loss_10": 92.61179161071777,
"kl_loss_2": 1080.5412902832031,
"kl_loss_3": 743.7303924560547,
"kl_loss_7": 185.2653793334961,
"learning_rate": 0.00032303996298219416,
"loss": 531.9591,
"step": 6190
},
{
"ce_loss_10": 3.6777410745620727,
"ce_loss_13": 3.6153058767318726,
"ce_loss_2": 4.135652315616608,
"ce_loss_3": 3.968917655944824,
"ce_loss_7": 3.723153126239777,
"epoch": 0.62,
"grad_norm": 328.0,
"kl_loss_10": 97.44341430664062,
"kl_loss_2": 1081.0255004882813,
"kl_loss_3": 750.4999420166016,
"kl_loss_7": 189.73232498168946,
"learning_rate": 0.00032155688920406414,
"loss": 532.518,
"step": 6200
},
{
"ce_loss_10": 3.587769341468811,
"ce_loss_13": 3.524891209602356,
"ce_loss_2": 4.075685119628906,
"ce_loss_3": 3.896172082424164,
"ce_loss_7": 3.6351929187774656,
"epoch": 0.621,
"grad_norm": 376.0,
"kl_loss_10": 100.48479537963867,
"kl_loss_2": 1141.9940368652344,
"kl_loss_3": 786.613900756836,
"kl_loss_7": 195.81097640991212,
"learning_rate": 0.0003200756123460788,
"loss": 557.093,
"step": 6210
},
{
"ce_loss_10": 3.613728904724121,
"ce_loss_13": 3.5514798045158384,
"ce_loss_2": 4.097208368778229,
"ce_loss_3": 3.922844612598419,
"ce_loss_7": 3.6612853050231933,
"epoch": 0.622,
"grad_norm": 436.0,
"kl_loss_10": 98.99568367004395,
"kl_loss_2": 1137.439712524414,
"kl_loss_3": 786.5532501220703,
"kl_loss_7": 195.3180892944336,
"learning_rate": 0.00031859614732467957,
"loss": 552.2858,
"step": 6220
},
{
"ce_loss_10": 3.668611526489258,
"ce_loss_13": 3.6079549193382263,
"ce_loss_2": 4.123561811447144,
"ce_loss_3": 3.957008695602417,
"ce_loss_7": 3.7130979537963866,
"epoch": 0.623,
"grad_norm": 436.0,
"kl_loss_10": 96.12700805664062,
"kl_loss_2": 1085.7240417480468,
"kl_loss_3": 750.371206665039,
"kl_loss_7": 188.20330352783202,
"learning_rate": 0.00031711850903806275,
"loss": 532.2347,
"step": 6230
},
{
"ce_loss_10": 3.5722012281417848,
"ce_loss_13": 3.5121172070503235,
"ce_loss_2": 4.05529419183731,
"ce_loss_3": 3.8803335189819337,
"ce_loss_7": 3.6196384906768797,
"epoch": 0.624,
"grad_norm": 372.0,
"kl_loss_10": 98.26438941955567,
"kl_loss_2": 1135.1425506591797,
"kl_loss_3": 784.2965637207031,
"kl_loss_7": 195.50869674682616,
"learning_rate": 0.0003156427123660297,
"loss": 544.6104,
"step": 6240
},
{
"ce_loss_10": 3.663820195198059,
"ce_loss_13": 3.6021278977394102,
"ce_loss_2": 4.12639445066452,
"ce_loss_3": 3.9577032327651978,
"ce_loss_7": 3.709599566459656,
"epoch": 0.625,
"grad_norm": 376.0,
"kl_loss_10": 96.6868911743164,
"kl_loss_2": 1095.9533905029298,
"kl_loss_3": 760.994189453125,
"kl_loss_7": 189.45380859375,
"learning_rate": 0.0003141687721698363,
"loss": 542.975,
"step": 6250
},
{
"ce_loss_10": 3.6301703572273256,
"ce_loss_13": 3.5708668351173403,
"ce_loss_2": 4.076251423358917,
"ce_loss_3": 3.9138960361480715,
"ce_loss_7": 3.6724702954292296,
"epoch": 0.626,
"grad_norm": 424.0,
"kl_loss_10": 94.79209213256836,
"kl_loss_2": 1062.4459991455078,
"kl_loss_3": 731.6935089111328,
"kl_loss_7": 183.48223037719725,
"learning_rate": 0.00031269670329204396,
"loss": 531.0972,
"step": 6260
},
{
"ce_loss_10": 3.6652311086654663,
"ce_loss_13": 3.6031481981277467,
"ce_loss_2": 4.122557854652404,
"ce_loss_3": 3.9541366934776305,
"ce_loss_7": 3.707497763633728,
"epoch": 0.627,
"grad_norm": 404.0,
"kl_loss_10": 97.36745681762696,
"kl_loss_2": 1087.3731384277344,
"kl_loss_3": 749.8712615966797,
"kl_loss_7": 189.97913208007813,
"learning_rate": 0.00031122652055637015,
"loss": 536.5034,
"step": 6270
},
{
"ce_loss_10": 3.6263384938240053,
"ce_loss_13": 3.5657132387161257,
"ce_loss_2": 4.101957285404206,
"ce_loss_3": 3.9301799178123473,
"ce_loss_7": 3.671717309951782,
"epoch": 0.628,
"grad_norm": 320.0,
"kl_loss_10": 97.96914176940918,
"kl_loss_2": 1132.4724700927734,
"kl_loss_3": 779.5158935546875,
"kl_loss_7": 193.307218170166,
"learning_rate": 0.0003097582387675385,
"loss": 538.5988,
"step": 6280
},
{
"ce_loss_10": 3.6690368175506594,
"ce_loss_13": 3.608207333087921,
"ce_loss_2": 4.131593143939972,
"ce_loss_3": 3.967103731632233,
"ce_loss_7": 3.714122140407562,
"epoch": 0.629,
"grad_norm": 380.0,
"kl_loss_10": 97.3248161315918,
"kl_loss_2": 1100.8168243408204,
"kl_loss_3": 758.5913757324219,
"kl_loss_7": 190.2446075439453,
"learning_rate": 0.00030829187271113034,
"loss": 533.383,
"step": 6290
},
{
"ce_loss_10": 3.6720826983451844,
"ce_loss_13": 3.6116329789161683,
"ce_loss_2": 4.121181070804596,
"ce_loss_3": 3.958890378475189,
"ce_loss_7": 3.713034725189209,
"epoch": 0.63,
"grad_norm": 474.0,
"kl_loss_10": 95.86663208007812,
"kl_loss_2": 1078.529071044922,
"kl_loss_3": 747.6958526611328,
"kl_loss_7": 186.88264846801758,
"learning_rate": 0.00030682743715343565,
"loss": 538.6207,
"step": 6300
},
{
"ce_loss_10": 3.6168052315711976,
"ce_loss_13": 3.5534343481063844,
"ce_loss_2": 4.1001279830932615,
"ce_loss_3": 3.926764929294586,
"ce_loss_7": 3.6654592990875243,
"epoch": 0.631,
"grad_norm": 352.0,
"kl_loss_10": 98.38105430603028,
"kl_loss_2": 1116.2974884033204,
"kl_loss_3": 769.4165740966797,
"kl_loss_7": 194.41071319580078,
"learning_rate": 0.0003053649468413043,
"loss": 544.2852,
"step": 6310
},
{
"ce_loss_10": 3.728801727294922,
"ce_loss_13": 3.6677038788795473,
"ce_loss_2": 4.186562621593476,
"ce_loss_3": 4.021135902404785,
"ce_loss_7": 3.7726667642593386,
"epoch": 0.632,
"grad_norm": 548.0,
"kl_loss_10": 98.36889610290527,
"kl_loss_2": 1106.3314636230468,
"kl_loss_3": 764.3384338378906,
"kl_loss_7": 193.92676391601563,
"learning_rate": 0.00030390441650199725,
"loss": 534.6711,
"step": 6320
},
{
"ce_loss_10": 3.6225173473358154,
"ce_loss_13": 3.564038324356079,
"ce_loss_2": 4.088936626911163,
"ce_loss_3": 3.9200194835662843,
"ce_loss_7": 3.6701310753822325,
"epoch": 0.633,
"grad_norm": 390.0,
"kl_loss_10": 93.89363708496094,
"kl_loss_2": 1093.413995361328,
"kl_loss_3": 755.4691772460938,
"kl_loss_7": 188.9584762573242,
"learning_rate": 0.00030244586084303903,
"loss": 531.6465,
"step": 6330
},
{
"ce_loss_10": 3.5908933520317077,
"ce_loss_13": 3.530228877067566,
"ce_loss_2": 4.073009943962097,
"ce_loss_3": 3.908068907260895,
"ce_loss_7": 3.6380571484565736,
"epoch": 0.634,
"grad_norm": 362.0,
"kl_loss_10": 96.08535652160644,
"kl_loss_2": 1137.027798461914,
"kl_loss_3": 794.3090057373047,
"kl_loss_7": 193.36979446411132,
"learning_rate": 0.00030098929455206903,
"loss": 541.8852,
"step": 6340
},
{
"ce_loss_10": 3.5973508238792418,
"ce_loss_13": 3.538694751262665,
"ce_loss_2": 4.059111332893371,
"ce_loss_3": 3.8917571187019346,
"ce_loss_7": 3.6398496866226195,
"epoch": 0.635,
"grad_norm": 396.0,
"kl_loss_10": 95.19868698120118,
"kl_loss_2": 1117.9919860839843,
"kl_loss_3": 769.856167602539,
"kl_loss_7": 189.57870178222657,
"learning_rate": 0.00029953473229669324,
"loss": 545.9079,
"step": 6350
},
{
"ce_loss_10": 3.6316630482673644,
"ce_loss_13": 3.5723133206367494,
"ce_loss_2": 4.099796783924103,
"ce_loss_3": 3.9292221426963807,
"ce_loss_7": 3.6748278617858885,
"epoch": 0.636,
"grad_norm": 382.0,
"kl_loss_10": 94.04772453308105,
"kl_loss_2": 1105.0771392822267,
"kl_loss_3": 767.0107574462891,
"kl_loss_7": 189.39691848754882,
"learning_rate": 0.00029808218872433767,
"loss": 534.2105,
"step": 6360
},
{
"ce_loss_10": 3.6887783288955687,
"ce_loss_13": 3.6287707686424255,
"ce_loss_2": 4.1434108257293705,
"ce_loss_3": 3.9780289769172668,
"ce_loss_7": 3.7338571667671205,
"epoch": 0.637,
"grad_norm": 402.0,
"kl_loss_10": 97.2003547668457,
"kl_loss_2": 1086.371304321289,
"kl_loss_3": 753.1467376708985,
"kl_loss_7": 190.29918899536133,
"learning_rate": 0.0002966316784621,
"loss": 530.8481,
"step": 6370
},
{
"ce_loss_10": 3.5995650410652162,
"ce_loss_13": 3.5394855737686157,
"ce_loss_2": 4.081933212280274,
"ce_loss_3": 3.905743455886841,
"ce_loss_7": 3.6461820721626284,
"epoch": 0.638,
"grad_norm": 392.0,
"kl_loss_10": 94.92418899536133,
"kl_loss_2": 1131.0511108398437,
"kl_loss_3": 782.9240203857422,
"kl_loss_7": 192.17471160888672,
"learning_rate": 0.0002951832161166024,
"loss": 537.9302,
"step": 6380
},
{
"ce_loss_10": 3.6817028760910033,
"ce_loss_13": 3.619114363193512,
"ce_loss_2": 4.15013542175293,
"ce_loss_3": 3.980035495758057,
"ce_loss_7": 3.726088798046112,
"epoch": 0.639,
"grad_norm": 284.0,
"kl_loss_10": 99.42742652893067,
"kl_loss_2": 1089.2870971679688,
"kl_loss_3": 758.1006713867188,
"kl_loss_7": 192.03466110229493,
"learning_rate": 0.0002937368162738445,
"loss": 530.5328,
"step": 6390
},
{
"ce_loss_10": 3.6132258057594298,
"ce_loss_13": 3.557306098937988,
"ce_loss_2": 4.071500968933106,
"ce_loss_3": 3.905410099029541,
"ce_loss_7": 3.6560685634613037,
"epoch": 0.64,
"grad_norm": 580.0,
"kl_loss_10": 93.17153434753418,
"kl_loss_2": 1090.426809692383,
"kl_loss_3": 756.628515625,
"kl_loss_7": 185.41258697509767,
"learning_rate": 0.0002922924934990568,
"loss": 537.7791,
"step": 6400
},
{
"ce_loss_10": 3.553709554672241,
"ce_loss_13": 3.495926034450531,
"ce_loss_2": 4.037974917888642,
"ce_loss_3": 3.862057626247406,
"ce_loss_7": 3.5978724122047425,
"epoch": 0.641,
"grad_norm": 316.0,
"kl_loss_10": 94.70829887390137,
"kl_loss_2": 1132.230615234375,
"kl_loss_3": 780.3255004882812,
"kl_loss_7": 189.6028953552246,
"learning_rate": 0.0002908502623365536,
"loss": 541.2746,
"step": 6410
},
{
"ce_loss_10": 3.493143379688263,
"ce_loss_13": 3.4340757846832277,
"ce_loss_2": 3.982888162136078,
"ce_loss_3": 3.8087966442108154,
"ce_loss_7": 3.541613507270813,
"epoch": 0.642,
"grad_norm": 448.0,
"kl_loss_10": 93.92830047607421,
"kl_loss_2": 1141.5694763183594,
"kl_loss_3": 791.2887268066406,
"kl_loss_7": 189.8411407470703,
"learning_rate": 0.0002894101373095867,
"loss": 544.0511,
"step": 6420
},
{
"ce_loss_10": 3.7018409371376038,
"ce_loss_13": 3.641219162940979,
"ce_loss_2": 4.160841226577759,
"ce_loss_3": 3.996344065666199,
"ce_loss_7": 3.7449718475341798,
"epoch": 0.643,
"grad_norm": 444.0,
"kl_loss_10": 98.50596771240234,
"kl_loss_2": 1096.2253509521483,
"kl_loss_3": 759.2389587402344,
"kl_loss_7": 191.72063598632812,
"learning_rate": 0.00028797213292019926,
"loss": 535.7118,
"step": 6430
},
{
"ce_loss_10": 3.679163944721222,
"ce_loss_13": 3.6178041219711305,
"ce_loss_2": 4.137241208553315,
"ce_loss_3": 3.9736143589019775,
"ce_loss_7": 3.7223108887672423,
"epoch": 0.644,
"grad_norm": 316.0,
"kl_loss_10": 96.37056579589844,
"kl_loss_2": 1093.3028533935546,
"kl_loss_3": 763.8056060791016,
"kl_loss_7": 190.55449371337892,
"learning_rate": 0.0002865362636490791,
"loss": 543.9671,
"step": 6440
},
{
"ce_loss_10": 3.689470386505127,
"ce_loss_13": 3.6325947284698485,
"ce_loss_2": 4.151259076595307,
"ce_loss_3": 3.9852967262268066,
"ce_loss_7": 3.7347108364105224,
"epoch": 0.645,
"grad_norm": 422.0,
"kl_loss_10": 95.76711997985839,
"kl_loss_2": 1101.8473754882812,
"kl_loss_3": 757.8740173339844,
"kl_loss_7": 188.20162200927734,
"learning_rate": 0.0002851025439554142,
"loss": 532.7338,
"step": 6450
},
{
"ce_loss_10": 3.6879691004753115,
"ce_loss_13": 3.6268020391464235,
"ce_loss_2": 4.149470102787018,
"ce_loss_3": 3.9827425360679625,
"ce_loss_7": 3.732300865650177,
"epoch": 0.646,
"grad_norm": 432.0,
"kl_loss_10": 96.89583930969238,
"kl_loss_2": 1086.1058197021484,
"kl_loss_3": 754.8961853027344,
"kl_loss_7": 190.88655471801758,
"learning_rate": 0.00028367098827674573,
"loss": 531.1024,
"step": 6460
},
{
"ce_loss_10": 3.613504183292389,
"ce_loss_13": 3.552918183803558,
"ce_loss_2": 4.07694593667984,
"ce_loss_3": 3.9072110176086428,
"ce_loss_7": 3.656181883811951,
"epoch": 0.647,
"grad_norm": 382.0,
"kl_loss_10": 95.70045394897461,
"kl_loss_2": 1088.4426727294922,
"kl_loss_3": 747.3143646240235,
"kl_loss_7": 185.63362350463868,
"learning_rate": 0.00028224161102882397,
"loss": 534.1186,
"step": 6470
},
{
"ce_loss_10": 3.591862881183624,
"ce_loss_13": 3.5325499296188356,
"ce_loss_2": 4.047231125831604,
"ce_loss_3": 3.8850304007530214,
"ce_loss_7": 3.6327146530151366,
"epoch": 0.648,
"grad_norm": 398.0,
"kl_loss_10": 97.32144050598144,
"kl_loss_2": 1084.3862060546876,
"kl_loss_3": 756.0506072998047,
"kl_loss_7": 188.20642013549804,
"learning_rate": 0.00028081442660546124,
"loss": 534.4936,
"step": 6480
},
{
"ce_loss_10": 3.6528772950172423,
"ce_loss_13": 3.593310809135437,
"ce_loss_2": 4.104138958454132,
"ce_loss_3": 3.940169370174408,
"ce_loss_7": 3.6972940802574157,
"epoch": 0.649,
"grad_norm": 442.0,
"kl_loss_10": 96.56869812011719,
"kl_loss_2": 1082.232455444336,
"kl_loss_3": 748.2576446533203,
"kl_loss_7": 188.56612319946288,
"learning_rate": 0.0002793894493783892,
"loss": 535.3609,
"step": 6490
},
{
"ce_loss_10": 3.671093225479126,
"ce_loss_13": 3.6125397443771363,
"ce_loss_2": 4.120749580860138,
"ce_loss_3": 3.957093584537506,
"ce_loss_7": 3.715547430515289,
"epoch": 0.65,
"grad_norm": 340.0,
"kl_loss_10": 95.52767143249511,
"kl_loss_2": 1081.513833618164,
"kl_loss_3": 750.0977233886719,
"kl_loss_7": 185.41107177734375,
"learning_rate": 0.0002779666936971129,
"loss": 530.5015,
"step": 6500
},
{
"ce_loss_10": 3.6747244358062745,
"ce_loss_13": 3.6157574892044066,
"ce_loss_2": 4.147137761116028,
"ce_loss_3": 3.9802316427230835,
"ce_loss_7": 3.7200183868408203,
"epoch": 0.651,
"grad_norm": 388.0,
"kl_loss_10": 96.378706741333,
"kl_loss_2": 1104.2031311035157,
"kl_loss_3": 768.3699279785156,
"kl_loss_7": 190.13947677612305,
"learning_rate": 0.00027654617388876614,
"loss": 540.9622,
"step": 6510
},
{
"ce_loss_10": 3.7085010170936585,
"ce_loss_13": 3.650082528591156,
"ce_loss_2": 4.159732723236084,
"ce_loss_3": 3.9939939975738525,
"ce_loss_7": 3.752064514160156,
"epoch": 0.652,
"grad_norm": 372.0,
"kl_loss_10": 98.8690299987793,
"kl_loss_2": 1084.27646484375,
"kl_loss_3": 749.1016296386719,
"kl_loss_7": 189.19281463623048,
"learning_rate": 0.0002751279042579672,
"loss": 533.7532,
"step": 6520
},
{
"ce_loss_10": 3.6514885902404783,
"ce_loss_13": 3.589630663394928,
"ce_loss_2": 4.104155695438385,
"ce_loss_3": 3.9368098855018614,
"ce_loss_7": 3.696379566192627,
"epoch": 0.653,
"grad_norm": 388.0,
"kl_loss_10": 98.10863304138184,
"kl_loss_2": 1078.5175903320312,
"kl_loss_3": 739.8918975830078,
"kl_loss_7": 187.05665588378906,
"learning_rate": 0.00027371189908667604,
"loss": 535.8568,
"step": 6530
},
{
"ce_loss_10": 3.6950425028800966,
"ce_loss_13": 3.6345377445220945,
"ce_loss_2": 4.172570693492889,
"ce_loss_3": 4.002642476558686,
"ce_loss_7": 3.742088866233826,
"epoch": 0.654,
"grad_norm": 456.0,
"kl_loss_10": 98.50621490478515,
"kl_loss_2": 1120.8493408203126,
"kl_loss_3": 772.4739196777343,
"kl_loss_7": 194.52065811157226,
"learning_rate": 0.00027229817263404863,
"loss": 550.1683,
"step": 6540
},
{
"ce_loss_10": 3.678051483631134,
"ce_loss_13": 3.6163152933120726,
"ce_loss_2": 4.125236618518829,
"ce_loss_3": 3.9632533311843874,
"ce_loss_7": 3.717917835712433,
"epoch": 0.655,
"grad_norm": 354.0,
"kl_loss_10": 97.52188301086426,
"kl_loss_2": 1072.0729919433593,
"kl_loss_3": 745.5059295654297,
"kl_loss_7": 187.41375122070312,
"learning_rate": 0.0002708867391362948,
"loss": 530.4727,
"step": 6550
},
{
"ce_loss_10": 3.659157025814056,
"ce_loss_13": 3.5987429141998293,
"ce_loss_2": 4.098348212242127,
"ce_loss_3": 3.9343943357467652,
"ce_loss_7": 3.69932336807251,
"epoch": 0.656,
"grad_norm": 380.0,
"kl_loss_10": 95.51490859985351,
"kl_loss_2": 1048.09501953125,
"kl_loss_3": 723.2193145751953,
"kl_loss_7": 183.38801651000978,
"learning_rate": 0.0002694776128065345,
"loss": 526.4233,
"step": 6560
},
{
"ce_loss_10": 3.5926573395729067,
"ce_loss_13": 3.5355629920959473,
"ce_loss_2": 4.059596955776215,
"ce_loss_3": 3.8947146415710447,
"ce_loss_7": 3.63899849653244,
"epoch": 0.657,
"grad_norm": 302.0,
"kl_loss_10": 94.25321388244629,
"kl_loss_2": 1108.046826171875,
"kl_loss_3": 769.1714508056641,
"kl_loss_7": 190.54062194824218,
"learning_rate": 0.00026807080783465374,
"loss": 532.2117,
"step": 6570
},
{
"ce_loss_10": 3.7099499464035035,
"ce_loss_13": 3.6470829010009767,
"ce_loss_2": 4.173487448692322,
"ce_loss_3": 4.007464277744293,
"ce_loss_7": 3.753613090515137,
"epoch": 0.658,
"grad_norm": 336.0,
"kl_loss_10": 98.83243751525879,
"kl_loss_2": 1096.7148071289062,
"kl_loss_3": 763.6604827880859,
"kl_loss_7": 191.30890121459962,
"learning_rate": 0.00026666633838716316,
"loss": 542.1623,
"step": 6580
},
{
"ce_loss_10": 3.597714030742645,
"ce_loss_13": 3.5341309905052185,
"ce_loss_2": 4.0741772770881655,
"ce_loss_3": 3.9031991958618164,
"ce_loss_7": 3.64434130191803,
"epoch": 0.659,
"grad_norm": 418.0,
"kl_loss_10": 98.79775390625,
"kl_loss_2": 1119.104165649414,
"kl_loss_3": 772.7665252685547,
"kl_loss_7": 193.75399169921874,
"learning_rate": 0.00026526421860705474,
"loss": 546.4087,
"step": 6590
},
{
"ce_loss_10": 3.6211095809936524,
"ce_loss_13": 3.56248060464859,
"ce_loss_2": 4.090437388420105,
"ce_loss_3": 3.9254501700401305,
"ce_loss_7": 3.669628012180328,
"epoch": 0.66,
"grad_norm": 388.0,
"kl_loss_10": 97.33003234863281,
"kl_loss_2": 1100.579428100586,
"kl_loss_3": 767.1163055419922,
"kl_loss_7": 192.85016250610352,
"learning_rate": 0.0002638644626136587,
"loss": 535.0932,
"step": 6600
},
{
"ce_loss_10": 3.632294547557831,
"ce_loss_13": 3.5736007690429688,
"ce_loss_2": 4.098874115943909,
"ce_loss_3": 3.928848695755005,
"ce_loss_7": 3.6751357674598695,
"epoch": 0.661,
"grad_norm": 370.0,
"kl_loss_10": 95.11613578796387,
"kl_loss_2": 1096.4229095458984,
"kl_loss_3": 759.0542449951172,
"kl_loss_7": 188.92064208984374,
"learning_rate": 0.00026246708450250255,
"loss": 537.9207,
"step": 6610
},
{
"ce_loss_10": 3.6327243566513063,
"ce_loss_13": 3.5709309697151186,
"ce_loss_2": 4.086973357200622,
"ce_loss_3": 3.9239420771598814,
"ce_loss_7": 3.675078272819519,
"epoch": 0.662,
"grad_norm": 450.0,
"kl_loss_10": 97.06436119079589,
"kl_loss_2": 1079.41337890625,
"kl_loss_3": 752.72802734375,
"kl_loss_7": 187.51063842773436,
"learning_rate": 0.00026107209834516854,
"loss": 531.8906,
"step": 6620
},
{
"ce_loss_10": 3.5740899324417112,
"ce_loss_13": 3.5152911067008974,
"ce_loss_2": 4.057041144371032,
"ce_loss_3": 3.8850310802459718,
"ce_loss_7": 3.6205747365951537,
"epoch": 0.663,
"grad_norm": 326.0,
"kl_loss_10": 95.74808731079102,
"kl_loss_2": 1136.7873779296874,
"kl_loss_3": 780.0463623046875,
"kl_loss_7": 190.15955352783203,
"learning_rate": 0.0002596795181891514,
"loss": 547.2686,
"step": 6630
},
{
"ce_loss_10": 3.5901227831840514,
"ce_loss_13": 3.527127909660339,
"ce_loss_2": 4.062633895874024,
"ce_loss_3": 3.8958073616027833,
"ce_loss_7": 3.63388534784317,
"epoch": 0.664,
"grad_norm": 488.0,
"kl_loss_10": 97.48413009643555,
"kl_loss_2": 1119.4189453125,
"kl_loss_3": 774.4207427978515,
"kl_loss_7": 193.8588966369629,
"learning_rate": 0.000258289358057718,
"loss": 556.5954,
"step": 6640
},
{
"ce_loss_10": 3.6630045056343077,
"ce_loss_13": 3.6010705709457396,
"ce_loss_2": 4.126548099517822,
"ce_loss_3": 3.960009717941284,
"ce_loss_7": 3.70961674451828,
"epoch": 0.665,
"grad_norm": 368.0,
"kl_loss_10": 97.2126693725586,
"kl_loss_2": 1116.2655120849608,
"kl_loss_3": 770.7855743408203,
"kl_loss_7": 193.7609016418457,
"learning_rate": 0.0002569016319497657,
"loss": 544.2068,
"step": 6650
},
{
"ce_loss_10": 3.645352327823639,
"ce_loss_13": 3.582920753955841,
"ce_loss_2": 4.116545259952545,
"ce_loss_3": 3.9502077460289002,
"ce_loss_7": 3.6899593830108643,
"epoch": 0.666,
"grad_norm": 324.0,
"kl_loss_10": 98.58149223327636,
"kl_loss_2": 1127.1539520263673,
"kl_loss_3": 778.5697784423828,
"kl_loss_7": 194.4781005859375,
"learning_rate": 0.00025551635383968066,
"loss": 551.8321,
"step": 6660
},
{
"ce_loss_10": 3.5590095281600953,
"ce_loss_13": 3.497633898258209,
"ce_loss_2": 4.0256366491317745,
"ce_loss_3": 3.8563454031944273,
"ce_loss_7": 3.6033952236175537,
"epoch": 0.667,
"grad_norm": 386.0,
"kl_loss_10": 96.00436630249024,
"kl_loss_2": 1115.5439819335938,
"kl_loss_3": 764.8407897949219,
"kl_loss_7": 191.15278091430665,
"learning_rate": 0.00025413353767719804,
"loss": 541.5643,
"step": 6670
},
{
"ce_loss_10": 3.6135716080665587,
"ce_loss_13": 3.556279420852661,
"ce_loss_2": 4.074564230442047,
"ce_loss_3": 3.9083084225654603,
"ce_loss_7": 3.6589901089668273,
"epoch": 0.668,
"grad_norm": 404.0,
"kl_loss_10": 95.40520133972169,
"kl_loss_2": 1103.0668395996095,
"kl_loss_3": 766.21494140625,
"kl_loss_7": 187.07973251342773,
"learning_rate": 0.0002527531973872617,
"loss": 541.5821,
"step": 6680
},
{
"ce_loss_10": 3.630588722229004,
"ce_loss_13": 3.5716015577316282,
"ce_loss_2": 4.09862619638443,
"ce_loss_3": 3.9337419509887694,
"ce_loss_7": 3.6740004658699035,
"epoch": 0.669,
"grad_norm": 376.0,
"kl_loss_10": 94.05056571960449,
"kl_loss_2": 1104.580502319336,
"kl_loss_3": 767.1347503662109,
"kl_loss_7": 187.80085144042968,
"learning_rate": 0.0002513753468698826,
"loss": 536.7451,
"step": 6690
},
{
"ce_loss_10": 3.6005271077156067,
"ce_loss_13": 3.538683819770813,
"ce_loss_2": 4.075844824314117,
"ce_loss_3": 3.901875948905945,
"ce_loss_7": 3.6449614763259888,
"epoch": 0.67,
"grad_norm": 392.0,
"kl_loss_10": 97.46344718933105,
"kl_loss_2": 1117.6306915283203,
"kl_loss_3": 769.393521118164,
"kl_loss_7": 191.83680877685546,
"learning_rate": 0.0002500000000000001,
"loss": 543.8447,
"step": 6700
},
{
"ce_loss_10": 3.7194844245910645,
"ce_loss_13": 3.6591498017311097,
"ce_loss_2": 4.157877945899964,
"ce_loss_3": 3.9965709686279296,
"ce_loss_7": 3.7608611464500425,
"epoch": 0.671,
"grad_norm": 388.0,
"kl_loss_10": 96.12382774353027,
"kl_loss_2": 1059.211587524414,
"kl_loss_3": 732.8135711669922,
"kl_loss_7": 185.53207092285157,
"learning_rate": 0.0002486271706273421,
"loss": 540.9632,
"step": 6710
},
{
"ce_loss_10": 3.652998185157776,
"ce_loss_13": 3.5960669040679933,
"ce_loss_2": 4.096874964237213,
"ce_loss_3": 3.930626368522644,
"ce_loss_7": 3.694219136238098,
"epoch": 0.672,
"grad_norm": 370.0,
"kl_loss_10": 96.1414752960205,
"kl_loss_2": 1060.9839447021484,
"kl_loss_3": 732.6356231689454,
"kl_loss_7": 184.73310241699218,
"learning_rate": 0.0002472568725762853,
"loss": 531.8145,
"step": 6720
},
{
"ce_loss_10": 3.644508719444275,
"ce_loss_13": 3.585316574573517,
"ce_loss_2": 4.077662718296051,
"ce_loss_3": 3.923126482963562,
"ce_loss_7": 3.6880379915237427,
"epoch": 0.673,
"grad_norm": 536.0,
"kl_loss_10": 95.44480400085449,
"kl_loss_2": 1060.1810028076172,
"kl_loss_3": 734.1040588378906,
"kl_loss_7": 183.89718780517578,
"learning_rate": 0.00024588911964571554,
"loss": 524.9737,
"step": 6730
},
{
"ce_loss_10": 3.6595176219940186,
"ce_loss_13": 3.5960793495178223,
"ce_loss_2": 4.141416406631469,
"ce_loss_3": 3.971626269817352,
"ce_loss_7": 3.706479799747467,
"epoch": 0.674,
"grad_norm": 370.0,
"kl_loss_10": 101.08820152282715,
"kl_loss_2": 1123.6421142578124,
"kl_loss_3": 779.8745697021484,
"kl_loss_7": 196.79359664916993,
"learning_rate": 0.00024452392560888974,
"loss": 538.6094,
"step": 6740
},
{
"ce_loss_10": 3.5484472513198853,
"ce_loss_13": 3.4903222799301146,
"ce_loss_2": 4.00926810503006,
"ce_loss_3": 3.837252104282379,
"ce_loss_7": 3.5929391860961912,
"epoch": 0.675,
"grad_norm": 376.0,
"kl_loss_10": 94.44077377319336,
"kl_loss_2": 1104.6140991210937,
"kl_loss_3": 759.8463775634766,
"kl_loss_7": 187.49753799438477,
"learning_rate": 0.00024316130421329695,
"loss": 531.6798,
"step": 6750
},
{
"ce_loss_10": 3.63141074180603,
"ce_loss_13": 3.5704286813735964,
"ce_loss_2": 4.089890336990356,
"ce_loss_3": 3.9222849130630495,
"ce_loss_7": 3.6722644567489624,
"epoch": 0.676,
"grad_norm": 320.0,
"kl_loss_10": 96.4859691619873,
"kl_loss_2": 1072.7287811279298,
"kl_loss_3": 740.4257781982421,
"kl_loss_7": 185.37494659423828,
"learning_rate": 0.00024180126918051909,
"loss": 528.9844,
"step": 6760
},
{
"ce_loss_10": 3.6748690009117126,
"ce_loss_13": 3.6154377579689028,
"ce_loss_2": 4.126313555240631,
"ce_loss_3": 3.959956741333008,
"ce_loss_7": 3.719127857685089,
"epoch": 0.677,
"grad_norm": 494.0,
"kl_loss_10": 95.71767883300781,
"kl_loss_2": 1071.3604461669922,
"kl_loss_3": 739.3463531494141,
"kl_loss_7": 186.98586730957032,
"learning_rate": 0.00024044383420609406,
"loss": 526.4402,
"step": 6770
},
{
"ce_loss_10": 3.6849735140800477,
"ce_loss_13": 3.6251555919647216,
"ce_loss_2": 4.126254045963288,
"ce_loss_3": 3.9655120730400086,
"ce_loss_7": 3.7277087569236755,
"epoch": 0.678,
"grad_norm": 406.0,
"kl_loss_10": 96.21127319335938,
"kl_loss_2": 1065.4650268554688,
"kl_loss_3": 737.1611053466797,
"kl_loss_7": 186.31879425048828,
"learning_rate": 0.00023908901295937712,
"loss": 532.375,
"step": 6780
},
{
"ce_loss_10": 3.6866431832313538,
"ce_loss_13": 3.621911180019379,
"ce_loss_2": 4.138471448421479,
"ce_loss_3": 3.9692311763763426,
"ce_loss_7": 3.727970468997955,
"epoch": 0.679,
"grad_norm": 520.0,
"kl_loss_10": 97.46222076416015,
"kl_loss_2": 1075.2411163330078,
"kl_loss_3": 742.8502899169922,
"kl_loss_7": 187.16495361328126,
"learning_rate": 0.00023773681908340283,
"loss": 541.7315,
"step": 6790
},
{
"ce_loss_10": 3.6525588750839235,
"ce_loss_13": 3.590035092830658,
"ce_loss_2": 4.125091111660003,
"ce_loss_3": 3.955258107185364,
"ce_loss_7": 3.6996394038200378,
"epoch": 0.68,
"grad_norm": 448.0,
"kl_loss_10": 100.11968383789062,
"kl_loss_2": 1120.372329711914,
"kl_loss_3": 775.7205535888672,
"kl_loss_7": 195.07009201049806,
"learning_rate": 0.00023638726619474876,
"loss": 550.8879,
"step": 6800
},
{
"ce_loss_10": 3.6433764457702638,
"ce_loss_13": 3.581800138950348,
"ce_loss_2": 4.1252215027809145,
"ce_loss_3": 3.95204918384552,
"ce_loss_7": 3.68941251039505,
"epoch": 0.681,
"grad_norm": 380.0,
"kl_loss_10": 94.89226531982422,
"kl_loss_2": 1121.6464782714843,
"kl_loss_3": 776.2536529541015,
"kl_loss_7": 190.19580459594727,
"learning_rate": 0.0002350403678833976,
"loss": 540.7707,
"step": 6810
},
{
"ce_loss_10": 3.5702003121376036,
"ce_loss_13": 3.509978950023651,
"ce_loss_2": 4.041775238513947,
"ce_loss_3": 3.871393322944641,
"ce_loss_7": 3.6151094794273377,
"epoch": 0.682,
"grad_norm": 316.0,
"kl_loss_10": 94.982954788208,
"kl_loss_2": 1118.5872802734375,
"kl_loss_3": 772.0714935302734,
"kl_loss_7": 188.55085983276368,
"learning_rate": 0.00023369613771260007,
"loss": 536.8643,
"step": 6820
},
{
"ce_loss_10": 3.688840866088867,
"ce_loss_13": 3.6270360946655273,
"ce_loss_2": 4.156035900115967,
"ce_loss_3": 3.9860677838325502,
"ce_loss_7": 3.7326239466667177,
"epoch": 0.683,
"grad_norm": 410.0,
"kl_loss_10": 97.82878112792969,
"kl_loss_2": 1106.3897888183594,
"kl_loss_3": 766.803921508789,
"kl_loss_7": 191.37064056396486,
"learning_rate": 0.00023235458921873925,
"loss": 544.207,
"step": 6830
},
{
"ce_loss_10": 3.63765789270401,
"ce_loss_13": 3.5765843272209166,
"ce_loss_2": 4.12269172668457,
"ce_loss_3": 3.953417754173279,
"ce_loss_7": 3.6850703358650208,
"epoch": 0.684,
"grad_norm": 676.0,
"kl_loss_10": 97.75669631958007,
"kl_loss_2": 1147.8291046142579,
"kl_loss_3": 799.1194305419922,
"kl_loss_7": 195.58543319702147,
"learning_rate": 0.0002310157359111938,
"loss": 555.1555,
"step": 6840
},
{
"ce_loss_10": 3.526192367076874,
"ce_loss_13": 3.4662320494651793,
"ce_loss_2": 4.027907514572144,
"ce_loss_3": 3.8482834458351136,
"ce_loss_7": 3.574409317970276,
"epoch": 0.685,
"grad_norm": 660.0,
"kl_loss_10": 96.51494178771972,
"kl_loss_2": 1163.1898101806642,
"kl_loss_3": 802.0491363525391,
"kl_loss_7": 194.50169296264647,
"learning_rate": 0.0002296795912722014,
"loss": 551.9703,
"step": 6850
},
{
"ce_loss_10": 3.6707953572273255,
"ce_loss_13": 3.6116589188575743,
"ce_loss_2": 4.125709581375122,
"ce_loss_3": 3.957431602478027,
"ce_loss_7": 3.716504216194153,
"epoch": 0.686,
"grad_norm": 328.0,
"kl_loss_10": 96.6977554321289,
"kl_loss_2": 1086.6772430419921,
"kl_loss_3": 747.0762786865234,
"kl_loss_7": 188.86367645263672,
"learning_rate": 0.0002283461687567236,
"loss": 527.8294,
"step": 6860
},
{
"ce_loss_10": 3.727082335948944,
"ce_loss_13": 3.664930725097656,
"ce_loss_2": 4.172837960720062,
"ce_loss_3": 4.010821652412415,
"ce_loss_7": 3.7691392421722414,
"epoch": 0.687,
"grad_norm": 334.0,
"kl_loss_10": 97.53575859069824,
"kl_loss_2": 1058.4923736572266,
"kl_loss_3": 731.6483947753907,
"kl_loss_7": 186.02228698730468,
"learning_rate": 0.00022701548179231045,
"loss": 535.9072,
"step": 6870
},
{
"ce_loss_10": 3.6793978810310364,
"ce_loss_13": 3.6168754935264587,
"ce_loss_2": 4.133899199962616,
"ce_loss_3": 3.9700045347213746,
"ce_loss_7": 3.7239136338233947,
"epoch": 0.688,
"grad_norm": 382.0,
"kl_loss_10": 98.03768157958984,
"kl_loss_2": 1087.3397521972656,
"kl_loss_3": 753.5451507568359,
"kl_loss_7": 189.21656646728516,
"learning_rate": 0.00022568754377896516,
"loss": 530.6016,
"step": 6880
},
{
"ce_loss_10": 3.669530212879181,
"ce_loss_13": 3.611078381538391,
"ce_loss_2": 4.122839629650116,
"ce_loss_3": 3.9565317392349244,
"ce_loss_7": 3.7144492745399473,
"epoch": 0.689,
"grad_norm": 482.0,
"kl_loss_10": 93.94465446472168,
"kl_loss_2": 1092.5764556884765,
"kl_loss_3": 757.9043579101562,
"kl_loss_7": 189.06216201782226,
"learning_rate": 0.00022436236808900844,
"loss": 532.0287,
"step": 6890
},
{
"ce_loss_10": 3.563220775127411,
"ce_loss_13": 3.505044734477997,
"ce_loss_2": 4.028258430957794,
"ce_loss_3": 3.860454273223877,
"ce_loss_7": 3.6083375453948974,
"epoch": 0.69,
"grad_norm": 402.0,
"kl_loss_10": 95.30224533081055,
"kl_loss_2": 1114.9274475097657,
"kl_loss_3": 768.1644836425781,
"kl_loss_7": 189.04213485717773,
"learning_rate": 0.00022303996806694487,
"loss": 534.7889,
"step": 6900
},
{
"ce_loss_10": 3.646312749385834,
"ce_loss_13": 3.5865816950798033,
"ce_loss_2": 4.1086891174316404,
"ce_loss_3": 3.9399857401847838,
"ce_loss_7": 3.69192236661911,
"epoch": 0.691,
"grad_norm": 392.0,
"kl_loss_10": 95.77762832641602,
"kl_loss_2": 1094.2582000732423,
"kl_loss_3": 756.6770172119141,
"kl_loss_7": 187.92616500854493,
"learning_rate": 0.00022172035702932823,
"loss": 534.246,
"step": 6910
},
{
"ce_loss_10": 3.685254919528961,
"ce_loss_13": 3.6261175990104677,
"ce_loss_2": 4.142215931415558,
"ce_loss_3": 3.9721115231513977,
"ce_loss_7": 3.7271186470985413,
"epoch": 0.692,
"grad_norm": 430.0,
"kl_loss_10": 94.89179420471191,
"kl_loss_2": 1075.7997589111328,
"kl_loss_3": 742.8703857421875,
"kl_loss_7": 186.23582077026367,
"learning_rate": 0.00022040354826462666,
"loss": 530.2491,
"step": 6920
},
{
"ce_loss_10": 3.62452495098114,
"ce_loss_13": 3.563087892532349,
"ce_loss_2": 4.079807507991791,
"ce_loss_3": 3.913197338581085,
"ce_loss_7": 3.6697877049446106,
"epoch": 0.693,
"grad_norm": 410.0,
"kl_loss_10": 96.51725845336914,
"kl_loss_2": 1085.478707885742,
"kl_loss_3": 750.6873352050782,
"kl_loss_7": 187.0568748474121,
"learning_rate": 0.0002190895550330899,
"loss": 535.6979,
"step": 6930
},
{
"ce_loss_10": 3.547420835494995,
"ce_loss_13": 3.488833248615265,
"ce_loss_2": 4.036989772319794,
"ce_loss_3": 3.85961799621582,
"ce_loss_7": 3.598125493526459,
"epoch": 0.694,
"grad_norm": 406.0,
"kl_loss_10": 96.3628433227539,
"kl_loss_2": 1128.3786254882812,
"kl_loss_3": 778.4836669921875,
"kl_loss_7": 192.30058898925782,
"learning_rate": 0.00021777839056661552,
"loss": 534.9962,
"step": 6940
},
{
"ce_loss_10": 3.636169970035553,
"ce_loss_13": 3.576909136772156,
"ce_loss_2": 4.093018388748169,
"ce_loss_3": 3.9319678425788878,
"ce_loss_7": 3.682026994228363,
"epoch": 0.695,
"grad_norm": 380.0,
"kl_loss_10": 95.15358619689941,
"kl_loss_2": 1086.1379272460938,
"kl_loss_3": 753.724154663086,
"kl_loss_7": 185.95790100097656,
"learning_rate": 0.0002164700680686147,
"loss": 526.2859,
"step": 6950
},
{
"ce_loss_10": 3.6809890270233154,
"ce_loss_13": 3.6225372910499574,
"ce_loss_2": 4.135282206535339,
"ce_loss_3": 3.9695199608802794,
"ce_loss_7": 3.7249368906021116,
"epoch": 0.696,
"grad_norm": 400.0,
"kl_loss_10": 96.4394718170166,
"kl_loss_2": 1074.3540649414062,
"kl_loss_3": 743.0920288085938,
"kl_loss_7": 188.12129898071288,
"learning_rate": 0.0002151646007138806,
"loss": 527.0223,
"step": 6960
},
{
"ce_loss_10": 3.55483934879303,
"ce_loss_13": 3.493414306640625,
"ce_loss_2": 4.029416286945343,
"ce_loss_3": 3.8593334913253785,
"ce_loss_7": 3.5997050285339354,
"epoch": 0.697,
"grad_norm": 324.0,
"kl_loss_10": 98.1744327545166,
"kl_loss_2": 1119.6888793945313,
"kl_loss_3": 776.6419464111328,
"kl_loss_7": 191.90652236938476,
"learning_rate": 0.00021386200164845526,
"loss": 540.4315,
"step": 6970
},
{
"ce_loss_10": 3.7494669914245606,
"ce_loss_13": 3.6868221879005434,
"ce_loss_2": 4.180894982814789,
"ce_loss_3": 4.02219043970108,
"ce_loss_7": 3.790766155719757,
"epoch": 0.698,
"grad_norm": 386.0,
"kl_loss_10": 98.89772605895996,
"kl_loss_2": 1061.9671813964844,
"kl_loss_3": 736.8194549560546,
"kl_loss_7": 189.14059829711914,
"learning_rate": 0.0002125622839894964,
"loss": 526.3207,
"step": 6980
},
{
"ce_loss_10": 3.6859158158302305,
"ce_loss_13": 3.626417076587677,
"ce_loss_2": 4.136348474025726,
"ce_loss_3": 3.974168133735657,
"ce_loss_7": 3.7279628992080687,
"epoch": 0.699,
"grad_norm": 406.0,
"kl_loss_10": 97.57818336486817,
"kl_loss_2": 1081.921697998047,
"kl_loss_3": 746.1339630126953,
"kl_loss_7": 188.19551315307618,
"learning_rate": 0.00021126546082514663,
"loss": 529.5254,
"step": 6990
},
{
"ce_loss_10": 3.704355037212372,
"ce_loss_13": 3.643582081794739,
"ce_loss_2": 4.151243126392364,
"ce_loss_3": 3.9851069808006288,
"ce_loss_7": 3.747806203365326,
"epoch": 0.7,
"grad_norm": 394.0,
"kl_loss_10": 97.80472221374512,
"kl_loss_2": 1074.9452331542968,
"kl_loss_3": 745.385775756836,
"kl_loss_7": 188.936759185791,
"learning_rate": 0.00020997154521440098,
"loss": 526.4211,
"step": 7000
},
{
"ce_loss_10": 3.6455201506614685,
"ce_loss_13": 3.586948239803314,
"ce_loss_2": 4.104578590393066,
"ce_loss_3": 3.9375877380371094,
"ce_loss_7": 3.68754506111145,
"epoch": 0.701,
"grad_norm": 322.0,
"kl_loss_10": 93.82002601623535,
"kl_loss_2": 1085.8826141357422,
"kl_loss_3": 746.0692993164063,
"kl_loss_7": 184.4355583190918,
"learning_rate": 0.0002086805501869749,
"loss": 524.1356,
"step": 7010
},
{
"ce_loss_10": 3.6133246064186095,
"ce_loss_13": 3.554938244819641,
"ce_loss_2": 4.0853543996810915,
"ce_loss_3": 3.918412721157074,
"ce_loss_7": 3.6612335562705995,
"epoch": 0.702,
"grad_norm": 398.0,
"kl_loss_10": 95.29999237060547,
"kl_loss_2": 1131.5339111328126,
"kl_loss_3": 781.2637298583984,
"kl_loss_7": 192.70318984985352,
"learning_rate": 0.0002073924887431744,
"loss": 542.1648,
"step": 7020
},
{
"ce_loss_10": 3.619812881946564,
"ce_loss_13": 3.561210036277771,
"ce_loss_2": 4.088810133934021,
"ce_loss_3": 3.9195892930030825,
"ce_loss_7": 3.667060124874115,
"epoch": 0.703,
"grad_norm": 396.0,
"kl_loss_10": 95.14918022155761,
"kl_loss_2": 1112.4185638427734,
"kl_loss_3": 769.590234375,
"kl_loss_7": 188.17913894653321,
"learning_rate": 0.00020610737385376348,
"loss": 545.7339,
"step": 7030
},
{
"ce_loss_10": 3.689952182769775,
"ce_loss_13": 3.629777657985687,
"ce_loss_2": 4.127048587799072,
"ce_loss_3": 3.9679968118667603,
"ce_loss_7": 3.7309682607650756,
"epoch": 0.704,
"grad_norm": 480.0,
"kl_loss_10": 96.72987632751465,
"kl_loss_2": 1060.028268432617,
"kl_loss_3": 736.1820068359375,
"kl_loss_7": 185.3560775756836,
"learning_rate": 0.00020482521845983521,
"loss": 531.1421,
"step": 7040
},
{
"ce_loss_10": 3.681384038925171,
"ce_loss_13": 3.6203475475311278,
"ce_loss_2": 4.1394176363945006,
"ce_loss_3": 3.9727881073951723,
"ce_loss_7": 3.725051200389862,
"epoch": 0.705,
"grad_norm": 482.0,
"kl_loss_10": 100.69121513366699,
"kl_loss_2": 1089.9848724365233,
"kl_loss_3": 754.3679351806641,
"kl_loss_7": 192.38913803100587,
"learning_rate": 0.00020354603547267987,
"loss": 542.1912,
"step": 7050
},
{
"ce_loss_10": 3.667348313331604,
"ce_loss_13": 3.605680251121521,
"ce_loss_2": 4.1402019739151,
"ce_loss_3": 3.971468675136566,
"ce_loss_7": 3.712887394428253,
"epoch": 0.706,
"grad_norm": 364.0,
"kl_loss_10": 97.05326614379882,
"kl_loss_2": 1105.346597290039,
"kl_loss_3": 773.4686828613281,
"kl_loss_7": 191.13608169555664,
"learning_rate": 0.00020226983777365604,
"loss": 548.4642,
"step": 7060
},
{
"ce_loss_10": 3.563067603111267,
"ce_loss_13": 3.504194128513336,
"ce_loss_2": 4.040568280220032,
"ce_loss_3": 3.8677730679512026,
"ce_loss_7": 3.6067102789878844,
"epoch": 0.707,
"grad_norm": 338.0,
"kl_loss_10": 92.14009590148926,
"kl_loss_2": 1122.7782775878907,
"kl_loss_3": 765.3169036865235,
"kl_loss_7": 183.86895446777345,
"learning_rate": 0.00020099663821406056,
"loss": 534.7408,
"step": 7070
},
{
"ce_loss_10": 3.669863748550415,
"ce_loss_13": 3.6097553610801696,
"ce_loss_2": 4.117836952209473,
"ce_loss_3": 3.955933165550232,
"ce_loss_7": 3.7124067664146425,
"epoch": 0.708,
"grad_norm": 528.0,
"kl_loss_10": 95.14625968933106,
"kl_loss_2": 1064.9939758300782,
"kl_loss_3": 737.6582244873047,
"kl_loss_7": 184.71611633300782,
"learning_rate": 0.00019972644961499853,
"loss": 531.3339,
"step": 7080
},
{
"ce_loss_10": 3.635360848903656,
"ce_loss_13": 3.575283741950989,
"ce_loss_2": 4.107546412944794,
"ce_loss_3": 3.9376320004463197,
"ce_loss_7": 3.6813616275787355,
"epoch": 0.709,
"grad_norm": 454.0,
"kl_loss_10": 95.76384582519532,
"kl_loss_2": 1112.5157043457032,
"kl_loss_3": 768.9019195556641,
"kl_loss_7": 190.37624435424806,
"learning_rate": 0.00019845928476725522,
"loss": 537.9877,
"step": 7090
},
{
"ce_loss_10": 3.7167228937149046,
"ce_loss_13": 3.654716455936432,
"ce_loss_2": 4.171470665931702,
"ce_loss_3": 4.006917369365692,
"ce_loss_7": 3.763367462158203,
"epoch": 0.71,
"grad_norm": 402.0,
"kl_loss_10": 97.96182098388672,
"kl_loss_2": 1088.9804382324219,
"kl_loss_3": 752.4143249511719,
"kl_loss_7": 190.0522773742676,
"learning_rate": 0.00019719515643116677,
"loss": 545.6708,
"step": 7100
},
{
"ce_loss_10": 3.657674491405487,
"ce_loss_13": 3.595584750175476,
"ce_loss_2": 4.113815677165985,
"ce_loss_3": 3.9436608791351317,
"ce_loss_7": 3.700818693637848,
"epoch": 0.711,
"grad_norm": 354.0,
"kl_loss_10": 97.26274185180664,
"kl_loss_2": 1084.9519958496094,
"kl_loss_3": 745.9836975097656,
"kl_loss_7": 187.7238555908203,
"learning_rate": 0.0001959340773364911,
"loss": 535.516,
"step": 7110
},
{
"ce_loss_10": 3.6742369413375853,
"ce_loss_13": 3.613626217842102,
"ce_loss_2": 4.1353883981704715,
"ce_loss_3": 3.9663340568542482,
"ce_loss_7": 3.715994417667389,
"epoch": 0.712,
"grad_norm": 414.0,
"kl_loss_10": 97.77620887756348,
"kl_loss_2": 1094.1240295410157,
"kl_loss_3": 755.1257873535156,
"kl_loss_7": 188.97418975830078,
"learning_rate": 0.0001946760601822809,
"loss": 526.0803,
"step": 7120
},
{
"ce_loss_10": 3.724298870563507,
"ce_loss_13": 3.6654844999313356,
"ce_loss_2": 4.171249413490296,
"ce_loss_3": 4.011460411548614,
"ce_loss_7": 3.770449674129486,
"epoch": 0.713,
"grad_norm": 328.0,
"kl_loss_10": 95.51984024047852,
"kl_loss_2": 1076.5175323486328,
"kl_loss_3": 741.9749359130859,
"kl_loss_7": 187.3384910583496,
"learning_rate": 0.00019342111763675512,
"loss": 520.2061,
"step": 7130
},
{
"ce_loss_10": 3.730803680419922,
"ce_loss_13": 3.6689053654670714,
"ce_loss_2": 4.169312536716461,
"ce_loss_3": 4.00542528629303,
"ce_loss_7": 3.7727373957633974,
"epoch": 0.714,
"grad_norm": 418.0,
"kl_loss_10": 99.5161979675293,
"kl_loss_2": 1071.9742065429687,
"kl_loss_3": 743.7749084472656,
"kl_loss_7": 189.85234451293945,
"learning_rate": 0.00019216926233717085,
"loss": 525.6779,
"step": 7140
},
{
"ce_loss_10": 3.6117329597473145,
"ce_loss_13": 3.5528572678565977,
"ce_loss_2": 4.092064487934112,
"ce_loss_3": 3.914566385746002,
"ce_loss_7": 3.653049111366272,
"epoch": 0.715,
"grad_norm": 342.0,
"kl_loss_10": 95.1452823638916,
"kl_loss_2": 1135.5553619384766,
"kl_loss_3": 779.0962982177734,
"kl_loss_7": 185.5459442138672,
"learning_rate": 0.00019092050688969737,
"loss": 540.4428,
"step": 7150
},
{
"ce_loss_10": 3.6794282674789427,
"ce_loss_13": 3.619647240638733,
"ce_loss_2": 4.124801588058472,
"ce_loss_3": 3.9644263625144958,
"ce_loss_7": 3.7204025983810425,
"epoch": 0.716,
"grad_norm": 458.0,
"kl_loss_10": 95.73797454833985,
"kl_loss_2": 1075.4539825439454,
"kl_loss_3": 743.3267883300781,
"kl_loss_7": 186.0149787902832,
"learning_rate": 0.00018967486386928817,
"loss": 525.8811,
"step": 7160
},
{
"ce_loss_10": 3.5499155521392822,
"ce_loss_13": 3.4892677664756775,
"ce_loss_2": 4.026895833015442,
"ce_loss_3": 3.8540278673171997,
"ce_loss_7": 3.594646680355072,
"epoch": 0.717,
"grad_norm": 458.0,
"kl_loss_10": 93.14333381652833,
"kl_loss_2": 1122.9288635253906,
"kl_loss_3": 776.9212982177735,
"kl_loss_7": 188.66815719604492,
"learning_rate": 0.00018843234581955443,
"loss": 552.9929,
"step": 7170
},
{
"ce_loss_10": 3.574516201019287,
"ce_loss_13": 3.512941229343414,
"ce_loss_2": 4.049459004402161,
"ce_loss_3": 3.871944236755371,
"ce_loss_7": 3.6209982872009276,
"epoch": 0.718,
"grad_norm": 364.0,
"kl_loss_10": 96.56784629821777,
"kl_loss_2": 1129.7979248046875,
"kl_loss_3": 775.4145477294921,
"kl_loss_7": 190.88178558349608,
"learning_rate": 0.00018719296525263924,
"loss": 541.6241,
"step": 7180
},
{
"ce_loss_10": 3.6690776705741883,
"ce_loss_13": 3.6084558844566343,
"ce_loss_2": 4.109010553359985,
"ce_loss_3": 3.944419741630554,
"ce_loss_7": 3.7104645013809203,
"epoch": 0.719,
"grad_norm": 472.0,
"kl_loss_10": 96.92717056274414,
"kl_loss_2": 1058.7910217285157,
"kl_loss_3": 728.63525390625,
"kl_loss_7": 186.22266235351563,
"learning_rate": 0.0001859567346490913,
"loss": 525.3373,
"step": 7190
},
{
"ce_loss_10": 3.6438188314437867,
"ce_loss_13": 3.5840962886810304,
"ce_loss_2": 4.113380300998688,
"ce_loss_3": 3.9464030385017397,
"ce_loss_7": 3.690832197666168,
"epoch": 0.72,
"grad_norm": 372.0,
"kl_loss_10": 96.38097648620605,
"kl_loss_2": 1109.4217742919923,
"kl_loss_3": 771.3218353271484,
"kl_loss_7": 191.62188110351562,
"learning_rate": 0.0001847236664577389,
"loss": 531.0333,
"step": 7200
},
{
"ce_loss_10": 3.673705244064331,
"ce_loss_13": 3.614805054664612,
"ce_loss_2": 4.117141389846802,
"ce_loss_3": 3.954344153404236,
"ce_loss_7": 3.717172992229462,
"epoch": 0.721,
"grad_norm": 342.0,
"kl_loss_10": 96.93136787414551,
"kl_loss_2": 1071.9077087402343,
"kl_loss_3": 737.0366821289062,
"kl_loss_7": 186.3966079711914,
"learning_rate": 0.00018349377309556487,
"loss": 518.4113,
"step": 7210
},
{
"ce_loss_10": 3.609177756309509,
"ce_loss_13": 3.5509839773178102,
"ce_loss_2": 4.084410285949707,
"ce_loss_3": 3.911909210681915,
"ce_loss_7": 3.6546399116516115,
"epoch": 0.722,
"grad_norm": 436.0,
"kl_loss_10": 94.82120094299316,
"kl_loss_2": 1119.1944885253906,
"kl_loss_3": 772.7051483154297,
"kl_loss_7": 190.29311599731446,
"learning_rate": 0.00018226706694758193,
"loss": 539.7223,
"step": 7220
},
{
"ce_loss_10": 3.6862050175666807,
"ce_loss_13": 3.6256973266601564,
"ce_loss_2": 4.135941016674042,
"ce_loss_3": 3.9752198338508604,
"ce_loss_7": 3.7262799024581907,
"epoch": 0.723,
"grad_norm": 386.0,
"kl_loss_10": 96.04033012390137,
"kl_loss_2": 1079.298776245117,
"kl_loss_3": 752.3606506347656,
"kl_loss_7": 187.0266014099121,
"learning_rate": 0.0001810435603667075,
"loss": 540.3036,
"step": 7230
},
{
"ce_loss_10": 3.5322535395622254,
"ce_loss_13": 3.4715150594711304,
"ce_loss_2": 4.000997626781464,
"ce_loss_3": 3.8283260583877565,
"ce_loss_7": 3.5753297805786133,
"epoch": 0.724,
"grad_norm": 348.0,
"kl_loss_10": 92.0587100982666,
"kl_loss_2": 1101.032977294922,
"kl_loss_3": 757.3754730224609,
"kl_loss_7": 184.87646255493163,
"learning_rate": 0.0001798232656736389,
"loss": 539.9771,
"step": 7240
},
{
"ce_loss_10": 3.7180214405059813,
"ce_loss_13": 3.6561784505844117,
"ce_loss_2": 4.153665316104889,
"ce_loss_3": 3.994912326335907,
"ce_loss_7": 3.759207808971405,
"epoch": 0.725,
"grad_norm": 388.0,
"kl_loss_10": 97.47655296325684,
"kl_loss_2": 1060.039584350586,
"kl_loss_3": 729.7286529541016,
"kl_loss_7": 185.7909019470215,
"learning_rate": 0.0001786061951567303,
"loss": 527.9849,
"step": 7250
},
{
"ce_loss_10": 3.630312275886536,
"ce_loss_13": 3.5694428086280823,
"ce_loss_2": 4.091070818901062,
"ce_loss_3": 3.92718985080719,
"ce_loss_7": 3.675185751914978,
"epoch": 0.726,
"grad_norm": 382.0,
"kl_loss_10": 97.81040573120117,
"kl_loss_2": 1091.2934509277343,
"kl_loss_3": 755.8922180175781,
"kl_loss_7": 189.30439071655275,
"learning_rate": 0.00017739236107186857,
"loss": 537.2411,
"step": 7260
},
{
"ce_loss_10": 3.711188280582428,
"ce_loss_13": 3.6525003552436828,
"ce_loss_2": 4.142853522300721,
"ce_loss_3": 3.981386995315552,
"ce_loss_7": 3.7502527594566346,
"epoch": 0.727,
"grad_norm": 374.0,
"kl_loss_10": 93.90410652160645,
"kl_loss_2": 1048.1178436279297,
"kl_loss_3": 725.0591918945313,
"kl_loss_7": 182.22721328735352,
"learning_rate": 0.00017618177564234904,
"loss": 519.2631,
"step": 7270
},
{
"ce_loss_10": 3.693279492855072,
"ce_loss_13": 3.6356263041496275,
"ce_loss_2": 4.13202931880951,
"ce_loss_3": 3.9750990748405455,
"ce_loss_7": 3.7332441210746765,
"epoch": 0.728,
"grad_norm": 318.0,
"kl_loss_10": 95.86821098327637,
"kl_loss_2": 1048.5098999023437,
"kl_loss_3": 724.8844573974609,
"kl_loss_7": 182.79603576660156,
"learning_rate": 0.00017497445105875377,
"loss": 523.0468,
"step": 7280
},
{
"ce_loss_10": 3.595864677429199,
"ce_loss_13": 3.5371819376945495,
"ce_loss_2": 4.073407852649689,
"ce_loss_3": 3.904122495651245,
"ce_loss_7": 3.6426048040390016,
"epoch": 0.729,
"grad_norm": 442.0,
"kl_loss_10": 95.08332710266113,
"kl_loss_2": 1130.4158264160155,
"kl_loss_3": 780.6070220947265,
"kl_loss_7": 189.8589889526367,
"learning_rate": 0.000173770399478828,
"loss": 538.7581,
"step": 7290
},
{
"ce_loss_10": 3.5191142082214357,
"ce_loss_13": 3.461543416976929,
"ce_loss_2": 3.977211833000183,
"ce_loss_3": 3.8103960871696474,
"ce_loss_7": 3.564071011543274,
"epoch": 0.73,
"grad_norm": 438.0,
"kl_loss_10": 93.54008331298829,
"kl_loss_2": 1093.509115600586,
"kl_loss_3": 757.6209930419922,
"kl_loss_7": 186.89632568359374,
"learning_rate": 0.0001725696330273575,
"loss": 540.4559,
"step": 7300
},
{
"ce_loss_10": 3.714753472805023,
"ce_loss_13": 3.6550832748413087,
"ce_loss_2": 4.150299251079559,
"ce_loss_3": 3.9939948439598085,
"ce_loss_7": 3.757344377040863,
"epoch": 0.731,
"grad_norm": 486.0,
"kl_loss_10": 93.61467895507812,
"kl_loss_2": 1050.2083618164063,
"kl_loss_3": 726.4699127197266,
"kl_loss_7": 182.62665328979492,
"learning_rate": 0.00017137216379604724,
"loss": 517.0194,
"step": 7310
},
{
"ce_loss_10": 3.590583050251007,
"ce_loss_13": 3.5311309576034544,
"ce_loss_2": 4.051425302028656,
"ce_loss_3": 3.8829818606376647,
"ce_loss_7": 3.632352864742279,
"epoch": 0.732,
"grad_norm": 340.0,
"kl_loss_10": 95.8599407196045,
"kl_loss_2": 1085.3143981933595,
"kl_loss_3": 747.0916809082031,
"kl_loss_7": 186.49290466308594,
"learning_rate": 0.00017017800384339925,
"loss": 528.4002,
"step": 7320
},
{
"ce_loss_10": 3.540472662448883,
"ce_loss_13": 3.4801993131637574,
"ce_loss_2": 4.017971241474152,
"ce_loss_3": 3.8469355702400208,
"ce_loss_7": 3.586536169052124,
"epoch": 0.733,
"grad_norm": 316.0,
"kl_loss_10": 95.24363555908204,
"kl_loss_2": 1121.9350006103516,
"kl_loss_3": 775.7258972167969,
"kl_loss_7": 189.4253242492676,
"learning_rate": 0.00016898716519459073,
"loss": 528.2626,
"step": 7330
},
{
"ce_loss_10": 3.6674713015556337,
"ce_loss_13": 3.608376145362854,
"ce_loss_2": 4.144577407836914,
"ce_loss_3": 3.9727422475814818,
"ce_loss_7": 3.712773549556732,
"epoch": 0.734,
"grad_norm": 330.0,
"kl_loss_10": 96.16988220214844,
"kl_loss_2": 1116.4668975830077,
"kl_loss_3": 767.9603485107422,
"kl_loss_7": 191.9127670288086,
"learning_rate": 0.00016779965984135375,
"loss": 536.6205,
"step": 7340
},
{
"ce_loss_10": 3.5673499703407288,
"ce_loss_13": 3.5097331523895265,
"ce_loss_2": 4.023692965507507,
"ce_loss_3": 3.8575591087341308,
"ce_loss_7": 3.6114558935165406,
"epoch": 0.735,
"grad_norm": 478.0,
"kl_loss_10": 92.66586227416992,
"kl_loss_2": 1079.1628143310547,
"kl_loss_3": 740.6051025390625,
"kl_loss_7": 182.72610321044922,
"learning_rate": 0.00016661549974185424,
"loss": 528.04,
"step": 7350
},
{
"ce_loss_10": 3.612525475025177,
"ce_loss_13": 3.5525230765342712,
"ce_loss_2": 4.068535602092743,
"ce_loss_3": 3.9024940848350527,
"ce_loss_7": 3.6558452367782595,
"epoch": 0.736,
"grad_norm": 390.0,
"kl_loss_10": 97.4712890625,
"kl_loss_2": 1087.6514739990234,
"kl_loss_3": 751.1584289550781,
"kl_loss_7": 188.87088012695312,
"learning_rate": 0.00016543469682057105,
"loss": 524.4483,
"step": 7360
},
{
"ce_loss_10": 3.6394684672355653,
"ce_loss_13": 3.579529356956482,
"ce_loss_2": 4.096106541156769,
"ce_loss_3": 3.930702245235443,
"ce_loss_7": 3.6828288197517396,
"epoch": 0.737,
"grad_norm": 332.0,
"kl_loss_10": 96.63297386169434,
"kl_loss_2": 1092.361489868164,
"kl_loss_3": 752.7328277587891,
"kl_loss_7": 189.77932739257812,
"learning_rate": 0.00016425726296817632,
"loss": 533.2087,
"step": 7370
},
{
"ce_loss_10": 3.6602264523506163,
"ce_loss_13": 3.6020756483078005,
"ce_loss_2": 4.102893972396851,
"ce_loss_3": 3.9389352679252623,
"ce_loss_7": 3.702047073841095,
"epoch": 0.738,
"grad_norm": 604.0,
"kl_loss_10": 95.1510066986084,
"kl_loss_2": 1066.6962097167968,
"kl_loss_3": 734.1273132324219,
"kl_loss_7": 185.05731124877929,
"learning_rate": 0.00016308321004141607,
"loss": 524.9394,
"step": 7380
},
{
"ce_loss_10": 3.6052220940589903,
"ce_loss_13": 3.544311022758484,
"ce_loss_2": 4.074472200870514,
"ce_loss_3": 3.905838668346405,
"ce_loss_7": 3.6499088406562805,
"epoch": 0.739,
"grad_norm": 414.0,
"kl_loss_10": 98.00579032897949,
"kl_loss_2": 1091.213427734375,
"kl_loss_3": 753.4314147949219,
"kl_loss_7": 190.15870666503906,
"learning_rate": 0.00016191254986299043,
"loss": 528.1322,
"step": 7390
},
{
"ce_loss_10": 3.665621018409729,
"ce_loss_13": 3.606168735027313,
"ce_loss_2": 4.110114741325378,
"ce_loss_3": 3.9419893980026246,
"ce_loss_7": 3.7061524271965025,
"epoch": 0.74,
"grad_norm": 380.0,
"kl_loss_10": 95.95707778930664,
"kl_loss_2": 1084.0633728027344,
"kl_loss_3": 743.1361236572266,
"kl_loss_7": 184.58063583374025,
"learning_rate": 0.00016074529422143398,
"loss": 534.7291,
"step": 7400
},
{
"ce_loss_10": 3.5971511721611025,
"ce_loss_13": 3.540179669857025,
"ce_loss_2": 4.063095271587372,
"ce_loss_3": 3.8901899337768553,
"ce_loss_7": 3.6407782435417175,
"epoch": 0.741,
"grad_norm": 672.0,
"kl_loss_10": 95.23762931823731,
"kl_loss_2": 1107.8231140136718,
"kl_loss_3": 756.5032379150391,
"kl_loss_7": 187.1332000732422,
"learning_rate": 0.0001595814548709983,
"loss": 535.9396,
"step": 7410
},
{
"ce_loss_10": 3.6745630502700806,
"ce_loss_13": 3.613684153556824,
"ce_loss_2": 4.1425374269485475,
"ce_loss_3": 3.9706888437271117,
"ce_loss_7": 3.7216501116752623,
"epoch": 0.742,
"grad_norm": 372.0,
"kl_loss_10": 97.69215469360351,
"kl_loss_2": 1104.6529205322265,
"kl_loss_3": 761.8667907714844,
"kl_loss_7": 191.12793655395507,
"learning_rate": 0.00015842104353153285,
"loss": 536.9469,
"step": 7420
},
{
"ce_loss_10": 3.6906041502952576,
"ce_loss_13": 3.6308916926383974,
"ce_loss_2": 4.145760095119476,
"ce_loss_3": 3.981899178028107,
"ce_loss_7": 3.7335981249809267,
"epoch": 0.743,
"grad_norm": 418.0,
"kl_loss_10": 97.18793029785157,
"kl_loss_2": 1097.6078674316407,
"kl_loss_3": 759.1226196289062,
"kl_loss_7": 189.25363845825194,
"learning_rate": 0.0001572640718883667,
"loss": 543.1555,
"step": 7430
},
{
"ce_loss_10": 3.6231363296508787,
"ce_loss_13": 3.564767360687256,
"ce_loss_2": 4.071314561367035,
"ce_loss_3": 3.91090784072876,
"ce_loss_7": 3.664019286632538,
"epoch": 0.744,
"grad_norm": 320.0,
"kl_loss_10": 94.90192832946778,
"kl_loss_2": 1067.2735595703125,
"kl_loss_3": 738.7796752929687,
"kl_loss_7": 183.48248062133788,
"learning_rate": 0.0001561105515921915,
"loss": 533.524,
"step": 7440
},
{
"ce_loss_10": 3.463870346546173,
"ce_loss_13": 3.4067335724830627,
"ce_loss_2": 3.9477816224098206,
"ce_loss_3": 3.7779128670692446,
"ce_loss_7": 3.51077561378479,
"epoch": 0.745,
"grad_norm": 300.0,
"kl_loss_10": 92.0508991241455,
"kl_loss_2": 1123.8193664550781,
"kl_loss_3": 770.2350646972657,
"kl_loss_7": 184.63360900878905,
"learning_rate": 0.0001549604942589441,
"loss": 530.4723,
"step": 7450
},
{
"ce_loss_10": 3.6651261687278747,
"ce_loss_13": 3.6062275648117064,
"ce_loss_2": 4.092234718799591,
"ce_loss_3": 3.9361136317253114,
"ce_loss_7": 3.7055254936218263,
"epoch": 0.746,
"grad_norm": 366.0,
"kl_loss_10": 93.61905822753906,
"kl_loss_2": 1028.498812866211,
"kl_loss_3": 711.0323303222656,
"kl_loss_7": 180.76227340698242,
"learning_rate": 0.00015381391146968864,
"loss": 518.9042,
"step": 7460
},
{
"ce_loss_10": 3.6343637704849243,
"ce_loss_13": 3.5772631406784057,
"ce_loss_2": 4.097551655769348,
"ce_loss_3": 3.9294708490371706,
"ce_loss_7": 3.6792303323745728,
"epoch": 0.747,
"grad_norm": 348.0,
"kl_loss_10": 93.67252769470215,
"kl_loss_2": 1075.3313690185546,
"kl_loss_3": 736.6811370849609,
"kl_loss_7": 182.92275466918946,
"learning_rate": 0.00015267081477050133,
"loss": 529.2104,
"step": 7470
},
{
"ce_loss_10": 3.737002635002136,
"ce_loss_13": 3.6760261058807373,
"ce_loss_2": 4.184408628940583,
"ce_loss_3": 4.020549094676971,
"ce_loss_7": 3.779319405555725,
"epoch": 0.748,
"grad_norm": 314.0,
"kl_loss_10": 97.9722526550293,
"kl_loss_2": 1074.686865234375,
"kl_loss_3": 738.4016967773438,
"kl_loss_7": 188.9453094482422,
"learning_rate": 0.00015153121567235335,
"loss": 521.3269,
"step": 7480
},
{
"ce_loss_10": 3.627355396747589,
"ce_loss_13": 3.566980814933777,
"ce_loss_2": 4.087941682338714,
"ce_loss_3": 3.9189056277275087,
"ce_loss_7": 3.671427834033966,
"epoch": 0.749,
"grad_norm": 362.0,
"kl_loss_10": 95.86229972839355,
"kl_loss_2": 1099.5704315185546,
"kl_loss_3": 757.0687835693359,
"kl_loss_7": 188.21585922241212,
"learning_rate": 0.00015039512565099468,
"loss": 520.7597,
"step": 7490
},
{
"ce_loss_10": 3.6923457860946653,
"ce_loss_13": 3.6337139129638674,
"ce_loss_2": 4.142465770244598,
"ce_loss_3": 3.9779353976249694,
"ce_loss_7": 3.7360000610351562,
"epoch": 0.75,
"grad_norm": 400.0,
"kl_loss_10": 96.83558921813965,
"kl_loss_2": 1084.189535522461,
"kl_loss_3": 748.4331817626953,
"kl_loss_7": 188.31302337646486,
"learning_rate": 0.00014926255614683932,
"loss": 542.3,
"step": 7500
},
{
"ce_loss_10": 3.63236540555954,
"ce_loss_13": 3.5743218302726745,
"ce_loss_2": 4.084986460208893,
"ce_loss_3": 3.9159162759780886,
"ce_loss_7": 3.6776034474372863,
"epoch": 0.751,
"grad_norm": 356.0,
"kl_loss_10": 95.49623985290528,
"kl_loss_2": 1074.4479522705078,
"kl_loss_3": 737.1827270507813,
"kl_loss_7": 185.40160522460937,
"learning_rate": 0.0001481335185648498,
"loss": 533.0406,
"step": 7510
},
{
"ce_loss_10": 3.6419626474380493,
"ce_loss_13": 3.583856701850891,
"ce_loss_2": 4.0939129114151,
"ce_loss_3": 3.9313414216041567,
"ce_loss_7": 3.686499559879303,
"epoch": 0.752,
"grad_norm": 406.0,
"kl_loss_10": 93.70109405517579,
"kl_loss_2": 1078.2966064453126,
"kl_loss_3": 747.9822265625,
"kl_loss_7": 186.15133514404297,
"learning_rate": 0.0001470080242744218,
"loss": 523.242,
"step": 7520
},
{
"ce_loss_10": 3.638762640953064,
"ce_loss_13": 3.5817859530448914,
"ce_loss_2": 4.096928322315216,
"ce_loss_3": 3.925868511199951,
"ce_loss_7": 3.6821122765541077,
"epoch": 0.753,
"grad_norm": 304.0,
"kl_loss_10": 92.91362495422364,
"kl_loss_2": 1078.3225189208983,
"kl_loss_3": 744.8880218505859,
"kl_loss_7": 183.9945556640625,
"learning_rate": 0.0001458860846092705,
"loss": 532.4821,
"step": 7530
},
{
"ce_loss_10": 3.6720047116279604,
"ce_loss_13": 3.6128148198127747,
"ce_loss_2": 4.114215791225433,
"ce_loss_3": 3.9525702714920046,
"ce_loss_7": 3.7135818719863893,
"epoch": 0.754,
"grad_norm": 322.0,
"kl_loss_10": 94.26252975463868,
"kl_loss_2": 1064.2825866699218,
"kl_loss_3": 735.7307800292969,
"kl_loss_7": 183.32004623413087,
"learning_rate": 0.00014476771086731566,
"loss": 517.3908,
"step": 7540
},
{
"ce_loss_10": 3.7847033739089966,
"ce_loss_13": 3.7219197750091553,
"ce_loss_2": 4.230582165718078,
"ce_loss_3": 4.065666139125824,
"ce_loss_7": 3.827774000167847,
"epoch": 0.755,
"grad_norm": 430.0,
"kl_loss_10": 99.63549118041992,
"kl_loss_2": 1067.45849609375,
"kl_loss_3": 732.3584259033203,
"kl_loss_7": 187.05829620361328,
"learning_rate": 0.00014365291431056872,
"loss": 535.3279,
"step": 7550
},
{
"ce_loss_10": 3.6090814113616942,
"ce_loss_13": 3.5493834733963014,
"ce_loss_2": 4.077333819866181,
"ce_loss_3": 3.906116855144501,
"ce_loss_7": 3.652885007858276,
"epoch": 0.756,
"grad_norm": 460.0,
"kl_loss_10": 97.59222984313965,
"kl_loss_2": 1117.932635498047,
"kl_loss_3": 769.9259338378906,
"kl_loss_7": 192.52491149902343,
"learning_rate": 0.00014254170616501827,
"loss": 534.983,
"step": 7560
},
{
"ce_loss_10": 3.535455918312073,
"ce_loss_13": 3.47601797580719,
"ce_loss_2": 4.02073061466217,
"ce_loss_3": 3.852824592590332,
"ce_loss_7": 3.582290494441986,
"epoch": 0.757,
"grad_norm": 544.0,
"kl_loss_10": 94.12142906188964,
"kl_loss_2": 1137.807843017578,
"kl_loss_3": 793.451205444336,
"kl_loss_7": 191.15487823486328,
"learning_rate": 0.0001414340976205183,
"loss": 552.139,
"step": 7570
},
{
"ce_loss_10": 3.554329538345337,
"ce_loss_13": 3.495155191421509,
"ce_loss_2": 4.028338003158569,
"ce_loss_3": 3.860016918182373,
"ce_loss_7": 3.6010040402412415,
"epoch": 0.758,
"grad_norm": 392.0,
"kl_loss_10": 94.82050590515136,
"kl_loss_2": 1103.2837646484375,
"kl_loss_3": 760.9699432373047,
"kl_loss_7": 186.93385314941406,
"learning_rate": 0.00014033009983067452,
"loss": 536.1902,
"step": 7580
},
{
"ce_loss_10": 3.7230227828025817,
"ce_loss_13": 3.663366961479187,
"ce_loss_2": 4.157820415496826,
"ce_loss_3": 3.9997562408447265,
"ce_loss_7": 3.766176974773407,
"epoch": 0.759,
"grad_norm": 366.0,
"kl_loss_10": 95.41510429382325,
"kl_loss_2": 1045.2667877197266,
"kl_loss_3": 721.8115600585937,
"kl_loss_7": 183.35761260986328,
"learning_rate": 0.00013922972391273224,
"loss": 521.7405,
"step": 7590
},
{
"ce_loss_10": 3.726309287548065,
"ce_loss_13": 3.666226303577423,
"ce_loss_2": 4.173925065994263,
"ce_loss_3": 4.007045650482178,
"ce_loss_7": 3.7666892886161802,
"epoch": 0.76,
"grad_norm": 396.0,
"kl_loss_10": 96.0021198272705,
"kl_loss_2": 1064.7614837646483,
"kl_loss_3": 734.0039642333984,
"kl_loss_7": 185.6392059326172,
"learning_rate": 0.0001381329809474649,
"loss": 528.3375,
"step": 7600
},
{
"ce_loss_10": 3.621905469894409,
"ce_loss_13": 3.561663830280304,
"ce_loss_2": 4.098969185352326,
"ce_loss_3": 3.925651717185974,
"ce_loss_7": 3.6682042717933654,
"epoch": 0.761,
"grad_norm": 370.0,
"kl_loss_10": 96.61415328979493,
"kl_loss_2": 1119.761654663086,
"kl_loss_3": 769.7445831298828,
"kl_loss_7": 190.59917831420898,
"learning_rate": 0.0001370398819790621,
"loss": 540.338,
"step": 7610
},
{
"ce_loss_10": 3.7644327759742735,
"ce_loss_13": 3.704228925704956,
"ce_loss_2": 4.202455806732178,
"ce_loss_3": 4.041428947448731,
"ce_loss_7": 3.8075477123260497,
"epoch": 0.762,
"grad_norm": 424.0,
"kl_loss_10": 97.06539382934571,
"kl_loss_2": 1046.6303649902343,
"kl_loss_3": 720.9399322509765,
"kl_loss_7": 185.40132827758788,
"learning_rate": 0.00013595043801501794,
"loss": 512.6931,
"step": 7620
},
{
"ce_loss_10": 3.5539215803146362,
"ce_loss_13": 3.4973302245140077,
"ce_loss_2": 4.044493949413299,
"ce_loss_3": 3.8687676310539247,
"ce_loss_7": 3.602993667125702,
"epoch": 0.763,
"grad_norm": 468.0,
"kl_loss_10": 92.99364700317383,
"kl_loss_2": 1138.1223754882812,
"kl_loss_3": 782.3796447753906,
"kl_loss_7": 188.2281280517578,
"learning_rate": 0.00013486466002602133,
"loss": 539.5471,
"step": 7630
},
{
"ce_loss_10": 3.680443322658539,
"ce_loss_13": 3.6184515833854674,
"ce_loss_2": 4.119498157501221,
"ce_loss_3": 3.9600594878196715,
"ce_loss_7": 3.7241831541061403,
"epoch": 0.764,
"grad_norm": 376.0,
"kl_loss_10": 97.24503707885742,
"kl_loss_2": 1061.5150573730468,
"kl_loss_3": 737.4153533935547,
"kl_loss_7": 187.22406005859375,
"learning_rate": 0.00013378255894584462,
"loss": 537.8561,
"step": 7640
},
{
"ce_loss_10": 3.60829781293869,
"ce_loss_13": 3.5466054916381835,
"ce_loss_2": 4.072665071487426,
"ce_loss_3": 3.9038659572601317,
"ce_loss_7": 3.6548298835754394,
"epoch": 0.765,
"grad_norm": 380.0,
"kl_loss_10": 95.1153465270996,
"kl_loss_2": 1096.5119873046874,
"kl_loss_3": 758.0185302734375,
"kl_loss_7": 188.7285140991211,
"learning_rate": 0.0001327041456712334,
"loss": 535.4322,
"step": 7650
},
{
"ce_loss_10": 3.649807059764862,
"ce_loss_13": 3.588579738140106,
"ce_loss_2": 4.103657793998718,
"ce_loss_3": 3.9434640645980834,
"ce_loss_7": 3.6960434794425963,
"epoch": 0.766,
"grad_norm": 410.0,
"kl_loss_10": 95.99581718444824,
"kl_loss_2": 1095.5443603515625,
"kl_loss_3": 758.2011474609375,
"kl_loss_7": 189.6258804321289,
"learning_rate": 0.00013162943106179747,
"loss": 538.4857,
"step": 7660
},
{
"ce_loss_10": 3.627143681049347,
"ce_loss_13": 3.5671829342842103,
"ce_loss_2": 4.08168009519577,
"ce_loss_3": 3.9202899813652037,
"ce_loss_7": 3.6696593165397644,
"epoch": 0.767,
"grad_norm": 372.0,
"kl_loss_10": 97.96165161132812,
"kl_loss_2": 1080.916067504883,
"kl_loss_3": 746.2386291503906,
"kl_loss_7": 187.8828155517578,
"learning_rate": 0.00013055842593990132,
"loss": 529.1405,
"step": 7670
},
{
"ce_loss_10": 3.571021115779877,
"ce_loss_13": 3.5149319171905518,
"ce_loss_2": 4.027233076095581,
"ce_loss_3": 3.864198935031891,
"ce_loss_7": 3.6173386335372926,
"epoch": 0.768,
"grad_norm": 372.0,
"kl_loss_10": 92.48302154541015,
"kl_loss_2": 1072.3523834228515,
"kl_loss_3": 740.25439453125,
"kl_loss_7": 183.08441925048828,
"learning_rate": 0.00012949114109055414,
"loss": 533.8078,
"step": 7680
},
{
"ce_loss_10": 3.6176257848739626,
"ce_loss_13": 3.5594166994094847,
"ce_loss_2": 4.078605031967163,
"ce_loss_3": 3.918487286567688,
"ce_loss_7": 3.6636170506477357,
"epoch": 0.769,
"grad_norm": 422.0,
"kl_loss_10": 94.60773849487305,
"kl_loss_2": 1089.138235473633,
"kl_loss_3": 757.3290557861328,
"kl_loss_7": 187.67217483520508,
"learning_rate": 0.00012842758726130281,
"loss": 537.3952,
"step": 7690
},
{
"ce_loss_10": 3.655508840084076,
"ce_loss_13": 3.5946714520454406,
"ce_loss_2": 4.117365610599518,
"ce_loss_3": 3.9561346530914308,
"ce_loss_7": 3.7002153038978576,
"epoch": 0.77,
"grad_norm": 432.0,
"kl_loss_10": 94.65040473937988,
"kl_loss_2": 1092.9069885253907,
"kl_loss_3": 757.1287445068359,
"kl_loss_7": 189.29573440551758,
"learning_rate": 0.00012736777516212267,
"loss": 528.3388,
"step": 7700
},
{
"ce_loss_10": 3.65016793012619,
"ce_loss_13": 3.5914124608039857,
"ce_loss_2": 4.1151956677436825,
"ce_loss_3": 3.947415459156036,
"ce_loss_7": 3.6969300508499146,
"epoch": 0.771,
"grad_norm": 404.0,
"kl_loss_10": 94.72591972351074,
"kl_loss_2": 1095.3469024658202,
"kl_loss_3": 757.773715209961,
"kl_loss_7": 189.3510871887207,
"learning_rate": 0.00012631171546530968,
"loss": 527.5062,
"step": 7710
},
{
"ce_loss_10": 3.6695477604866027,
"ce_loss_13": 3.6066803336143494,
"ce_loss_2": 4.130255508422851,
"ce_loss_3": 3.9629722952842714,
"ce_loss_7": 3.7124558687210083,
"epoch": 0.772,
"grad_norm": 400.0,
"kl_loss_10": 99.19231147766114,
"kl_loss_2": 1089.8547271728517,
"kl_loss_3": 754.8526977539062,
"kl_loss_7": 189.7204719543457,
"learning_rate": 0.00012525941880537307,
"loss": 538.339,
"step": 7720
},
{
"ce_loss_10": 3.7045652866363525,
"ce_loss_13": 3.6435051798820495,
"ce_loss_2": 4.150338041782379,
"ce_loss_3": 3.9872673988342284,
"ce_loss_7": 3.7454243421554567,
"epoch": 0.773,
"grad_norm": 398.0,
"kl_loss_10": 95.61402626037598,
"kl_loss_2": 1061.4443786621093,
"kl_loss_3": 733.3583831787109,
"kl_loss_7": 185.768399810791,
"learning_rate": 0.00012421089577892869,
"loss": 524.5635,
"step": 7730
},
{
"ce_loss_10": 3.645431864261627,
"ce_loss_13": 3.584313917160034,
"ce_loss_2": 4.109975218772888,
"ce_loss_3": 3.9383553504943847,
"ce_loss_7": 3.6912776827812195,
"epoch": 0.774,
"grad_norm": 440.0,
"kl_loss_10": 96.41397132873536,
"kl_loss_2": 1098.874331665039,
"kl_loss_3": 755.1620544433594,
"kl_loss_7": 190.60089797973632,
"learning_rate": 0.0001231661569445919,
"loss": 536.2486,
"step": 7740
},
{
"ce_loss_10": 3.501088798046112,
"ce_loss_13": 3.443252968788147,
"ce_loss_2": 3.9620775461196898,
"ce_loss_3": 3.795079970359802,
"ce_loss_7": 3.5464309573173525,
"epoch": 0.775,
"grad_norm": 346.0,
"kl_loss_10": 93.47399139404297,
"kl_loss_2": 1090.8283447265626,
"kl_loss_3": 754.4031158447266,
"kl_loss_7": 186.22638092041015,
"learning_rate": 0.00012212521282287093,
"loss": 538.4937,
"step": 7750
},
{
"ce_loss_10": 3.6629942655563354,
"ce_loss_13": 3.601106119155884,
"ce_loss_2": 4.111618340015411,
"ce_loss_3": 3.951659619808197,
"ce_loss_7": 3.7078338623046876,
"epoch": 0.776,
"grad_norm": 364.0,
"kl_loss_10": 98.37307014465333,
"kl_loss_2": 1080.0280029296875,
"kl_loss_3": 745.148388671875,
"kl_loss_7": 190.13256072998047,
"learning_rate": 0.00012108807389606158,
"loss": 538.7029,
"step": 7760
},
{
"ce_loss_10": 3.659121203422546,
"ce_loss_13": 3.6007887601852415,
"ce_loss_2": 4.112268555164337,
"ce_loss_3": 3.9502876162528993,
"ce_loss_7": 3.7037811279296875,
"epoch": 0.777,
"grad_norm": 364.0,
"kl_loss_10": 93.70635108947754,
"kl_loss_2": 1072.3641204833984,
"kl_loss_3": 740.7109130859375,
"kl_loss_7": 182.99172821044922,
"learning_rate": 0.00012005475060814159,
"loss": 525.026,
"step": 7770
},
{
"ce_loss_10": 3.5951377630233763,
"ce_loss_13": 3.5359464406967165,
"ce_loss_2": 4.060847020149231,
"ce_loss_3": 3.891322433948517,
"ce_loss_7": 3.63891544342041,
"epoch": 0.778,
"grad_norm": 384.0,
"kl_loss_10": 97.0392059326172,
"kl_loss_2": 1106.707992553711,
"kl_loss_3": 763.6160034179687,
"kl_loss_7": 188.94908752441407,
"learning_rate": 0.00011902525336466464,
"loss": 535.4202,
"step": 7780
},
{
"ce_loss_10": 3.5829373240470885,
"ce_loss_13": 3.5231135487556458,
"ce_loss_2": 4.054291594028473,
"ce_loss_3": 3.888161540031433,
"ce_loss_7": 3.630410146713257,
"epoch": 0.779,
"grad_norm": 384.0,
"kl_loss_10": 95.91268005371094,
"kl_loss_2": 1108.9134979248047,
"kl_loss_3": 768.8667724609375,
"kl_loss_7": 190.86130905151367,
"learning_rate": 0.00011799959253265668,
"loss": 532.9367,
"step": 7790
},
{
"ce_loss_10": 3.646629250049591,
"ce_loss_13": 3.584940028190613,
"ce_loss_2": 4.100114536285401,
"ce_loss_3": 3.9342658519744873,
"ce_loss_7": 3.687722849845886,
"epoch": 0.78,
"grad_norm": 426.0,
"kl_loss_10": 98.96642303466797,
"kl_loss_2": 1093.9118621826171,
"kl_loss_3": 757.5971832275391,
"kl_loss_7": 190.95031204223633,
"learning_rate": 0.00011697777844051105,
"loss": 534.9413,
"step": 7800
},
{
"ce_loss_10": 3.6246392488479615,
"ce_loss_13": 3.5636275887489317,
"ce_loss_2": 4.0959463000297545,
"ce_loss_3": 3.9209203004837034,
"ce_loss_7": 3.668913960456848,
"epoch": 0.781,
"grad_norm": 394.0,
"kl_loss_10": 96.37951927185058,
"kl_loss_2": 1131.5390258789062,
"kl_loss_3": 774.0704650878906,
"kl_loss_7": 190.10399703979493,
"learning_rate": 0.00011595982137788402,
"loss": 539.5272,
"step": 7810
},
{
"ce_loss_10": 3.601748263835907,
"ce_loss_13": 3.542947518825531,
"ce_loss_2": 4.0462228655815125,
"ce_loss_3": 3.884107196331024,
"ce_loss_7": 3.6427837133407595,
"epoch": 0.782,
"grad_norm": 362.0,
"kl_loss_10": 95.04786491394043,
"kl_loss_2": 1064.3328094482422,
"kl_loss_3": 734.7262878417969,
"kl_loss_7": 183.74214706420898,
"learning_rate": 0.00011494573159559212,
"loss": 528.7992,
"step": 7820
},
{
"ce_loss_10": 3.587358093261719,
"ce_loss_13": 3.5285757184028625,
"ce_loss_2": 4.055095791816711,
"ce_loss_3": 3.8850948452949523,
"ce_loss_7": 3.6320362448692323,
"epoch": 0.783,
"grad_norm": 344.0,
"kl_loss_10": 95.2613368988037,
"kl_loss_2": 1092.221664428711,
"kl_loss_3": 759.4220550537109,
"kl_loss_7": 186.76042938232422,
"learning_rate": 0.00011393551930550828,
"loss": 541.8625,
"step": 7830
},
{
"ce_loss_10": 3.7354641199111938,
"ce_loss_13": 3.6739312171936036,
"ce_loss_2": 4.175600934028625,
"ce_loss_3": 4.019279301166534,
"ce_loss_7": 3.7783281922340395,
"epoch": 0.784,
"grad_norm": 390.0,
"kl_loss_10": 99.59685325622559,
"kl_loss_2": 1064.6414337158203,
"kl_loss_3": 741.2587860107421,
"kl_loss_7": 189.10858612060548,
"learning_rate": 0.00011292919468045875,
"loss": 527.9955,
"step": 7840
},
{
"ce_loss_10": 3.680347263813019,
"ce_loss_13": 3.6196223735809325,
"ce_loss_2": 4.128578865528107,
"ce_loss_3": 3.9644781708717347,
"ce_loss_7": 3.723640871047974,
"epoch": 0.785,
"grad_norm": 326.0,
"kl_loss_10": 95.6224323272705,
"kl_loss_2": 1072.9300354003906,
"kl_loss_3": 746.2864379882812,
"kl_loss_7": 187.85019607543944,
"learning_rate": 0.00011192676785412154,
"loss": 523.3404,
"step": 7850
},
{
"ce_loss_10": 3.622621536254883,
"ce_loss_13": 3.560643196105957,
"ce_loss_2": 4.089509451389313,
"ce_loss_3": 3.9235698699951174,
"ce_loss_7": 3.6674723744392397,
"epoch": 0.786,
"grad_norm": 458.0,
"kl_loss_10": 96.80489120483398,
"kl_loss_2": 1093.20048828125,
"kl_loss_3": 754.2780883789062,
"kl_loss_7": 187.94250411987304,
"learning_rate": 0.00011092824892092374,
"loss": 533.5229,
"step": 7860
},
{
"ce_loss_10": 3.547496974468231,
"ce_loss_13": 3.4892043232917787,
"ce_loss_2": 4.020435309410095,
"ce_loss_3": 3.8508559226989747,
"ce_loss_7": 3.5902876496315,
"epoch": 0.787,
"grad_norm": 322.0,
"kl_loss_10": 94.49787139892578,
"kl_loss_2": 1110.2376556396484,
"kl_loss_3": 767.8008331298828,
"kl_loss_7": 188.15859375,
"learning_rate": 0.0001099336479359398,
"loss": 532.4489,
"step": 7870
},
{
"ce_loss_10": 3.676584839820862,
"ce_loss_13": 3.6199623942375183,
"ce_loss_2": 4.124644804000854,
"ce_loss_3": 3.9601522207260134,
"ce_loss_7": 3.7184366583824158,
"epoch": 0.788,
"grad_norm": 414.0,
"kl_loss_10": 92.98647613525391,
"kl_loss_2": 1076.658267211914,
"kl_loss_3": 737.3064331054687,
"kl_loss_7": 183.75065536499022,
"learning_rate": 0.00010894297491479043,
"loss": 529.369,
"step": 7880
},
{
"ce_loss_10": 3.675907850265503,
"ce_loss_13": 3.615241324901581,
"ce_loss_2": 4.123448085784912,
"ce_loss_3": 3.9602300405502318,
"ce_loss_7": 3.715320038795471,
"epoch": 0.789,
"grad_norm": 370.0,
"kl_loss_10": 97.27086067199707,
"kl_loss_2": 1078.250909423828,
"kl_loss_3": 741.1790222167969,
"kl_loss_7": 186.16854553222657,
"learning_rate": 0.00010795623983354214,
"loss": 523.6978,
"step": 7890
},
{
"ce_loss_10": 3.549619424343109,
"ce_loss_13": 3.492576813697815,
"ce_loss_2": 4.021520948410034,
"ce_loss_3": 3.8529414176940917,
"ce_loss_7": 3.595447373390198,
"epoch": 0.79,
"grad_norm": 428.0,
"kl_loss_10": 93.0215072631836,
"kl_loss_2": 1113.914730834961,
"kl_loss_3": 772.1699676513672,
"kl_loss_7": 189.76142959594728,
"learning_rate": 0.00010697345262860636,
"loss": 533.2417,
"step": 7900
},
{
"ce_loss_10": 3.702609384059906,
"ce_loss_13": 3.6431208491325378,
"ce_loss_2": 4.14087233543396,
"ce_loss_3": 3.9802441716194155,
"ce_loss_7": 3.7457746505737304,
"epoch": 0.791,
"grad_norm": 368.0,
"kl_loss_10": 97.61579055786133,
"kl_loss_2": 1063.5964447021483,
"kl_loss_3": 734.3944030761719,
"kl_loss_7": 187.06654663085936,
"learning_rate": 0.00010599462319663906,
"loss": 520.0625,
"step": 7910
},
{
"ce_loss_10": 3.6748117208480835,
"ce_loss_13": 3.614163410663605,
"ce_loss_2": 4.111383318901062,
"ce_loss_3": 3.951873278617859,
"ce_loss_7": 3.715614116191864,
"epoch": 0.792,
"grad_norm": 382.0,
"kl_loss_10": 94.54501228332519,
"kl_loss_2": 1049.0091613769532,
"kl_loss_3": 722.9781219482422,
"kl_loss_7": 183.01754150390624,
"learning_rate": 0.00010501976139444191,
"loss": 518.3574,
"step": 7920
},
{
"ce_loss_10": 3.7049331426620484,
"ce_loss_13": 3.6438170671463013,
"ce_loss_2": 4.144390141963958,
"ce_loss_3": 3.9876601815223696,
"ce_loss_7": 3.745813262462616,
"epoch": 0.793,
"grad_norm": 370.0,
"kl_loss_10": 97.8447940826416,
"kl_loss_2": 1057.744808959961,
"kl_loss_3": 730.5345703125,
"kl_loss_7": 185.18996047973633,
"learning_rate": 0.0001040488770388625,
"loss": 527.8366,
"step": 7930
},
{
"ce_loss_10": 3.6446168065071105,
"ce_loss_13": 3.5857683539390566,
"ce_loss_2": 4.095709836483001,
"ce_loss_3": 3.92866997718811,
"ce_loss_7": 3.685992920398712,
"epoch": 0.794,
"grad_norm": 426.0,
"kl_loss_10": 95.57501831054688,
"kl_loss_2": 1080.6232208251954,
"kl_loss_3": 746.1043212890625,
"kl_loss_7": 186.66847763061523,
"learning_rate": 0.00010308197990669538,
"loss": 527.0882,
"step": 7940
},
{
"ce_loss_10": 3.7647696137428284,
"ce_loss_13": 3.7019853234291076,
"ce_loss_2": 4.21561850309372,
"ce_loss_3": 4.0513708114624025,
"ce_loss_7": 3.8064971685409548,
"epoch": 0.795,
"grad_norm": 356.0,
"kl_loss_10": 100.9611873626709,
"kl_loss_2": 1084.6148345947265,
"kl_loss_3": 743.2166534423828,
"kl_loss_7": 191.26584091186524,
"learning_rate": 0.0001021190797345839,
"loss": 525.7331,
"step": 7950
},
{
"ce_loss_10": 3.4792375445365904,
"ce_loss_13": 3.4190258502960207,
"ce_loss_2": 3.96710387468338,
"ce_loss_3": 3.7957834005355835,
"ce_loss_7": 3.528597414493561,
"epoch": 0.796,
"grad_norm": 386.0,
"kl_loss_10": 95.0804401397705,
"kl_loss_2": 1137.388375854492,
"kl_loss_3": 792.2215118408203,
"kl_loss_7": 192.50171508789063,
"learning_rate": 0.00010116018621892236,
"loss": 537.4441,
"step": 7960
},
{
"ce_loss_10": 3.6988709568977356,
"ce_loss_13": 3.6362175583839416,
"ce_loss_2": 4.151265692710877,
"ce_loss_3": 3.9912821412086488,
"ce_loss_7": 3.742702007293701,
"epoch": 0.797,
"grad_norm": 444.0,
"kl_loss_10": 99.6129222869873,
"kl_loss_2": 1100.607211303711,
"kl_loss_3": 767.8290985107421,
"kl_loss_7": 194.2897491455078,
"learning_rate": 0.00010020530901575753,
"loss": 526.4385,
"step": 7970
},
{
"ce_loss_10": 3.727276122570038,
"ce_loss_13": 3.664809966087341,
"ce_loss_2": 4.17646723985672,
"ce_loss_3": 4.011640095710755,
"ce_loss_7": 3.7683190941810607,
"epoch": 0.798,
"grad_norm": 334.0,
"kl_loss_10": 98.68130950927734,
"kl_loss_2": 1084.4167602539062,
"kl_loss_3": 747.0828460693359,
"kl_loss_7": 190.09516677856445,
"learning_rate": 9.925445774069231e-05,
"loss": 521.7054,
"step": 7980
},
{
"ce_loss_10": 3.677051067352295,
"ce_loss_13": 3.6162899494171143,
"ce_loss_2": 4.132367658615112,
"ce_loss_3": 3.9699331760406493,
"ce_loss_7": 3.723151159286499,
"epoch": 0.799,
"grad_norm": 340.0,
"kl_loss_10": 97.4996379852295,
"kl_loss_2": 1074.8818054199219,
"kl_loss_3": 740.7804992675781,
"kl_loss_7": 187.78277282714845,
"learning_rate": 9.830764196878872e-05,
"loss": 517.902,
"step": 7990
},
{
"ce_loss_10": 3.6140867948532103,
"ce_loss_13": 3.556562864780426,
"ce_loss_2": 4.0635038137435915,
"ce_loss_3": 3.902656090259552,
"ce_loss_7": 3.6608413100242614,
"epoch": 0.8,
"grad_norm": 410.0,
"kl_loss_10": 94.1772445678711,
"kl_loss_2": 1099.7673645019531,
"kl_loss_3": 761.414794921875,
"kl_loss_7": 186.34807205200195,
"learning_rate": 9.736487123447069e-05,
"loss": 531.4563,
"step": 8000
},
{
"ce_loss_10": 3.559322512149811,
"ce_loss_13": 3.49820739030838,
"ce_loss_2": 4.036343896389008,
"ce_loss_3": 3.8618996500968934,
"ce_loss_7": 3.6017415881156922,
"epoch": 0.801,
"grad_norm": 424.0,
"kl_loss_10": 96.55318107604981,
"kl_loss_2": 1136.456121826172,
"kl_loss_3": 771.9989410400391,
"kl_loss_7": 188.50249938964845,
"learning_rate": 9.642615503142926e-05,
"loss": 541.6381,
"step": 8010
},
{
"ce_loss_10": 3.630905735492706,
"ce_loss_13": 3.5719484210014345,
"ce_loss_2": 4.097460567951202,
"ce_loss_3": 3.9188284277915955,
"ce_loss_7": 3.673666751384735,
"epoch": 0.802,
"grad_norm": 370.0,
"kl_loss_10": 94.45314712524414,
"kl_loss_2": 1090.8831848144532,
"kl_loss_3": 738.8009979248047,
"kl_loss_7": 184.0514343261719,
"learning_rate": 9.549150281252633e-05,
"loss": 524.0769,
"step": 8020
},
{
"ce_loss_10": 3.658740258216858,
"ce_loss_13": 3.598051357269287,
"ce_loss_2": 4.112537753582001,
"ce_loss_3": 3.9440460920333864,
"ce_loss_7": 3.701529622077942,
"epoch": 0.803,
"grad_norm": 354.0,
"kl_loss_10": 97.62285194396972,
"kl_loss_2": 1076.1221923828125,
"kl_loss_3": 742.6418304443359,
"kl_loss_7": 187.46692276000977,
"learning_rate": 9.4560923989699e-05,
"loss": 531.6947,
"step": 8030
},
{
"ce_loss_10": 3.6491722106933593,
"ce_loss_13": 3.5902853846549987,
"ce_loss_2": 4.109341251850128,
"ce_loss_3": 3.942945408821106,
"ce_loss_7": 3.696093666553497,
"epoch": 0.804,
"grad_norm": 382.0,
"kl_loss_10": 96.87751007080078,
"kl_loss_2": 1089.1260498046875,
"kl_loss_3": 751.9404052734375,
"kl_loss_7": 188.3861946105957,
"learning_rate": 9.363442793386607e-05,
"loss": 538.5806,
"step": 8040
},
{
"ce_loss_10": 3.6259461641311646,
"ce_loss_13": 3.5652650475502012,
"ce_loss_2": 4.09434745311737,
"ce_loss_3": 3.9288868069648744,
"ce_loss_7": 3.670744836330414,
"epoch": 0.805,
"grad_norm": 436.0,
"kl_loss_10": 96.23310775756836,
"kl_loss_2": 1102.4481658935547,
"kl_loss_3": 766.5739196777344,
"kl_loss_7": 189.9322036743164,
"learning_rate": 9.271202397483213e-05,
"loss": 525.3384,
"step": 8050
},
{
"ce_loss_10": 3.64525443315506,
"ce_loss_13": 3.587091565132141,
"ce_loss_2": 4.088842356204987,
"ce_loss_3": 3.926717495918274,
"ce_loss_7": 3.6877028584480285,
"epoch": 0.806,
"grad_norm": 462.0,
"kl_loss_10": 95.10493888854981,
"kl_loss_2": 1064.438558959961,
"kl_loss_3": 734.5970611572266,
"kl_loss_7": 184.7579719543457,
"learning_rate": 9.179372140119524e-05,
"loss": 530.6901,
"step": 8060
},
{
"ce_loss_10": 3.59020277261734,
"ce_loss_13": 3.531452512741089,
"ce_loss_2": 4.036340653896332,
"ce_loss_3": 3.8760047912597657,
"ce_loss_7": 3.6337902188301086,
"epoch": 0.807,
"grad_norm": 432.0,
"kl_loss_10": 94.00482330322265,
"kl_loss_2": 1074.4489135742188,
"kl_loss_3": 739.4833740234375,
"kl_loss_7": 184.7809310913086,
"learning_rate": 9.087952946025175e-05,
"loss": 531.5049,
"step": 8070
},
{
"ce_loss_10": 3.7053560853004455,
"ce_loss_13": 3.6452667355537414,
"ce_loss_2": 4.136937665939331,
"ce_loss_3": 3.9754079580307007,
"ce_loss_7": 3.7457935094833372,
"epoch": 0.808,
"grad_norm": 368.0,
"kl_loss_10": 96.12910385131836,
"kl_loss_2": 1048.5191436767577,
"kl_loss_3": 719.7487762451171,
"kl_loss_7": 183.48829498291016,
"learning_rate": 8.996945735790446e-05,
"loss": 523.2327,
"step": 8080
},
{
"ce_loss_10": 3.602836012840271,
"ce_loss_13": 3.542934799194336,
"ce_loss_2": 4.055256414413452,
"ce_loss_3": 3.8926199197769167,
"ce_loss_7": 3.6462236762046816,
"epoch": 0.809,
"grad_norm": 414.0,
"kl_loss_10": 95.67857933044434,
"kl_loss_2": 1093.489208984375,
"kl_loss_3": 759.0634765625,
"kl_loss_7": 186.64484634399415,
"learning_rate": 8.906351425856951e-05,
"loss": 536.3948,
"step": 8090
},
{
"ce_loss_10": 3.586146354675293,
"ce_loss_13": 3.5270805954933167,
"ce_loss_2": 4.053403818607331,
"ce_loss_3": 3.883652901649475,
"ce_loss_7": 3.6302590370178223,
"epoch": 0.81,
"grad_norm": 328.0,
"kl_loss_10": 96.12913436889649,
"kl_loss_2": 1108.7147094726563,
"kl_loss_3": 762.2885803222656,
"kl_loss_7": 187.99051055908203,
"learning_rate": 8.816170928508365e-05,
"loss": 536.7299,
"step": 8100
},
{
"ce_loss_10": 3.5469899415969848,
"ce_loss_13": 3.487591028213501,
"ce_loss_2": 4.024684643745422,
"ce_loss_3": 3.853050243854523,
"ce_loss_7": 3.5918329834938048,
"epoch": 0.811,
"grad_norm": 424.0,
"kl_loss_10": 95.16791305541992,
"kl_loss_2": 1131.8392974853516,
"kl_loss_3": 782.3692016601562,
"kl_loss_7": 188.51590728759766,
"learning_rate": 8.7264051518613e-05,
"loss": 538.6139,
"step": 8110
},
{
"ce_loss_10": 3.639654815196991,
"ce_loss_13": 3.583385097980499,
"ce_loss_2": 4.081218779087067,
"ce_loss_3": 3.9191540598869326,
"ce_loss_7": 3.680349314212799,
"epoch": 0.812,
"grad_norm": 358.0,
"kl_loss_10": 93.30685958862304,
"kl_loss_2": 1057.4586822509766,
"kl_loss_3": 735.9759002685547,
"kl_loss_7": 182.97039413452148,
"learning_rate": 8.637054999856148e-05,
"loss": 526.1802,
"step": 8120
},
{
"ce_loss_10": 3.6243308544158936,
"ce_loss_13": 3.5630579233169555,
"ce_loss_2": 4.083577620983124,
"ce_loss_3": 3.9160293340682983,
"ce_loss_7": 3.6718581318855286,
"epoch": 0.813,
"grad_norm": 328.0,
"kl_loss_10": 95.2622299194336,
"kl_loss_2": 1086.6508239746095,
"kl_loss_3": 748.3265411376954,
"kl_loss_7": 187.44526748657228,
"learning_rate": 8.548121372247918e-05,
"loss": 536.2552,
"step": 8130
},
{
"ce_loss_10": 3.699293088912964,
"ce_loss_13": 3.641393613815308,
"ce_loss_2": 4.146343159675598,
"ce_loss_3": 3.982176637649536,
"ce_loss_7": 3.7424126982688906,
"epoch": 0.814,
"grad_norm": 420.0,
"kl_loss_10": 97.64918098449706,
"kl_loss_2": 1075.0233795166016,
"kl_loss_3": 745.3918151855469,
"kl_loss_7": 187.1306022644043,
"learning_rate": 8.459605164597267e-05,
"loss": 527.4509,
"step": 8140
},
{
"ce_loss_10": 3.5794180989265443,
"ce_loss_13": 3.521663022041321,
"ce_loss_2": 4.035482859611511,
"ce_loss_3": 3.869397759437561,
"ce_loss_7": 3.6230968952178957,
"epoch": 0.815,
"grad_norm": 322.0,
"kl_loss_10": 93.84382820129395,
"kl_loss_2": 1085.6336395263672,
"kl_loss_3": 749.5215454101562,
"kl_loss_7": 184.3967170715332,
"learning_rate": 8.371507268261436e-05,
"loss": 530.9717,
"step": 8150
},
{
"ce_loss_10": 3.6623859286308287,
"ce_loss_13": 3.603581893444061,
"ce_loss_2": 4.1160969018936155,
"ce_loss_3": 3.9481249690055846,
"ce_loss_7": 3.7034823894500732,
"epoch": 0.816,
"grad_norm": 410.0,
"kl_loss_10": 96.0962978363037,
"kl_loss_2": 1085.8551330566406,
"kl_loss_3": 744.0009185791016,
"kl_loss_7": 187.44638290405274,
"learning_rate": 8.283828570385238e-05,
"loss": 515.8468,
"step": 8160
},
{
"ce_loss_10": 3.6646664142608643,
"ce_loss_13": 3.607030153274536,
"ce_loss_2": 4.124508082866669,
"ce_loss_3": 3.955708396434784,
"ce_loss_7": 3.708679938316345,
"epoch": 0.817,
"grad_norm": 286.0,
"kl_loss_10": 95.48198356628419,
"kl_loss_2": 1068.3529357910156,
"kl_loss_3": 737.6435119628907,
"kl_loss_7": 186.3275260925293,
"learning_rate": 8.196569953892202e-05,
"loss": 525.6566,
"step": 8170
},
{
"ce_loss_10": 3.5752533435821534,
"ce_loss_13": 3.5151426196098328,
"ce_loss_2": 4.039277529716491,
"ce_loss_3": 3.8700820326805117,
"ce_loss_7": 3.6193170666694643,
"epoch": 0.818,
"grad_norm": 392.0,
"kl_loss_10": 95.23657569885253,
"kl_loss_2": 1087.7711944580078,
"kl_loss_3": 748.5086303710938,
"kl_loss_7": 185.79026489257814,
"learning_rate": 8.109732297475635e-05,
"loss": 529.4896,
"step": 8180
},
{
"ce_loss_10": 3.5442301869392394,
"ce_loss_13": 3.48368262052536,
"ce_loss_2": 4.041348910331726,
"ce_loss_3": 3.8620414972305297,
"ce_loss_7": 3.593292236328125,
"epoch": 0.819,
"grad_norm": 508.0,
"kl_loss_10": 94.79218406677246,
"kl_loss_2": 1140.4125610351562,
"kl_loss_3": 788.5256622314453,
"kl_loss_7": 192.41318969726564,
"learning_rate": 8.023316475589754e-05,
"loss": 543.2035,
"step": 8190
},
{
"ce_loss_10": 3.5104150652885435,
"ce_loss_13": 3.44714834690094,
"ce_loss_2": 4.0140674948692325,
"ce_loss_3": 3.8308369159698485,
"ce_loss_7": 3.5589245796203612,
"epoch": 0.82,
"grad_norm": 532.0,
"kl_loss_10": 97.92351608276367,
"kl_loss_2": 1158.8160186767577,
"kl_loss_3": 797.9960662841797,
"kl_loss_7": 195.1374740600586,
"learning_rate": 7.937323358440934e-05,
"loss": 549.9746,
"step": 8200
},
{
"ce_loss_10": 3.637300205230713,
"ce_loss_13": 3.5789112567901613,
"ce_loss_2": 4.087347877025604,
"ce_loss_3": 3.923117625713348,
"ce_loss_7": 3.679899263381958,
"epoch": 0.821,
"grad_norm": 404.0,
"kl_loss_10": 95.01284561157226,
"kl_loss_2": 1074.5766845703124,
"kl_loss_3": 743.3249450683594,
"kl_loss_7": 184.74501190185546,
"learning_rate": 7.851753811978923e-05,
"loss": 530.0879,
"step": 8210
},
{
"ce_loss_10": 3.661479341983795,
"ce_loss_13": 3.6010610818862916,
"ce_loss_2": 4.123578751087189,
"ce_loss_3": 3.9517632484436036,
"ce_loss_7": 3.7047110080718992,
"epoch": 0.822,
"grad_norm": 358.0,
"kl_loss_10": 96.71367454528809,
"kl_loss_2": 1091.6120025634766,
"kl_loss_3": 744.3147399902343,
"kl_loss_7": 186.59919815063478,
"learning_rate": 7.766608697888095e-05,
"loss": 527.9285,
"step": 8220
},
{
"ce_loss_10": 3.672685134410858,
"ce_loss_13": 3.6110698223114013,
"ce_loss_2": 4.123067581653595,
"ce_loss_3": 3.9549397349357607,
"ce_loss_7": 3.7160248041152952,
"epoch": 0.823,
"grad_norm": 428.0,
"kl_loss_10": 99.5799617767334,
"kl_loss_2": 1090.7132843017578,
"kl_loss_3": 754.5721008300782,
"kl_loss_7": 190.94887008666993,
"learning_rate": 7.681888873578785e-05,
"loss": 534.6821,
"step": 8230
},
{
"ce_loss_10": 3.599495697021484,
"ce_loss_13": 3.5377328515052797,
"ce_loss_2": 4.075003004074096,
"ce_loss_3": 3.9027179360389708,
"ce_loss_7": 3.6464317083358764,
"epoch": 0.824,
"grad_norm": 454.0,
"kl_loss_10": 96.61878395080566,
"kl_loss_2": 1113.7870971679688,
"kl_loss_3": 766.2083129882812,
"kl_loss_7": 191.40456848144532,
"learning_rate": 7.597595192178702e-05,
"loss": 531.8756,
"step": 8240
},
{
"ce_loss_10": 3.5937318563461305,
"ce_loss_13": 3.5349967002868654,
"ce_loss_2": 4.069689559936523,
"ce_loss_3": 3.896022927761078,
"ce_loss_7": 3.640897309780121,
"epoch": 0.825,
"grad_norm": 390.0,
"kl_loss_10": 96.6520393371582,
"kl_loss_2": 1123.3416778564454,
"kl_loss_3": 772.9763427734375,
"kl_loss_7": 191.78026428222657,
"learning_rate": 7.513728502524286e-05,
"loss": 540.9631,
"step": 8250
},
{
"ce_loss_10": 3.600663185119629,
"ce_loss_13": 3.543354606628418,
"ce_loss_2": 4.056336843967438,
"ce_loss_3": 3.886526358127594,
"ce_loss_7": 3.644607651233673,
"epoch": 0.826,
"grad_norm": 520.0,
"kl_loss_10": 94.51933555603027,
"kl_loss_2": 1071.4390838623046,
"kl_loss_3": 737.6031066894532,
"kl_loss_7": 182.459228515625,
"learning_rate": 7.430289649152156e-05,
"loss": 532.1943,
"step": 8260
},
{
"ce_loss_10": 3.4964008927345276,
"ce_loss_13": 3.4386809706687926,
"ce_loss_2": 3.979319155216217,
"ce_loss_3": 3.806533432006836,
"ce_loss_7": 3.5424695372581483,
"epoch": 0.827,
"grad_norm": 438.0,
"kl_loss_10": 92.59819717407227,
"kl_loss_2": 1138.140805053711,
"kl_loss_3": 787.0873046875,
"kl_loss_7": 188.89702301025392,
"learning_rate": 7.347279472290646e-05,
"loss": 536.0913,
"step": 8270
},
{
"ce_loss_10": 3.641860234737396,
"ce_loss_13": 3.5819854736328125,
"ce_loss_2": 4.100788974761963,
"ce_loss_3": 3.9369661927223207,
"ce_loss_7": 3.6862512946128847,
"epoch": 0.828,
"grad_norm": 404.0,
"kl_loss_10": 96.73132438659668,
"kl_loss_2": 1085.176287841797,
"kl_loss_3": 756.2387023925781,
"kl_loss_7": 187.64101333618163,
"learning_rate": 7.264698807851328e-05,
"loss": 532.8096,
"step": 8280
},
{
"ce_loss_10": 3.604352295398712,
"ce_loss_13": 3.549103558063507,
"ce_loss_2": 4.042815041542053,
"ce_loss_3": 3.880488729476929,
"ce_loss_7": 3.64413400888443,
"epoch": 0.829,
"grad_norm": 520.0,
"kl_loss_10": 92.21053123474121,
"kl_loss_2": 1057.8690124511718,
"kl_loss_3": 728.9253723144532,
"kl_loss_7": 181.22113647460938,
"learning_rate": 7.182548487420554e-05,
"loss": 524.6575,
"step": 8290
},
{
"ce_loss_10": 3.6577786207199097,
"ce_loss_13": 3.597848916053772,
"ce_loss_2": 4.107566392421722,
"ce_loss_3": 3.947295570373535,
"ce_loss_7": 3.703710603713989,
"epoch": 0.83,
"grad_norm": 286.0,
"kl_loss_10": 96.30242042541504,
"kl_loss_2": 1087.0319366455078,
"kl_loss_3": 748.0092193603516,
"kl_loss_7": 187.4295867919922,
"learning_rate": 7.100829338251146e-05,
"loss": 527.7667,
"step": 8300
},
{
"ce_loss_10": 3.5980669021606446,
"ce_loss_13": 3.5371885776519774,
"ce_loss_2": 4.070665979385376,
"ce_loss_3": 3.8979653000831602,
"ce_loss_7": 3.6431610107421877,
"epoch": 0.831,
"grad_norm": 394.0,
"kl_loss_10": 95.44490776062011,
"kl_loss_2": 1113.3842803955079,
"kl_loss_3": 769.6158874511718,
"kl_loss_7": 189.99929428100586,
"learning_rate": 7.019542183254046e-05,
"loss": 531.0445,
"step": 8310
},
{
"ce_loss_10": 3.6354474306106566,
"ce_loss_13": 3.57179137468338,
"ce_loss_2": 4.082340836524963,
"ce_loss_3": 3.9207422971725463,
"ce_loss_7": 3.6777117967605593,
"epoch": 0.832,
"grad_norm": 474.0,
"kl_loss_10": 100.207564163208,
"kl_loss_2": 1084.2285125732421,
"kl_loss_3": 748.0254974365234,
"kl_loss_7": 190.82402954101562,
"learning_rate": 6.938687840989971e-05,
"loss": 528.8804,
"step": 8320
},
{
"ce_loss_10": 3.5696911811828613,
"ce_loss_13": 3.508439671993256,
"ce_loss_2": 4.0291890621185305,
"ce_loss_3": 3.8622559309005737,
"ce_loss_7": 3.614106321334839,
"epoch": 0.833,
"grad_norm": 600.0,
"kl_loss_10": 96.55842895507813,
"kl_loss_2": 1082.4974243164063,
"kl_loss_3": 748.5556121826172,
"kl_loss_7": 188.75322189331055,
"learning_rate": 6.858267125661271e-05,
"loss": 531.4916,
"step": 8330
},
{
"ce_loss_10": 3.6338680744171143,
"ce_loss_13": 3.575134778022766,
"ce_loss_2": 4.0971689343452455,
"ce_loss_3": 3.930681896209717,
"ce_loss_7": 3.6769707798957825,
"epoch": 0.834,
"grad_norm": 418.0,
"kl_loss_10": 93.3882438659668,
"kl_loss_2": 1085.4937896728516,
"kl_loss_3": 746.0253967285156,
"kl_loss_7": 184.32117233276367,
"learning_rate": 6.778280847103668e-05,
"loss": 538.0241,
"step": 8340
},
{
"ce_loss_10": 3.6449947714805604,
"ce_loss_13": 3.581918466091156,
"ce_loss_2": 4.1008768558502195,
"ce_loss_3": 3.937298035621643,
"ce_loss_7": 3.686388850212097,
"epoch": 0.835,
"grad_norm": 290.0,
"kl_loss_10": 98.43625144958496,
"kl_loss_2": 1102.1855102539062,
"kl_loss_3": 759.7929138183594,
"kl_loss_7": 191.51789016723632,
"learning_rate": 6.698729810778065e-05,
"loss": 532.2951,
"step": 8350
},
{
"ce_loss_10": 3.5478424787521363,
"ce_loss_13": 3.489585447311401,
"ce_loss_2": 4.0140421986579895,
"ce_loss_3": 3.8517470717430116,
"ce_loss_7": 3.592922496795654,
"epoch": 0.836,
"grad_norm": 490.0,
"kl_loss_10": 91.77609100341797,
"kl_loss_2": 1092.1636932373046,
"kl_loss_3": 756.2904968261719,
"kl_loss_7": 183.14143447875978,
"learning_rate": 6.619614817762538e-05,
"loss": 531.3562,
"step": 8360
},
{
"ce_loss_10": 3.509856128692627,
"ce_loss_13": 3.4520259737968444,
"ce_loss_2": 4.005417215824127,
"ce_loss_3": 3.8302616715431212,
"ce_loss_7": 3.56083265542984,
"epoch": 0.837,
"grad_norm": 356.0,
"kl_loss_10": 91.30384330749511,
"kl_loss_2": 1146.0878509521485,
"kl_loss_3": 788.8349487304688,
"kl_loss_7": 189.73513488769532,
"learning_rate": 6.540936664744196e-05,
"loss": 543.0581,
"step": 8370
},
{
"ce_loss_10": 3.6644623279571533,
"ce_loss_13": 3.6040658593177795,
"ce_loss_2": 4.12789534330368,
"ce_loss_3": 3.959988057613373,
"ce_loss_7": 3.7062342405319213,
"epoch": 0.838,
"grad_norm": 366.0,
"kl_loss_10": 97.38574295043945,
"kl_loss_2": 1085.7984375,
"kl_loss_3": 749.598519897461,
"kl_loss_7": 188.30213012695313,
"learning_rate": 6.462696144011149e-05,
"loss": 525.3536,
"step": 8380
},
{
"ce_loss_10": 3.6138532400131225,
"ce_loss_13": 3.5537376523017885,
"ce_loss_2": 4.071477258205414,
"ce_loss_3": 3.910947525501251,
"ce_loss_7": 3.658327579498291,
"epoch": 0.839,
"grad_norm": 556.0,
"kl_loss_10": 98.20170745849609,
"kl_loss_2": 1090.382958984375,
"kl_loss_3": 762.5471374511719,
"kl_loss_7": 191.74814834594727,
"learning_rate": 6.384894043444567e-05,
"loss": 528.8093,
"step": 8390
},
{
"ce_loss_10": 3.644765245914459,
"ce_loss_13": 3.585478734970093,
"ce_loss_2": 4.109920060634613,
"ce_loss_3": 3.9416786313056944,
"ce_loss_7": 3.689965844154358,
"epoch": 0.84,
"grad_norm": 412.0,
"kl_loss_10": 97.19089965820312,
"kl_loss_2": 1101.7069030761718,
"kl_loss_3": 757.5290496826171,
"kl_loss_7": 188.98860778808594,
"learning_rate": 6.307531146510753e-05,
"loss": 529.2157,
"step": 8400
},
{
"ce_loss_10": 3.621027076244354,
"ce_loss_13": 3.5618404507637025,
"ce_loss_2": 4.0682983756065365,
"ce_loss_3": 3.90874502658844,
"ce_loss_7": 3.6661928296089172,
"epoch": 0.841,
"grad_norm": 384.0,
"kl_loss_10": 95.90530738830566,
"kl_loss_2": 1067.8680267333984,
"kl_loss_3": 738.8968048095703,
"kl_loss_7": 187.38672485351563,
"learning_rate": 6.230608232253226e-05,
"loss": 522.0211,
"step": 8410
},
{
"ce_loss_10": 3.5725093245506288,
"ce_loss_13": 3.5133079648017884,
"ce_loss_2": 4.052767491340637,
"ce_loss_3": 3.8865469098091125,
"ce_loss_7": 3.617572808265686,
"epoch": 0.842,
"grad_norm": 420.0,
"kl_loss_10": 93.54998550415038,
"kl_loss_2": 1118.0941436767578,
"kl_loss_3": 779.2841003417968,
"kl_loss_7": 188.06975250244142,
"learning_rate": 6.154126075284855e-05,
"loss": 530.6581,
"step": 8420
},
{
"ce_loss_10": 3.6709149718284606,
"ce_loss_13": 3.610610318183899,
"ce_loss_2": 4.11589070558548,
"ce_loss_3": 3.958199071884155,
"ce_loss_7": 3.7119770526885985,
"epoch": 0.843,
"grad_norm": 360.0,
"kl_loss_10": 93.72929344177246,
"kl_loss_2": 1052.0708984375,
"kl_loss_3": 727.1819213867187,
"kl_loss_7": 182.0021545410156,
"learning_rate": 6.078085445780129e-05,
"loss": 515.5865,
"step": 8430
},
{
"ce_loss_10": 3.678613018989563,
"ce_loss_13": 3.6185575127601624,
"ce_loss_2": 4.138859879970551,
"ce_loss_3": 3.970304036140442,
"ce_loss_7": 3.7233882188796996,
"epoch": 0.844,
"grad_norm": 708.0,
"kl_loss_10": 96.56619453430176,
"kl_loss_2": 1092.8436309814454,
"kl_loss_3": 748.7821746826172,
"kl_loss_7": 187.36514282226562,
"learning_rate": 6.002487109467347e-05,
"loss": 524.9962,
"step": 8440
},
{
"ce_loss_10": 3.681882548332214,
"ce_loss_13": 3.623554539680481,
"ce_loss_2": 4.131060492992401,
"ce_loss_3": 3.969043660163879,
"ce_loss_7": 3.7261468291282656,
"epoch": 0.845,
"grad_norm": 498.0,
"kl_loss_10": 95.19795646667481,
"kl_loss_2": 1083.3428985595704,
"kl_loss_3": 748.7116729736329,
"kl_loss_7": 188.84120330810546,
"learning_rate": 5.927331827620902e-05,
"loss": 524.2234,
"step": 8450
},
{
"ce_loss_10": 3.671555197238922,
"ce_loss_13": 3.6144081234931944,
"ce_loss_2": 4.109152019023895,
"ce_loss_3": 3.957107722759247,
"ce_loss_7": 3.7151288032531737,
"epoch": 0.846,
"grad_norm": 384.0,
"kl_loss_10": 92.54770011901856,
"kl_loss_2": 1047.1174011230469,
"kl_loss_3": 728.4162536621094,
"kl_loss_7": 183.04834442138673,
"learning_rate": 5.852620357053651e-05,
"loss": 522.9391,
"step": 8460
},
{
"ce_loss_10": 3.7129202485084534,
"ce_loss_13": 3.65321398973465,
"ce_loss_2": 4.155979669094085,
"ce_loss_3": 3.9961599469184876,
"ce_loss_7": 3.7558295488357545,
"epoch": 0.847,
"grad_norm": 432.0,
"kl_loss_10": 94.81909484863282,
"kl_loss_2": 1067.3740447998048,
"kl_loss_3": 736.2771881103515,
"kl_loss_7": 184.3846176147461,
"learning_rate": 5.778353450109286e-05,
"loss": 523.3945,
"step": 8470
},
{
"ce_loss_10": 3.7526662349700928,
"ce_loss_13": 3.6899970173835754,
"ce_loss_2": 4.2024567246437075,
"ce_loss_3": 4.037352812290192,
"ce_loss_7": 3.7961275696754457,
"epoch": 0.848,
"grad_norm": 420.0,
"kl_loss_10": 98.8898868560791,
"kl_loss_2": 1083.7428894042969,
"kl_loss_3": 747.87919921875,
"kl_loss_7": 190.12581558227538,
"learning_rate": 5.7045318546547206e-05,
"loss": 528.6064,
"step": 8480
},
{
"ce_loss_10": 3.6435152888298035,
"ce_loss_13": 3.5820479154586793,
"ce_loss_2": 4.10130136013031,
"ce_loss_3": 3.9336646437644958,
"ce_loss_7": 3.6865146279335024,
"epoch": 0.849,
"grad_norm": 476.0,
"kl_loss_10": 97.09412269592285,
"kl_loss_2": 1097.005093383789,
"kl_loss_3": 757.3569030761719,
"kl_loss_7": 187.13169021606444,
"learning_rate": 5.631156314072605e-05,
"loss": 526.7981,
"step": 8490
},
{
"ce_loss_10": 3.6548070907592773,
"ce_loss_13": 3.5959606409072875,
"ce_loss_2": 4.090519487857819,
"ce_loss_3": 3.9302281975746154,
"ce_loss_7": 3.6990505933761595,
"epoch": 0.85,
"grad_norm": 348.0,
"kl_loss_10": 94.60167617797852,
"kl_loss_2": 1058.567938232422,
"kl_loss_3": 726.6986267089844,
"kl_loss_7": 182.6941146850586,
"learning_rate": 5.5582275672538315e-05,
"loss": 518.2773,
"step": 8500
},
{
"ce_loss_10": 3.5718761324882506,
"ce_loss_13": 3.510132133960724,
"ce_loss_2": 4.058491265773773,
"ce_loss_3": 3.8868750095367433,
"ce_loss_7": 3.62018061876297,
"epoch": 0.851,
"grad_norm": 356.0,
"kl_loss_10": 98.47408905029297,
"kl_loss_2": 1129.9293365478516,
"kl_loss_3": 782.1435455322265,
"kl_loss_7": 191.806551361084,
"learning_rate": 5.4857463485900484e-05,
"loss": 540.5649,
"step": 8510
},
{
"ce_loss_10": 3.626720643043518,
"ce_loss_13": 3.5688146710395814,
"ce_loss_2": 4.081609988212586,
"ce_loss_3": 3.9117035031318665,
"ce_loss_7": 3.673699951171875,
"epoch": 0.852,
"grad_norm": 392.0,
"kl_loss_10": 94.4161319732666,
"kl_loss_2": 1082.976022338867,
"kl_loss_3": 743.9283477783204,
"kl_loss_7": 185.5842658996582,
"learning_rate": 5.413713387966329e-05,
"loss": 525.7675,
"step": 8520
},
{
"ce_loss_10": 3.6495197653770446,
"ce_loss_13": 3.5870252728462217,
"ce_loss_2": 4.1089702367782595,
"ce_loss_3": 3.943737292289734,
"ce_loss_7": 3.6925705909729003,
"epoch": 0.853,
"grad_norm": 560.0,
"kl_loss_10": 99.9091007232666,
"kl_loss_2": 1091.3887969970704,
"kl_loss_3": 754.8269989013672,
"kl_loss_7": 190.51073608398437,
"learning_rate": 5.34212941075381e-05,
"loss": 533.712,
"step": 8530
},
{
"ce_loss_10": 3.6638750314712523,
"ce_loss_13": 3.603909599781036,
"ce_loss_2": 4.105106854438782,
"ce_loss_3": 3.939826285839081,
"ce_loss_7": 3.703915464878082,
"epoch": 0.854,
"grad_norm": 324.0,
"kl_loss_10": 94.93586730957031,
"kl_loss_2": 1060.2898712158203,
"kl_loss_3": 729.1602386474609,
"kl_loss_7": 183.2039321899414,
"learning_rate": 5.270995137802315e-05,
"loss": 520.0254,
"step": 8540
},
{
"ce_loss_10": 3.586125075817108,
"ce_loss_13": 3.530829107761383,
"ce_loss_2": 4.0409599304199215,
"ce_loss_3": 3.876398241519928,
"ce_loss_7": 3.6288790106773376,
"epoch": 0.855,
"grad_norm": 390.0,
"kl_loss_10": 92.31447868347168,
"kl_loss_2": 1091.2599792480469,
"kl_loss_3": 750.2804168701172,
"kl_loss_7": 184.4141700744629,
"learning_rate": 5.2003112854332125e-05,
"loss": 530.1402,
"step": 8550
},
{
"ce_loss_10": 3.592084896564484,
"ce_loss_13": 3.5318885922431944,
"ce_loss_2": 4.045030009746552,
"ce_loss_3": 3.8797095656394958,
"ce_loss_7": 3.6342476487159727,
"epoch": 0.856,
"grad_norm": 410.0,
"kl_loss_10": 95.16406364440918,
"kl_loss_2": 1083.518502807617,
"kl_loss_3": 746.9155914306641,
"kl_loss_7": 184.60284118652345,
"learning_rate": 5.130078565432089e-05,
"loss": 519.0631,
"step": 8560
},
{
"ce_loss_10": 3.6698386430740357,
"ce_loss_13": 3.611102557182312,
"ce_loss_2": 4.1041951179504395,
"ce_loss_3": 3.9457595467567446,
"ce_loss_7": 3.714687442779541,
"epoch": 0.857,
"grad_norm": 330.0,
"kl_loss_10": 94.41157264709473,
"kl_loss_2": 1066.6546508789063,
"kl_loss_3": 732.30849609375,
"kl_loss_7": 183.59521484375,
"learning_rate": 5.060297685041659e-05,
"loss": 515.5307,
"step": 8570
},
{
"ce_loss_10": 3.594843864440918,
"ce_loss_13": 3.535090386867523,
"ce_loss_2": 4.058831119537354,
"ce_loss_3": 3.8907560467720033,
"ce_loss_7": 3.6390093684196474,
"epoch": 0.858,
"grad_norm": 396.0,
"kl_loss_10": 97.14489707946777,
"kl_loss_2": 1100.07861328125,
"kl_loss_3": 757.8477020263672,
"kl_loss_7": 190.17505111694337,
"learning_rate": 4.99096934695461e-05,
"loss": 537.0569,
"step": 8580
},
{
"ce_loss_10": 3.655477023124695,
"ce_loss_13": 3.592752683162689,
"ce_loss_2": 4.114116084575653,
"ce_loss_3": 3.950313460826874,
"ce_loss_7": 3.6980414509773256,
"epoch": 0.859,
"grad_norm": 370.0,
"kl_loss_10": 96.66123657226562,
"kl_loss_2": 1076.5634460449219,
"kl_loss_3": 745.2082977294922,
"kl_loss_7": 186.95159301757812,
"learning_rate": 4.922094249306558e-05,
"loss": 520.1718,
"step": 8590
},
{
"ce_loss_10": 3.677726352214813,
"ce_loss_13": 3.6172829270362854,
"ce_loss_2": 4.126979196071625,
"ce_loss_3": 3.9645047903060915,
"ce_loss_7": 3.7215185284614565,
"epoch": 0.86,
"grad_norm": 392.0,
"kl_loss_10": 96.89525718688965,
"kl_loss_2": 1065.1883819580078,
"kl_loss_3": 740.1956573486328,
"kl_loss_7": 187.83882064819335,
"learning_rate": 4.853673085668947e-05,
"loss": 516.6985,
"step": 8600
},
{
"ce_loss_10": 3.707137334346771,
"ce_loss_13": 3.6448033452033997,
"ce_loss_2": 4.162192296981812,
"ce_loss_3": 3.993678319454193,
"ce_loss_7": 3.7496466279029845,
"epoch": 0.861,
"grad_norm": 370.0,
"kl_loss_10": 98.02176780700684,
"kl_loss_2": 1078.1511993408203,
"kl_loss_3": 739.8441162109375,
"kl_loss_7": 186.5592399597168,
"learning_rate": 4.78570654504214e-05,
"loss": 529.6101,
"step": 8610
},
{
"ce_loss_10": 3.6458049774169923,
"ce_loss_13": 3.5854872465133667,
"ce_loss_2": 4.110537803173065,
"ce_loss_3": 3.938798224925995,
"ce_loss_7": 3.6893723726272585,
"epoch": 0.862,
"grad_norm": 414.0,
"kl_loss_10": 94.25516128540039,
"kl_loss_2": 1104.6271423339845,
"kl_loss_3": 758.221337890625,
"kl_loss_7": 185.93933029174804,
"learning_rate": 4.7181953118484556e-05,
"loss": 535.9025,
"step": 8620
},
{
"ce_loss_10": 3.6774216413497927,
"ce_loss_13": 3.6180386185646056,
"ce_loss_2": 4.12672735452652,
"ce_loss_3": 3.962115204334259,
"ce_loss_7": 3.720357131958008,
"epoch": 0.863,
"grad_norm": 356.0,
"kl_loss_10": 95.34017066955566,
"kl_loss_2": 1068.0610900878905,
"kl_loss_3": 737.2169891357422,
"kl_loss_7": 185.36345138549805,
"learning_rate": 4.651140065925269e-05,
"loss": 530.0095,
"step": 8630
},
{
"ce_loss_10": 3.609228265285492,
"ce_loss_13": 3.5492658615112305,
"ce_loss_2": 4.060226953029632,
"ce_loss_3": 3.895670175552368,
"ce_loss_7": 3.6542355179786683,
"epoch": 0.864,
"grad_norm": 360.0,
"kl_loss_10": 96.95414390563965,
"kl_loss_2": 1087.1394622802734,
"kl_loss_3": 748.6742889404297,
"kl_loss_7": 188.45738372802734,
"learning_rate": 4.58454148251814e-05,
"loss": 535.7555,
"step": 8640
},
{
"ce_loss_10": 3.6290027260780335,
"ce_loss_13": 3.566804575920105,
"ce_loss_2": 4.098408913612365,
"ce_loss_3": 3.928418016433716,
"ce_loss_7": 3.673435080051422,
"epoch": 0.865,
"grad_norm": 352.0,
"kl_loss_10": 97.77750358581542,
"kl_loss_2": 1105.780810546875,
"kl_loss_3": 762.838412475586,
"kl_loss_7": 187.93626327514647,
"learning_rate": 4.518400232274078e-05,
"loss": 530.3719,
"step": 8650
},
{
"ce_loss_10": 3.641969549655914,
"ce_loss_13": 3.5785802602767944,
"ce_loss_2": 4.092971992492676,
"ce_loss_3": 3.932430160045624,
"ce_loss_7": 3.6855560064315798,
"epoch": 0.866,
"grad_norm": 320.0,
"kl_loss_10": 100.24152946472168,
"kl_loss_2": 1078.2671875,
"kl_loss_3": 746.3800415039062,
"kl_loss_7": 188.71098556518555,
"learning_rate": 4.452716981234745e-05,
"loss": 518.2875,
"step": 8660
},
{
"ce_loss_10": 3.619352424144745,
"ce_loss_13": 3.5634596943855286,
"ce_loss_2": 4.0641814827919,
"ce_loss_3": 3.9009178042411805,
"ce_loss_7": 3.6601861000061033,
"epoch": 0.867,
"grad_norm": 334.0,
"kl_loss_10": 92.77517395019531,
"kl_loss_2": 1069.4530029296875,
"kl_loss_3": 742.2820404052734,
"kl_loss_7": 183.70159912109375,
"learning_rate": 4.3874923908297335e-05,
"loss": 518.2648,
"step": 8670
},
{
"ce_loss_10": 3.6679449677467346,
"ce_loss_13": 3.605993056297302,
"ce_loss_2": 4.122425937652588,
"ce_loss_3": 3.955815386772156,
"ce_loss_7": 3.710171031951904,
"epoch": 0.868,
"grad_norm": 372.0,
"kl_loss_10": 98.51640739440919,
"kl_loss_2": 1091.1497436523437,
"kl_loss_3": 753.822543334961,
"kl_loss_7": 189.5640121459961,
"learning_rate": 4.322727117869951e-05,
"loss": 527.5021,
"step": 8680
},
{
"ce_loss_10": 3.678618919849396,
"ce_loss_13": 3.61755256652832,
"ce_loss_2": 4.1355063915252686,
"ce_loss_3": 3.9705930352211,
"ce_loss_7": 3.7248330235481264,
"epoch": 0.869,
"grad_norm": 450.0,
"kl_loss_10": 97.55352783203125,
"kl_loss_2": 1094.9813720703125,
"kl_loss_3": 756.694857788086,
"kl_loss_7": 188.98089218139648,
"learning_rate": 4.2584218145409916e-05,
"loss": 526.9053,
"step": 8690
},
{
"ce_loss_10": 3.724055600166321,
"ce_loss_13": 3.6645130157470702,
"ce_loss_2": 4.164188587665558,
"ce_loss_3": 4.006092858314514,
"ce_loss_7": 3.766603982448578,
"epoch": 0.87,
"grad_norm": 368.0,
"kl_loss_10": 97.79985809326172,
"kl_loss_2": 1054.3090911865233,
"kl_loss_3": 727.9592834472656,
"kl_loss_7": 186.32457809448243,
"learning_rate": 4.194577128396521e-05,
"loss": 516.3896,
"step": 8700
},
{
"ce_loss_10": 3.59331738948822,
"ce_loss_13": 3.5345770716667175,
"ce_loss_2": 4.046900963783264,
"ce_loss_3": 3.882276177406311,
"ce_loss_7": 3.636314344406128,
"epoch": 0.871,
"grad_norm": 348.0,
"kl_loss_10": 93.78037185668946,
"kl_loss_2": 1077.3778259277344,
"kl_loss_3": 740.198031616211,
"kl_loss_7": 183.74533233642578,
"learning_rate": 4.1311937023518264e-05,
"loss": 527.0207,
"step": 8710
},
{
"ce_loss_10": 3.6144633054733277,
"ce_loss_13": 3.5550664901733398,
"ce_loss_2": 4.064953672885895,
"ce_loss_3": 3.891311466693878,
"ce_loss_7": 3.653948724269867,
"epoch": 0.872,
"grad_norm": 338.0,
"kl_loss_10": 94.96177291870117,
"kl_loss_2": 1085.5813049316407,
"kl_loss_3": 729.3066223144531,
"kl_loss_7": 181.0632652282715,
"learning_rate": 4.0682721746773344e-05,
"loss": 521.2992,
"step": 8720
},
{
"ce_loss_10": 3.4832905650138857,
"ce_loss_13": 3.4249367475509644,
"ce_loss_2": 3.961899662017822,
"ce_loss_3": 3.788464534282684,
"ce_loss_7": 3.527579641342163,
"epoch": 0.873,
"grad_norm": 370.0,
"kl_loss_10": 91.51293182373047,
"kl_loss_2": 1104.7394775390626,
"kl_loss_3": 759.5037414550782,
"kl_loss_7": 185.07400512695312,
"learning_rate": 4.0058131789920904e-05,
"loss": 521.9289,
"step": 8730
},
{
"ce_loss_10": 3.640140187740326,
"ce_loss_13": 3.57983558177948,
"ce_loss_2": 4.088211476802826,
"ce_loss_3": 3.927894616127014,
"ce_loss_7": 3.6845538139343263,
"epoch": 0.874,
"grad_norm": 438.0,
"kl_loss_10": 95.66121215820313,
"kl_loss_2": 1082.0109283447266,
"kl_loss_3": 751.8433319091797,
"kl_loss_7": 184.97217254638673,
"learning_rate": 3.9438173442575e-05,
"loss": 542.025,
"step": 8740
},
{
"ce_loss_10": 3.668476128578186,
"ce_loss_13": 3.6084399461746215,
"ce_loss_2": 4.114363825321197,
"ce_loss_3": 3.948890733718872,
"ce_loss_7": 3.712895894050598,
"epoch": 0.875,
"grad_norm": 360.0,
"kl_loss_10": 95.13606338500976,
"kl_loss_2": 1069.65205078125,
"kl_loss_3": 736.1352905273437,
"kl_loss_7": 185.31621551513672,
"learning_rate": 3.882285294770937e-05,
"loss": 524.7358,
"step": 8750
},
{
"ce_loss_10": 3.636470365524292,
"ce_loss_13": 3.576250433921814,
"ce_loss_2": 4.081735682487488,
"ce_loss_3": 3.9194202423095703,
"ce_loss_7": 3.6787103533744814,
"epoch": 0.876,
"grad_norm": 372.0,
"kl_loss_10": 97.42237510681153,
"kl_loss_2": 1070.8320678710938,
"kl_loss_3": 736.4440826416015,
"kl_loss_7": 186.42294464111328,
"learning_rate": 3.821217650159453e-05,
"loss": 528.159,
"step": 8760
},
{
"ce_loss_10": 3.501795244216919,
"ce_loss_13": 3.445420837402344,
"ce_loss_2": 3.993399131298065,
"ce_loss_3": 3.819171416759491,
"ce_loss_7": 3.5519042015075684,
"epoch": 0.877,
"grad_norm": 398.0,
"kl_loss_10": 91.19635620117188,
"kl_loss_2": 1126.038784790039,
"kl_loss_3": 777.8552947998047,
"kl_loss_7": 188.21297302246094,
"learning_rate": 3.760615025373543e-05,
"loss": 535.8912,
"step": 8770
},
{
"ce_loss_10": 3.687652599811554,
"ce_loss_13": 3.6275517463684084,
"ce_loss_2": 4.149944150447846,
"ce_loss_3": 3.984694278240204,
"ce_loss_7": 3.7361566066741942,
"epoch": 0.878,
"grad_norm": 426.0,
"kl_loss_10": 98.53735313415527,
"kl_loss_2": 1087.7767242431642,
"kl_loss_3": 754.1841644287109,
"kl_loss_7": 191.66405487060547,
"learning_rate": 3.700478030680987e-05,
"loss": 534.6525,
"step": 8780
},
{
"ce_loss_10": 3.672296917438507,
"ce_loss_13": 3.6126784920692443,
"ce_loss_2": 4.126206862926483,
"ce_loss_3": 3.9555336833000183,
"ce_loss_7": 3.7154035449028013,
"epoch": 0.879,
"grad_norm": 400.0,
"kl_loss_10": 95.93194694519043,
"kl_loss_2": 1067.7572967529297,
"kl_loss_3": 734.3840759277343,
"kl_loss_7": 185.99778594970704,
"learning_rate": 3.6408072716606344e-05,
"loss": 520.9604,
"step": 8790
},
{
"ce_loss_10": 3.5921829104423524,
"ce_loss_13": 3.5314606547355654,
"ce_loss_2": 4.064702832698822,
"ce_loss_3": 3.897125017642975,
"ce_loss_7": 3.639820373058319,
"epoch": 0.88,
"grad_norm": 424.0,
"kl_loss_10": 96.45306243896485,
"kl_loss_2": 1113.6997863769532,
"kl_loss_3": 769.2831970214844,
"kl_loss_7": 189.68171615600585,
"learning_rate": 3.5816033491963716e-05,
"loss": 546.457,
"step": 8800
},
{
"ce_loss_10": 3.4587510585784913,
"ce_loss_13": 3.398640847206116,
"ce_loss_2": 3.9295639514923097,
"ce_loss_3": 3.755736696720123,
"ce_loss_7": 3.502725625038147,
"epoch": 0.881,
"grad_norm": 374.0,
"kl_loss_10": 94.41120719909668,
"kl_loss_2": 1107.7318145751954,
"kl_loss_3": 762.6848449707031,
"kl_loss_7": 185.3354965209961,
"learning_rate": 3.522866859471047e-05,
"loss": 531.675,
"step": 8810
},
{
"ce_loss_10": 3.7003540635108947,
"ce_loss_13": 3.6417059302330017,
"ce_loss_2": 4.134489345550537,
"ce_loss_3": 3.972803270816803,
"ce_loss_7": 3.7418115973472594,
"epoch": 0.882,
"grad_norm": 620.0,
"kl_loss_10": 93.44988250732422,
"kl_loss_2": 1046.9635864257812,
"kl_loss_3": 718.645751953125,
"kl_loss_7": 180.43475570678712,
"learning_rate": 3.46459839396045e-05,
"loss": 519.2549,
"step": 8820
},
{
"ce_loss_10": 3.6235634326934814,
"ce_loss_13": 3.5625478267669677,
"ce_loss_2": 4.090062844753265,
"ce_loss_3": 3.9221726655960083,
"ce_loss_7": 3.6677647113800047,
"epoch": 0.883,
"grad_norm": 392.0,
"kl_loss_10": 97.41650848388672,
"kl_loss_2": 1090.359048461914,
"kl_loss_3": 752.6492370605469,
"kl_loss_7": 188.19114456176757,
"learning_rate": 3.406798539427386e-05,
"loss": 541.4702,
"step": 8830
},
{
"ce_loss_10": 3.6815385699272154,
"ce_loss_13": 3.622318422794342,
"ce_loss_2": 4.134820902347565,
"ce_loss_3": 3.9722886800765993,
"ce_loss_7": 3.7261940598487855,
"epoch": 0.884,
"grad_norm": 458.0,
"kl_loss_10": 95.14997901916504,
"kl_loss_2": 1087.6108123779297,
"kl_loss_3": 753.6235443115235,
"kl_loss_7": 186.09493026733398,
"learning_rate": 3.349467877915746e-05,
"loss": 532.4207,
"step": 8840
},
{
"ce_loss_10": 3.6383310556411743,
"ce_loss_13": 3.578685259819031,
"ce_loss_2": 4.10920352935791,
"ce_loss_3": 3.9395066857337953,
"ce_loss_7": 3.684439957141876,
"epoch": 0.885,
"grad_norm": 346.0,
"kl_loss_10": 94.56938552856445,
"kl_loss_2": 1107.4275299072265,
"kl_loss_3": 766.7192443847656,
"kl_loss_7": 187.05870895385743,
"learning_rate": 3.292606986744667e-05,
"loss": 544.0854,
"step": 8850
},
{
"ce_loss_10": 3.593039667606354,
"ce_loss_13": 3.5363111972808836,
"ce_loss_2": 4.061631453037262,
"ce_loss_3": 3.888974642753601,
"ce_loss_7": 3.6354947090148926,
"epoch": 0.886,
"grad_norm": 312.0,
"kl_loss_10": 94.36025886535644,
"kl_loss_2": 1094.437567138672,
"kl_loss_3": 755.0413787841796,
"kl_loss_7": 185.15854110717774,
"learning_rate": 3.23621643850267e-05,
"loss": 531.352,
"step": 8860
},
{
"ce_loss_10": 3.6675365686416628,
"ce_loss_13": 3.608867907524109,
"ce_loss_2": 4.1205101132392885,
"ce_loss_3": 3.9526678919792175,
"ce_loss_7": 3.71103777885437,
"epoch": 0.887,
"grad_norm": 398.0,
"kl_loss_10": 95.91901359558105,
"kl_loss_2": 1094.978707885742,
"kl_loss_3": 758.2980133056641,
"kl_loss_7": 187.99334793090821,
"learning_rate": 3.180296801041971e-05,
"loss": 525.304,
"step": 8870
},
{
"ce_loss_10": 3.6939959645271303,
"ce_loss_13": 3.6341704607009886,
"ce_loss_2": 4.136724853515625,
"ce_loss_3": 3.976076662540436,
"ce_loss_7": 3.7369011640548706,
"epoch": 0.888,
"grad_norm": 322.0,
"kl_loss_10": 96.13762168884277,
"kl_loss_2": 1061.462728881836,
"kl_loss_3": 731.0939331054688,
"kl_loss_7": 185.31768493652345,
"learning_rate": 3.124848637472688e-05,
"loss": 515.8721,
"step": 8880
},
{
"ce_loss_10": 3.5114728569984437,
"ce_loss_13": 3.452458143234253,
"ce_loss_2": 3.9819056034088134,
"ce_loss_3": 3.8095321655273438,
"ce_loss_7": 3.5549168229103087,
"epoch": 0.889,
"grad_norm": 430.0,
"kl_loss_10": 92.77987136840821,
"kl_loss_2": 1105.7576904296875,
"kl_loss_3": 760.3018249511719,
"kl_loss_7": 183.98031311035157,
"learning_rate": 3.069872506157212e-05,
"loss": 529.9256,
"step": 8890
},
{
"ce_loss_10": 3.6096359133720397,
"ce_loss_13": 3.5530964136123657,
"ce_loss_2": 4.066385662555694,
"ce_loss_3": 3.9037466764450075,
"ce_loss_7": 3.653862941265106,
"epoch": 0.89,
"grad_norm": 414.0,
"kl_loss_10": 94.68969841003418,
"kl_loss_2": 1082.7529907226562,
"kl_loss_3": 748.9955108642578,
"kl_loss_7": 186.7980583190918,
"learning_rate": 3.0153689607045842e-05,
"loss": 522.4292,
"step": 8900
},
{
"ce_loss_10": 3.5076727747917174,
"ce_loss_13": 3.4481669664382935,
"ce_loss_2": 3.998192644119263,
"ce_loss_3": 3.8251919507980348,
"ce_loss_7": 3.5543401718139647,
"epoch": 0.891,
"grad_norm": 462.0,
"kl_loss_10": 96.1771800994873,
"kl_loss_2": 1157.3876403808595,
"kl_loss_3": 799.3413696289062,
"kl_loss_7": 192.33385009765624,
"learning_rate": 2.9613385499648926e-05,
"loss": 537.2502,
"step": 8910
},
{
"ce_loss_10": 3.5617488503456114,
"ce_loss_13": 3.5028850078582763,
"ce_loss_2": 4.028625464439392,
"ce_loss_3": 3.8606330037117003,
"ce_loss_7": 3.60619056224823,
"epoch": 0.892,
"grad_norm": 364.0,
"kl_loss_10": 92.3734031677246,
"kl_loss_2": 1092.7289123535156,
"kl_loss_3": 755.3269073486329,
"kl_loss_7": 183.66201095581056,
"learning_rate": 2.9077818180237692e-05,
"loss": 529.899,
"step": 8920
},
{
"ce_loss_10": 3.611976993083954,
"ce_loss_13": 3.5523295164108277,
"ce_loss_2": 4.088427019119263,
"ce_loss_3": 3.911720395088196,
"ce_loss_7": 3.6568928718566895,
"epoch": 0.893,
"grad_norm": 604.0,
"kl_loss_10": 95.37241554260254,
"kl_loss_2": 1091.7466766357422,
"kl_loss_3": 749.5647033691406,
"kl_loss_7": 185.87219848632813,
"learning_rate": 2.8546993041969172e-05,
"loss": 528.8222,
"step": 8930
},
{
"ce_loss_10": 3.649553382396698,
"ce_loss_13": 3.5936214447021486,
"ce_loss_2": 4.095563900470734,
"ce_loss_3": 3.9343939542770388,
"ce_loss_7": 3.6919458627700807,
"epoch": 0.894,
"grad_norm": 356.0,
"kl_loss_10": 92.16914100646973,
"kl_loss_2": 1065.6531127929688,
"kl_loss_3": 739.0178924560547,
"kl_loss_7": 182.67144699096679,
"learning_rate": 2.802091543024671e-05,
"loss": 525.8132,
"step": 8940
},
{
"ce_loss_10": 3.6456188917160035,
"ce_loss_13": 3.5855357170104982,
"ce_loss_2": 4.1163407325744625,
"ce_loss_3": 3.9452737092971804,
"ce_loss_7": 3.690487289428711,
"epoch": 0.895,
"grad_norm": 376.0,
"kl_loss_10": 94.99068603515624,
"kl_loss_2": 1107.8523712158203,
"kl_loss_3": 763.5164489746094,
"kl_loss_7": 187.85556182861328,
"learning_rate": 2.7499590642665774e-05,
"loss": 543.5269,
"step": 8950
},
{
"ce_loss_10": 3.6521722793579103,
"ce_loss_13": 3.5920722246170045,
"ce_loss_2": 4.112611806392669,
"ce_loss_3": 3.942758357524872,
"ce_loss_7": 3.6924882411956785,
"epoch": 0.896,
"grad_norm": 434.0,
"kl_loss_10": 97.21023635864258,
"kl_loss_2": 1089.4108154296875,
"kl_loss_3": 742.6543731689453,
"kl_loss_7": 186.23975067138673,
"learning_rate": 2.6983023928961405e-05,
"loss": 523.9287,
"step": 8960
},
{
"ce_loss_10": 3.6287880539894104,
"ce_loss_13": 3.569942307472229,
"ce_loss_2": 4.086234021186828,
"ce_loss_3": 3.919290018081665,
"ce_loss_7": 3.6727704763412476,
"epoch": 0.897,
"grad_norm": 428.0,
"kl_loss_10": 96.33384323120117,
"kl_loss_2": 1081.610333251953,
"kl_loss_3": 747.162060546875,
"kl_loss_7": 187.28789825439452,
"learning_rate": 2.6471220490954628e-05,
"loss": 531.8677,
"step": 8970
},
{
"ce_loss_10": 3.6082414865493773,
"ce_loss_13": 3.5503612399101256,
"ce_loss_2": 4.054306983947754,
"ce_loss_3": 3.8875715851783754,
"ce_loss_7": 3.647981250286102,
"epoch": 0.898,
"grad_norm": 402.0,
"kl_loss_10": 93.92480773925782,
"kl_loss_2": 1068.1579833984374,
"kl_loss_3": 736.318814086914,
"kl_loss_7": 183.30384826660156,
"learning_rate": 2.596418548250029e-05,
"loss": 527.9295,
"step": 8980
},
{
"ce_loss_10": 3.6551415085792542,
"ce_loss_13": 3.5952192187309264,
"ce_loss_2": 4.1076843500137326,
"ce_loss_3": 3.944980025291443,
"ce_loss_7": 3.700137984752655,
"epoch": 0.899,
"grad_norm": 396.0,
"kl_loss_10": 97.98623161315918,
"kl_loss_2": 1081.954281616211,
"kl_loss_3": 746.2776489257812,
"kl_loss_7": 188.93777618408203,
"learning_rate": 2.5461924009435368e-05,
"loss": 524.2467,
"step": 8990
},
{
"ce_loss_10": 3.650333786010742,
"ce_loss_13": 3.590772497653961,
"ce_loss_2": 4.109632253646851,
"ce_loss_3": 3.9412980914115905,
"ce_loss_7": 3.6946743369102477,
"epoch": 0.9,
"grad_norm": 410.0,
"kl_loss_10": 96.09890708923339,
"kl_loss_2": 1079.7472290039063,
"kl_loss_3": 745.8318054199219,
"kl_loss_7": 186.11589736938475,
"learning_rate": 2.4964441129527336e-05,
"loss": 536.0899,
"step": 9000
},
{
"ce_loss_10": 3.6510029554367067,
"ce_loss_13": 3.590871715545654,
"ce_loss_2": 4.100390136241913,
"ce_loss_3": 3.932853305339813,
"ce_loss_7": 3.6917531371116636,
"epoch": 0.901,
"grad_norm": 418.0,
"kl_loss_10": 95.55135993957519,
"kl_loss_2": 1061.7380157470702,
"kl_loss_3": 727.2771514892578,
"kl_loss_7": 183.68069381713866,
"learning_rate": 2.4471741852423235e-05,
"loss": 518.1353,
"step": 9010
},
{
"ce_loss_10": 3.695908546447754,
"ce_loss_13": 3.6349289417266846,
"ce_loss_2": 4.151931369304657,
"ce_loss_3": 3.98497998714447,
"ce_loss_7": 3.739882934093475,
"epoch": 0.902,
"grad_norm": 392.0,
"kl_loss_10": 95.51335906982422,
"kl_loss_2": 1066.5906768798827,
"kl_loss_3": 733.3630157470703,
"kl_loss_7": 184.28593063354492,
"learning_rate": 2.3983831139599287e-05,
"loss": 522.8627,
"step": 9020
},
{
"ce_loss_10": 3.617437481880188,
"ce_loss_13": 3.558865213394165,
"ce_loss_2": 4.061969435214996,
"ce_loss_3": 3.8991889357566833,
"ce_loss_7": 3.660116195678711,
"epoch": 0.903,
"grad_norm": 456.0,
"kl_loss_10": 93.39376106262208,
"kl_loss_2": 1059.7717498779298,
"kl_loss_3": 733.3598663330079,
"kl_loss_7": 181.95840148925782,
"learning_rate": 2.3500713904311022e-05,
"loss": 512.7801,
"step": 9030
},
{
"ce_loss_10": 3.659070146083832,
"ce_loss_13": 3.5992442965507507,
"ce_loss_2": 4.08744889497757,
"ce_loss_3": 3.9278596162796022,
"ce_loss_7": 3.700530481338501,
"epoch": 0.904,
"grad_norm": 472.0,
"kl_loss_10": 95.7885025024414,
"kl_loss_2": 1036.0338073730468,
"kl_loss_3": 713.1754333496094,
"kl_loss_7": 181.65938034057618,
"learning_rate": 2.3022395011543685e-05,
"loss": 514.4845,
"step": 9040
},
{
"ce_loss_10": 3.6909992337226867,
"ce_loss_13": 3.630416977405548,
"ce_loss_2": 4.144919979572296,
"ce_loss_3": 3.98409184217453,
"ce_loss_7": 3.735574746131897,
"epoch": 0.905,
"grad_norm": 400.0,
"kl_loss_10": 95.80096397399902,
"kl_loss_2": 1091.1403015136718,
"kl_loss_3": 758.9450408935547,
"kl_loss_7": 188.74431228637695,
"learning_rate": 2.2548879277963063e-05,
"loss": 536.6219,
"step": 9050
},
{
"ce_loss_10": 3.6055094718933107,
"ce_loss_13": 3.5453344702720644,
"ce_loss_2": 4.055747485160827,
"ce_loss_3": 3.8876903891563415,
"ce_loss_7": 3.645590376853943,
"epoch": 0.906,
"grad_norm": 312.0,
"kl_loss_10": 94.81256561279297,
"kl_loss_2": 1081.8126281738282,
"kl_loss_3": 743.9638031005859,
"kl_loss_7": 185.8631164550781,
"learning_rate": 2.208017147186736e-05,
"loss": 517.0646,
"step": 9060
},
{
"ce_loss_10": 3.5984405398368837,
"ce_loss_13": 3.5392195105552675,
"ce_loss_2": 4.055430555343628,
"ce_loss_3": 3.8891077756881716,
"ce_loss_7": 3.643998312950134,
"epoch": 0.907,
"grad_norm": 424.0,
"kl_loss_10": 95.52283592224121,
"kl_loss_2": 1082.7356536865234,
"kl_loss_3": 749.8307952880859,
"kl_loss_7": 186.6390350341797,
"learning_rate": 2.1616276313139227e-05,
"loss": 522.272,
"step": 9070
},
{
"ce_loss_10": 3.6377461314201356,
"ce_loss_13": 3.5757868885993958,
"ce_loss_2": 4.087118625640869,
"ce_loss_3": 3.9254656434059143,
"ce_loss_7": 3.680292618274689,
"epoch": 0.908,
"grad_norm": 362.0,
"kl_loss_10": 96.6335952758789,
"kl_loss_2": 1071.57734375,
"kl_loss_3": 743.0760345458984,
"kl_loss_7": 186.97156448364257,
"learning_rate": 2.1157198473197415e-05,
"loss": 527.4616,
"step": 9080
},
{
"ce_loss_10": 3.7054911255836487,
"ce_loss_13": 3.646452081203461,
"ce_loss_2": 4.16020712852478,
"ce_loss_3": 3.99694961309433,
"ce_loss_7": 3.7527972936630247,
"epoch": 0.909,
"grad_norm": 428.0,
"kl_loss_10": 95.60770835876465,
"kl_loss_2": 1073.3848999023437,
"kl_loss_3": 744.7516662597657,
"kl_loss_7": 188.15945053100586,
"learning_rate": 2.0702942574950812e-05,
"loss": 526.0792,
"step": 9090
},
{
"ce_loss_10": 3.623731589317322,
"ce_loss_13": 3.5640787363052366,
"ce_loss_2": 4.083542311191559,
"ce_loss_3": 3.9220656394958495,
"ce_loss_7": 3.669620490074158,
"epoch": 0.91,
"grad_norm": 302.0,
"kl_loss_10": 95.35622863769531,
"kl_loss_2": 1087.3217651367188,
"kl_loss_3": 752.284033203125,
"kl_loss_7": 187.5697151184082,
"learning_rate": 2.025351319275137e-05,
"loss": 528.1311,
"step": 9100
},
{
"ce_loss_10": 3.761759030818939,
"ce_loss_13": 3.6962865233421325,
"ce_loss_2": 4.2175662279129025,
"ce_loss_3": 4.051000607013703,
"ce_loss_7": 3.8052276611328124,
"epoch": 0.911,
"grad_norm": 420.0,
"kl_loss_10": 101.6547290802002,
"kl_loss_2": 1108.3317321777345,
"kl_loss_3": 765.9157867431641,
"kl_loss_7": 194.34442520141602,
"learning_rate": 1.9808914852347816e-05,
"loss": 545.7752,
"step": 9110
},
{
"ce_loss_10": 3.599123954772949,
"ce_loss_13": 3.539510524272919,
"ce_loss_2": 4.069272911548614,
"ce_loss_3": 3.9009834051132204,
"ce_loss_7": 3.6455170154571532,
"epoch": 0.912,
"grad_norm": 416.0,
"kl_loss_10": 95.14377288818359,
"kl_loss_2": 1095.5253448486328,
"kl_loss_3": 750.8630340576171,
"kl_loss_7": 187.0247688293457,
"learning_rate": 1.9369152030840554e-05,
"loss": 527.6025,
"step": 9120
},
{
"ce_loss_10": 3.6806903958320616,
"ce_loss_13": 3.620557761192322,
"ce_loss_2": 4.135490739345551,
"ce_loss_3": 3.9653069972991943,
"ce_loss_7": 3.723483943939209,
"epoch": 0.913,
"grad_norm": 362.0,
"kl_loss_10": 97.92795066833496,
"kl_loss_2": 1089.1937438964844,
"kl_loss_3": 747.6420379638672,
"kl_loss_7": 187.34563446044922,
"learning_rate": 1.893422915663645e-05,
"loss": 529.2906,
"step": 9130
},
{
"ce_loss_10": 3.5489492774009705,
"ce_loss_13": 3.488741672039032,
"ce_loss_2": 4.032487225532532,
"ce_loss_3": 3.862708866596222,
"ce_loss_7": 3.594150650501251,
"epoch": 0.914,
"grad_norm": 460.0,
"kl_loss_10": 95.81211128234864,
"kl_loss_2": 1122.290625,
"kl_loss_3": 780.3386810302734,
"kl_loss_7": 190.92548141479492,
"learning_rate": 1.850415060940386e-05,
"loss": 539.4046,
"step": 9140
},
{
"ce_loss_10": 3.670183026790619,
"ce_loss_13": 3.611021101474762,
"ce_loss_2": 4.120828151702881,
"ce_loss_3": 3.9584792375564577,
"ce_loss_7": 3.712183046340942,
"epoch": 0.915,
"grad_norm": 418.0,
"kl_loss_10": 95.88972358703613,
"kl_loss_2": 1074.5135314941406,
"kl_loss_3": 738.371826171875,
"kl_loss_7": 185.7539405822754,
"learning_rate": 1.8078920720028978e-05,
"loss": 525.966,
"step": 9150
},
{
"ce_loss_10": 3.600800943374634,
"ce_loss_13": 3.5434103488922117,
"ce_loss_2": 4.046385419368744,
"ce_loss_3": 3.8842490911483765,
"ce_loss_7": 3.6435607194900514,
"epoch": 0.916,
"grad_norm": 468.0,
"kl_loss_10": 94.49675407409669,
"kl_loss_2": 1068.3072998046875,
"kl_loss_3": 736.1623046875,
"kl_loss_7": 182.35257797241212,
"learning_rate": 1.765854377057219e-05,
"loss": 533.5915,
"step": 9160
},
{
"ce_loss_10": 3.579929566383362,
"ce_loss_13": 3.52090607881546,
"ce_loss_2": 4.0303690195083615,
"ce_loss_3": 3.863832104206085,
"ce_loss_7": 3.621261489391327,
"epoch": 0.917,
"grad_norm": 344.0,
"kl_loss_10": 93.69845123291016,
"kl_loss_2": 1076.374838256836,
"kl_loss_3": 739.5320068359375,
"kl_loss_7": 182.73907394409179,
"learning_rate": 1.724302399422456e-05,
"loss": 525.9574,
"step": 9170
},
{
"ce_loss_10": 3.5273375153541564,
"ce_loss_13": 3.469092321395874,
"ce_loss_2": 3.98960462808609,
"ce_loss_3": 3.8235998272895815,
"ce_loss_7": 3.572177302837372,
"epoch": 0.918,
"grad_norm": 328.0,
"kl_loss_10": 94.86108894348145,
"kl_loss_2": 1092.3598358154297,
"kl_loss_3": 757.3310150146484,
"kl_loss_7": 188.48751983642578,
"learning_rate": 1.683236557526574e-05,
"loss": 533.8531,
"step": 9180
},
{
"ce_loss_10": 3.6514230132102967,
"ce_loss_13": 3.59556097984314,
"ce_loss_2": 4.083134496212006,
"ce_loss_3": 3.926029086112976,
"ce_loss_7": 3.693097734451294,
"epoch": 0.919,
"grad_norm": 276.0,
"kl_loss_10": 94.37221069335938,
"kl_loss_2": 1047.5379638671875,
"kl_loss_3": 720.9200286865234,
"kl_loss_7": 181.39565734863282,
"learning_rate": 1.6426572649021475e-05,
"loss": 520.5356,
"step": 9190
},
{
"ce_loss_10": 3.6877851486206055,
"ce_loss_13": 3.6274981617927553,
"ce_loss_2": 4.1144737839698795,
"ce_loss_3": 3.9595839619636535,
"ce_loss_7": 3.7264232993125916,
"epoch": 0.92,
"grad_norm": 430.0,
"kl_loss_10": 99.18587074279785,
"kl_loss_2": 1047.7421783447267,
"kl_loss_3": 721.9292663574219,
"kl_loss_7": 186.34831695556642,
"learning_rate": 1.6025649301821876e-05,
"loss": 520.097,
"step": 9200
},
{
"ce_loss_10": 3.6789560437202455,
"ce_loss_13": 3.6199841260910035,
"ce_loss_2": 4.116438376903534,
"ce_loss_3": 3.95575532913208,
"ce_loss_7": 3.720892333984375,
"epoch": 0.921,
"grad_norm": 430.0,
"kl_loss_10": 95.03273735046386,
"kl_loss_2": 1068.5045623779297,
"kl_loss_3": 740.7460571289063,
"kl_loss_7": 185.96430587768555,
"learning_rate": 1.5629599570960716e-05,
"loss": 522.4428,
"step": 9210
},
{
"ce_loss_10": 3.579318141937256,
"ce_loss_13": 3.5199381947517394,
"ce_loss_2": 4.029832947254181,
"ce_loss_3": 3.865503740310669,
"ce_loss_7": 3.6221681237220764,
"epoch": 0.922,
"grad_norm": 430.0,
"kl_loss_10": 94.97879791259766,
"kl_loss_2": 1084.768603515625,
"kl_loss_3": 748.8800231933594,
"kl_loss_7": 185.368741607666,
"learning_rate": 1.5238427444654367e-05,
"loss": 526.936,
"step": 9220
},
{
"ce_loss_10": 3.642410922050476,
"ce_loss_13": 3.5841264009475706,
"ce_loss_2": 4.090620064735413,
"ce_loss_3": 3.929516541957855,
"ce_loss_7": 3.68586403131485,
"epoch": 0.923,
"grad_norm": 340.0,
"kl_loss_10": 95.43446731567383,
"kl_loss_2": 1061.9394897460938,
"kl_loss_3": 729.8539154052735,
"kl_loss_7": 184.269775390625,
"learning_rate": 1.4852136862001764e-05,
"loss": 521.6809,
"step": 9230
},
{
"ce_loss_10": 3.6022266387939452,
"ce_loss_13": 3.5460850477218626,
"ce_loss_2": 4.056096696853638,
"ce_loss_3": 3.894578981399536,
"ce_loss_7": 3.6445172667503356,
"epoch": 0.924,
"grad_norm": 382.0,
"kl_loss_10": 90.83601989746094,
"kl_loss_2": 1070.5055114746094,
"kl_loss_3": 735.5364959716796,
"kl_loss_7": 180.06712493896484,
"learning_rate": 1.4470731712944884e-05,
"loss": 526.6606,
"step": 9240
},
{
"ce_loss_10": 3.632104980945587,
"ce_loss_13": 3.573563551902771,
"ce_loss_2": 4.086918556690216,
"ce_loss_3": 3.921724486351013,
"ce_loss_7": 3.676921808719635,
"epoch": 0.925,
"grad_norm": 404.0,
"kl_loss_10": 93.8505702972412,
"kl_loss_2": 1076.019464111328,
"kl_loss_3": 742.9348846435547,
"kl_loss_7": 185.7860206604004,
"learning_rate": 1.4094215838229174e-05,
"loss": 532.0963,
"step": 9250
},
{
"ce_loss_10": 3.5902254581451416,
"ce_loss_13": 3.531176710128784,
"ce_loss_2": 4.053838360309601,
"ce_loss_3": 3.8887827515602114,
"ce_loss_7": 3.634320020675659,
"epoch": 0.926,
"grad_norm": 440.0,
"kl_loss_10": 95.00082511901856,
"kl_loss_2": 1108.7564575195313,
"kl_loss_3": 761.1957458496094,
"kl_loss_7": 187.39419326782226,
"learning_rate": 1.372259302936546e-05,
"loss": 548.2919,
"step": 9260
},
{
"ce_loss_10": 3.7115341782569886,
"ce_loss_13": 3.6472853660583495,
"ce_loss_2": 4.159861445426941,
"ce_loss_3": 3.998417854309082,
"ce_loss_7": 3.7543888211250307,
"epoch": 0.927,
"grad_norm": 304.0,
"kl_loss_10": 100.11175384521485,
"kl_loss_2": 1075.1090118408204,
"kl_loss_3": 744.2237152099609,
"kl_loss_7": 190.9360038757324,
"learning_rate": 1.3355867028591206e-05,
"loss": 520.805,
"step": 9270
},
{
"ce_loss_10": 3.6113879919052123,
"ce_loss_13": 3.5496174573898314,
"ce_loss_2": 4.047625136375427,
"ce_loss_3": 3.8916648983955384,
"ce_loss_7": 3.653665769100189,
"epoch": 0.928,
"grad_norm": 334.0,
"kl_loss_10": 94.99486846923828,
"kl_loss_2": 1063.383090209961,
"kl_loss_3": 737.3780670166016,
"kl_loss_7": 184.87188415527345,
"learning_rate": 1.2994041528833267e-05,
"loss": 520.9468,
"step": 9280
},
{
"ce_loss_10": 3.612771439552307,
"ce_loss_13": 3.5519652009010314,
"ce_loss_2": 4.069023680686951,
"ce_loss_3": 3.9033527731895448,
"ce_loss_7": 3.653776025772095,
"epoch": 0.929,
"grad_norm": 394.0,
"kl_loss_10": 94.48731269836426,
"kl_loss_2": 1086.341064453125,
"kl_loss_3": 747.7527069091797,
"kl_loss_7": 184.27003555297853,
"learning_rate": 1.2637120173670358e-05,
"loss": 525.795,
"step": 9290
},
{
"ce_loss_10": 3.6342510104179384,
"ce_loss_13": 3.574049484729767,
"ce_loss_2": 4.097525525093078,
"ce_loss_3": 3.9327287077903748,
"ce_loss_7": 3.6803439974784853,
"epoch": 0.93,
"grad_norm": 492.0,
"kl_loss_10": 94.73881340026855,
"kl_loss_2": 1086.5091583251954,
"kl_loss_3": 750.7861236572265,
"kl_loss_7": 186.8117706298828,
"learning_rate": 1.2285106557296478e-05,
"loss": 526.7854,
"step": 9300
},
{
"ce_loss_10": 3.513438880443573,
"ce_loss_13": 3.453951287269592,
"ce_loss_2": 3.9955971360206606,
"ce_loss_3": 3.8230167746543886,
"ce_loss_7": 3.555509877204895,
"epoch": 0.931,
"grad_norm": 356.0,
"kl_loss_10": 93.80283432006836,
"kl_loss_2": 1116.4696807861328,
"kl_loss_3": 771.7997375488281,
"kl_loss_7": 186.52389373779297,
"learning_rate": 1.1938004224484989e-05,
"loss": 533.0822,
"step": 9310
},
{
"ce_loss_10": 3.7524689197540284,
"ce_loss_13": 3.6876933336257935,
"ce_loss_2": 4.20148618221283,
"ce_loss_3": 4.035860347747803,
"ce_loss_7": 3.7956905245780943,
"epoch": 0.932,
"grad_norm": 418.0,
"kl_loss_10": 99.70074195861817,
"kl_loss_2": 1085.114028930664,
"kl_loss_3": 747.7518859863281,
"kl_loss_7": 189.80009078979492,
"learning_rate": 1.1595816670552429e-05,
"loss": 536.128,
"step": 9320
},
{
"ce_loss_10": 3.6811413764953613,
"ce_loss_13": 3.619305157661438,
"ce_loss_2": 4.1267077088356015,
"ce_loss_3": 3.9628111124038696,
"ce_loss_7": 3.7232463002204894,
"epoch": 0.933,
"grad_norm": 430.0,
"kl_loss_10": 98.55138320922852,
"kl_loss_2": 1066.0611297607422,
"kl_loss_3": 732.6245086669921,
"kl_loss_7": 187.06882858276367,
"learning_rate": 1.1258547341323699e-05,
"loss": 518.9695,
"step": 9330
},
{
"ce_loss_10": 3.706856846809387,
"ce_loss_13": 3.6450837016105653,
"ce_loss_2": 4.152973532676697,
"ce_loss_3": 3.9891764402389525,
"ce_loss_7": 3.7481295585632326,
"epoch": 0.934,
"grad_norm": 394.0,
"kl_loss_10": 96.45535087585449,
"kl_loss_2": 1089.2688110351562,
"kl_loss_3": 747.8073425292969,
"kl_loss_7": 187.34025497436522,
"learning_rate": 1.0926199633097156e-05,
"loss": 527.061,
"step": 9340
},
{
"ce_loss_10": 3.7075893759727476,
"ce_loss_13": 3.6489187121391295,
"ce_loss_2": 4.135252356529236,
"ce_loss_3": 3.976875376701355,
"ce_loss_7": 3.747441065311432,
"epoch": 0.935,
"grad_norm": 428.0,
"kl_loss_10": 94.83727493286133,
"kl_loss_2": 1042.2317810058594,
"kl_loss_3": 718.6920349121094,
"kl_loss_7": 181.23108978271483,
"learning_rate": 1.0598776892610684e-05,
"loss": 526.2413,
"step": 9350
},
{
"ce_loss_10": 3.5169559955596923,
"ce_loss_13": 3.4603365540504454,
"ce_loss_2": 3.9802993655204775,
"ce_loss_3": 3.8121800780296327,
"ce_loss_7": 3.561786246299744,
"epoch": 0.936,
"grad_norm": 334.0,
"kl_loss_10": 92.96564292907715,
"kl_loss_2": 1091.1406646728515,
"kl_loss_3": 747.6543731689453,
"kl_loss_7": 183.7804039001465,
"learning_rate": 1.0276282417007399e-05,
"loss": 521.9861,
"step": 9360
},
{
"ce_loss_10": 3.6849416494369507,
"ce_loss_13": 3.626581645011902,
"ce_loss_2": 4.118964040279389,
"ce_loss_3": 3.9585147976875303,
"ce_loss_7": 3.7237794518470766,
"epoch": 0.937,
"grad_norm": 464.0,
"kl_loss_10": 95.02116394042969,
"kl_loss_2": 1044.2026397705079,
"kl_loss_3": 719.8276824951172,
"kl_loss_7": 182.06821365356444,
"learning_rate": 9.958719453803277e-06,
"loss": 518.1707,
"step": 9370
},
{
"ce_loss_10": 3.6774186968803404,
"ce_loss_13": 3.6149828910827635,
"ce_loss_2": 4.126804637908935,
"ce_loss_3": 3.964286994934082,
"ce_loss_7": 3.7206520080566405,
"epoch": 0.938,
"grad_norm": 364.0,
"kl_loss_10": 96.62460212707519,
"kl_loss_2": 1077.0972625732422,
"kl_loss_3": 746.5920196533203,
"kl_loss_7": 186.96116638183594,
"learning_rate": 9.646091200853802e-06,
"loss": 526.3039,
"step": 9380
},
{
"ce_loss_10": 3.633099365234375,
"ce_loss_13": 3.5745465636253355,
"ce_loss_2": 4.0883647203445435,
"ce_loss_3": 3.9242159128189087,
"ce_loss_7": 3.672658348083496,
"epoch": 0.939,
"grad_norm": 398.0,
"kl_loss_10": 93.04219818115234,
"kl_loss_2": 1075.2075500488281,
"kl_loss_3": 738.6250030517579,
"kl_loss_7": 181.5531784057617,
"learning_rate": 9.338400806321978e-06,
"loss": 512.8155,
"step": 9390
},
{
"ce_loss_10": 3.664756190776825,
"ce_loss_13": 3.603893756866455,
"ce_loss_2": 4.104370522499084,
"ce_loss_3": 3.941502547264099,
"ce_loss_7": 3.7107202291488646,
"epoch": 0.94,
"grad_norm": 330.0,
"kl_loss_10": 96.52969932556152,
"kl_loss_2": 1056.286117553711,
"kl_loss_3": 729.6215881347656,
"kl_loss_7": 186.73142929077147,
"learning_rate": 9.035651368646646e-06,
"loss": 517.5048,
"step": 9400
},
{
"ce_loss_10": 3.6749662160873413,
"ce_loss_13": 3.6150254607200623,
"ce_loss_2": 4.108079397678376,
"ce_loss_3": 3.9502787351608277,
"ce_loss_7": 3.71422598361969,
"epoch": 0.941,
"grad_norm": 368.0,
"kl_loss_10": 95.4813446044922,
"kl_loss_2": 1051.3231384277344,
"kl_loss_3": 730.8897918701172,
"kl_loss_7": 183.71395568847657,
"learning_rate": 8.737845936511335e-06,
"loss": 521.5386,
"step": 9410
},
{
"ce_loss_10": 3.621238374710083,
"ce_loss_13": 3.560182070732117,
"ce_loss_2": 4.075435829162598,
"ce_loss_3": 3.906463932991028,
"ce_loss_7": 3.6651031732559205,
"epoch": 0.942,
"grad_norm": 472.0,
"kl_loss_10": 95.50933799743652,
"kl_loss_2": 1087.418194580078,
"kl_loss_3": 749.9641418457031,
"kl_loss_7": 187.3939208984375,
"learning_rate": 8.444987508813451e-06,
"loss": 524.6778,
"step": 9420
},
{
"ce_loss_10": 3.567629599571228,
"ce_loss_13": 3.5098708271980286,
"ce_loss_2": 4.03240327835083,
"ce_loss_3": 3.868740451335907,
"ce_loss_7": 3.614664590358734,
"epoch": 0.943,
"grad_norm": 452.0,
"kl_loss_10": 95.83200073242188,
"kl_loss_2": 1111.0681640625,
"kl_loss_3": 769.0793914794922,
"kl_loss_7": 188.26431045532226,
"learning_rate": 8.157079034633974e-06,
"loss": 533.1891,
"step": 9430
},
{
"ce_loss_10": 3.5664173483848574,
"ce_loss_13": 3.5061603307724,
"ce_loss_2": 4.02851265668869,
"ce_loss_3": 3.862307035923004,
"ce_loss_7": 3.6107182621955873,
"epoch": 0.944,
"grad_norm": 426.0,
"kl_loss_10": 94.98325424194336,
"kl_loss_2": 1109.4172790527343,
"kl_loss_3": 762.6424713134766,
"kl_loss_7": 186.38191299438478,
"learning_rate": 7.874123413208145e-06,
"loss": 528.958,
"step": 9440
},
{
"ce_loss_10": 3.5382938742637635,
"ce_loss_13": 3.481018900871277,
"ce_loss_2": 4.006192743778229,
"ce_loss_3": 3.8386752605438232,
"ce_loss_7": 3.5831608533859254,
"epoch": 0.945,
"grad_norm": 338.0,
"kl_loss_10": 92.47231903076172,
"kl_loss_2": 1088.9563568115234,
"kl_loss_3": 753.4448974609375,
"kl_loss_7": 184.27166213989258,
"learning_rate": 7.59612349389599e-06,
"loss": 527.5225,
"step": 9450
},
{
"ce_loss_10": 3.633445167541504,
"ce_loss_13": 3.5758827209472654,
"ce_loss_2": 4.075440514087677,
"ce_loss_3": 3.9124983310699464,
"ce_loss_7": 3.6780736327171324,
"epoch": 0.946,
"grad_norm": 356.0,
"kl_loss_10": 91.38598556518555,
"kl_loss_2": 1046.8805053710937,
"kl_loss_3": 718.2211791992188,
"kl_loss_7": 180.72154998779297,
"learning_rate": 7.323082076153509e-06,
"loss": 519.5404,
"step": 9460
},
{
"ce_loss_10": 3.675933361053467,
"ce_loss_13": 3.616945672035217,
"ce_loss_2": 4.116010129451752,
"ce_loss_3": 3.954231834411621,
"ce_loss_7": 3.7195321679115296,
"epoch": 0.947,
"grad_norm": 376.0,
"kl_loss_10": 96.42714042663575,
"kl_loss_2": 1051.1879852294921,
"kl_loss_3": 727.5513549804688,
"kl_loss_7": 186.51647338867187,
"learning_rate": 7.055001909504755e-06,
"loss": 525.7655,
"step": 9470
},
{
"ce_loss_10": 3.7083083152770997,
"ce_loss_13": 3.647673761844635,
"ce_loss_2": 4.157342481613159,
"ce_loss_3": 3.991931939125061,
"ce_loss_7": 3.752028775215149,
"epoch": 0.948,
"grad_norm": 344.0,
"kl_loss_10": 96.79825706481934,
"kl_loss_2": 1084.5101806640625,
"kl_loss_3": 742.6272155761719,
"kl_loss_7": 187.0098518371582,
"learning_rate": 6.791885693514133e-06,
"loss": 528.4126,
"step": 9480
},
{
"ce_loss_10": 3.6131741404533386,
"ce_loss_13": 3.554737401008606,
"ce_loss_2": 4.069884133338928,
"ce_loss_3": 3.910088050365448,
"ce_loss_7": 3.657594072818756,
"epoch": 0.949,
"grad_norm": 444.0,
"kl_loss_10": 95.54262161254883,
"kl_loss_2": 1090.819403076172,
"kl_loss_3": 755.8211273193359,
"kl_loss_7": 187.30291366577148,
"learning_rate": 6.533736077758867e-06,
"loss": 532.407,
"step": 9490
},
{
"ce_loss_10": 3.5753329753875733,
"ce_loss_13": 3.5157718658447266,
"ce_loss_2": 4.050174379348755,
"ce_loss_3": 3.878748118877411,
"ce_loss_7": 3.621631395816803,
"epoch": 0.95,
"grad_norm": 454.0,
"kl_loss_10": 95.78313636779785,
"kl_loss_2": 1112.5021850585938,
"kl_loss_3": 766.8859832763671,
"kl_loss_7": 188.93851776123046,
"learning_rate": 6.2805556618028556e-06,
"loss": 531.8975,
"step": 9500
},
{
"ce_loss_10": 3.6739890694618227,
"ce_loss_13": 3.614563775062561,
"ce_loss_2": 4.105420649051666,
"ce_loss_3": 3.946949827671051,
"ce_loss_7": 3.713826298713684,
"epoch": 0.951,
"grad_norm": 428.0,
"kl_loss_10": 95.29025764465332,
"kl_loss_2": 1035.753839111328,
"kl_loss_3": 718.9863189697265,
"kl_loss_7": 182.34558639526367,
"learning_rate": 6.032346995169968e-06,
"loss": 506.1833,
"step": 9510
},
{
"ce_loss_10": 3.6744378566741944,
"ce_loss_13": 3.6160669803619383,
"ce_loss_2": 4.116178596019745,
"ce_loss_3": 3.952050065994263,
"ce_loss_7": 3.714146387577057,
"epoch": 0.952,
"grad_norm": 350.0,
"kl_loss_10": 95.77439384460449,
"kl_loss_2": 1065.6743865966796,
"kl_loss_3": 734.3932067871094,
"kl_loss_7": 184.87170867919923,
"learning_rate": 5.789112577318789e-06,
"loss": 520.2576,
"step": 9520
},
{
"ce_loss_10": 3.6489309549331663,
"ce_loss_13": 3.5895671963691713,
"ce_loss_2": 4.11376656293869,
"ce_loss_3": 3.946073520183563,
"ce_loss_7": 3.6925018429756165,
"epoch": 0.953,
"grad_norm": 460.0,
"kl_loss_10": 96.73359451293945,
"kl_loss_2": 1111.601629638672,
"kl_loss_3": 771.5278289794921,
"kl_loss_7": 187.8802848815918,
"learning_rate": 5.550854857617194e-06,
"loss": 527.3308,
"step": 9530
},
{
"ce_loss_10": 3.6415695905685426,
"ce_loss_13": 3.579833471775055,
"ce_loss_2": 4.102292227745056,
"ce_loss_3": 3.9383127331733703,
"ce_loss_7": 3.6863919377326964,
"epoch": 0.954,
"grad_norm": 398.0,
"kl_loss_10": 98.16804580688476,
"kl_loss_2": 1097.6046325683594,
"kl_loss_3": 757.5784729003906,
"kl_loss_7": 190.50857543945312,
"learning_rate": 5.317576235317756e-06,
"loss": 527.9396,
"step": 9540
},
{
"ce_loss_10": 3.6651427507400514,
"ce_loss_13": 3.604920470714569,
"ce_loss_2": 4.100248050689697,
"ce_loss_3": 3.94064177274704,
"ce_loss_7": 3.7060970425605775,
"epoch": 0.955,
"grad_norm": 386.0,
"kl_loss_10": 96.45015525817871,
"kl_loss_2": 1031.3038146972656,
"kl_loss_3": 712.4996978759766,
"kl_loss_7": 182.76630401611328,
"learning_rate": 5.089279059533658e-06,
"loss": 524.0002,
"step": 9550
},
{
"ce_loss_10": 3.7266568183898925,
"ce_loss_13": 3.663935911655426,
"ce_loss_2": 4.170814108848572,
"ce_loss_3": 4.006054651737213,
"ce_loss_7": 3.769794237613678,
"epoch": 0.956,
"grad_norm": 386.0,
"kl_loss_10": 100.15878944396972,
"kl_loss_2": 1068.9294799804688,
"kl_loss_3": 738.0209930419921,
"kl_loss_7": 192.08404541015625,
"learning_rate": 4.865965629214819e-06,
"loss": 520.8748,
"step": 9560
},
{
"ce_loss_10": 3.670477032661438,
"ce_loss_13": 3.611146903038025,
"ce_loss_2": 4.115479242801666,
"ce_loss_3": 3.9537983894348145,
"ce_loss_7": 3.7129539370536806,
"epoch": 0.957,
"grad_norm": 496.0,
"kl_loss_10": 96.79973983764648,
"kl_loss_2": 1085.6631072998048,
"kl_loss_3": 749.8902404785156,
"kl_loss_7": 188.74480895996095,
"learning_rate": 4.6476381931251366e-06,
"loss": 519.6521,
"step": 9570
},
{
"ce_loss_10": 3.646716892719269,
"ce_loss_13": 3.5878213763237,
"ce_loss_2": 4.089986479282379,
"ce_loss_3": 3.9314276933670045,
"ce_loss_7": 3.6911307334899903,
"epoch": 0.958,
"grad_norm": 318.0,
"kl_loss_10": 94.01541290283203,
"kl_loss_2": 1067.8105712890624,
"kl_loss_3": 740.1676208496094,
"kl_loss_7": 184.118741607666,
"learning_rate": 4.434298949819449e-06,
"loss": 523.6254,
"step": 9580
},
{
"ce_loss_10": 3.6008993268013,
"ce_loss_13": 3.538570249080658,
"ce_loss_2": 4.069638097286225,
"ce_loss_3": 3.8975520372390746,
"ce_loss_7": 3.6453381776809692,
"epoch": 0.959,
"grad_norm": 440.0,
"kl_loss_10": 97.41343994140625,
"kl_loss_2": 1125.892025756836,
"kl_loss_3": 772.14267578125,
"kl_loss_7": 189.9515396118164,
"learning_rate": 4.2259500476214406e-06,
"loss": 534.6609,
"step": 9590
},
{
"ce_loss_10": 3.58458696603775,
"ce_loss_13": 3.52560031414032,
"ce_loss_2": 4.040603399276733,
"ce_loss_3": 3.8742735624313354,
"ce_loss_7": 3.627805030345917,
"epoch": 0.96,
"grad_norm": 388.0,
"kl_loss_10": 94.08248367309571,
"kl_loss_2": 1083.009814453125,
"kl_loss_3": 746.2331970214843,
"kl_loss_7": 184.85717010498047,
"learning_rate": 4.02259358460233e-06,
"loss": 521.7564,
"step": 9600
},
{
"ce_loss_10": 3.6558929800987245,
"ce_loss_13": 3.5954962849617003,
"ce_loss_2": 4.101473760604859,
"ce_loss_3": 3.9380804181098936,
"ce_loss_7": 3.6987645506858824,
"epoch": 0.961,
"grad_norm": 544.0,
"kl_loss_10": 95.69773292541504,
"kl_loss_2": 1060.7937774658203,
"kl_loss_3": 733.2102172851562,
"kl_loss_7": 185.71547775268556,
"learning_rate": 3.8242316085594916e-06,
"loss": 516.8465,
"step": 9610
},
{
"ce_loss_10": 3.5343406558036805,
"ce_loss_13": 3.4767986059188845,
"ce_loss_2": 4.016193747520447,
"ce_loss_3": 3.8443652629852294,
"ce_loss_7": 3.580942380428314,
"epoch": 0.962,
"grad_norm": 366.0,
"kl_loss_10": 93.89258918762206,
"kl_loss_2": 1123.5916809082032,
"kl_loss_3": 780.3413696289062,
"kl_loss_7": 187.34277801513673,
"learning_rate": 3.630866116995757e-06,
"loss": 546.1011,
"step": 9620
},
{
"ce_loss_10": 3.6960983991622927,
"ce_loss_13": 3.635801446437836,
"ce_loss_2": 4.132487082481385,
"ce_loss_3": 3.9690314412117003,
"ce_loss_7": 3.737609100341797,
"epoch": 0.963,
"grad_norm": 312.0,
"kl_loss_10": 96.57149925231934,
"kl_loss_2": 1044.7675506591797,
"kl_loss_3": 718.4659484863281,
"kl_loss_7": 183.9634048461914,
"learning_rate": 3.4424990570994797e-06,
"loss": 523.2208,
"step": 9630
},
{
"ce_loss_10": 3.685701644420624,
"ce_loss_13": 3.624559962749481,
"ce_loss_2": 4.128798627853394,
"ce_loss_3": 3.968520772457123,
"ce_loss_7": 3.7257295846939087,
"epoch": 0.964,
"grad_norm": 280.0,
"kl_loss_10": 95.63589668273926,
"kl_loss_2": 1068.9191833496093,
"kl_loss_3": 737.6691131591797,
"kl_loss_7": 184.7596893310547,
"learning_rate": 3.2591323257248896e-06,
"loss": 522.5564,
"step": 9640
},
{
"ce_loss_10": 3.5315052390098574,
"ce_loss_13": 3.4732569575309755,
"ce_loss_2": 3.99234676361084,
"ce_loss_3": 3.822614312171936,
"ce_loss_7": 3.5727542638778687,
"epoch": 0.965,
"grad_norm": 338.0,
"kl_loss_10": 93.59828681945801,
"kl_loss_2": 1088.8541290283204,
"kl_loss_3": 750.0034118652344,
"kl_loss_7": 183.51124572753906,
"learning_rate": 3.0807677693729385e-06,
"loss": 528.9641,
"step": 9650
},
{
"ce_loss_10": 3.721923458576202,
"ce_loss_13": 3.6635044693946837,
"ce_loss_2": 4.157553017139435,
"ce_loss_3": 3.9980939745903017,
"ce_loss_7": 3.7649829506874086,
"epoch": 0.966,
"grad_norm": 328.0,
"kl_loss_10": 95.77610893249512,
"kl_loss_2": 1046.733694458008,
"kl_loss_3": 723.9284912109375,
"kl_loss_7": 183.63089752197266,
"learning_rate": 2.9074071841727055e-06,
"loss": 513.6759,
"step": 9660
},
{
"ce_loss_10": 3.6491685032844545,
"ce_loss_13": 3.5898547172546387,
"ce_loss_2": 4.10191251039505,
"ce_loss_3": 3.9377865552902223,
"ce_loss_7": 3.694057583808899,
"epoch": 0.967,
"grad_norm": 410.0,
"kl_loss_10": 94.75908012390137,
"kl_loss_2": 1074.1435485839843,
"kl_loss_3": 739.0172424316406,
"kl_loss_7": 185.9965072631836,
"learning_rate": 2.739052315863355e-06,
"loss": 514.4849,
"step": 9670
},
{
"ce_loss_10": 3.6381678700447084,
"ce_loss_13": 3.5745797991752624,
"ce_loss_2": 4.085923862457276,
"ce_loss_3": 3.9223034262657164,
"ce_loss_7": 3.679898130893707,
"epoch": 0.968,
"grad_norm": 400.0,
"kl_loss_10": 98.94500389099122,
"kl_loss_2": 1071.759048461914,
"kl_loss_3": 742.1754821777344,
"kl_loss_7": 186.4635383605957,
"learning_rate": 2.5757048597765396e-06,
"loss": 520.3133,
"step": 9680
},
{
"ce_loss_10": 3.6451838970184327,
"ce_loss_13": 3.584810471534729,
"ce_loss_2": 4.096628618240357,
"ce_loss_3": 3.9352465867996216,
"ce_loss_7": 3.6861096024513245,
"epoch": 0.969,
"grad_norm": 354.0,
"kl_loss_10": 95.84736633300781,
"kl_loss_2": 1089.9881774902344,
"kl_loss_3": 753.175894165039,
"kl_loss_7": 186.62908554077148,
"learning_rate": 2.417366460819359e-06,
"loss": 527.3621,
"step": 9690
},
{
"ce_loss_10": 3.6515438675880434,
"ce_loss_13": 3.5902424931526182,
"ce_loss_2": 4.121148645877838,
"ce_loss_3": 3.9508870244026184,
"ce_loss_7": 3.6973974823951723,
"epoch": 0.97,
"grad_norm": 378.0,
"kl_loss_10": 97.83438453674316,
"kl_loss_2": 1114.7044860839844,
"kl_loss_3": 766.2058898925782,
"kl_loss_7": 189.7753791809082,
"learning_rate": 2.2640387134577057e-06,
"loss": 528.5559,
"step": 9700
},
{
"ce_loss_10": 3.579375672340393,
"ce_loss_13": 3.5232587337493895,
"ce_loss_2": 4.008558976650238,
"ce_loss_3": 3.853218126296997,
"ce_loss_7": 3.621316111087799,
"epoch": 0.971,
"grad_norm": 346.0,
"kl_loss_10": 89.91974563598633,
"kl_loss_2": 1025.9577575683593,
"kl_loss_3": 709.8189392089844,
"kl_loss_7": 177.37805099487304,
"learning_rate": 2.115723161700278e-06,
"loss": 511.7921,
"step": 9710
},
{
"ce_loss_10": 3.5539788961410523,
"ce_loss_13": 3.493563008308411,
"ce_loss_2": 4.019102883338928,
"ce_loss_3": 3.8513848066329954,
"ce_loss_7": 3.6021719098091127,
"epoch": 0.972,
"grad_norm": 450.0,
"kl_loss_10": 97.08839912414551,
"kl_loss_2": 1102.8951354980468,
"kl_loss_3": 763.9947265625,
"kl_loss_7": 189.97327194213867,
"learning_rate": 1.9724212990830937e-06,
"loss": 534.7647,
"step": 9720
},
{
"ce_loss_10": 3.7055511236190797,
"ce_loss_13": 3.645791494846344,
"ce_loss_2": 4.164284873008728,
"ce_loss_3": 3.998200333118439,
"ce_loss_7": 3.748906970024109,
"epoch": 0.973,
"grad_norm": 306.0,
"kl_loss_10": 97.2132583618164,
"kl_loss_2": 1086.074758911133,
"kl_loss_3": 748.4372924804687,
"kl_loss_7": 187.37096481323243,
"learning_rate": 1.8341345686543331e-06,
"loss": 526.9427,
"step": 9730
},
{
"ce_loss_10": 3.688717949390411,
"ce_loss_13": 3.6282296895980837,
"ce_loss_2": 4.123041558265686,
"ce_loss_3": 3.963315784931183,
"ce_loss_7": 3.731441855430603,
"epoch": 0.974,
"grad_norm": 446.0,
"kl_loss_10": 95.65226020812989,
"kl_loss_2": 1053.469790649414,
"kl_loss_3": 725.7204162597657,
"kl_loss_7": 185.0056625366211,
"learning_rate": 1.7008643629596864e-06,
"loss": 524.4386,
"step": 9740
},
{
"ce_loss_10": 3.674058973789215,
"ce_loss_13": 3.6143284678459167,
"ce_loss_2": 4.119033622741699,
"ce_loss_3": 3.954865837097168,
"ce_loss_7": 3.7161202311515806,
"epoch": 0.975,
"grad_norm": 406.0,
"kl_loss_10": 96.88497962951661,
"kl_loss_2": 1081.9985229492188,
"kl_loss_3": 741.1380645751954,
"kl_loss_7": 186.00157089233397,
"learning_rate": 1.5726120240288633e-06,
"loss": 531.1466,
"step": 9750
},
{
"ce_loss_10": 3.569232928752899,
"ce_loss_13": 3.511077570915222,
"ce_loss_2": 4.014019024372101,
"ce_loss_3": 3.854006791114807,
"ce_loss_7": 3.6116040468215944,
"epoch": 0.976,
"grad_norm": 548.0,
"kl_loss_10": 94.0260025024414,
"kl_loss_2": 1069.765771484375,
"kl_loss_3": 740.6184356689453,
"kl_loss_7": 184.33246154785155,
"learning_rate": 1.4493788433612708e-06,
"loss": 520.0787,
"step": 9760
},
{
"ce_loss_10": 3.6905293107032775,
"ce_loss_13": 3.630939745903015,
"ce_loss_2": 4.1426611065864565,
"ce_loss_3": 3.9766762137413023,
"ce_loss_7": 3.7346643686294554,
"epoch": 0.977,
"grad_norm": 340.0,
"kl_loss_10": 95.88525352478027,
"kl_loss_2": 1083.8150268554687,
"kl_loss_3": 744.1571472167968,
"kl_loss_7": 186.4142593383789,
"learning_rate": 1.3311660619138578e-06,
"loss": 528.903,
"step": 9770
},
{
"ce_loss_10": 3.6836748480796815,
"ce_loss_13": 3.6255483746528627,
"ce_loss_2": 4.109962856769561,
"ce_loss_3": 3.9559788703918457,
"ce_loss_7": 3.7251157641410826,
"epoch": 0.978,
"grad_norm": 358.0,
"kl_loss_10": 94.83126564025879,
"kl_loss_2": 1041.5651123046875,
"kl_loss_3": 719.0739654541015,
"kl_loss_7": 183.43487930297852,
"learning_rate": 1.2179748700879012e-06,
"loss": 517.046,
"step": 9780
},
{
"ce_loss_10": 3.6114102602005005,
"ce_loss_13": 3.553410363197327,
"ce_loss_2": 4.060984289646148,
"ce_loss_3": 3.9008304476737976,
"ce_loss_7": 3.6546178460121155,
"epoch": 0.979,
"grad_norm": 448.0,
"kl_loss_10": 94.51130638122558,
"kl_loss_2": 1070.2796936035156,
"kl_loss_3": 734.1844604492187,
"kl_loss_7": 183.8596923828125,
"learning_rate": 1.1098064077174619e-06,
"loss": 522.2918,
"step": 9790
},
{
"ce_loss_10": 3.6468693137168886,
"ce_loss_13": 3.5864970564842222,
"ce_loss_2": 4.112797820568085,
"ce_loss_3": 3.9430916547775268,
"ce_loss_7": 3.6902678489685057,
"epoch": 0.98,
"grad_norm": 396.0,
"kl_loss_10": 94.27075653076172,
"kl_loss_2": 1089.721176147461,
"kl_loss_3": 749.4085998535156,
"kl_loss_7": 185.25458450317382,
"learning_rate": 1.006661764057837e-06,
"loss": 525.8869,
"step": 9800
},
{
"ce_loss_10": 3.6512062191963195,
"ce_loss_13": 3.591748607158661,
"ce_loss_2": 4.100599420070648,
"ce_loss_3": 3.94116724729538,
"ce_loss_7": 3.6929367065429686,
"epoch": 0.981,
"grad_norm": 370.0,
"kl_loss_10": 95.22041091918945,
"kl_loss_2": 1079.5743225097656,
"kl_loss_3": 744.3837371826172,
"kl_loss_7": 184.29767150878905,
"learning_rate": 9.085419777743465e-07,
"loss": 523.8814,
"step": 9810
},
{
"ce_loss_10": 3.5867176413536073,
"ce_loss_13": 3.5297884702682496,
"ce_loss_2": 4.040095067024231,
"ce_loss_3": 3.876063418388367,
"ce_loss_7": 3.6296881198883058,
"epoch": 0.982,
"grad_norm": 372.0,
"kl_loss_10": 92.43338165283203,
"kl_loss_2": 1068.0833831787108,
"kl_loss_3": 736.9401550292969,
"kl_loss_7": 179.86018447875978,
"learning_rate": 8.15448036932176e-07,
"loss": 515.9226,
"step": 9820
},
{
"ce_loss_10": 3.641873502731323,
"ce_loss_13": 3.5830901145935057,
"ce_loss_2": 4.088828957080841,
"ce_loss_3": 3.9226441621780395,
"ce_loss_7": 3.6845821142196655,
"epoch": 0.983,
"grad_norm": 450.0,
"kl_loss_10": 93.74083061218262,
"kl_loss_2": 1074.023745727539,
"kl_loss_3": 742.9956268310547,
"kl_loss_7": 184.04373016357422,
"learning_rate": 7.273808789862724e-07,
"loss": 527.5683,
"step": 9830
},
{
"ce_loss_10": 3.7283427119255066,
"ce_loss_13": 3.6678077578544617,
"ce_loss_2": 4.168006038665771,
"ce_loss_3": 4.004901099205017,
"ce_loss_7": 3.7688368439674376,
"epoch": 0.984,
"grad_norm": 302.0,
"kl_loss_10": 97.79412803649902,
"kl_loss_2": 1069.5015228271484,
"kl_loss_3": 732.57353515625,
"kl_loss_7": 187.13561325073243,
"learning_rate": 6.443413907720186e-07,
"loss": 519.9878,
"step": 9840
},
{
"ce_loss_10": 3.6514281272888183,
"ce_loss_13": 3.5926932096481323,
"ce_loss_2": 4.092191052436829,
"ce_loss_3": 3.930639326572418,
"ce_loss_7": 3.6929845571517945,
"epoch": 0.985,
"grad_norm": 370.0,
"kl_loss_10": 94.32459564208985,
"kl_loss_2": 1056.6608520507812,
"kl_loss_3": 730.5611633300781,
"kl_loss_7": 184.7391357421875,
"learning_rate": 5.663304084960185e-07,
"loss": 518.7671,
"step": 9850
},
{
"ce_loss_10": 3.5804072976112367,
"ce_loss_13": 3.520645248889923,
"ce_loss_2": 4.040905499458313,
"ce_loss_3": 3.8740663886070252,
"ce_loss_7": 3.6240461707115172,
"epoch": 0.986,
"grad_norm": 364.0,
"kl_loss_10": 96.15405921936035,
"kl_loss_2": 1090.2829711914062,
"kl_loss_3": 753.8562805175782,
"kl_loss_7": 186.33775329589844,
"learning_rate": 4.933487177280482e-07,
"loss": 518.7076,
"step": 9860
},
{
"ce_loss_10": 3.6774788737297057,
"ce_loss_13": 3.6186564683914186,
"ce_loss_2": 4.12188241481781,
"ce_loss_3": 3.958666682243347,
"ce_loss_7": 3.7196821093559267,
"epoch": 0.987,
"grad_norm": 408.0,
"kl_loss_10": 94.4230339050293,
"kl_loss_2": 1058.865460205078,
"kl_loss_3": 734.6985687255859,
"kl_loss_7": 181.19112243652344,
"learning_rate": 4.2539705339295075e-07,
"loss": 516.55,
"step": 9870
},
{
"ce_loss_10": 3.525623691082001,
"ce_loss_13": 3.46755028963089,
"ce_loss_2": 3.986714744567871,
"ce_loss_3": 3.828349435329437,
"ce_loss_7": 3.572091352939606,
"epoch": 0.988,
"grad_norm": 376.0,
"kl_loss_10": 93.51777114868165,
"kl_loss_2": 1095.3638244628905,
"kl_loss_3": 760.6636901855469,
"kl_loss_7": 187.32990493774415,
"learning_rate": 3.6247609976319816e-07,
"loss": 523.6327,
"step": 9880
},
{
"ce_loss_10": 3.6277820110321044,
"ce_loss_13": 3.567537808418274,
"ce_loss_2": 4.088609397411346,
"ce_loss_3": 3.924705386161804,
"ce_loss_7": 3.6743552684783936,
"epoch": 0.989,
"grad_norm": 476.0,
"kl_loss_10": 96.44798164367675,
"kl_loss_2": 1082.7717010498047,
"kl_loss_3": 749.9800903320313,
"kl_loss_7": 188.03020095825195,
"learning_rate": 3.0458649045211895e-07,
"loss": 536.5464,
"step": 9890
},
{
"ce_loss_10": 3.596536934375763,
"ce_loss_13": 3.5354753971099853,
"ce_loss_2": 4.064900302886963,
"ce_loss_3": 3.895892357826233,
"ce_loss_7": 3.6420591354370115,
"epoch": 0.99,
"grad_norm": 354.0,
"kl_loss_10": 95.05449028015137,
"kl_loss_2": 1090.392593383789,
"kl_loss_3": 754.5375762939453,
"kl_loss_7": 188.19856491088868,
"learning_rate": 2.517288084074587e-07,
"loss": 534.519,
"step": 9900
},
{
"ce_loss_10": 3.635116171836853,
"ce_loss_13": 3.574588453769684,
"ce_loss_2": 4.111752784252166,
"ce_loss_3": 3.944272756576538,
"ce_loss_7": 3.682980465888977,
"epoch": 0.991,
"grad_norm": 354.0,
"kl_loss_10": 95.2107322692871,
"kl_loss_2": 1113.8304351806642,
"kl_loss_3": 770.2767669677735,
"kl_loss_7": 189.45772018432618,
"learning_rate": 2.0390358590538505e-07,
"loss": 533.4635,
"step": 9910
},
{
"ce_loss_10": 3.644596242904663,
"ce_loss_13": 3.5852715373039246,
"ce_loss_2": 4.097472989559174,
"ce_loss_3": 3.93781635761261,
"ce_loss_7": 3.6912411212921143,
"epoch": 0.992,
"grad_norm": 360.0,
"kl_loss_10": 95.30287055969238,
"kl_loss_2": 1081.2569366455077,
"kl_loss_3": 748.5147064208984,
"kl_loss_7": 189.18702926635743,
"learning_rate": 1.61111304545436e-07,
"loss": 523.9828,
"step": 9920
},
{
"ce_loss_10": 3.6098355293273925,
"ce_loss_13": 3.5515360593795777,
"ce_loss_2": 4.0586741924285885,
"ce_loss_3": 3.895809698104858,
"ce_loss_7": 3.651991581916809,
"epoch": 0.993,
"grad_norm": 408.0,
"kl_loss_10": 94.64987220764161,
"kl_loss_2": 1077.9339660644532,
"kl_loss_3": 744.9157043457031,
"kl_loss_7": 185.0061477661133,
"learning_rate": 1.2335239524541298e-07,
"loss": 518.6489,
"step": 9930
},
{
"ce_loss_10": 3.5815568804740905,
"ce_loss_13": 3.523807632923126,
"ce_loss_2": 4.032287573814392,
"ce_loss_3": 3.871028816699982,
"ce_loss_7": 3.625322496891022,
"epoch": 0.994,
"grad_norm": 396.0,
"kl_loss_10": 94.44433670043945,
"kl_loss_2": 1070.128958129883,
"kl_loss_3": 738.4679992675781,
"kl_loss_7": 184.16616668701172,
"learning_rate": 9.06272382371065e-08,
"loss": 522.681,
"step": 9940
},
{
"ce_loss_10": 3.649178886413574,
"ce_loss_13": 3.5898856997489927,
"ce_loss_2": 4.107921350002289,
"ce_loss_3": 3.9442641377449035,
"ce_loss_7": 3.6927329182624815,
"epoch": 0.995,
"grad_norm": 366.0,
"kl_loss_10": 97.43391189575195,
"kl_loss_2": 1093.5839904785157,
"kl_loss_3": 755.0986083984375,
"kl_loss_7": 187.86676559448242,
"learning_rate": 6.293616306246586e-08,
"loss": 528.0195,
"step": 9950
},
{
"ce_loss_10": 3.6503029584884645,
"ce_loss_13": 3.5912461996078493,
"ce_loss_2": 4.083380007743836,
"ce_loss_3": 3.9251137375831604,
"ce_loss_7": 3.6914992809295653,
"epoch": 0.996,
"grad_norm": 386.0,
"kl_loss_10": 92.51536598205567,
"kl_loss_2": 1047.152279663086,
"kl_loss_3": 724.7971832275391,
"kl_loss_7": 180.47787857055664,
"learning_rate": 4.027944857032395e-08,
"loss": 508.9252,
"step": 9960
},
{
"ce_loss_10": 3.640582966804504,
"ce_loss_13": 3.5819292664527893,
"ce_loss_2": 4.071112728118896,
"ce_loss_3": 3.9061211466789247,
"ce_loss_7": 3.678541886806488,
"epoch": 0.997,
"grad_norm": 332.0,
"kl_loss_10": 94.77192039489746,
"kl_loss_2": 1031.1441497802734,
"kl_loss_3": 710.4731018066407,
"kl_loss_7": 178.55070953369142,
"learning_rate": 2.265732291356626e-08,
"loss": 508.5261,
"step": 9970
},
{
"ce_loss_10": 3.6857742786407472,
"ce_loss_13": 3.6264986276626585,
"ce_loss_2": 4.119817900657654,
"ce_loss_3": 3.9565661191940307,
"ce_loss_7": 3.7256898403167726,
"epoch": 0.998,
"grad_norm": 354.0,
"kl_loss_10": 95.05257987976074,
"kl_loss_2": 1045.786294555664,
"kl_loss_3": 725.6510559082031,
"kl_loss_7": 184.49017181396485,
"learning_rate": 1.0069963546743833e-08,
"loss": 527.3226,
"step": 9980
},
{
"ce_loss_10": 3.66132515668869,
"ce_loss_13": 3.6029880166053774,
"ce_loss_2": 4.108680582046508,
"ce_loss_3": 3.9468571662902834,
"ce_loss_7": 3.7058345794677736,
"epoch": 0.999,
"grad_norm": 358.0,
"kl_loss_10": 95.48841247558593,
"kl_loss_2": 1072.3943481445312,
"kl_loss_3": 746.4591552734375,
"kl_loss_7": 185.93264389038086,
"learning_rate": 2.517497224463483e-09,
"loss": 522.7165,
"step": 9990
},
{
"ce_loss_10": 3.6195040583610534,
"ce_loss_13": 3.559644305706024,
"ce_loss_2": 4.096893215179444,
"ce_loss_3": 3.9238691568374633,
"ce_loss_7": 3.6661298632621766,
"epoch": 1.0,
"grad_norm": 502.0,
"kl_loss_10": 96.35170402526856,
"kl_loss_2": 1110.4292907714844,
"kl_loss_3": 763.8701965332032,
"kl_loss_7": 189.7416961669922,
"learning_rate": 0.0,
"loss": 533.5189,
"step": 10000
}
],
"logging_steps": 10,
"max_steps": 10000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.177819035608023e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}