{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.375, "eval_steps": 2000, "global_step": 15000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00025, "grad_norm": 31.5, "learning_rate": 0.0001, "loss": 7.633, "loss/crossentropy": 2.065455098450184, "loss/hidden": 3.476953125, "loss/jsd": 0.0, "loss/logits": 0.20220321230590343, "step": 10 }, { "epoch": 0.0005, "grad_norm": 35.0, "grad_norm_var": 2.6895182291666666, "learning_rate": 0.0001, "loss": 7.4618, "loss/crossentropy": 1.9399560801684856, "loss/hidden": 3.394140625, "loss/jsd": 0.0, "loss/logits": 0.19191570337861777, "step": 20 }, { "epoch": 0.00075, "grad_norm": 37.5, "grad_norm_var": 6.579622395833334, "learning_rate": 0.0001, "loss": 7.5972, "loss/crossentropy": 2.130601316690445, "loss/hidden": 3.38984375, "loss/jsd": 0.0, "loss/logits": 0.20188977513462306, "step": 30 }, { "epoch": 0.001, "grad_norm": 33.5, "grad_norm_var": 6.253125, "learning_rate": 0.0001, "loss": 7.5917, "loss/crossentropy": 2.2571407079696657, "loss/hidden": 3.422265625, "loss/jsd": 0.0, "loss/logits": 0.19847887996584176, "step": 40 }, { "epoch": 0.00125, "grad_norm": 32.25, "grad_norm_var": 2.1619140625, "learning_rate": 0.0001, "loss": 7.6054, "loss/crossentropy": 2.1717565625905992, "loss/hidden": 3.43359375, "loss/jsd": 0.0, "loss/logits": 0.20264342725276946, "step": 50 }, { "epoch": 0.0015, "grad_norm": 35.5, "grad_norm_var": 15.786393229166666, "learning_rate": 0.0001, "loss": 7.5513, "loss/crossentropy": 2.070718301087618, "loss/hidden": 3.409375, "loss/jsd": 0.0, "loss/logits": 0.19855907820165158, "step": 60 }, { "epoch": 0.00175, "grad_norm": 31.0, "grad_norm_var": 12.4625, "learning_rate": 0.0001, "loss": 7.5447, "loss/crossentropy": 2.118075390160084, "loss/hidden": 3.473828125, "loss/jsd": 0.0, "loss/logits": 0.20283062420785428, "step": 70 }, { "epoch": 0.002, "grad_norm": 32.25, "grad_norm_var": 1.2643229166666667, "learning_rate": 0.0001, "loss": 7.468, "loss/crossentropy": 2.0006178975105287, "loss/hidden": 3.350390625, "loss/jsd": 0.0, "loss/logits": 0.18958428762853147, "step": 80 }, { "epoch": 0.00225, "grad_norm": 30.625, "grad_norm_var": 3.470572916666667, "learning_rate": 0.0001, "loss": 7.5061, "loss/crossentropy": 1.9605075903236866, "loss/hidden": 3.54375, "loss/jsd": 0.0, "loss/logits": 0.20559987109154462, "step": 90 }, { "epoch": 0.0025, "grad_norm": 31.125, "grad_norm_var": 6.763541666666667, "learning_rate": 0.0001, "loss": 7.4928, "loss/crossentropy": 2.1205389350652695, "loss/hidden": 3.44140625, "loss/jsd": 0.0, "loss/logits": 0.19496036488562823, "step": 100 }, { "epoch": 0.00275, "grad_norm": 31.0, "grad_norm_var": 6.1509765625, "learning_rate": 0.0001, "loss": 7.595, "loss/crossentropy": 2.1240097641944886, "loss/hidden": 3.43671875, "loss/jsd": 0.0, "loss/logits": 0.19564666803926228, "step": 110 }, { "epoch": 0.003, "grad_norm": 31.25, "grad_norm_var": 3.348893229166667, "learning_rate": 0.0001, "loss": 7.5329, "loss/crossentropy": 2.175096944719553, "loss/hidden": 3.41796875, "loss/jsd": 0.0, "loss/logits": 0.21303062327206135, "step": 120 }, { "epoch": 0.00325, "grad_norm": 32.0, "grad_norm_var": 2.8541666666666665, "learning_rate": 0.0001, "loss": 7.5536, "loss/crossentropy": 2.1472502022981645, "loss/hidden": 3.342578125, "loss/jsd": 0.0, "loss/logits": 0.18929538186639547, "step": 130 }, { "epoch": 0.0035, "grad_norm": 29.375, "grad_norm_var": 29.683268229166668, "learning_rate": 0.0001, "loss": 7.5191, "loss/crossentropy": 2.015011890232563, "loss/hidden": 3.44296875, "loss/jsd": 0.0, "loss/logits": 0.20328481420874595, "step": 140 }, { "epoch": 0.00375, "grad_norm": 28.75, "grad_norm_var": 28.74765625, "learning_rate": 0.0001, "loss": 7.4158, "loss/crossentropy": 1.9774167470633983, "loss/hidden": 3.43515625, "loss/jsd": 0.0, "loss/logits": 0.19464388117194176, "step": 150 }, { "epoch": 0.004, "grad_norm": 30.875, "grad_norm_var": 1.3635416666666667, "learning_rate": 0.0001, "loss": 7.6354, "loss/crossentropy": 2.320629420876503, "loss/hidden": 3.418359375, "loss/jsd": 0.0, "loss/logits": 0.20745602920651435, "step": 160 }, { "epoch": 0.00425, "grad_norm": 31.5, "grad_norm_var": 1.0270182291666667, "learning_rate": 0.0001, "loss": 7.4137, "loss/crossentropy": 1.900385806709528, "loss/hidden": 3.345703125, "loss/jsd": 0.0, "loss/logits": 0.16769229620695114, "step": 170 }, { "epoch": 0.0045, "grad_norm": 31.25, "grad_norm_var": 0.9833333333333333, "learning_rate": 0.0001, "loss": 7.5763, "loss/crossentropy": 2.129625543951988, "loss/hidden": 3.5171875, "loss/jsd": 0.0, "loss/logits": 0.2102549459785223, "step": 180 }, { "epoch": 0.00475, "grad_norm": 32.25, "grad_norm_var": 3.05390625, "learning_rate": 0.0001, "loss": 7.6166, "loss/crossentropy": 2.1552532628178596, "loss/hidden": 3.469140625, "loss/jsd": 0.0, "loss/logits": 0.2250068686902523, "step": 190 }, { "epoch": 0.005, "grad_norm": 29.625, "grad_norm_var": 3.8375, "learning_rate": 0.0001, "loss": 7.5745, "loss/crossentropy": 1.9441482461988926, "loss/hidden": 3.387890625, "loss/jsd": 0.0, "loss/logits": 0.195942450594157, "step": 200 }, { "epoch": 0.00525, "grad_norm": 32.5, "grad_norm_var": 18.396875, "learning_rate": 0.0001, "loss": 7.5292, "loss/crossentropy": 1.9941987417638303, "loss/hidden": 3.394140625, "loss/jsd": 0.0, "loss/logits": 0.18264975901693106, "step": 210 }, { "epoch": 0.0055, "grad_norm": 31.75, "grad_norm_var": 20.736393229166666, "learning_rate": 0.0001, "loss": 7.4899, "loss/crossentropy": 2.0191620789468288, "loss/hidden": 3.355078125, "loss/jsd": 0.0, "loss/logits": 0.18100650198757648, "step": 220 }, { "epoch": 0.00575, "grad_norm": 30.375, "grad_norm_var": 2.342643229166667, "learning_rate": 0.0001, "loss": 7.5199, "loss/crossentropy": 2.001779730618, "loss/hidden": 3.32109375, "loss/jsd": 0.0, "loss/logits": 0.17959208656102418, "step": 230 }, { "epoch": 0.006, "grad_norm": 30.75, "grad_norm_var": 1.271875, "learning_rate": 0.0001, "loss": 7.6842, "loss/crossentropy": 2.1846971333026888, "loss/hidden": 3.397265625, "loss/jsd": 0.0, "loss/logits": 0.2059234745800495, "step": 240 }, { "epoch": 0.00625, "grad_norm": 29.5, "grad_norm_var": 5.688541666666667, "learning_rate": 0.0001, "loss": 7.5196, "loss/crossentropy": 2.174124576151371, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.20000722594559192, "step": 250 }, { "epoch": 0.0065, "grad_norm": 28.75, "grad_norm_var": 1.9572265625, "learning_rate": 0.0001, "loss": 7.3875, "loss/crossentropy": 1.9285166233778, "loss/hidden": 3.396875, "loss/jsd": 0.0, "loss/logits": 0.18449910767376423, "step": 260 }, { "epoch": 0.00675, "grad_norm": 33.5, "grad_norm_var": 2.0999348958333335, "learning_rate": 0.0001, "loss": 7.5877, "loss/crossentropy": 2.0323276594281197, "loss/hidden": 3.37890625, "loss/jsd": 0.0, "loss/logits": 0.19395631980150937, "step": 270 }, { "epoch": 0.007, "grad_norm": 30.5, "grad_norm_var": 2.15390625, "learning_rate": 0.0001, "loss": 7.5791, "loss/crossentropy": 2.126656140387058, "loss/hidden": 3.496875, "loss/jsd": 0.0, "loss/logits": 0.21661139875650406, "step": 280 }, { "epoch": 0.00725, "grad_norm": 29.5, "grad_norm_var": 3.193489583333333, "learning_rate": 0.0001, "loss": 7.5587, "loss/crossentropy": 2.200097793340683, "loss/hidden": 3.529296875, "loss/jsd": 0.0, "loss/logits": 0.21046234332025052, "step": 290 }, { "epoch": 0.0075, "grad_norm": 26.75, "grad_norm_var": 4.27265625, "learning_rate": 0.0001, "loss": 7.5404, "loss/crossentropy": 2.1184144005179406, "loss/hidden": 3.487890625, "loss/jsd": 0.0, "loss/logits": 0.20949590150266886, "step": 300 }, { "epoch": 0.00775, "grad_norm": 33.0, "grad_norm_var": 3.3643229166666666, "learning_rate": 0.0001, "loss": 7.5628, "loss/crossentropy": 1.9984030593186617, "loss/hidden": 3.453515625, "loss/jsd": 0.0, "loss/logits": 0.18789457948878407, "step": 310 }, { "epoch": 0.008, "grad_norm": 32.5, "grad_norm_var": 2.5645182291666666, "learning_rate": 0.0001, "loss": 7.5695, "loss/crossentropy": 2.143594169616699, "loss/hidden": 3.42421875, "loss/jsd": 0.0, "loss/logits": 0.19360470157116652, "step": 320 }, { "epoch": 0.00825, "grad_norm": 29.375, "grad_norm_var": 1.8749348958333334, "learning_rate": 0.0001, "loss": 7.3627, "loss/crossentropy": 2.1077703177928924, "loss/hidden": 3.373828125, "loss/jsd": 0.0, "loss/logits": 0.19771252572536469, "step": 330 }, { "epoch": 0.0085, "grad_norm": 29.75, "grad_norm_var": 1.5978515625, "learning_rate": 0.0001, "loss": 7.4192, "loss/crossentropy": 2.0583472289144993, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.20189273860305548, "step": 340 }, { "epoch": 0.00875, "grad_norm": 29.875, "grad_norm_var": 1.2872395833333334, "learning_rate": 0.0001, "loss": 7.5432, "loss/crossentropy": 2.0804511278867723, "loss/hidden": 3.38828125, "loss/jsd": 0.0, "loss/logits": 0.19735569059848784, "step": 350 }, { "epoch": 0.009, "grad_norm": 30.5, "grad_norm_var": 18.731184895833334, "learning_rate": 0.0001, "loss": 7.4948, "loss/crossentropy": 2.0466629534959795, "loss/hidden": 3.315234375, "loss/jsd": 0.0, "loss/logits": 0.18366040643304588, "step": 360 }, { "epoch": 0.00925, "grad_norm": 30.875, "grad_norm_var": 25.9916015625, "learning_rate": 0.0001, "loss": 7.5081, "loss/crossentropy": 1.9005662694573402, "loss/hidden": 3.501171875, "loss/jsd": 0.0, "loss/logits": 0.1900689721107483, "step": 370 }, { "epoch": 0.0095, "grad_norm": 28.75, "grad_norm_var": 2.451041666666667, "learning_rate": 0.0001, "loss": 7.4305, "loss/crossentropy": 2.0674299761652946, "loss/hidden": 3.517578125, "loss/jsd": 0.0, "loss/logits": 0.21062961965799332, "step": 380 }, { "epoch": 0.00975, "grad_norm": 31.25, "grad_norm_var": 5.645247395833334, "learning_rate": 0.0001, "loss": 7.5168, "loss/crossentropy": 2.0279919117689134, "loss/hidden": 3.503125, "loss/jsd": 0.0, "loss/logits": 0.20519332773983479, "step": 390 }, { "epoch": 0.01, "grad_norm": 31.125, "grad_norm_var": 5.928125, "learning_rate": 0.0001, "loss": 7.4985, "loss/crossentropy": 2.0427632443606853, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.20287631042301654, "step": 400 }, { "epoch": 0.01025, "grad_norm": 38.5, "grad_norm_var": 438.43515625, "learning_rate": 0.0001, "loss": 7.5633, "loss/crossentropy": 2.199043881893158, "loss/hidden": 3.397265625, "loss/jsd": 0.0, "loss/logits": 0.21130343191325665, "step": 410 }, { "epoch": 0.0105, "grad_norm": 30.875, "grad_norm_var": 43.14140625, "learning_rate": 0.0001, "loss": 7.4835, "loss/crossentropy": 1.9102243572473525, "loss/hidden": 3.42578125, "loss/jsd": 0.0, "loss/logits": 0.1895731385797262, "step": 420 }, { "epoch": 0.01075, "grad_norm": 31.75, "grad_norm_var": 5.658268229166667, "learning_rate": 0.0001, "loss": 7.3897, "loss/crossentropy": 2.159160128980875, "loss/hidden": 3.464453125, "loss/jsd": 0.0, "loss/logits": 0.20280379485338926, "step": 430 }, { "epoch": 0.011, "grad_norm": 28.375, "grad_norm_var": 16.3375, "learning_rate": 0.0001, "loss": 7.5463, "loss/crossentropy": 2.1217672407627104, "loss/hidden": 3.545703125, "loss/jsd": 0.0, "loss/logits": 0.23856931366026402, "step": 440 }, { "epoch": 0.01125, "grad_norm": 30.5, "grad_norm_var": 17.098372395833334, "learning_rate": 0.0001, "loss": 7.5225, "loss/crossentropy": 1.969854873791337, "loss/hidden": 3.430078125, "loss/jsd": 0.0, "loss/logits": 0.19548849146813155, "step": 450 }, { "epoch": 0.0115, "grad_norm": 29.875, "grad_norm_var": 2.5677083333333335, "learning_rate": 0.0001, "loss": 7.5046, "loss/crossentropy": 2.121321603655815, "loss/hidden": 3.476171875, "loss/jsd": 0.0, "loss/logits": 0.19364523217082025, "step": 460 }, { "epoch": 0.01175, "grad_norm": 32.25, "grad_norm_var": 8.585416666666667, "learning_rate": 0.0001, "loss": 7.4558, "loss/crossentropy": 1.9360710382461548, "loss/hidden": 3.382421875, "loss/jsd": 0.0, "loss/logits": 0.1893781816586852, "step": 470 }, { "epoch": 0.012, "grad_norm": 29.875, "grad_norm_var": 3.417122395833333, "learning_rate": 0.0001, "loss": 7.531, "loss/crossentropy": 2.082458943128586, "loss/hidden": 3.471875, "loss/jsd": 0.0, "loss/logits": 0.2220946006476879, "step": 480 }, { "epoch": 0.01225, "grad_norm": 31.0, "grad_norm_var": 48.96640625, "learning_rate": 0.0001, "loss": 7.5651, "loss/crossentropy": 2.1382531195878984, "loss/hidden": 3.480078125, "loss/jsd": 0.0, "loss/logits": 0.20847559962421655, "step": 490 }, { "epoch": 0.0125, "grad_norm": 29.875, "grad_norm_var": 49.2666015625, "learning_rate": 0.0001, "loss": 7.5679, "loss/crossentropy": 2.0875915244221686, "loss/hidden": 3.33125, "loss/jsd": 0.0, "loss/logits": 0.1850985599681735, "step": 500 }, { "epoch": 0.01275, "grad_norm": 31.875, "grad_norm_var": 1.45, "learning_rate": 0.0001, "loss": 7.5263, "loss/crossentropy": 2.182442346215248, "loss/hidden": 3.446484375, "loss/jsd": 0.0, "loss/logits": 0.19555890336632728, "step": 510 }, { "epoch": 0.013, "grad_norm": 34.0, "grad_norm_var": 1.6931640625, "learning_rate": 0.0001, "loss": 7.5209, "loss/crossentropy": 1.9812136888504028, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.1965757070109248, "step": 520 }, { "epoch": 0.01325, "grad_norm": 31.0, "grad_norm_var": 2.101822916666667, "learning_rate": 0.0001, "loss": 7.6059, "loss/crossentropy": 2.0372241511940956, "loss/hidden": 3.564453125, "loss/jsd": 0.0, "loss/logits": 0.204646560549736, "step": 530 }, { "epoch": 0.0135, "grad_norm": 29.125, "grad_norm_var": 20.071875, "learning_rate": 0.0001, "loss": 7.5725, "loss/crossentropy": 2.155761349201202, "loss/hidden": 3.4125, "loss/jsd": 0.0, "loss/logits": 0.19602423422038556, "step": 540 }, { "epoch": 0.01375, "grad_norm": 29.125, "grad_norm_var": 20.506705729166665, "learning_rate": 0.0001, "loss": 7.5842, "loss/crossentropy": 1.8869566857814788, "loss/hidden": 3.437890625, "loss/jsd": 0.0, "loss/logits": 0.20957522764801978, "step": 550 }, { "epoch": 0.014, "grad_norm": 30.625, "grad_norm_var": 10.025455729166667, "learning_rate": 0.0001, "loss": 7.4975, "loss/crossentropy": 2.0370677679777147, "loss/hidden": 3.361328125, "loss/jsd": 0.0, "loss/logits": 0.19026046600192786, "step": 560 }, { "epoch": 0.01425, "grad_norm": 33.0, "grad_norm_var": 2.2270833333333333, "learning_rate": 0.0001, "loss": 7.5688, "loss/crossentropy": 2.1931444257497787, "loss/hidden": 3.415234375, "loss/jsd": 0.0, "loss/logits": 0.2036376902833581, "step": 570 }, { "epoch": 0.0145, "grad_norm": 35.0, "grad_norm_var": 3.5681640625, "learning_rate": 0.0001, "loss": 7.478, "loss/crossentropy": 2.061052493005991, "loss/hidden": 3.478125, "loss/jsd": 0.0, "loss/logits": 0.2282864760607481, "step": 580 }, { "epoch": 0.01475, "grad_norm": 32.5, "grad_norm_var": 2.8705729166666667, "learning_rate": 0.0001, "loss": 7.5957, "loss/crossentropy": 2.0078392371535303, "loss/hidden": 3.45, "loss/jsd": 0.0, "loss/logits": 0.19647251404821872, "step": 590 }, { "epoch": 0.015, "grad_norm": 30.25, "grad_norm_var": 31.449934895833334, "learning_rate": 0.0001, "loss": 7.5096, "loss/crossentropy": 2.0417068414390087, "loss/hidden": 3.423046875, "loss/jsd": 0.0, "loss/logits": 0.19782953998073935, "step": 600 }, { "epoch": 0.01525, "grad_norm": 30.5, "grad_norm_var": 26.253059895833335, "learning_rate": 0.0001, "loss": 7.5368, "loss/crossentropy": 2.1738049775362014, "loss/hidden": 3.409765625, "loss/jsd": 0.0, "loss/logits": 0.1996332859620452, "step": 610 }, { "epoch": 0.0155, "grad_norm": 30.125, "grad_norm_var": 2.334375, "learning_rate": 0.0001, "loss": 7.4868, "loss/crossentropy": 1.7587297886610032, "loss/hidden": 3.475390625, "loss/jsd": 0.0, "loss/logits": 0.18938990794122218, "step": 620 }, { "epoch": 0.01575, "grad_norm": 29.25, "grad_norm_var": 27.393684895833335, "learning_rate": 0.0001, "loss": 7.4833, "loss/crossentropy": 1.9551145888864994, "loss/hidden": 3.384375, "loss/jsd": 0.0, "loss/logits": 0.20075901364907622, "step": 630 }, { "epoch": 0.016, "grad_norm": 29.75, "grad_norm_var": 29.6947265625, "learning_rate": 0.0001, "loss": 7.4608, "loss/crossentropy": 2.128718316555023, "loss/hidden": 3.3625, "loss/jsd": 0.0, "loss/logits": 0.19077460393309592, "step": 640 }, { "epoch": 0.01625, "grad_norm": 29.75, "grad_norm_var": 27.322330729166666, "learning_rate": 0.0001, "loss": 7.6033, "loss/crossentropy": 1.9678708665072917, "loss/hidden": 3.413671875, "loss/jsd": 0.0, "loss/logits": 0.18875791020691396, "step": 650 }, { "epoch": 0.0165, "grad_norm": 30.375, "grad_norm_var": 3.129622395833333, "learning_rate": 0.0001, "loss": 7.3873, "loss/crossentropy": 1.9582339562475681, "loss/hidden": 3.34765625, "loss/jsd": 0.0, "loss/logits": 0.18309127148240806, "step": 660 }, { "epoch": 0.01675, "grad_norm": 32.75, "grad_norm_var": 2.7009765625, "learning_rate": 0.0001, "loss": 7.4913, "loss/crossentropy": 2.0773802563548087, "loss/hidden": 3.505078125, "loss/jsd": 0.0, "loss/logits": 0.20910798981785775, "step": 670 }, { "epoch": 0.017, "grad_norm": 34.0, "grad_norm_var": 3.3854166666666665, "learning_rate": 0.0001, "loss": 7.4847, "loss/crossentropy": 2.12913373708725, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.201920267008245, "step": 680 }, { "epoch": 0.01725, "grad_norm": 30.75, "grad_norm_var": 1.7176432291666666, "learning_rate": 0.0001, "loss": 7.5065, "loss/crossentropy": 1.9141538538038732, "loss/hidden": 3.44921875, "loss/jsd": 0.0, "loss/logits": 0.1841401271522045, "step": 690 }, { "epoch": 0.0175, "grad_norm": 31.0, "grad_norm_var": 1.6374348958333333, "learning_rate": 0.0001, "loss": 7.5897, "loss/crossentropy": 2.207232800126076, "loss/hidden": 3.399609375, "loss/jsd": 0.0, "loss/logits": 0.21376523859798907, "step": 700 }, { "epoch": 0.01775, "grad_norm": 32.75, "grad_norm_var": 2.3655598958333335, "learning_rate": 0.0001, "loss": 7.5075, "loss/crossentropy": 2.03845998942852, "loss/hidden": 3.41953125, "loss/jsd": 0.0, "loss/logits": 0.1920805646572262, "step": 710 }, { "epoch": 0.018, "grad_norm": 32.5, "grad_norm_var": 1.3893229166666667, "learning_rate": 0.0001, "loss": 7.4669, "loss/crossentropy": 2.054341807588935, "loss/hidden": 3.489453125, "loss/jsd": 0.0, "loss/logits": 0.19716067584231495, "step": 720 }, { "epoch": 0.01825, "grad_norm": 31.625, "grad_norm_var": 3.54140625, "learning_rate": 0.0001, "loss": 7.517, "loss/crossentropy": 2.2111608639359472, "loss/hidden": 3.409765625, "loss/jsd": 0.0, "loss/logits": 0.20262118335813284, "step": 730 }, { "epoch": 0.0185, "grad_norm": 29.125, "grad_norm_var": 4.692122395833334, "learning_rate": 0.0001, "loss": 7.4784, "loss/crossentropy": 2.0551758617162705, "loss/hidden": 3.446875, "loss/jsd": 0.0, "loss/logits": 0.20378697756677866, "step": 740 }, { "epoch": 0.01875, "grad_norm": 33.0, "grad_norm_var": 4.295572916666667, "learning_rate": 0.0001, "loss": 7.4016, "loss/crossentropy": 2.128055375814438, "loss/hidden": 3.3953125, "loss/jsd": 0.0, "loss/logits": 0.19904747987166047, "step": 750 }, { "epoch": 0.019, "grad_norm": 6106906624.0, "grad_norm_var": 2.3308942582349476e+18, "learning_rate": 0.0001, "loss": 7.4633, "loss/crossentropy": 2.248567137122154, "loss/hidden": 3.37265625, "loss/jsd": 0.0, "loss/logits": 0.19723597317934036, "step": 760 }, { "epoch": 0.01925, "grad_norm": 28.5, "grad_norm_var": 2.330894258158611e+18, "learning_rate": 0.0001, "loss": 7.4542, "loss/crossentropy": 2.132212319970131, "loss/hidden": 3.373828125, "loss/jsd": 0.0, "loss/logits": 0.18174959290772677, "step": 770 }, { "epoch": 0.0195, "grad_norm": 36.5, "grad_norm_var": 4.833333333333333, "learning_rate": 0.0001, "loss": 7.465, "loss/crossentropy": 2.046277052164078, "loss/hidden": 3.491015625, "loss/jsd": 0.0, "loss/logits": 0.21161840241402388, "step": 780 }, { "epoch": 0.01975, "grad_norm": 32.75, "grad_norm_var": 5.137434895833334, "learning_rate": 0.0001, "loss": 7.4171, "loss/crossentropy": 2.058088332414627, "loss/hidden": 3.315234375, "loss/jsd": 0.0, "loss/logits": 0.1815673651173711, "step": 790 }, { "epoch": 0.02, "grad_norm": 30.125, "grad_norm_var": 12.37265625, "learning_rate": 0.0001, "loss": 7.4153, "loss/crossentropy": 2.064726157486439, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.19402222614735365, "step": 800 }, { "epoch": 0.02025, "grad_norm": 32.0, "grad_norm_var": 12.240625, "learning_rate": 0.0001, "loss": 7.3739, "loss/crossentropy": 2.0926051691174505, "loss/hidden": 3.476953125, "loss/jsd": 0.0, "loss/logits": 0.21017331834882497, "step": 810 }, { "epoch": 0.0205, "grad_norm": 31.875, "grad_norm_var": 3.6853515625, "learning_rate": 0.0001, "loss": 7.409, "loss/crossentropy": 2.016859006881714, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.20363395065069198, "step": 820 }, { "epoch": 0.02075, "grad_norm": 34.0, "grad_norm_var": 278.1108723958333, "learning_rate": 0.0001, "loss": 7.6725, "loss/crossentropy": 2.03957434669137, "loss/hidden": 3.4625, "loss/jsd": 0.0, "loss/logits": 0.19866096526384353, "step": 830 }, { "epoch": 0.021, "grad_norm": 35.75, "grad_norm_var": 281.2239583333333, "learning_rate": 0.0001, "loss": 7.4058, "loss/crossentropy": 2.1190530106425287, "loss/hidden": 3.41796875, "loss/jsd": 0.0, "loss/logits": 0.19663097113370895, "step": 840 }, { "epoch": 0.02125, "grad_norm": 32.25, "grad_norm_var": 4.044791666666667, "learning_rate": 0.0001, "loss": 7.4687, "loss/crossentropy": 2.1552326917648315, "loss/hidden": 3.41796875, "loss/jsd": 0.0, "loss/logits": 0.19604418501257898, "step": 850 }, { "epoch": 0.0215, "grad_norm": 37.25, "grad_norm_var": 2.7587362193217157e+18, "learning_rate": 0.0001, "loss": 7.5552, "loss/crossentropy": 2.1164004117250443, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.19724889248609542, "step": 860 }, { "epoch": 0.02175, "grad_norm": 35.25, "grad_norm_var": 2.758736219342478e+18, "learning_rate": 0.0001, "loss": 7.5021, "loss/crossentropy": 2.036998500674963, "loss/hidden": 3.298828125, "loss/jsd": 0.0, "loss/logits": 0.18320635841228067, "step": 870 }, { "epoch": 0.022, "grad_norm": 37.0, "grad_norm_var": 16.9541015625, "learning_rate": 0.0001, "loss": 7.5059, "loss/crossentropy": 1.9707016140222549, "loss/hidden": 3.36328125, "loss/jsd": 0.0, "loss/logits": 0.20436920877546072, "step": 880 }, { "epoch": 0.02225, "grad_norm": 31.375, "grad_norm_var": 30.538541666666667, "learning_rate": 0.0001, "loss": 7.4935, "loss/crossentropy": 2.206394499540329, "loss/hidden": 3.366015625, "loss/jsd": 0.0, "loss/logits": 0.20495780408382416, "step": 890 }, { "epoch": 0.0225, "grad_norm": 29.875, "grad_norm_var": 28.020833333333332, "learning_rate": 0.0001, "loss": 7.4823, "loss/crossentropy": 2.091763325035572, "loss/hidden": 3.43828125, "loss/jsd": 0.0, "loss/logits": 0.20592593550682067, "step": 900 }, { "epoch": 0.02275, "grad_norm": 31.875, "grad_norm_var": 3.5645182291666666, "learning_rate": 0.0001, "loss": 7.422, "loss/crossentropy": 1.9740761511027813, "loss/hidden": 3.494921875, "loss/jsd": 0.0, "loss/logits": 0.2015986293554306, "step": 910 }, { "epoch": 0.023, "grad_norm": 32.0, "grad_norm_var": 56.256184895833336, "learning_rate": 0.0001, "loss": 7.4528, "loss/crossentropy": 2.030415116250515, "loss/hidden": 3.205078125, "loss/jsd": 0.0, "loss/logits": 0.1614784031175077, "step": 920 }, { "epoch": 0.02325, "grad_norm": 30.0, "grad_norm_var": 57.1619140625, "learning_rate": 0.0001, "loss": 7.3713, "loss/crossentropy": 2.0250086903572084, "loss/hidden": 3.455859375, "loss/jsd": 0.0, "loss/logits": 0.19023355115205048, "step": 930 }, { "epoch": 0.0235, "grad_norm": 30.625, "grad_norm_var": 1.3830729166666667, "learning_rate": 0.0001, "loss": 7.5277, "loss/crossentropy": 2.222324788570404, "loss/hidden": 3.366796875, "loss/jsd": 0.0, "loss/logits": 0.19078677501529456, "step": 940 }, { "epoch": 0.02375, "grad_norm": 31.0, "grad_norm_var": 3.1455729166666666, "learning_rate": 0.0001, "loss": 7.5086, "loss/crossentropy": 2.1299516543745995, "loss/hidden": 3.49921875, "loss/jsd": 0.0, "loss/logits": 0.21310927756130696, "step": 950 }, { "epoch": 0.024, "grad_norm": 29.875, "grad_norm_var": 8.883072916666666, "learning_rate": 0.0001, "loss": 7.5579, "loss/crossentropy": 2.0535727672278883, "loss/hidden": 3.43828125, "loss/jsd": 0.0, "loss/logits": 0.18507701791822911, "step": 960 }, { "epoch": 0.02425, "grad_norm": 32.75, "grad_norm_var": 2.5916015625, "learning_rate": 0.0001, "loss": 7.537, "loss/crossentropy": 2.1785535484552385, "loss/hidden": 3.309765625, "loss/jsd": 0.0, "loss/logits": 0.1955953363329172, "step": 970 }, { "epoch": 0.0245, "grad_norm": 36.5, "grad_norm_var": 6.852083333333334, "learning_rate": 0.0001, "loss": 7.5091, "loss/crossentropy": 2.0967498391866686, "loss/hidden": 3.43515625, "loss/jsd": 0.0, "loss/logits": 0.2146583067253232, "step": 980 }, { "epoch": 0.02475, "grad_norm": 29.625, "grad_norm_var": 4.325455729166666, "learning_rate": 0.0001, "loss": 7.5901, "loss/crossentropy": 2.1134474128484726, "loss/hidden": 3.3953125, "loss/jsd": 0.0, "loss/logits": 0.19056662563234567, "step": 990 }, { "epoch": 0.025, "grad_norm": 42.0, "grad_norm_var": 4.1552039405313587e+18, "learning_rate": 0.0001, "loss": 7.6082, "loss/crossentropy": 2.0916516482830048, "loss/hidden": 3.46640625, "loss/jsd": 0.0, "loss/logits": 0.19376826155930757, "step": 1000 }, { "epoch": 0.02525, "grad_norm": 29.625, "grad_norm_var": 4.1552039416015355e+18, "learning_rate": 0.0001, "loss": 7.4528, "loss/crossentropy": 2.003750593960285, "loss/hidden": 3.330859375, "loss/jsd": 0.0, "loss/logits": 0.18129821103066207, "step": 1010 }, { "epoch": 0.0255, "grad_norm": 35.25, "grad_norm_var": 24.095572916666665, "learning_rate": 0.0001, "loss": 7.5395, "loss/crossentropy": 2.0453194856643675, "loss/hidden": 3.477734375, "loss/jsd": 0.0, "loss/logits": 0.199107607267797, "step": 1020 }, { "epoch": 0.02575, "grad_norm": 32.25, "grad_norm_var": 19.5259765625, "learning_rate": 0.0001, "loss": 7.31, "loss/crossentropy": 2.1016619503498077, "loss/hidden": 3.34453125, "loss/jsd": 0.0, "loss/logits": 0.184703135676682, "step": 1030 }, { "epoch": 0.026, "grad_norm": 30.75, "grad_norm_var": 1.87890625, "learning_rate": 0.0001, "loss": 7.5425, "loss/crossentropy": 2.1467826470732687, "loss/hidden": 3.432421875, "loss/jsd": 0.0, "loss/logits": 0.20074132941663264, "step": 1040 }, { "epoch": 0.02625, "grad_norm": 30.625, "grad_norm_var": 0.7452473958333333, "learning_rate": 0.0001, "loss": 7.4114, "loss/crossentropy": 2.049474111199379, "loss/hidden": 3.41796875, "loss/jsd": 0.0, "loss/logits": 0.20267941821366547, "step": 1050 }, { "epoch": 0.0265, "grad_norm": 31.75, "grad_norm_var": 3.124739583333333, "learning_rate": 0.0001, "loss": 7.4845, "loss/crossentropy": 2.036583887040615, "loss/hidden": 3.391796875, "loss/jsd": 0.0, "loss/logits": 0.1893632340244949, "step": 1060 }, { "epoch": 0.02675, "grad_norm": 40.75, "grad_norm_var": 3.405847188209664e+18, "learning_rate": 0.0001, "loss": 7.3982, "loss/crossentropy": 2.124411530792713, "loss/hidden": 3.4484375, "loss/jsd": 0.0, "loss/logits": 0.19454579129815103, "step": 1070 }, { "epoch": 0.027, "grad_norm": 28.25, "grad_norm_var": 3.4058471885941417e+18, "learning_rate": 0.0001, "loss": 7.3928, "loss/crossentropy": 2.0034691862761975, "loss/hidden": 3.503515625, "loss/jsd": 0.0, "loss/logits": 0.21349683087319135, "step": 1080 }, { "epoch": 0.02725, "grad_norm": 29.875, "grad_norm_var": 4.88515625, "learning_rate": 0.0001, "loss": 7.5095, "loss/crossentropy": 1.9183670297265052, "loss/hidden": 3.405859375, "loss/jsd": 0.0, "loss/logits": 0.19249978363513948, "step": 1090 }, { "epoch": 0.0275, "grad_norm": 30.5, "grad_norm_var": 3.2728515625, "learning_rate": 0.0001, "loss": 7.37, "loss/crossentropy": 2.145428071916103, "loss/hidden": 3.35703125, "loss/jsd": 0.0, "loss/logits": 0.19729665387421846, "step": 1100 }, { "epoch": 0.02775, "grad_norm": 31.25, "grad_norm_var": 2.34765625, "learning_rate": 0.0001, "loss": 7.4772, "loss/crossentropy": 2.10652961358428, "loss/hidden": 3.398046875, "loss/jsd": 0.0, "loss/logits": 0.19585925145074726, "step": 1110 }, { "epoch": 0.028, "grad_norm": 31.25, "grad_norm_var": 2.434477049308093e+18, "learning_rate": 0.0001, "loss": 7.4016, "loss/crossentropy": 1.9645449101924897, "loss/hidden": 3.44453125, "loss/jsd": 0.0, "loss/logits": 0.19977953620254993, "step": 1120 }, { "epoch": 0.02825, "grad_norm": 32.0, "grad_norm_var": 2.4344770492950907e+18, "learning_rate": 0.0001, "loss": 7.4453, "loss/crossentropy": 2.131172102689743, "loss/hidden": 3.383984375, "loss/jsd": 0.0, "loss/logits": 0.2083016105927527, "step": 1130 }, { "epoch": 0.0285, "grad_norm": 32.75, "grad_norm_var": 3.7080729166666666, "learning_rate": 0.0001, "loss": 7.4009, "loss/crossentropy": 2.003016713261604, "loss/hidden": 3.34453125, "loss/jsd": 0.0, "loss/logits": 0.18665643623098732, "step": 1140 }, { "epoch": 0.02875, "grad_norm": 30.875, "grad_norm_var": 1.34765625, "learning_rate": 0.0001, "loss": 7.5648, "loss/crossentropy": 2.0709651306271555, "loss/hidden": 3.45703125, "loss/jsd": 0.0, "loss/logits": 0.18793081305921078, "step": 1150 }, { "epoch": 0.029, "grad_norm": 32.25, "grad_norm_var": 2.1582682291666666, "learning_rate": 0.0001, "loss": 7.4644, "loss/crossentropy": 2.06434089243412, "loss/hidden": 3.454296875, "loss/jsd": 0.0, "loss/logits": 0.2109043262898922, "step": 1160 }, { "epoch": 0.02925, "grad_norm": 31.375, "grad_norm_var": 2.4010416666666665, "learning_rate": 0.0001, "loss": 7.4403, "loss/crossentropy": 2.0107607185840606, "loss/hidden": 3.498046875, "loss/jsd": 0.0, "loss/logits": 0.20349722560495137, "step": 1170 }, { "epoch": 0.0295, "grad_norm": 33.25, "grad_norm_var": 1.2260416666666667, "learning_rate": 0.0001, "loss": 7.4412, "loss/crossentropy": 2.096436749398708, "loss/hidden": 3.474609375, "loss/jsd": 0.0, "loss/logits": 0.20087064132094384, "step": 1180 }, { "epoch": 0.02975, "grad_norm": 29.75, "grad_norm_var": 1.8046223958333334, "learning_rate": 0.0001, "loss": 7.4458, "loss/crossentropy": 1.972258360683918, "loss/hidden": 3.583984375, "loss/jsd": 0.0, "loss/logits": 0.20998958311975002, "step": 1190 }, { "epoch": 0.03, "grad_norm": 33.75, "grad_norm_var": 3.7395833333333335, "learning_rate": 0.0001, "loss": 7.3931, "loss/crossentropy": 1.8556599006056786, "loss/hidden": 3.397265625, "loss/jsd": 0.0, "loss/logits": 0.19810242671519518, "step": 1200 }, { "epoch": 0.03025, "grad_norm": 29.0, "grad_norm_var": 9.394791666666666, "learning_rate": 0.0001, "loss": 7.5849, "loss/crossentropy": 2.0611833460628985, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.19216072149574756, "step": 1210 }, { "epoch": 0.0305, "grad_norm": 31.75, "grad_norm_var": 3.26640625, "learning_rate": 0.0001, "loss": 7.4844, "loss/crossentropy": 2.0546294137835504, "loss/hidden": 3.58828125, "loss/jsd": 0.0, "loss/logits": 0.21588555499911308, "step": 1220 }, { "epoch": 0.03075, "grad_norm": 31.625, "grad_norm_var": 2.3968098958333335, "learning_rate": 0.0001, "loss": 7.4858, "loss/crossentropy": 2.0615282475948336, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.206529095210135, "step": 1230 }, { "epoch": 0.031, "grad_norm": 32.0, "grad_norm_var": 1.6124348958333334, "learning_rate": 0.0001, "loss": 7.4647, "loss/crossentropy": 1.9786661133170127, "loss/hidden": 3.381640625, "loss/jsd": 0.0, "loss/logits": 0.17899234425276517, "step": 1240 }, { "epoch": 0.03125, "grad_norm": 5838471168.0, "grad_norm_var": 2.1304840753447437e+18, "learning_rate": 0.0001, "loss": 7.4926, "loss/crossentropy": 2.04936410933733, "loss/hidden": 3.714453125, "loss/jsd": 0.0, "loss/logits": 0.1995564555749297, "step": 1250 }, { "epoch": 0.0315, "grad_norm": 31.25, "grad_norm_var": 2.1304840747304878e+18, "learning_rate": 0.0001, "loss": 7.5078, "loss/crossentropy": 2.1189576953649523, "loss/hidden": 3.43515625, "loss/jsd": 0.0, "loss/logits": 0.19967459067702292, "step": 1260 }, { "epoch": 0.03175, "grad_norm": 30.5, "grad_norm_var": 3.178580729166667, "learning_rate": 0.0001, "loss": 7.4255, "loss/crossentropy": 2.163596141338348, "loss/hidden": 3.4546875, "loss/jsd": 0.0, "loss/logits": 0.19321363251656293, "step": 1270 }, { "epoch": 0.032, "grad_norm": 33.25, "grad_norm_var": 2.1639973958333334, "learning_rate": 0.0001, "loss": 7.4609, "loss/crossentropy": 1.9938266813755035, "loss/hidden": 3.351953125, "loss/jsd": 0.0, "loss/logits": 0.18334759529680014, "step": 1280 }, { "epoch": 0.03225, "grad_norm": 29.375, "grad_norm_var": 1.67890625, "learning_rate": 0.0001, "loss": 7.4652, "loss/crossentropy": 2.161333967000246, "loss/hidden": 3.38828125, "loss/jsd": 0.0, "loss/logits": 0.19740422032773494, "step": 1290 }, { "epoch": 0.0325, "grad_norm": 32.75, "grad_norm_var": 3.0385416666666667, "learning_rate": 0.0001, "loss": 7.3146, "loss/crossentropy": 2.0165325723588468, "loss/hidden": 3.49921875, "loss/jsd": 0.0, "loss/logits": 0.19117104820907116, "step": 1300 }, { "epoch": 0.03275, "grad_norm": 28.25, "grad_norm_var": 9.158072916666667, "learning_rate": 0.0001, "loss": 7.4955, "loss/crossentropy": 2.124955786764622, "loss/hidden": 3.491015625, "loss/jsd": 0.0, "loss/logits": 0.19802952595055104, "step": 1310 }, { "epoch": 0.033, "grad_norm": 30.75, "grad_norm_var": 2.4535807291666667, "learning_rate": 0.0001, "loss": 7.4311, "loss/crossentropy": 2.018800371140242, "loss/hidden": 3.542578125, "loss/jsd": 0.0, "loss/logits": 0.2196814114227891, "step": 1320 }, { "epoch": 0.03325, "grad_norm": 31.375, "grad_norm_var": 2.39375, "learning_rate": 0.0001, "loss": 7.5164, "loss/crossentropy": 2.0520452961325644, "loss/hidden": 3.454296875, "loss/jsd": 0.0, "loss/logits": 0.2013697015121579, "step": 1330 }, { "epoch": 0.0335, "grad_norm": 32.5, "grad_norm_var": 1.0431640625, "learning_rate": 0.0001, "loss": 7.5302, "loss/crossentropy": 2.12932348549366, "loss/hidden": 3.525, "loss/jsd": 0.0, "loss/logits": 0.20245677568018436, "step": 1340 }, { "epoch": 0.03375, "grad_norm": 30.625, "grad_norm_var": 3.3900390625, "learning_rate": 0.0001, "loss": 7.5292, "loss/crossentropy": 2.031618994474411, "loss/hidden": 3.44140625, "loss/jsd": 0.0, "loss/logits": 0.19062725063413383, "step": 1350 }, { "epoch": 0.034, "grad_norm": 32.0, "grad_norm_var": 3.3447265625, "learning_rate": 0.0001, "loss": 7.5755, "loss/crossentropy": 2.2257011234760284, "loss/hidden": 3.447265625, "loss/jsd": 0.0, "loss/logits": 0.1979327043518424, "step": 1360 }, { "epoch": 0.03425, "grad_norm": 30.625, "grad_norm_var": 3.3421223958333335, "learning_rate": 0.0001, "loss": 7.4219, "loss/crossentropy": 2.155778780579567, "loss/hidden": 3.31796875, "loss/jsd": 0.0, "loss/logits": 0.19018295016139747, "step": 1370 }, { "epoch": 0.0345, "grad_norm": 30.25, "grad_norm_var": 2.5872395833333335, "learning_rate": 0.0001, "loss": 7.4637, "loss/crossentropy": 2.058405503630638, "loss/hidden": 3.39296875, "loss/jsd": 0.0, "loss/logits": 0.2114524593576789, "step": 1380 }, { "epoch": 0.03475, "grad_norm": 32.5, "grad_norm_var": 3.2994140625, "learning_rate": 0.0001, "loss": 7.5834, "loss/crossentropy": 2.1654782712459566, "loss/hidden": 3.442578125, "loss/jsd": 0.0, "loss/logits": 0.2024593001231551, "step": 1390 }, { "epoch": 0.035, "grad_norm": 31.125, "grad_norm_var": 12.812239583333334, "learning_rate": 0.0001, "loss": 7.4442, "loss/crossentropy": 2.0921876966953277, "loss/hidden": 3.286328125, "loss/jsd": 0.0, "loss/logits": 0.19270132519304753, "step": 1400 }, { "epoch": 0.03525, "grad_norm": 29.25, "grad_norm_var": 1.5108723958333334, "learning_rate": 0.0001, "loss": 7.4779, "loss/crossentropy": 1.9434148371219635, "loss/hidden": 3.366015625, "loss/jsd": 0.0, "loss/logits": 0.17576389852911234, "step": 1410 }, { "epoch": 0.0355, "grad_norm": 30.125, "grad_norm_var": 2.154166666666667, "learning_rate": 0.0001, "loss": 7.508, "loss/crossentropy": 2.0766889482736586, "loss/hidden": 3.485546875, "loss/jsd": 0.0, "loss/logits": 0.20394362770020963, "step": 1420 }, { "epoch": 0.03575, "grad_norm": 30.125, "grad_norm_var": 17.580208333333335, "learning_rate": 0.0001, "loss": 7.4612, "loss/crossentropy": 2.00380075648427, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.18816210143268108, "step": 1430 }, { "epoch": 0.036, "grad_norm": 31.375, "grad_norm_var": 16.758268229166667, "learning_rate": 0.0001, "loss": 7.4602, "loss/crossentropy": 2.1938020154833793, "loss/hidden": 3.4234375, "loss/jsd": 0.0, "loss/logits": 0.2016971528530121, "step": 1440 }, { "epoch": 0.03625, "grad_norm": 30.875, "grad_norm_var": 1.2556640625, "learning_rate": 0.0001, "loss": 7.4245, "loss/crossentropy": 2.0232372283935547, "loss/hidden": 3.40234375, "loss/jsd": 0.0, "loss/logits": 0.19209201391786337, "step": 1450 }, { "epoch": 0.0365, "grad_norm": 31.0, "grad_norm_var": 1.4041015625, "learning_rate": 0.0001, "loss": 7.5518, "loss/crossentropy": 2.2000616788864136, "loss/hidden": 3.473046875, "loss/jsd": 0.0, "loss/logits": 0.22938326951116322, "step": 1460 }, { "epoch": 0.03675, "grad_norm": 28.375, "grad_norm_var": 2.0322916666666666, "learning_rate": 0.0001, "loss": 7.4397, "loss/crossentropy": 2.0838582158088683, "loss/hidden": 3.451953125, "loss/jsd": 0.0, "loss/logits": 0.20685861641541123, "step": 1470 }, { "epoch": 0.037, "grad_norm": 32.0, "grad_norm_var": 1.5020833333333334, "learning_rate": 0.0001, "loss": 7.4183, "loss/crossentropy": 2.149951633810997, "loss/hidden": 3.375390625, "loss/jsd": 0.0, "loss/logits": 0.1984950641170144, "step": 1480 }, { "epoch": 0.03725, "grad_norm": 33.75, "grad_norm_var": 34.10826822916667, "learning_rate": 0.0001, "loss": 7.453, "loss/crossentropy": 2.128306310623884, "loss/hidden": 3.33203125, "loss/jsd": 0.0, "loss/logits": 0.19783397912979125, "step": 1490 }, { "epoch": 0.0375, "grad_norm": 29.5, "grad_norm_var": 5.008072916666666, "learning_rate": 0.0001, "loss": 7.469, "loss/crossentropy": 2.042660539597273, "loss/hidden": 3.365234375, "loss/jsd": 0.0, "loss/logits": 0.19274956732988358, "step": 1500 }, { "epoch": 0.03775, "grad_norm": 33.0, "grad_norm_var": 19.1775390625, "learning_rate": 0.0001, "loss": 7.4119, "loss/crossentropy": 2.043857058137655, "loss/hidden": 3.376953125, "loss/jsd": 0.0, "loss/logits": 0.18266947232186795, "step": 1510 }, { "epoch": 0.038, "grad_norm": 29.625, "grad_norm_var": 14.303580729166667, "learning_rate": 0.0001, "loss": 7.4362, "loss/crossentropy": 1.9492302805185318, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.1754497304558754, "step": 1520 }, { "epoch": 0.03825, "grad_norm": 29.75, "grad_norm_var": 23.764518229166665, "learning_rate": 0.0001, "loss": 7.4444, "loss/crossentropy": 2.0668226674199106, "loss/hidden": 3.473828125, "loss/jsd": 0.0, "loss/logits": 0.1921279976144433, "step": 1530 }, { "epoch": 0.0385, "grad_norm": 32.75, "grad_norm_var": 3.2226069790467994e+18, "learning_rate": 0.0001, "loss": 7.5077, "loss/crossentropy": 2.1122784771025183, "loss/hidden": 3.46953125, "loss/jsd": 0.0, "loss/logits": 0.22245875597000123, "step": 1540 }, { "epoch": 0.03875, "grad_norm": 30.25, "grad_norm_var": 5.382291666666666, "learning_rate": 0.0001, "loss": 7.4525, "loss/crossentropy": 2.264697426557541, "loss/hidden": 3.432421875, "loss/jsd": 0.0, "loss/logits": 0.2075907403603196, "step": 1550 }, { "epoch": 0.039, "grad_norm": 30.0, "grad_norm_var": 6.353580729166667, "learning_rate": 0.0001, "loss": 7.5064, "loss/crossentropy": 2.1150408178567885, "loss/hidden": 3.5203125, "loss/jsd": 0.0, "loss/logits": 0.23207673486322164, "step": 1560 }, { "epoch": 0.03925, "grad_norm": 34.25, "grad_norm_var": 6.72265625, "learning_rate": 0.0001, "loss": 7.4578, "loss/crossentropy": 2.188142140209675, "loss/hidden": 3.445703125, "loss/jsd": 0.0, "loss/logits": 0.20429779235273599, "step": 1570 }, { "epoch": 0.0395, "grad_norm": 34.75, "grad_norm_var": 897.6666015625, "learning_rate": 0.0001, "loss": 7.434, "loss/crossentropy": 2.0795677445828913, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.18706642352044583, "step": 1580 }, { "epoch": 0.03975, "grad_norm": 28.0, "grad_norm_var": 903.6327473958333, "learning_rate": 0.0001, "loss": 7.5655, "loss/crossentropy": 2.1025844663381577, "loss/hidden": 3.469140625, "loss/jsd": 0.0, "loss/logits": 0.1966788914054632, "step": 1590 }, { "epoch": 0.04, "grad_norm": 28.625, "grad_norm_var": 11.97890625, "learning_rate": 0.0001, "loss": 7.2578, "loss/crossentropy": 2.050418493151665, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.20104087069630622, "step": 1600 }, { "epoch": 0.04025, "grad_norm": 28.0, "grad_norm_var": 2.255989583333333, "learning_rate": 0.0001, "loss": 7.4393, "loss/crossentropy": 2.1767756581306457, "loss/hidden": 3.5140625, "loss/jsd": 0.0, "loss/logits": 0.2213939843699336, "step": 1610 }, { "epoch": 0.0405, "grad_norm": 29.75, "grad_norm_var": 3.80390625, "learning_rate": 0.0001, "loss": 7.5026, "loss/crossentropy": 2.126803469657898, "loss/hidden": 3.39375, "loss/jsd": 0.0, "loss/logits": 0.19106289148330688, "step": 1620 }, { "epoch": 0.04075, "grad_norm": 32.0, "grad_norm_var": 3.1249348958333334, "learning_rate": 0.0001, "loss": 7.4274, "loss/crossentropy": 2.144256164133549, "loss/hidden": 3.424609375, "loss/jsd": 0.0, "loss/logits": 0.21435861438512802, "step": 1630 }, { "epoch": 0.041, "grad_norm": 30.25, "grad_norm_var": 29.265559895833334, "learning_rate": 0.0001, "loss": 7.5728, "loss/crossentropy": 2.2575725719332693, "loss/hidden": 3.4421875, "loss/jsd": 0.0, "loss/logits": 0.20658138059079648, "step": 1640 }, { "epoch": 0.04125, "grad_norm": 30.5, "grad_norm_var": 48.35390625, "learning_rate": 0.0001, "loss": 7.5776, "loss/crossentropy": 2.096929042041302, "loss/hidden": 3.346875, "loss/jsd": 0.0, "loss/logits": 0.18803389491513373, "step": 1650 }, { "epoch": 0.0415, "grad_norm": 30.5, "grad_norm_var": 1.1010416666666667, "learning_rate": 0.0001, "loss": 7.3792, "loss/crossentropy": 2.0290944524109364, "loss/hidden": 3.313671875, "loss/jsd": 0.0, "loss/logits": 0.19023821037262678, "step": 1660 }, { "epoch": 0.04175, "grad_norm": 28.125, "grad_norm_var": 33.49270833333333, "learning_rate": 0.0001, "loss": 7.5018, "loss/crossentropy": 2.0678361281752586, "loss/hidden": 3.35234375, "loss/jsd": 0.0, "loss/logits": 0.18862500675022603, "step": 1670 }, { "epoch": 0.042, "grad_norm": 29.75, "grad_norm_var": 2.2955729166666665, "learning_rate": 0.0001, "loss": 7.4432, "loss/crossentropy": 2.0549797296524046, "loss/hidden": 3.441796875, "loss/jsd": 0.0, "loss/logits": 0.19089050237089394, "step": 1680 }, { "epoch": 0.04225, "grad_norm": 29.75, "grad_norm_var": 1.8791666666666667, "learning_rate": 0.0001, "loss": 7.3842, "loss/crossentropy": 2.0077505365014074, "loss/hidden": 3.382421875, "loss/jsd": 0.0, "loss/logits": 0.18722779098898173, "step": 1690 }, { "epoch": 0.0425, "grad_norm": 29.375, "grad_norm_var": 0.9434895833333333, "learning_rate": 0.0001, "loss": 7.4273, "loss/crossentropy": 2.071325332671404, "loss/hidden": 3.486328125, "loss/jsd": 0.0, "loss/logits": 0.20270166713744403, "step": 1700 }, { "epoch": 0.04275, "grad_norm": 38.25, "grad_norm_var": 7.669791666666667, "learning_rate": 0.0001, "loss": 7.4176, "loss/crossentropy": 2.1353142291307448, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.19663168713450432, "step": 1710 }, { "epoch": 0.043, "grad_norm": 28.25, "grad_norm_var": 7.75, "learning_rate": 0.0001, "loss": 7.3818, "loss/crossentropy": 1.9995346069335938, "loss/hidden": 3.41015625, "loss/jsd": 0.0, "loss/logits": 0.18310597026720643, "step": 1720 }, { "epoch": 0.04325, "grad_norm": 29.5, "grad_norm_var": 3.7619140625, "learning_rate": 0.0001, "loss": 7.4912, "loss/crossentropy": 2.1415088951587675, "loss/hidden": 3.55078125, "loss/jsd": 0.0, "loss/logits": 0.22313783299177886, "step": 1730 }, { "epoch": 0.0435, "grad_norm": 31.625, "grad_norm_var": 3.0416666666666665, "learning_rate": 0.0001, "loss": 7.4999, "loss/crossentropy": 2.1686330527067184, "loss/hidden": 3.384765625, "loss/jsd": 0.0, "loss/logits": 0.20409150077030064, "step": 1740 }, { "epoch": 0.04375, "grad_norm": 31.375, "grad_norm_var": 2.724739583333333, "learning_rate": 0.0001, "loss": 7.438, "loss/crossentropy": 1.9411263287067413, "loss/hidden": 3.304296875, "loss/jsd": 0.0, "loss/logits": 0.17631518254056572, "step": 1750 }, { "epoch": 0.044, "grad_norm": 32.0, "grad_norm_var": 1.9145833333333333, "learning_rate": 0.0001, "loss": 7.679, "loss/crossentropy": 2.1614590853452684, "loss/hidden": 3.36015625, "loss/jsd": 0.0, "loss/logits": 0.194198589771986, "step": 1760 }, { "epoch": 0.04425, "grad_norm": 28.5, "grad_norm_var": 2.039322916666667, "learning_rate": 0.0001, "loss": 7.5095, "loss/crossentropy": 2.282147654891014, "loss/hidden": 3.359765625, "loss/jsd": 0.0, "loss/logits": 0.19978236705064772, "step": 1770 }, { "epoch": 0.0445, "grad_norm": 29.625, "grad_norm_var": 2.34140625, "learning_rate": 0.0001, "loss": 7.5296, "loss/crossentropy": 2.2078514605760575, "loss/hidden": 3.403515625, "loss/jsd": 0.0, "loss/logits": 0.19668537452816964, "step": 1780 }, { "epoch": 0.04475, "grad_norm": 30.25, "grad_norm_var": 2.70390625, "learning_rate": 0.0001, "loss": 7.5779, "loss/crossentropy": 2.1053253799676894, "loss/hidden": 3.433984375, "loss/jsd": 0.0, "loss/logits": 0.20323336366564035, "step": 1790 }, { "epoch": 0.045, "grad_norm": 28.5, "grad_norm_var": 4.8712890625, "learning_rate": 0.0001, "loss": 7.4866, "loss/crossentropy": 2.060333488881588, "loss/hidden": 3.373828125, "loss/jsd": 0.0, "loss/logits": 0.18627767637372017, "step": 1800 }, { "epoch": 0.04525, "grad_norm": 28.0, "grad_norm_var": 14.480989583333333, "learning_rate": 0.0001, "loss": 7.5225, "loss/crossentropy": 1.9755317773669958, "loss/hidden": 3.54375, "loss/jsd": 0.0, "loss/logits": 0.20334282671101392, "step": 1810 }, { "epoch": 0.0455, "grad_norm": 29.875, "grad_norm_var": 12.935872395833334, "learning_rate": 0.0001, "loss": 7.4781, "loss/crossentropy": 2.1289859026670457, "loss/hidden": 3.346484375, "loss/jsd": 0.0, "loss/logits": 0.1973018018528819, "step": 1820 }, { "epoch": 0.04575, "grad_norm": 31.75, "grad_norm_var": 2.123893229166667, "learning_rate": 0.0001, "loss": 7.3915, "loss/crossentropy": 1.9609280914068221, "loss/hidden": 3.386328125, "loss/jsd": 0.0, "loss/logits": 0.1916458262130618, "step": 1830 }, { "epoch": 0.046, "grad_norm": 32.0, "grad_norm_var": 1.6332682291666667, "learning_rate": 0.0001, "loss": 7.5095, "loss/crossentropy": 2.0019985377788543, "loss/hidden": 3.384375, "loss/jsd": 0.0, "loss/logits": 0.19768325993791222, "step": 1840 }, { "epoch": 0.04625, "grad_norm": 29.875, "grad_norm_var": 2.225455729166667, "learning_rate": 0.0001, "loss": 7.623, "loss/crossentropy": 2.0607564479112623, "loss/hidden": 3.507421875, "loss/jsd": 0.0, "loss/logits": 0.20858939345926047, "step": 1850 }, { "epoch": 0.0465, "grad_norm": 29.5, "grad_norm_var": 1.9863932291666666, "learning_rate": 0.0001, "loss": 7.3836, "loss/crossentropy": 2.132562433928251, "loss/hidden": 3.40859375, "loss/jsd": 0.0, "loss/logits": 0.1956317812204361, "step": 1860 }, { "epoch": 0.04675, "grad_norm": 36.0, "grad_norm_var": 3.2171223958333335, "learning_rate": 0.0001, "loss": 7.4803, "loss/crossentropy": 2.0316790327429772, "loss/hidden": 3.396875, "loss/jsd": 0.0, "loss/logits": 0.20630075875669718, "step": 1870 }, { "epoch": 0.047, "grad_norm": 33.25, "grad_norm_var": 16.304622395833334, "learning_rate": 0.0001, "loss": 7.576, "loss/crossentropy": 2.161964085698128, "loss/hidden": 3.513671875, "loss/jsd": 0.0, "loss/logits": 0.21842746511101724, "step": 1880 }, { "epoch": 0.04725, "grad_norm": 29.75, "grad_norm_var": 2.3541666666666665, "learning_rate": 0.0001, "loss": 7.5036, "loss/crossentropy": 1.8695943117141725, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.18793469872325658, "step": 1890 }, { "epoch": 0.0475, "grad_norm": 34.25, "grad_norm_var": 2.1780598958333335, "learning_rate": 0.0001, "loss": 7.5623, "loss/crossentropy": 2.2376974314451217, "loss/hidden": 3.489453125, "loss/jsd": 0.0, "loss/logits": 0.21696731727570295, "step": 1900 }, { "epoch": 0.04775, "grad_norm": 30.75, "grad_norm_var": 14.924934895833333, "learning_rate": 0.0001, "loss": 7.388, "loss/crossentropy": 1.9403380863368511, "loss/hidden": 3.34921875, "loss/jsd": 0.0, "loss/logits": 0.18128401823341847, "step": 1910 }, { "epoch": 0.048, "grad_norm": 29.25, "grad_norm_var": 25.1916015625, "learning_rate": 0.0001, "loss": 7.4109, "loss/crossentropy": 2.1744547933340073, "loss/hidden": 3.423046875, "loss/jsd": 0.0, "loss/logits": 0.20097011709585785, "step": 1920 }, { "epoch": 0.04825, "grad_norm": 29.25, "grad_norm_var": 14.801822916666667, "learning_rate": 0.0001, "loss": 7.2893, "loss/crossentropy": 2.101319019496441, "loss/hidden": 3.44140625, "loss/jsd": 0.0, "loss/logits": 0.1921493023633957, "step": 1930 }, { "epoch": 0.0485, "grad_norm": 30.125, "grad_norm_var": 14.517708333333333, "learning_rate": 0.0001, "loss": 7.579, "loss/crossentropy": 2.057158224284649, "loss/hidden": 3.59140625, "loss/jsd": 0.0, "loss/logits": 0.21765361074358225, "step": 1940 }, { "epoch": 0.04875, "grad_norm": 29.625, "grad_norm_var": 15.790559895833333, "learning_rate": 0.0001, "loss": 7.3712, "loss/crossentropy": 1.9415803879499436, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.18346730088815094, "step": 1950 }, { "epoch": 0.049, "grad_norm": 27.625, "grad_norm_var": 9.794791666666667, "learning_rate": 0.0001, "loss": 7.4902, "loss/crossentropy": 2.035348242521286, "loss/hidden": 3.439453125, "loss/jsd": 0.0, "loss/logits": 0.20268035624176264, "step": 1960 }, { "epoch": 0.04925, "grad_norm": 35.25, "grad_norm_var": 12.768684895833333, "learning_rate": 0.0001, "loss": 7.4627, "loss/crossentropy": 2.054542076587677, "loss/hidden": 3.426171875, "loss/jsd": 0.0, "loss/logits": 0.2003987120464444, "step": 1970 }, { "epoch": 0.0495, "grad_norm": 36.0, "grad_norm_var": 12.572916666666666, "learning_rate": 0.0001, "loss": 7.353, "loss/crossentropy": 1.9634785205125809, "loss/hidden": 3.301171875, "loss/jsd": 0.0, "loss/logits": 0.17985089337453247, "step": 1980 }, { "epoch": 0.04975, "grad_norm": 36.25, "grad_norm_var": 9.2353515625, "learning_rate": 0.0001, "loss": 7.4473, "loss/crossentropy": 2.059533824026585, "loss/hidden": 3.3578125, "loss/jsd": 0.0, "loss/logits": 0.19096513148397207, "step": 1990 }, { "epoch": 0.05, "grad_norm": 29.125, "grad_norm_var": 13.320572916666666, "learning_rate": 0.0001, "loss": 7.3914, "loss/crossentropy": 2.011685383319855, "loss/hidden": 3.4421875, "loss/jsd": 0.0, "loss/logits": 0.19188414234668016, "step": 2000 }, { "epoch": 0.05025, "grad_norm": 36.25, "grad_norm_var": 14.026822916666667, "learning_rate": 0.0001, "loss": 7.4213, "loss/crossentropy": 2.309766414761543, "loss/hidden": 3.39453125, "loss/jsd": 0.0, "loss/logits": 0.20372038893401623, "step": 2010 }, { "epoch": 0.0505, "grad_norm": 29.0, "grad_norm_var": 9.237239583333333, "learning_rate": 0.0001, "loss": 7.4145, "loss/crossentropy": 2.1240487143397333, "loss/hidden": 3.447265625, "loss/jsd": 0.0, "loss/logits": 0.20137840434908866, "step": 2020 }, { "epoch": 0.05075, "grad_norm": 38.5, "grad_norm_var": 89.21432291666666, "learning_rate": 0.0001, "loss": 7.3696, "loss/crossentropy": 2.112667274475098, "loss/hidden": 3.487109375, "loss/jsd": 0.0, "loss/logits": 0.19770587887614965, "step": 2030 }, { "epoch": 0.051, "grad_norm": 27.75, "grad_norm_var": 94.06015625, "learning_rate": 0.0001, "loss": 7.2471, "loss/crossentropy": 1.9955052442848682, "loss/hidden": 3.30546875, "loss/jsd": 0.0, "loss/logits": 0.1880181163549423, "step": 2040 }, { "epoch": 0.05125, "grad_norm": 35.25, "grad_norm_var": 3.67265625, "learning_rate": 0.0001, "loss": 7.458, "loss/crossentropy": 2.1320972844958304, "loss/hidden": 3.384765625, "loss/jsd": 0.0, "loss/logits": 0.18908526431769132, "step": 2050 }, { "epoch": 0.0515, "grad_norm": 38.75, "grad_norm_var": 10.776822916666667, "learning_rate": 0.0001, "loss": 7.3769, "loss/crossentropy": 2.171598494052887, "loss/hidden": 3.29765625, "loss/jsd": 0.0, "loss/logits": 0.18929236195981503, "step": 2060 }, { "epoch": 0.05175, "grad_norm": 32.75, "grad_norm_var": 10.53515625, "learning_rate": 0.0001, "loss": 7.5279, "loss/crossentropy": 2.0172302186489106, "loss/hidden": 3.4203125, "loss/jsd": 0.0, "loss/logits": 0.2013201082125306, "step": 2070 }, { "epoch": 0.052, "grad_norm": 32.0, "grad_norm_var": 7.678125, "learning_rate": 0.0001, "loss": 7.3619, "loss/crossentropy": 1.982726515084505, "loss/hidden": 3.394921875, "loss/jsd": 0.0, "loss/logits": 0.17850281894207, "step": 2080 }, { "epoch": 0.05225, "grad_norm": 29.75, "grad_norm_var": 63.6681640625, "learning_rate": 0.0001, "loss": 7.5109, "loss/crossentropy": 2.121504098176956, "loss/hidden": 3.50703125, "loss/jsd": 0.0, "loss/logits": 0.240205854550004, "step": 2090 }, { "epoch": 0.0525, "grad_norm": 34.5, "grad_norm_var": 7.506184895833333, "learning_rate": 0.0001, "loss": 7.4658, "loss/crossentropy": 2.110687591135502, "loss/hidden": 3.530078125, "loss/jsd": 0.0, "loss/logits": 0.2039638390764594, "step": 2100 }, { "epoch": 0.05275, "grad_norm": 32.5, "grad_norm_var": 19.075455729166666, "learning_rate": 0.0001, "loss": 7.5668, "loss/crossentropy": 1.9557841390371322, "loss/hidden": 3.462109375, "loss/jsd": 0.0, "loss/logits": 0.18774209143593906, "step": 2110 }, { "epoch": 0.053, "grad_norm": 31.125, "grad_norm_var": 3.85390625, "learning_rate": 0.0001, "loss": 7.5735, "loss/crossentropy": 2.0219520531594752, "loss/hidden": 3.3796875, "loss/jsd": 0.0, "loss/logits": 0.18533632289618254, "step": 2120 }, { "epoch": 0.05325, "grad_norm": 32.25, "grad_norm_var": 3.8910807291666667, "learning_rate": 0.0001, "loss": 7.4083, "loss/crossentropy": 2.1359280541539194, "loss/hidden": 3.412890625, "loss/jsd": 0.0, "loss/logits": 0.1897095028311014, "step": 2130 }, { "epoch": 0.0535, "grad_norm": 31.25, "grad_norm_var": 2.5957682291666666, "learning_rate": 0.0001, "loss": 7.446, "loss/crossentropy": 2.170258317142725, "loss/hidden": 3.32109375, "loss/jsd": 0.0, "loss/logits": 0.1826348526403308, "step": 2140 }, { "epoch": 0.05375, "grad_norm": 31.25, "grad_norm_var": 3.785416666666667, "learning_rate": 0.0001, "loss": 7.4014, "loss/crossentropy": 2.131239393353462, "loss/hidden": 3.303515625, "loss/jsd": 0.0, "loss/logits": 0.18656531646847724, "step": 2150 }, { "epoch": 0.054, "grad_norm": 31.0, "grad_norm_var": 4.8666015625, "learning_rate": 0.0001, "loss": 7.5478, "loss/crossentropy": 2.223896725475788, "loss/hidden": 3.383203125, "loss/jsd": 0.0, "loss/logits": 0.1951376979239285, "step": 2160 }, { "epoch": 0.05425, "grad_norm": 30.375, "grad_norm_var": 8.437955729166667, "learning_rate": 0.0001, "loss": 7.5562, "loss/crossentropy": 2.1203987300395966, "loss/hidden": 3.351171875, "loss/jsd": 0.0, "loss/logits": 0.1970507999882102, "step": 2170 }, { "epoch": 0.0545, "grad_norm": 32.0, "grad_norm_var": 2.9488932291666665, "learning_rate": 0.0001, "loss": 7.5532, "loss/crossentropy": 2.080265050381422, "loss/hidden": 3.544140625, "loss/jsd": 0.0, "loss/logits": 0.2216239819303155, "step": 2180 }, { "epoch": 0.05475, "grad_norm": 31.125, "grad_norm_var": 8.1728515625, "learning_rate": 0.0001, "loss": 7.382, "loss/crossentropy": 2.2114535331726075, "loss/hidden": 3.37734375, "loss/jsd": 0.0, "loss/logits": 0.20577374435961246, "step": 2190 }, { "epoch": 0.055, "grad_norm": 28.875, "grad_norm_var": 14.520833333333334, "learning_rate": 0.0001, "loss": 7.5766, "loss/crossentropy": 2.1003271512687207, "loss/hidden": 3.358984375, "loss/jsd": 0.0, "loss/logits": 0.18811229150742292, "step": 2200 }, { "epoch": 0.05525, "grad_norm": 33.5, "grad_norm_var": 16.099739583333335, "learning_rate": 0.0001, "loss": 7.5553, "loss/crossentropy": 2.1326127350330353, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.22006579730659723, "step": 2210 }, { "epoch": 0.0555, "grad_norm": 32.25, "grad_norm_var": 9.305143229166667, "learning_rate": 0.0001, "loss": 7.3766, "loss/crossentropy": 2.1496046826243402, "loss/hidden": 3.476171875, "loss/jsd": 0.0, "loss/logits": 0.1952402491122484, "step": 2220 }, { "epoch": 0.05575, "grad_norm": 29.125, "grad_norm_var": 6.805143229166666, "learning_rate": 0.0001, "loss": 7.3648, "loss/crossentropy": 2.13938904479146, "loss/hidden": 3.36640625, "loss/jsd": 0.0, "loss/logits": 0.19394674636423587, "step": 2230 }, { "epoch": 0.056, "grad_norm": 27.625, "grad_norm_var": 15.0712890625, "learning_rate": 0.0001, "loss": 7.4292, "loss/crossentropy": 2.0648645758628845, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.18520106598734856, "step": 2240 }, { "epoch": 0.05625, "grad_norm": 29.25, "grad_norm_var": 12.034309895833333, "learning_rate": 0.0001, "loss": 7.4469, "loss/crossentropy": 2.080448921024799, "loss/hidden": 3.3109375, "loss/jsd": 0.0, "loss/logits": 0.18507405128329993, "step": 2250 }, { "epoch": 0.0565, "grad_norm": 31.375, "grad_norm_var": 2.014518229166667, "learning_rate": 0.0001, "loss": 7.4325, "loss/crossentropy": 2.0871294140815735, "loss/hidden": 3.409375, "loss/jsd": 0.0, "loss/logits": 0.20059894528239966, "step": 2260 }, { "epoch": 0.05675, "grad_norm": 28.75, "grad_norm_var": 1.8103515625, "learning_rate": 0.0001, "loss": 7.4268, "loss/crossentropy": 2.010594163835049, "loss/hidden": 3.39453125, "loss/jsd": 0.0, "loss/logits": 0.19413960948586464, "step": 2270 }, { "epoch": 0.057, "grad_norm": 32.5, "grad_norm_var": 4.0369140625, "learning_rate": 0.0001, "loss": 7.4346, "loss/crossentropy": 2.1129174560308455, "loss/hidden": 3.416015625, "loss/jsd": 0.0, "loss/logits": 0.1961110396310687, "step": 2280 }, { "epoch": 0.05725, "grad_norm": 39.0, "grad_norm_var": 30.42265625, "learning_rate": 0.0001, "loss": 7.4422, "loss/crossentropy": 2.002947611361742, "loss/hidden": 3.432421875, "loss/jsd": 0.0, "loss/logits": 0.2081361676566303, "step": 2290 }, { "epoch": 0.0575, "grad_norm": 37.25, "grad_norm_var": 25.699934895833334, "learning_rate": 0.0001, "loss": 7.4312, "loss/crossentropy": 2.06134437918663, "loss/hidden": 3.376171875, "loss/jsd": 0.0, "loss/logits": 0.18918452728539706, "step": 2300 }, { "epoch": 0.05775, "grad_norm": 28.875, "grad_norm_var": 9.115559895833334, "learning_rate": 0.0001, "loss": 7.4209, "loss/crossentropy": 2.041922479122877, "loss/hidden": 3.403515625, "loss/jsd": 0.0, "loss/logits": 0.20907302405685185, "step": 2310 }, { "epoch": 0.058, "grad_norm": 30.125, "grad_norm_var": 22.248372395833332, "learning_rate": 0.0001, "loss": 7.6844, "loss/crossentropy": 2.0152460247278214, "loss/hidden": 3.426171875, "loss/jsd": 0.0, "loss/logits": 0.1905667196959257, "step": 2320 }, { "epoch": 0.05825, "grad_norm": 38.25, "grad_norm_var": 31.398893229166667, "learning_rate": 0.0001, "loss": 7.4713, "loss/crossentropy": 2.105386929959059, "loss/hidden": 3.452734375, "loss/jsd": 0.0, "loss/logits": 0.1982942834496498, "step": 2330 }, { "epoch": 0.0585, "grad_norm": 28.375, "grad_norm_var": 54.94264322916667, "learning_rate": 0.0001, "loss": 7.4575, "loss/crossentropy": 2.2358868844807147, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.19232469592243434, "step": 2340 }, { "epoch": 0.05875, "grad_norm": 33.5, "grad_norm_var": 165.74583333333334, "learning_rate": 0.0001, "loss": 7.2987, "loss/crossentropy": 1.9657546751201154, "loss/hidden": 3.3921875, "loss/jsd": 0.0, "loss/logits": 0.18062973748892547, "step": 2350 }, { "epoch": 0.059, "grad_norm": 41.0, "grad_norm_var": 15.376822916666667, "learning_rate": 0.0001, "loss": 7.4431, "loss/crossentropy": 2.191007924079895, "loss/hidden": 3.3609375, "loss/jsd": 0.0, "loss/logits": 0.2068317520432174, "step": 2360 }, { "epoch": 0.05925, "grad_norm": 30.625, "grad_norm_var": 12.109375, "learning_rate": 0.0001, "loss": 7.3325, "loss/crossentropy": 2.0140789330005644, "loss/hidden": 3.4109375, "loss/jsd": 0.0, "loss/logits": 0.18166892379522323, "step": 2370 }, { "epoch": 0.0595, "grad_norm": 31.875, "grad_norm_var": 6.941666666666666, "learning_rate": 0.0001, "loss": 7.4039, "loss/crossentropy": 2.0221361994743345, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.1934544663876295, "step": 2380 }, { "epoch": 0.05975, "grad_norm": 30.125, "grad_norm_var": 10.472330729166666, "learning_rate": 0.0001, "loss": 7.5862, "loss/crossentropy": 1.9840030640363693, "loss/hidden": 3.46640625, "loss/jsd": 0.0, "loss/logits": 0.19178631734102963, "step": 2390 }, { "epoch": 0.06, "grad_norm": 29.875, "grad_norm_var": 14.10625, "learning_rate": 0.0001, "loss": 7.4826, "loss/crossentropy": 2.1700179904699324, "loss/hidden": 3.408984375, "loss/jsd": 0.0, "loss/logits": 0.1915024297311902, "step": 2400 }, { "epoch": 0.06025, "grad_norm": 32.75, "grad_norm_var": 7.370768229166667, "learning_rate": 0.0001, "loss": 7.3889, "loss/crossentropy": 2.091843403875828, "loss/hidden": 3.358203125, "loss/jsd": 0.0, "loss/logits": 0.18695627991110086, "step": 2410 }, { "epoch": 0.0605, "grad_norm": 29.0, "grad_norm_var": 9.922330729166667, "learning_rate": 0.0001, "loss": 7.4655, "loss/crossentropy": 2.172381104528904, "loss/hidden": 3.380078125, "loss/jsd": 0.0, "loss/logits": 0.20078962799161673, "step": 2420 }, { "epoch": 0.06075, "grad_norm": 34.25, "grad_norm_var": 8.637239583333333, "learning_rate": 0.0001, "loss": 7.519, "loss/crossentropy": 1.995463601499796, "loss/hidden": 3.411328125, "loss/jsd": 0.0, "loss/logits": 0.1993358489125967, "step": 2430 }, { "epoch": 0.061, "grad_norm": 31.25, "grad_norm_var": 11.9431640625, "learning_rate": 0.0001, "loss": 7.5169, "loss/crossentropy": 2.296917426586151, "loss/hidden": 3.513671875, "loss/jsd": 0.0, "loss/logits": 0.23228074796497822, "step": 2440 }, { "epoch": 0.06125, "grad_norm": 30.25, "grad_norm_var": 3.4368798046573737e+18, "learning_rate": 0.0001, "loss": 7.5038, "loss/crossentropy": 2.1944432735443113, "loss/hidden": 3.3921875, "loss/jsd": 0.0, "loss/logits": 0.21073084995150565, "step": 2450 }, { "epoch": 0.0615, "grad_norm": 33.5, "grad_norm_var": 3.436879805205814e+18, "learning_rate": 0.0001, "loss": 7.4423, "loss/crossentropy": 2.152103579044342, "loss/hidden": 3.512109375, "loss/jsd": 0.0, "loss/logits": 0.20929353777319193, "step": 2460 }, { "epoch": 0.06175, "grad_norm": 39.0, "grad_norm_var": 2.2045823633093297e+18, "learning_rate": 0.0001, "loss": 7.4382, "loss/crossentropy": 2.017627691477537, "loss/hidden": 3.355078125, "loss/jsd": 0.0, "loss/logits": 0.19590776292607187, "step": 2470 }, { "epoch": 0.062, "grad_norm": 29.375, "grad_norm_var": 2.2045823636681523e+18, "learning_rate": 0.0001, "loss": 7.4072, "loss/crossentropy": 2.1076912328600885, "loss/hidden": 3.433203125, "loss/jsd": 0.0, "loss/logits": 0.1988623272627592, "step": 2480 }, { "epoch": 0.06225, "grad_norm": 30.125, "grad_norm_var": 3.2494140625, "learning_rate": 0.0001, "loss": 7.3192, "loss/crossentropy": 1.9777067750692368, "loss/hidden": 3.429296875, "loss/jsd": 0.0, "loss/logits": 0.20539684109389783, "step": 2490 }, { "epoch": 0.0625, "grad_norm": 29.125, "grad_norm_var": 5.580208333333333, "learning_rate": 0.0001, "loss": 7.3283, "loss/crossentropy": 2.061080713570118, "loss/hidden": 3.4953125, "loss/jsd": 0.0, "loss/logits": 0.20077812522649766, "step": 2500 }, { "epoch": 0.06275, "grad_norm": 28.375, "grad_norm_var": 5.618489583333333, "learning_rate": 0.0001, "loss": 7.4401, "loss/crossentropy": 2.2099071338772776, "loss/hidden": 3.411328125, "loss/jsd": 0.0, "loss/logits": 0.2055276283994317, "step": 2510 }, { "epoch": 0.063, "grad_norm": 28.125, "grad_norm_var": 7.118684895833334, "learning_rate": 0.0001, "loss": 7.3509, "loss/crossentropy": 1.962952435016632, "loss/hidden": 3.421484375, "loss/jsd": 0.0, "loss/logits": 0.19731322024017572, "step": 2520 }, { "epoch": 0.06325, "grad_norm": 31.375, "grad_norm_var": 1.9681640625, "learning_rate": 0.0001, "loss": 7.3695, "loss/crossentropy": 1.9843583509325982, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.2062232268974185, "step": 2530 }, { "epoch": 0.0635, "grad_norm": 31.5, "grad_norm_var": 3.7988932291666666, "learning_rate": 0.0001, "loss": 7.4485, "loss/crossentropy": 2.1427679538726805, "loss/hidden": 3.38125, "loss/jsd": 0.0, "loss/logits": 0.2011977185495198, "step": 2540 }, { "epoch": 0.06375, "grad_norm": 30.0, "grad_norm_var": 2.5885416666666665, "learning_rate": 0.0001, "loss": 7.4157, "loss/crossentropy": 1.9085583783686162, "loss/hidden": 3.325, "loss/jsd": 0.0, "loss/logits": 0.17416954301297666, "step": 2550 }, { "epoch": 0.064, "grad_norm": 31.25, "grad_norm_var": 1.21015625, "learning_rate": 0.0001, "loss": 7.5141, "loss/crossentropy": 1.9622327491641045, "loss/hidden": 3.361328125, "loss/jsd": 0.0, "loss/logits": 0.18756412472575903, "step": 2560 }, { "epoch": 0.06425, "grad_norm": 30.0, "grad_norm_var": 1.7143229166666667, "learning_rate": 0.0001, "loss": 7.4624, "loss/crossentropy": 2.192887546122074, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.1984951412305236, "step": 2570 }, { "epoch": 0.0645, "grad_norm": 30.125, "grad_norm_var": 1.9143229166666667, "learning_rate": 0.0001, "loss": 7.3947, "loss/crossentropy": 2.102549520134926, "loss/hidden": 3.463671875, "loss/jsd": 0.0, "loss/logits": 0.1989850653335452, "step": 2580 }, { "epoch": 0.06475, "grad_norm": 32.25, "grad_norm_var": 9.5322265625, "learning_rate": 0.0001, "loss": 7.5147, "loss/crossentropy": 2.213281115144491, "loss/hidden": 3.405859375, "loss/jsd": 0.0, "loss/logits": 0.2027151037938893, "step": 2590 }, { "epoch": 0.065, "grad_norm": 30.625, "grad_norm_var": 2.3427083333333334, "learning_rate": 0.0001, "loss": 7.4691, "loss/crossentropy": 2.1138279482722284, "loss/hidden": 3.379296875, "loss/jsd": 0.0, "loss/logits": 0.20825629755854608, "step": 2600 }, { "epoch": 0.06525, "grad_norm": 36.0, "grad_norm_var": 3.3395182291666665, "learning_rate": 0.0001, "loss": 7.4775, "loss/crossentropy": 2.107349547743797, "loss/hidden": 3.404296875, "loss/jsd": 0.0, "loss/logits": 0.19337845854461194, "step": 2610 }, { "epoch": 0.0655, "grad_norm": 29.25, "grad_norm_var": 12.757291666666667, "learning_rate": 0.0001, "loss": 7.5438, "loss/crossentropy": 2.0628502368927, "loss/hidden": 3.4984375, "loss/jsd": 0.0, "loss/logits": 0.20967572089284658, "step": 2620 }, { "epoch": 0.06575, "grad_norm": 28.625, "grad_norm_var": 11.805208333333333, "learning_rate": 0.0001, "loss": 7.3354, "loss/crossentropy": 2.1009589530527593, "loss/hidden": 3.33828125, "loss/jsd": 0.0, "loss/logits": 0.18132725274190306, "step": 2630 }, { "epoch": 0.066, "grad_norm": 32.5, "grad_norm_var": 2.730208333333333, "learning_rate": 0.0001, "loss": 7.4257, "loss/crossentropy": 1.983342681080103, "loss/hidden": 3.480078125, "loss/jsd": 0.0, "loss/logits": 0.19340286049991845, "step": 2640 }, { "epoch": 0.06625, "grad_norm": 30.25, "grad_norm_var": 3.7549465282226944e+18, "learning_rate": 0.0001, "loss": 7.309, "loss/crossentropy": 2.0057250812649725, "loss/hidden": 3.418359375, "loss/jsd": 0.0, "loss/logits": 0.18936716187745334, "step": 2650 }, { "epoch": 0.0665, "grad_norm": 36.25, "grad_norm_var": 8.832747395833334, "learning_rate": 0.0001, "loss": 7.5442, "loss/crossentropy": 2.054753464460373, "loss/hidden": 3.410546875, "loss/jsd": 0.0, "loss/logits": 0.2035602940246463, "step": 2660 }, { "epoch": 0.06675, "grad_norm": 32.5, "grad_norm_var": 4.8900390625, "learning_rate": 0.0001, "loss": 7.4106, "loss/crossentropy": 2.0181221179664135, "loss/hidden": 3.3859375, "loss/jsd": 0.0, "loss/logits": 0.1878144398331642, "step": 2670 }, { "epoch": 0.067, "grad_norm": 30.125, "grad_norm_var": 4.280989583333334, "learning_rate": 0.0001, "loss": 7.4597, "loss/crossentropy": 2.200540581345558, "loss/hidden": 3.4046875, "loss/jsd": 0.0, "loss/logits": 0.20286752395331858, "step": 2680 }, { "epoch": 0.06725, "grad_norm": 31.75, "grad_norm_var": 3.8559895833333333, "learning_rate": 0.0001, "loss": 7.4643, "loss/crossentropy": 2.0630861818790436, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.20401672925800085, "step": 2690 }, { "epoch": 0.0675, "grad_norm": 33.0, "grad_norm_var": 7.073958333333334, "learning_rate": 0.0001, "loss": 7.4001, "loss/crossentropy": 1.927167509496212, "loss/hidden": 3.31328125, "loss/jsd": 0.0, "loss/logits": 0.17901942003518342, "step": 2700 }, { "epoch": 0.06775, "grad_norm": 30.25, "grad_norm_var": 8.9009765625, "learning_rate": 0.0001, "loss": 7.3461, "loss/crossentropy": 2.0538916781544687, "loss/hidden": 3.35234375, "loss/jsd": 0.0, "loss/logits": 0.1864149821922183, "step": 2710 }, { "epoch": 0.068, "grad_norm": 29.5, "grad_norm_var": 2.218489583333333, "learning_rate": 0.0001, "loss": 7.526, "loss/crossentropy": 2.211588367819786, "loss/hidden": 3.487890625, "loss/jsd": 0.0, "loss/logits": 0.20801848396658898, "step": 2720 }, { "epoch": 0.06825, "grad_norm": 31.375, "grad_norm_var": 1.0768229166666667, "learning_rate": 0.0001, "loss": 7.5535, "loss/crossentropy": 2.268890543282032, "loss/hidden": 3.39921875, "loss/jsd": 0.0, "loss/logits": 0.21352684032171965, "step": 2730 }, { "epoch": 0.0685, "grad_norm": 33.25, "grad_norm_var": 5.663997395833333, "learning_rate": 0.0001, "loss": 7.411, "loss/crossentropy": 1.902898482978344, "loss/hidden": 3.423046875, "loss/jsd": 0.0, "loss/logits": 0.1794701736420393, "step": 2740 }, { "epoch": 0.06875, "grad_norm": 32.25, "grad_norm_var": 6.167708333333334, "learning_rate": 0.0001, "loss": 7.3718, "loss/crossentropy": 1.9450767874717712, "loss/hidden": 3.453515625, "loss/jsd": 0.0, "loss/logits": 0.18759301900863648, "step": 2750 }, { "epoch": 0.069, "grad_norm": 31.125, "grad_norm_var": 31.185872395833332, "learning_rate": 0.0001, "loss": 7.4359, "loss/crossentropy": 2.0783849939703942, "loss/hidden": 3.334375, "loss/jsd": 0.0, "loss/logits": 0.18503105416893958, "step": 2760 }, { "epoch": 0.06925, "grad_norm": 36.5, "grad_norm_var": 35.412434895833336, "learning_rate": 0.0001, "loss": 7.5806, "loss/crossentropy": 2.2374701410532, "loss/hidden": 3.378125, "loss/jsd": 0.0, "loss/logits": 0.19615829903632404, "step": 2770 }, { "epoch": 0.0695, "grad_norm": 30.25, "grad_norm_var": 19.787239583333335, "learning_rate": 0.0001, "loss": 7.3197, "loss/crossentropy": 1.8297001466155052, "loss/hidden": 3.3171875, "loss/jsd": 0.0, "loss/logits": 0.16481583826243879, "step": 2780 }, { "epoch": 0.06975, "grad_norm": 428.0, "grad_norm_var": 9873.31640625, "learning_rate": 0.0001, "loss": 7.5313, "loss/crossentropy": 2.249661484360695, "loss/hidden": 3.392578125, "loss/jsd": 0.0, "loss/logits": 0.2018596636131406, "step": 2790 }, { "epoch": 0.07, "grad_norm": 31.0, "grad_norm_var": 9755.6625, "learning_rate": 0.0001, "loss": 7.3957, "loss/crossentropy": 1.9368772380053998, "loss/hidden": 3.48203125, "loss/jsd": 0.0, "loss/logits": 0.18386599626392125, "step": 2800 }, { "epoch": 0.07025, "grad_norm": 30.75, "grad_norm_var": 1.8317057291666667, "learning_rate": 0.0001, "loss": 7.4372, "loss/crossentropy": 1.98307463824749, "loss/hidden": 3.464453125, "loss/jsd": 0.0, "loss/logits": 0.19818334747105837, "step": 2810 }, { "epoch": 0.0705, "grad_norm": 29.375, "grad_norm_var": 2.589583333333333, "learning_rate": 0.0001, "loss": 7.5014, "loss/crossentropy": 2.1463105253875256, "loss/hidden": 3.5046875, "loss/jsd": 0.0, "loss/logits": 0.20105676222592592, "step": 2820 }, { "epoch": 0.07075, "grad_norm": 60.5, "grad_norm_var": 178.2556640625, "learning_rate": 0.0001, "loss": 7.4527, "loss/crossentropy": 2.0776613369584083, "loss/hidden": 3.420703125, "loss/jsd": 0.0, "loss/logits": 0.19452448841184378, "step": 2830 }, { "epoch": 0.071, "grad_norm": 29.25, "grad_norm_var": 172.31451822916668, "learning_rate": 0.0001, "loss": 7.4802, "loss/crossentropy": 2.1200039610266685, "loss/hidden": 3.417578125, "loss/jsd": 0.0, "loss/logits": 0.19831879772245883, "step": 2840 }, { "epoch": 0.07125, "grad_norm": 69.0, "grad_norm_var": 117.23098958333334, "learning_rate": 0.0001, "loss": 7.434, "loss/crossentropy": 2.024143140017986, "loss/hidden": 3.348828125, "loss/jsd": 0.0, "loss/logits": 0.1836528332903981, "step": 2850 }, { "epoch": 0.0715, "grad_norm": 31.375, "grad_norm_var": 92.53723958333333, "learning_rate": 0.0001, "loss": 7.4934, "loss/crossentropy": 2.2765417456626893, "loss/hidden": 3.446484375, "loss/jsd": 0.0, "loss/logits": 0.20736196860671044, "step": 2860 }, { "epoch": 0.07175, "grad_norm": 31.625, "grad_norm_var": 7.986393229166667, "learning_rate": 0.0001, "loss": 7.4826, "loss/crossentropy": 2.269197002053261, "loss/hidden": 3.404296875, "loss/jsd": 0.0, "loss/logits": 0.19869209118187428, "step": 2870 }, { "epoch": 0.072, "grad_norm": 31.25, "grad_norm_var": 3.1806640625, "learning_rate": 0.0001, "loss": 7.4018, "loss/crossentropy": 2.2985214799642564, "loss/hidden": 3.390234375, "loss/jsd": 0.0, "loss/logits": 0.20524807646870613, "step": 2880 }, { "epoch": 0.07225, "grad_norm": 30.875, "grad_norm_var": 4.801822916666667, "learning_rate": 0.0001, "loss": 7.5148, "loss/crossentropy": 2.2387808740139006, "loss/hidden": 3.46015625, "loss/jsd": 0.0, "loss/logits": 0.19951685946434736, "step": 2890 }, { "epoch": 0.0725, "grad_norm": 28.875, "grad_norm_var": 13.836458333333333, "learning_rate": 0.0001, "loss": 7.5232, "loss/crossentropy": 2.049694790691137, "loss/hidden": 3.410546875, "loss/jsd": 0.0, "loss/logits": 0.19052465092390775, "step": 2900 }, { "epoch": 0.07275, "grad_norm": 29.625, "grad_norm_var": 17.91640625, "learning_rate": 0.0001, "loss": 7.3227, "loss/crossentropy": 2.0360258772969244, "loss/hidden": 3.40546875, "loss/jsd": 0.0, "loss/logits": 0.18495636582374572, "step": 2910 }, { "epoch": 0.073, "grad_norm": 32.0, "grad_norm_var": 1.8926377214767268e+18, "learning_rate": 0.0001, "loss": 7.4512, "loss/crossentropy": 2.13848315179348, "loss/hidden": 3.3859375, "loss/jsd": 0.0, "loss/logits": 0.18625867497175932, "step": 2920 }, { "epoch": 0.07325, "grad_norm": 29.875, "grad_norm_var": 1.8926377199175642e+18, "learning_rate": 0.0001, "loss": 7.5038, "loss/crossentropy": 2.166595605015755, "loss/hidden": 3.49375, "loss/jsd": 0.0, "loss/logits": 0.20948194600641729, "step": 2930 }, { "epoch": 0.0735, "grad_norm": 28.5, "grad_norm_var": 73.08020833333333, "learning_rate": 0.0001, "loss": 7.374, "loss/crossentropy": 1.9849643550813199, "loss/hidden": 3.301171875, "loss/jsd": 0.0, "loss/logits": 0.18302082028239966, "step": 2940 }, { "epoch": 0.07375, "grad_norm": 29.125, "grad_norm_var": 24.825, "learning_rate": 0.0001, "loss": 7.3651, "loss/crossentropy": 2.057874396443367, "loss/hidden": 3.3609375, "loss/jsd": 0.0, "loss/logits": 0.1866615541279316, "step": 2950 }, { "epoch": 0.074, "grad_norm": 30.625, "grad_norm_var": 883.6354166666666, "learning_rate": 0.0001, "loss": 7.5415, "loss/crossentropy": 2.1631729155778885, "loss/hidden": 3.388671875, "loss/jsd": 0.0, "loss/logits": 0.20762786027044058, "step": 2960 }, { "epoch": 0.07425, "grad_norm": 32.75, "grad_norm_var": 887.2705729166667, "learning_rate": 0.0001, "loss": 7.4471, "loss/crossentropy": 1.9493468508124352, "loss/hidden": 3.3578125, "loss/jsd": 0.0, "loss/logits": 0.1884406829252839, "step": 2970 }, { "epoch": 0.0745, "grad_norm": 28.875, "grad_norm_var": 5.070768229166666, "learning_rate": 0.0001, "loss": 7.605, "loss/crossentropy": 2.122344336658716, "loss/hidden": 3.460546875, "loss/jsd": 0.0, "loss/logits": 0.21057356838136912, "step": 2980 }, { "epoch": 0.07475, "grad_norm": 37.0, "grad_norm_var": 21.535416666666666, "learning_rate": 0.0001, "loss": 7.469, "loss/crossentropy": 2.008989527821541, "loss/hidden": 3.54140625, "loss/jsd": 0.0, "loss/logits": 0.2172183733433485, "step": 2990 }, { "epoch": 0.075, "grad_norm": 29.375, "grad_norm_var": 18.198958333333334, "learning_rate": 0.0001, "loss": 7.3932, "loss/crossentropy": 2.1922819674015046, "loss/hidden": 3.453515625, "loss/jsd": 0.0, "loss/logits": 0.20425879992544652, "step": 3000 }, { "epoch": 0.07525, "grad_norm": 29.5, "grad_norm_var": 2.668684895833333, "learning_rate": 0.0001, "loss": 7.3505, "loss/crossentropy": 2.189265179634094, "loss/hidden": 3.34609375, "loss/jsd": 0.0, "loss/logits": 0.20808048862963915, "step": 3010 }, { "epoch": 0.0755, "grad_norm": 30.75, "grad_norm_var": 14.20625, "learning_rate": 0.0001, "loss": 7.5013, "loss/crossentropy": 2.0573098927736284, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.18116160985082388, "step": 3020 }, { "epoch": 0.07575, "grad_norm": 31.375, "grad_norm_var": 16.983333333333334, "learning_rate": 0.0001, "loss": 7.4455, "loss/crossentropy": 1.9735823571681976, "loss/hidden": 3.459765625, "loss/jsd": 0.0, "loss/logits": 0.19495000168681145, "step": 3030 }, { "epoch": 0.076, "grad_norm": 7247757312.0, "grad_norm_var": 3.2831240991582193e+18, "learning_rate": 0.0001, "loss": 7.4881, "loss/crossentropy": 1.971890377253294, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.18015608433634042, "step": 3040 }, { "epoch": 0.07625, "grad_norm": 28.25, "grad_norm_var": 3.283124098780732e+18, "learning_rate": 0.0001, "loss": 7.3664, "loss/crossentropy": 1.8378953270614147, "loss/hidden": 3.3609375, "loss/jsd": 0.0, "loss/logits": 0.1741427879780531, "step": 3050 }, { "epoch": 0.0765, "grad_norm": 31.75, "grad_norm_var": 1.89140625, "learning_rate": 0.0001, "loss": 7.5137, "loss/crossentropy": 2.141886255145073, "loss/hidden": 3.443359375, "loss/jsd": 0.0, "loss/logits": 0.19584037065505983, "step": 3060 }, { "epoch": 0.07675, "grad_norm": 27.25, "grad_norm_var": 2.4244140625, "learning_rate": 0.0001, "loss": 7.4296, "loss/crossentropy": 2.0373554110527037, "loss/hidden": 3.5640625, "loss/jsd": 0.0, "loss/logits": 0.216986732929945, "step": 3070 }, { "epoch": 0.077, "grad_norm": 35.25, "grad_norm_var": 3.7322265625, "learning_rate": 0.0001, "loss": 7.5269, "loss/crossentropy": 1.975497831404209, "loss/hidden": 3.333984375, "loss/jsd": 0.0, "loss/logits": 0.1780722170136869, "step": 3080 }, { "epoch": 0.07725, "grad_norm": 32.75, "grad_norm_var": 3.6895182291666666, "learning_rate": 0.0001, "loss": 7.4938, "loss/crossentropy": 2.151789793372154, "loss/hidden": 3.502734375, "loss/jsd": 0.0, "loss/logits": 0.21854450944811105, "step": 3090 }, { "epoch": 0.0775, "grad_norm": 29.5, "grad_norm_var": 6.82265625, "learning_rate": 0.0001, "loss": 7.4321, "loss/crossentropy": 1.9484706297516823, "loss/hidden": 3.506640625, "loss/jsd": 0.0, "loss/logits": 0.19896488767117262, "step": 3100 }, { "epoch": 0.07775, "grad_norm": 29.75, "grad_norm_var": 3.0780598958333334, "learning_rate": 0.0001, "loss": 7.5471, "loss/crossentropy": 2.165594828128815, "loss/hidden": 3.36796875, "loss/jsd": 0.0, "loss/logits": 0.20095103643834591, "step": 3110 }, { "epoch": 0.078, "grad_norm": 29.0, "grad_norm_var": 2.2197916666666666, "learning_rate": 0.0001, "loss": 7.6334, "loss/crossentropy": 2.1854751259088516, "loss/hidden": 3.4640625, "loss/jsd": 0.0, "loss/logits": 0.21246263310313224, "step": 3120 }, { "epoch": 0.07825, "grad_norm": 29.0, "grad_norm_var": 3.71640625, "learning_rate": 0.0001, "loss": 7.4278, "loss/crossentropy": 1.914103902876377, "loss/hidden": 3.451953125, "loss/jsd": 0.0, "loss/logits": 0.18373754434287548, "step": 3130 }, { "epoch": 0.0785, "grad_norm": 29.0, "grad_norm_var": 1.2952473958333333, "learning_rate": 0.0001, "loss": 7.4487, "loss/crossentropy": 1.9421842776238918, "loss/hidden": 3.5296875, "loss/jsd": 0.0, "loss/logits": 0.19919300880283117, "step": 3140 }, { "epoch": 0.07875, "grad_norm": 29.375, "grad_norm_var": 1.8268229166666667, "learning_rate": 0.0001, "loss": 7.5818, "loss/crossentropy": 2.0765694811940194, "loss/hidden": 3.5171875, "loss/jsd": 0.0, "loss/logits": 0.19946561977267266, "step": 3150 }, { "epoch": 0.079, "grad_norm": 28.125, "grad_norm_var": 11.483268229166667, "learning_rate": 0.0001, "loss": 7.4372, "loss/crossentropy": 2.013955050334334, "loss/hidden": 3.4078125, "loss/jsd": 0.0, "loss/logits": 0.20109358858317136, "step": 3160 }, { "epoch": 0.07925, "grad_norm": 28.875, "grad_norm_var": 12.871809895833334, "learning_rate": 0.0001, "loss": 7.4606, "loss/crossentropy": 2.2802242666482924, "loss/hidden": 3.423046875, "loss/jsd": 0.0, "loss/logits": 0.21229397617280482, "step": 3170 }, { "epoch": 0.0795, "grad_norm": 28.375, "grad_norm_var": 1.6301432291666667, "learning_rate": 0.0001, "loss": 7.4691, "loss/crossentropy": 2.134338477253914, "loss/hidden": 3.404296875, "loss/jsd": 0.0, "loss/logits": 0.18632632456719875, "step": 3180 }, { "epoch": 0.07975, "grad_norm": 30.625, "grad_norm_var": 2.6113932291666666, "learning_rate": 0.0001, "loss": 7.4903, "loss/crossentropy": 2.192245528101921, "loss/hidden": 3.381640625, "loss/jsd": 0.0, "loss/logits": 0.19276445377618073, "step": 3190 }, { "epoch": 0.08, "grad_norm": 27.875, "grad_norm_var": 2.6830729166666667, "learning_rate": 0.0001, "loss": 7.4715, "loss/crossentropy": 2.1333388604223726, "loss/hidden": 3.376953125, "loss/jsd": 0.0, "loss/logits": 0.1902673264965415, "step": 3200 }, { "epoch": 0.08025, "grad_norm": 29.625, "grad_norm_var": 2.7072265625, "learning_rate": 0.0001, "loss": 7.4646, "loss/crossentropy": 2.1069626569747926, "loss/hidden": 3.374609375, "loss/jsd": 0.0, "loss/logits": 0.18933899328112602, "step": 3210 }, { "epoch": 0.0805, "grad_norm": 33.0, "grad_norm_var": 1.6457682291666667, "learning_rate": 0.0001, "loss": 7.3771, "loss/crossentropy": 2.143903985619545, "loss/hidden": 3.34921875, "loss/jsd": 0.0, "loss/logits": 0.19841080270707606, "step": 3220 }, { "epoch": 0.08075, "grad_norm": 29.5, "grad_norm_var": 2.405143229166667, "learning_rate": 0.0001, "loss": 7.4629, "loss/crossentropy": 1.9501185864210129, "loss/hidden": 3.474609375, "loss/jsd": 0.0, "loss/logits": 0.2003694986924529, "step": 3230 }, { "epoch": 0.081, "grad_norm": 35.0, "grad_norm_var": 3.4619140625, "learning_rate": 0.0001, "loss": 7.6085, "loss/crossentropy": 2.1099744185805323, "loss/hidden": 3.33515625, "loss/jsd": 0.0, "loss/logits": 0.1865939747542143, "step": 3240 }, { "epoch": 0.08125, "grad_norm": 38.0, "grad_norm_var": 15.54140625, "learning_rate": 0.0001, "loss": 7.4858, "loss/crossentropy": 1.8915734700858593, "loss/hidden": 3.550390625, "loss/jsd": 0.0, "loss/logits": 0.20414282865822314, "step": 3250 }, { "epoch": 0.0815, "grad_norm": 31.875, "grad_norm_var": 15.074934895833334, "learning_rate": 0.0001, "loss": 7.4995, "loss/crossentropy": 2.0746393710374833, "loss/hidden": 3.448046875, "loss/jsd": 0.0, "loss/logits": 0.19025763403624296, "step": 3260 }, { "epoch": 0.08175, "grad_norm": 29.625, "grad_norm_var": 4.532291666666667, "learning_rate": 0.0001, "loss": 7.4517, "loss/crossentropy": 2.201898355782032, "loss/hidden": 3.38515625, "loss/jsd": 0.0, "loss/logits": 0.1851862959563732, "step": 3270 }, { "epoch": 0.082, "grad_norm": 32.25, "grad_norm_var": 9.199739583333333, "learning_rate": 0.0001, "loss": 7.4085, "loss/crossentropy": 1.9774614453315735, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.1853517958894372, "step": 3280 }, { "epoch": 0.08225, "grad_norm": 31.0, "grad_norm_var": 13.801497395833334, "learning_rate": 0.0001, "loss": 7.4065, "loss/crossentropy": 2.1263367265462874, "loss/hidden": 3.412109375, "loss/jsd": 0.0, "loss/logits": 0.18529028967022895, "step": 3290 }, { "epoch": 0.0825, "grad_norm": 29.5, "grad_norm_var": 2.967643229166667, "learning_rate": 0.0001, "loss": 7.4165, "loss/crossentropy": 2.193544697761536, "loss/hidden": 3.350390625, "loss/jsd": 0.0, "loss/logits": 0.19897244460880756, "step": 3300 }, { "epoch": 0.08275, "grad_norm": 33.75, "grad_norm_var": 9.687239583333334, "learning_rate": 0.0001, "loss": 7.5716, "loss/crossentropy": 2.0868531957268717, "loss/hidden": 3.616796875, "loss/jsd": 0.0, "loss/logits": 0.21278488002717494, "step": 3310 }, { "epoch": 0.083, "grad_norm": 31.0, "grad_norm_var": 7.9478515625, "learning_rate": 0.0001, "loss": 7.5543, "loss/crossentropy": 2.1392074063420297, "loss/hidden": 3.395703125, "loss/jsd": 0.0, "loss/logits": 0.21113577168434858, "step": 3320 }, { "epoch": 0.08325, "grad_norm": 30.0, "grad_norm_var": 2.0268229166666667, "learning_rate": 0.0001, "loss": 7.4454, "loss/crossentropy": 2.0691144198179243, "loss/hidden": 3.341796875, "loss/jsd": 0.0, "loss/logits": 0.20186964478343725, "step": 3330 }, { "epoch": 0.0835, "grad_norm": 31.5, "grad_norm_var": 2.6211653265769103e+18, "learning_rate": 0.0001, "loss": 7.4481, "loss/crossentropy": 2.0832756504416468, "loss/hidden": 3.423046875, "loss/jsd": 0.0, "loss/logits": 0.19915037509053946, "step": 3340 }, { "epoch": 0.08375, "grad_norm": 32.5, "grad_norm_var": 2.621165324337292e+18, "learning_rate": 0.0001, "loss": 7.3606, "loss/crossentropy": 2.102260760962963, "loss/hidden": 3.372265625, "loss/jsd": 0.0, "loss/logits": 0.19333885367959738, "step": 3350 }, { "epoch": 0.084, "grad_norm": 29.25, "grad_norm_var": 85.575, "learning_rate": 0.0001, "loss": 7.4073, "loss/crossentropy": 2.149528594315052, "loss/hidden": 3.491796875, "loss/jsd": 0.0, "loss/logits": 0.2177526842802763, "step": 3360 }, { "epoch": 0.08425, "grad_norm": 30.25, "grad_norm_var": 2.8645833333333335, "learning_rate": 0.0001, "loss": 7.4642, "loss/crossentropy": 2.085590344667435, "loss/hidden": 3.330859375, "loss/jsd": 0.0, "loss/logits": 0.17804578468203544, "step": 3370 }, { "epoch": 0.0845, "grad_norm": 33.25, "grad_norm_var": 2.996875, "learning_rate": 0.0001, "loss": 7.3953, "loss/crossentropy": 2.0975965946912765, "loss/hidden": 3.3, "loss/jsd": 0.0, "loss/logits": 0.1847201505675912, "step": 3380 }, { "epoch": 0.08475, "grad_norm": 32.0, "grad_norm_var": 2.470572916666667, "learning_rate": 0.0001, "loss": 7.4553, "loss/crossentropy": 2.1018140748143197, "loss/hidden": 3.36796875, "loss/jsd": 0.0, "loss/logits": 0.18250287007540464, "step": 3390 }, { "epoch": 0.085, "grad_norm": 30.25, "grad_norm_var": 2.887955729166667, "learning_rate": 0.0001, "loss": 7.5238, "loss/crossentropy": 2.1050665065646172, "loss/hidden": 3.501953125, "loss/jsd": 0.0, "loss/logits": 0.2124734738841653, "step": 3400 }, { "epoch": 0.08525, "grad_norm": 30.0, "grad_norm_var": 1.7143229166666667, "learning_rate": 0.0001, "loss": 7.2754, "loss/crossentropy": 2.0948296964168547, "loss/hidden": 3.37421875, "loss/jsd": 0.0, "loss/logits": 0.178215317055583, "step": 3410 }, { "epoch": 0.0855, "grad_norm": 30.875, "grad_norm_var": 5.354622395833333, "learning_rate": 0.0001, "loss": 7.4191, "loss/crossentropy": 2.0418393671512605, "loss/hidden": 3.4734375, "loss/jsd": 0.0, "loss/logits": 0.18740264605730772, "step": 3420 }, { "epoch": 0.08575, "grad_norm": 32.25, "grad_norm_var": 6.430989583333333, "learning_rate": 0.0001, "loss": 7.5642, "loss/crossentropy": 2.0279636546969413, "loss/hidden": 3.55859375, "loss/jsd": 0.0, "loss/logits": 0.20154636316001415, "step": 3430 }, { "epoch": 0.086, "grad_norm": 29.125, "grad_norm_var": 53.64791666666667, "learning_rate": 0.0001, "loss": 7.485, "loss/crossentropy": 2.0705729112029077, "loss/hidden": 3.456640625, "loss/jsd": 0.0, "loss/logits": 0.22035282999277114, "step": 3440 }, { "epoch": 0.08625, "grad_norm": 30.375, "grad_norm_var": 5.54765625, "learning_rate": 0.0001, "loss": 7.428, "loss/crossentropy": 1.9830067940056324, "loss/hidden": 3.376171875, "loss/jsd": 0.0, "loss/logits": 0.19354272997006775, "step": 3450 }, { "epoch": 0.0865, "grad_norm": 28.375, "grad_norm_var": 2.758736220726598e+18, "learning_rate": 0.0001, "loss": 7.4342, "loss/crossentropy": 2.1590976014733316, "loss/hidden": 3.489453125, "loss/jsd": 0.0, "loss/logits": 0.20231232214719058, "step": 3460 }, { "epoch": 0.08675, "grad_norm": 29.125, "grad_norm_var": 2.470572916666667, "learning_rate": 0.0001, "loss": 7.3376, "loss/crossentropy": 2.108407254517078, "loss/hidden": 3.416796875, "loss/jsd": 0.0, "loss/logits": 0.18425025548785925, "step": 3470 }, { "epoch": 0.087, "grad_norm": 32.5, "grad_norm_var": 19.315559895833335, "learning_rate": 0.0001, "loss": 7.391, "loss/crossentropy": 2.086346108466387, "loss/hidden": 3.380859375, "loss/jsd": 0.0, "loss/logits": 0.19492445401847364, "step": 3480 }, { "epoch": 0.08725, "grad_norm": 30.75, "grad_norm_var": 3.9009765625, "learning_rate": 0.0001, "loss": 7.454, "loss/crossentropy": 2.0728737086057665, "loss/hidden": 3.474609375, "loss/jsd": 0.0, "loss/logits": 0.21246139723807572, "step": 3490 }, { "epoch": 0.0875, "grad_norm": 53.25, "grad_norm_var": 34.962955729166666, "learning_rate": 0.0001, "loss": 7.4001, "loss/crossentropy": 1.9173476293683052, "loss/hidden": 3.330859375, "loss/jsd": 0.0, "loss/logits": 0.18263984741643072, "step": 3500 }, { "epoch": 0.08775, "grad_norm": 29.875, "grad_norm_var": 36.22389322916667, "learning_rate": 0.0001, "loss": 7.5855, "loss/crossentropy": 1.9761252515017986, "loss/hidden": 3.391015625, "loss/jsd": 0.0, "loss/logits": 0.20959299746900797, "step": 3510 }, { "epoch": 0.088, "grad_norm": 32.25, "grad_norm_var": 17.7337890625, "learning_rate": 0.0001, "loss": 7.4728, "loss/crossentropy": 2.0416554152965545, "loss/hidden": 3.4703125, "loss/jsd": 0.0, "loss/logits": 0.19014756735414268, "step": 3520 }, { "epoch": 0.08825, "grad_norm": 29.375, "grad_norm_var": 14.664322916666666, "learning_rate": 0.0001, "loss": 7.5608, "loss/crossentropy": 2.2834356099367143, "loss/hidden": 3.3953125, "loss/jsd": 0.0, "loss/logits": 0.19908843878656626, "step": 3530 }, { "epoch": 0.0885, "grad_norm": 31.875, "grad_norm_var": 2.6702473958333335, "learning_rate": 0.0001, "loss": 7.4804, "loss/crossentropy": 2.0417330890893934, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.20852382443845272, "step": 3540 }, { "epoch": 0.08875, "grad_norm": 31.625, "grad_norm_var": 2.460724589971584e+18, "learning_rate": 0.0001, "loss": 7.5559, "loss/crossentropy": 2.1676768481731417, "loss/hidden": 3.394921875, "loss/jsd": 0.0, "loss/logits": 0.1989177169278264, "step": 3550 }, { "epoch": 0.089, "grad_norm": 30.0, "grad_norm_var": 6.881705729166667, "learning_rate": 0.0001, "loss": 7.4678, "loss/crossentropy": 2.221273897588253, "loss/hidden": 3.4, "loss/jsd": 0.0, "loss/logits": 0.19402988757938147, "step": 3560 }, { "epoch": 0.08925, "grad_norm": 31.375, "grad_norm_var": 7.732747395833333, "learning_rate": 0.0001, "loss": 7.4508, "loss/crossentropy": 2.1802149415016174, "loss/hidden": 3.43828125, "loss/jsd": 0.0, "loss/logits": 0.20121808685362338, "step": 3570 }, { "epoch": 0.0895, "grad_norm": 52.5, "grad_norm_var": 30.9775390625, "learning_rate": 0.0001, "loss": 7.3982, "loss/crossentropy": 2.085124118626118, "loss/hidden": 3.40390625, "loss/jsd": 0.0, "loss/logits": 0.18448642026633025, "step": 3580 }, { "epoch": 0.08975, "grad_norm": 30.875, "grad_norm_var": 32.91295572916667, "learning_rate": 0.0001, "loss": 7.4381, "loss/crossentropy": 2.1467449337244036, "loss/hidden": 3.3734375, "loss/jsd": 0.0, "loss/logits": 0.19393185302615165, "step": 3590 }, { "epoch": 0.09, "grad_norm": 29.25, "grad_norm_var": 1.4708333333333334, "learning_rate": 0.0001, "loss": 7.415, "loss/crossentropy": 2.0135369554162024, "loss/hidden": 3.37109375, "loss/jsd": 0.0, "loss/logits": 0.18443848174065353, "step": 3600 }, { "epoch": 0.09025, "grad_norm": 31.375, "grad_norm_var": 6.1962890625, "learning_rate": 0.0001, "loss": 7.4028, "loss/crossentropy": 2.1443901300430297, "loss/hidden": 3.440234375, "loss/jsd": 0.0, "loss/logits": 0.2054579086601734, "step": 3610 }, { "epoch": 0.0905, "grad_norm": 26.5, "grad_norm_var": 3.562239583333333, "learning_rate": 0.0001, "loss": 7.3255, "loss/crossentropy": 1.799356396496296, "loss/hidden": 3.36953125, "loss/jsd": 0.0, "loss/logits": 0.17441922090947629, "step": 3620 }, { "epoch": 0.09075, "grad_norm": 31.375, "grad_norm_var": 2.2083333333333335, "learning_rate": 0.0001, "loss": 7.4272, "loss/crossentropy": 1.9925116747617722, "loss/hidden": 3.52578125, "loss/jsd": 0.0, "loss/logits": 0.21653544921427964, "step": 3630 }, { "epoch": 0.091, "grad_norm": 30.125, "grad_norm_var": 0.6125, "learning_rate": 0.0001, "loss": 7.3649, "loss/crossentropy": 2.135761073231697, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.18989351522177458, "step": 3640 }, { "epoch": 0.09125, "grad_norm": 31.375, "grad_norm_var": 1.4330729166666667, "learning_rate": 0.0001, "loss": 7.4505, "loss/crossentropy": 2.0986070543527604, "loss/hidden": 3.334375, "loss/jsd": 0.0, "loss/logits": 0.18352905213832854, "step": 3650 }, { "epoch": 0.0915, "grad_norm": 29.625, "grad_norm_var": 2.5869140625, "learning_rate": 0.0001, "loss": 7.4199, "loss/crossentropy": 2.1555575743317603, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.19261632524430752, "step": 3660 }, { "epoch": 0.09175, "grad_norm": 31.5, "grad_norm_var": 2.371875, "learning_rate": 0.0001, "loss": 7.5463, "loss/crossentropy": 2.1411691516637803, "loss/hidden": 3.446875, "loss/jsd": 0.0, "loss/logits": 0.2046652188524604, "step": 3670 }, { "epoch": 0.092, "grad_norm": 30.625, "grad_norm_var": 4.703580729166666, "learning_rate": 0.0001, "loss": 7.404, "loss/crossentropy": 2.142404294013977, "loss/hidden": 3.445703125, "loss/jsd": 0.0, "loss/logits": 0.20414466112852098, "step": 3680 }, { "epoch": 0.09225, "grad_norm": 30.375, "grad_norm_var": 3.25625, "learning_rate": 0.0001, "loss": 7.4774, "loss/crossentropy": 2.187901920080185, "loss/hidden": 3.480859375, "loss/jsd": 0.0, "loss/logits": 0.21911972090601922, "step": 3690 }, { "epoch": 0.0925, "grad_norm": 31.875, "grad_norm_var": 1.2166666666666666, "learning_rate": 0.0001, "loss": 7.5965, "loss/crossentropy": 2.086391404271126, "loss/hidden": 3.438671875, "loss/jsd": 0.0, "loss/logits": 0.2020766455680132, "step": 3700 }, { "epoch": 0.09275, "grad_norm": 30.625, "grad_norm_var": 2.147330729166667, "learning_rate": 0.0001, "loss": 7.4579, "loss/crossentropy": 2.09081457182765, "loss/hidden": 3.369140625, "loss/jsd": 0.0, "loss/logits": 0.1868050311692059, "step": 3710 }, { "epoch": 0.093, "grad_norm": 34.25, "grad_norm_var": 2.467643229166667, "learning_rate": 0.0001, "loss": 7.522, "loss/crossentropy": 2.12264247238636, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.18927707765251398, "step": 3720 }, { "epoch": 0.09325, "grad_norm": 32.25, "grad_norm_var": 3.981184895833333, "learning_rate": 0.0001, "loss": 7.4155, "loss/crossentropy": 2.1118928104639054, "loss/hidden": 3.44765625, "loss/jsd": 0.0, "loss/logits": 0.19489197488874196, "step": 3730 }, { "epoch": 0.0935, "grad_norm": 34.0, "grad_norm_var": 5.312434895833333, "learning_rate": 0.0001, "loss": 7.5053, "loss/crossentropy": 2.1360882744193077, "loss/hidden": 3.426171875, "loss/jsd": 0.0, "loss/logits": 0.19313989579677582, "step": 3740 }, { "epoch": 0.09375, "grad_norm": 29.125, "grad_norm_var": 4.549739583333333, "learning_rate": 0.0001, "loss": 7.3275, "loss/crossentropy": 2.010613538324833, "loss/hidden": 3.355859375, "loss/jsd": 0.0, "loss/logits": 0.18421147018671036, "step": 3750 }, { "epoch": 0.094, "grad_norm": 31.625, "grad_norm_var": 1.5541666666666667, "learning_rate": 0.0001, "loss": 7.4784, "loss/crossentropy": 2.1465295113623144, "loss/hidden": 3.323828125, "loss/jsd": 0.0, "loss/logits": 0.18987073097378016, "step": 3760 }, { "epoch": 0.09425, "grad_norm": 32.75, "grad_norm_var": 1.9018229166666667, "learning_rate": 0.0001, "loss": 7.3495, "loss/crossentropy": 2.17747982442379, "loss/hidden": 3.48046875, "loss/jsd": 0.0, "loss/logits": 0.2016214355826378, "step": 3770 }, { "epoch": 0.0945, "grad_norm": 30.875, "grad_norm_var": 3.088997395833333, "learning_rate": 0.0001, "loss": 7.5384, "loss/crossentropy": 2.179350584745407, "loss/hidden": 3.350390625, "loss/jsd": 0.0, "loss/logits": 0.19142594784498215, "step": 3780 }, { "epoch": 0.09475, "grad_norm": 29.625, "grad_norm_var": 1.1559895833333333, "learning_rate": 0.0001, "loss": 7.4035, "loss/crossentropy": 2.155378046631813, "loss/hidden": 3.32109375, "loss/jsd": 0.0, "loss/logits": 0.19720839541405438, "step": 3790 }, { "epoch": 0.095, "grad_norm": 30.625, "grad_norm_var": 1.1999348958333333, "learning_rate": 0.0001, "loss": 7.4441, "loss/crossentropy": 2.0597486779093743, "loss/hidden": 3.413671875, "loss/jsd": 0.0, "loss/logits": 0.19279775265604257, "step": 3800 }, { "epoch": 0.09525, "grad_norm": 33.5, "grad_norm_var": 2.1666666666666665, "learning_rate": 0.0001, "loss": 7.5146, "loss/crossentropy": 2.1966816753149034, "loss/hidden": 3.44375, "loss/jsd": 0.0, "loss/logits": 0.20174810625612735, "step": 3810 }, { "epoch": 0.0955, "grad_norm": 31.5, "grad_norm_var": 1.9593098958333333, "learning_rate": 0.0001, "loss": 7.539, "loss/crossentropy": 2.165803623199463, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.1953417781740427, "step": 3820 }, { "epoch": 0.09575, "grad_norm": 32.0, "grad_norm_var": 6.690625, "learning_rate": 0.0001, "loss": 7.514, "loss/crossentropy": 2.0817860513925552, "loss/hidden": 3.453515625, "loss/jsd": 0.0, "loss/logits": 0.20838446952402592, "step": 3830 }, { "epoch": 0.096, "grad_norm": 32.75, "grad_norm_var": 7.6431640625, "learning_rate": 0.0001, "loss": 7.5472, "loss/crossentropy": 2.231910442560911, "loss/hidden": 3.442578125, "loss/jsd": 0.0, "loss/logits": 0.21717903479002415, "step": 3840 }, { "epoch": 0.09625, "grad_norm": 32.0, "grad_norm_var": 16.134375, "learning_rate": 0.0001, "loss": 7.5807, "loss/crossentropy": 2.0746277555823327, "loss/hidden": 3.47578125, "loss/jsd": 0.0, "loss/logits": 0.20851925816386938, "step": 3850 }, { "epoch": 0.0965, "grad_norm": 30.625, "grad_norm_var": 16.132747395833334, "learning_rate": 0.0001, "loss": 7.3749, "loss/crossentropy": 2.1463438466191294, "loss/hidden": 3.356640625, "loss/jsd": 0.0, "loss/logits": 0.19305863380432128, "step": 3860 }, { "epoch": 0.09675, "grad_norm": 32.5, "grad_norm_var": 1.0895182291666667, "learning_rate": 0.0001, "loss": 7.5499, "loss/crossentropy": 2.2108413323760034, "loss/hidden": 3.42734375, "loss/jsd": 0.0, "loss/logits": 0.20310868676751853, "step": 3870 }, { "epoch": 0.097, "grad_norm": 30.75, "grad_norm_var": 1.4559895833333334, "learning_rate": 0.0001, "loss": 7.4788, "loss/crossentropy": 2.0900154620409013, "loss/hidden": 3.42734375, "loss/jsd": 0.0, "loss/logits": 0.18780422061681748, "step": 3880 }, { "epoch": 0.09725, "grad_norm": 30.625, "grad_norm_var": 13.917643229166666, "learning_rate": 0.0001, "loss": 7.4391, "loss/crossentropy": 2.0574848279356956, "loss/hidden": 3.3875, "loss/jsd": 0.0, "loss/logits": 0.19390027467161416, "step": 3890 }, { "epoch": 0.0975, "grad_norm": 27.375, "grad_norm_var": 13.55, "learning_rate": 0.0001, "loss": 7.4327, "loss/crossentropy": 2.2832688719034193, "loss/hidden": 3.3796875, "loss/jsd": 0.0, "loss/logits": 0.20608801003545524, "step": 3900 }, { "epoch": 0.09775, "grad_norm": 29.0, "grad_norm_var": 3.296875, "learning_rate": 0.0001, "loss": 7.3691, "loss/crossentropy": 1.9183307077735663, "loss/hidden": 3.37890625, "loss/jsd": 0.0, "loss/logits": 0.1917601386550814, "step": 3910 }, { "epoch": 0.098, "grad_norm": 34.0, "grad_norm_var": 3.24765625, "learning_rate": 0.0001, "loss": 7.4628, "loss/crossentropy": 2.0630046002566815, "loss/hidden": 3.338671875, "loss/jsd": 0.0, "loss/logits": 0.1871832549571991, "step": 3920 }, { "epoch": 0.09825, "grad_norm": 31.75, "grad_norm_var": 1.5384765625, "learning_rate": 0.0001, "loss": 7.4868, "loss/crossentropy": 2.061261148750782, "loss/hidden": 3.415234375, "loss/jsd": 0.0, "loss/logits": 0.18525551967322826, "step": 3930 }, { "epoch": 0.0985, "grad_norm": 29.75, "grad_norm_var": 1.584375, "learning_rate": 0.0001, "loss": 7.5498, "loss/crossentropy": 2.0895790114998816, "loss/hidden": 3.409375, "loss/jsd": 0.0, "loss/logits": 0.1932330032810569, "step": 3940 }, { "epoch": 0.09875, "grad_norm": 30.625, "grad_norm_var": 25.79765625, "learning_rate": 0.0001, "loss": 7.6502, "loss/crossentropy": 2.1616804771125318, "loss/hidden": 3.365625, "loss/jsd": 0.0, "loss/logits": 0.18905209768563508, "step": 3950 }, { "epoch": 0.099, "grad_norm": 30.5, "grad_norm_var": 28.547916666666666, "learning_rate": 0.0001, "loss": 7.3334, "loss/crossentropy": 2.1435488507151605, "loss/hidden": 3.397265625, "loss/jsd": 0.0, "loss/logits": 0.1910943292081356, "step": 3960 }, { "epoch": 0.09925, "grad_norm": 32.75, "grad_norm_var": 6.3650390625, "learning_rate": 0.0001, "loss": 7.542, "loss/crossentropy": 2.176460310816765, "loss/hidden": 3.41328125, "loss/jsd": 0.0, "loss/logits": 0.18821860365569593, "step": 3970 }, { "epoch": 0.0995, "grad_norm": 31.625, "grad_norm_var": 3.9905598958333335, "learning_rate": 0.0001, "loss": 7.5231, "loss/crossentropy": 2.2077176332473756, "loss/hidden": 3.4515625, "loss/jsd": 0.0, "loss/logits": 0.21911400128155947, "step": 3980 }, { "epoch": 0.09975, "grad_norm": 31.125, "grad_norm_var": 1.75625, "learning_rate": 0.0001, "loss": 7.4868, "loss/crossentropy": 2.105836200714111, "loss/hidden": 3.36953125, "loss/jsd": 0.0, "loss/logits": 0.1997914554551244, "step": 3990 }, { "epoch": 0.1, "grad_norm": 38.0, "grad_norm_var": 4.710416666666666, "learning_rate": 0.0001, "loss": 7.5675, "loss/crossentropy": 2.233233967423439, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.20876242108643056, "step": 4000 }, { "epoch": 0.10025, "grad_norm": 28.625, "grad_norm_var": 7.56640625, "learning_rate": 0.0001, "loss": 7.4736, "loss/crossentropy": 2.103509198874235, "loss/hidden": 3.413671875, "loss/jsd": 0.0, "loss/logits": 0.1953927006572485, "step": 4010 }, { "epoch": 0.1005, "grad_norm": 28.875, "grad_norm_var": 4.119791666666667, "learning_rate": 0.0001, "loss": 7.4509, "loss/crossentropy": 1.9697775058448315, "loss/hidden": 3.308984375, "loss/jsd": 0.0, "loss/logits": 0.17186311883851885, "step": 4020 }, { "epoch": 0.10075, "grad_norm": 29.5, "grad_norm_var": 1.3177083333333333, "learning_rate": 0.0001, "loss": 7.333, "loss/crossentropy": 2.0519870311021804, "loss/hidden": 3.42421875, "loss/jsd": 0.0, "loss/logits": 0.1872571600601077, "step": 4030 }, { "epoch": 0.101, "grad_norm": 29.5, "grad_norm_var": 1.2785807291666667, "learning_rate": 0.0001, "loss": 7.3466, "loss/crossentropy": 2.0663713179528713, "loss/hidden": 3.39921875, "loss/jsd": 0.0, "loss/logits": 0.18582073990255593, "step": 4040 }, { "epoch": 0.10125, "grad_norm": 30.375, "grad_norm_var": 1.9577473958333333, "learning_rate": 0.0001, "loss": 7.3812, "loss/crossentropy": 2.1256399258971213, "loss/hidden": 3.30390625, "loss/jsd": 0.0, "loss/logits": 0.19628962082788348, "step": 4050 }, { "epoch": 0.1015, "grad_norm": 30.625, "grad_norm_var": 0.53125, "learning_rate": 0.0001, "loss": 7.3726, "loss/crossentropy": 2.1235328309237955, "loss/hidden": 3.3703125, "loss/jsd": 0.0, "loss/logits": 0.18646292947232723, "step": 4060 }, { "epoch": 0.10175, "grad_norm": 29.0, "grad_norm_var": 3.19255952647709e+18, "learning_rate": 0.0001, "loss": 7.4564, "loss/crossentropy": 2.0213126331567763, "loss/hidden": 3.496875, "loss/jsd": 0.0, "loss/logits": 0.19607899691909553, "step": 4070 }, { "epoch": 0.102, "grad_norm": 28.75, "grad_norm_var": 3.48515625, "learning_rate": 0.0001, "loss": 7.3886, "loss/crossentropy": 2.0899658009409903, "loss/hidden": 3.340625, "loss/jsd": 0.0, "loss/logits": 0.1851665174588561, "step": 4080 }, { "epoch": 0.10225, "grad_norm": 29.5, "grad_norm_var": 1.8692057291666666, "learning_rate": 0.0001, "loss": 7.4838, "loss/crossentropy": 2.027493818849325, "loss/hidden": 3.49765625, "loss/jsd": 0.0, "loss/logits": 0.19640162959694862, "step": 4090 }, { "epoch": 0.1025, "grad_norm": 29.125, "grad_norm_var": 11.762434895833334, "learning_rate": 0.0001, "loss": 7.5099, "loss/crossentropy": 2.056584618985653, "loss/hidden": 3.32265625, "loss/jsd": 0.0, "loss/logits": 0.17638762388378382, "step": 4100 }, { "epoch": 0.10275, "grad_norm": 30.125, "grad_norm_var": 12.459375, "learning_rate": 0.0001, "loss": 7.5255, "loss/crossentropy": 2.0713445380330087, "loss/hidden": 3.36953125, "loss/jsd": 0.0, "loss/logits": 0.18587317056953906, "step": 4110 }, { "epoch": 0.103, "grad_norm": 33.25, "grad_norm_var": 1.9958333333333333, "learning_rate": 0.0001, "loss": 7.4437, "loss/crossentropy": 2.2338072419166566, "loss/hidden": 3.36171875, "loss/jsd": 0.0, "loss/logits": 0.18814200926572083, "step": 4120 }, { "epoch": 0.10325, "grad_norm": 32.75, "grad_norm_var": 3.1259765625, "learning_rate": 0.0001, "loss": 7.3184, "loss/crossentropy": 2.0210259817540646, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.18816483654081823, "step": 4130 }, { "epoch": 0.1035, "grad_norm": 29.5, "grad_norm_var": 2.870247395833333, "learning_rate": 0.0001, "loss": 7.5124, "loss/crossentropy": 2.0151045128703116, "loss/hidden": 3.371484375, "loss/jsd": 0.0, "loss/logits": 0.19255878478288652, "step": 4140 }, { "epoch": 0.10375, "grad_norm": 30.625, "grad_norm_var": 1.3926432291666666, "learning_rate": 0.0001, "loss": 7.5096, "loss/crossentropy": 1.9808883003890514, "loss/hidden": 3.449609375, "loss/jsd": 0.0, "loss/logits": 0.19115560222417116, "step": 4150 }, { "epoch": 0.104, "grad_norm": 30.75, "grad_norm_var": 1.6979166666666667, "learning_rate": 0.0001, "loss": 7.549, "loss/crossentropy": 2.1932784736156465, "loss/hidden": 3.39765625, "loss/jsd": 0.0, "loss/logits": 0.20479805655777455, "step": 4160 }, { "epoch": 0.10425, "grad_norm": 30.125, "grad_norm_var": 2.3333333333333335, "learning_rate": 0.0001, "loss": 7.3875, "loss/crossentropy": 1.8820222720503808, "loss/hidden": 3.337109375, "loss/jsd": 0.0, "loss/logits": 0.17310038600116967, "step": 4170 }, { "epoch": 0.1045, "grad_norm": 33.0, "grad_norm_var": 3.7728515625, "learning_rate": 0.0001, "loss": 7.4212, "loss/crossentropy": 2.082476270198822, "loss/hidden": 3.334765625, "loss/jsd": 0.0, "loss/logits": 0.19099258184432982, "step": 4180 }, { "epoch": 0.10475, "grad_norm": 30.875, "grad_norm_var": 11.408268229166667, "learning_rate": 0.0001, "loss": 7.4991, "loss/crossentropy": 2.287242355942726, "loss/hidden": 3.375390625, "loss/jsd": 0.0, "loss/logits": 0.1982285875827074, "step": 4190 }, { "epoch": 0.105, "grad_norm": 28.75, "grad_norm_var": 2.999739583333333, "learning_rate": 0.0001, "loss": 7.5959, "loss/crossentropy": 2.1783332407474516, "loss/hidden": 3.422265625, "loss/jsd": 0.0, "loss/logits": 0.2117959801107645, "step": 4200 }, { "epoch": 0.10525, "grad_norm": 30.0, "grad_norm_var": 4.708268229166666, "learning_rate": 0.0001, "loss": 7.3363, "loss/crossentropy": 1.955865352600813, "loss/hidden": 3.409375, "loss/jsd": 0.0, "loss/logits": 0.18177355360239744, "step": 4210 }, { "epoch": 0.1055, "grad_norm": 30.625, "grad_norm_var": 3.0254557291666666, "learning_rate": 0.0001, "loss": 7.4673, "loss/crossentropy": 1.833389012515545, "loss/hidden": 3.394921875, "loss/jsd": 0.0, "loss/logits": 0.1878132861107588, "step": 4220 }, { "epoch": 0.10575, "grad_norm": 32.0, "grad_norm_var": 3.05, "learning_rate": 0.0001, "loss": 7.3969, "loss/crossentropy": 1.9096243590116502, "loss/hidden": 3.35625, "loss/jsd": 0.0, "loss/logits": 0.17025592969730496, "step": 4230 }, { "epoch": 0.106, "grad_norm": 30.875, "grad_norm_var": 1.82265625, "learning_rate": 0.0001, "loss": 7.4638, "loss/crossentropy": 2.0454175233840943, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.20515710916370153, "step": 4240 }, { "epoch": 0.10625, "grad_norm": 30.125, "grad_norm_var": 3.1333333333333333, "learning_rate": 0.0001, "loss": 7.5126, "loss/crossentropy": 2.089062933623791, "loss/hidden": 3.4328125, "loss/jsd": 0.0, "loss/logits": 0.19156677946448325, "step": 4250 }, { "epoch": 0.1065, "grad_norm": 29.0, "grad_norm_var": 4.311393229166667, "learning_rate": 0.0001, "loss": 7.4468, "loss/crossentropy": 2.0564094200730323, "loss/hidden": 3.433984375, "loss/jsd": 0.0, "loss/logits": 0.19553639348596336, "step": 4260 }, { "epoch": 0.10675, "grad_norm": 32.0, "grad_norm_var": 3.2587890625, "learning_rate": 0.0001, "loss": 7.4186, "loss/crossentropy": 2.13806764036417, "loss/hidden": 3.3859375, "loss/jsd": 0.0, "loss/logits": 0.19822277761995793, "step": 4270 }, { "epoch": 0.107, "grad_norm": 28.0, "grad_norm_var": 1.6926432291666667, "learning_rate": 0.0001, "loss": 7.4595, "loss/crossentropy": 2.0767486467957497, "loss/hidden": 3.41953125, "loss/jsd": 0.0, "loss/logits": 0.1884168043732643, "step": 4280 }, { "epoch": 0.10725, "grad_norm": 33.0, "grad_norm_var": 2.3059895833333335, "learning_rate": 0.0001, "loss": 7.4481, "loss/crossentropy": 2.033916361629963, "loss/hidden": 3.45, "loss/jsd": 0.0, "loss/logits": 0.20558829829096795, "step": 4290 }, { "epoch": 0.1075, "grad_norm": 31.0, "grad_norm_var": 2.9375, "learning_rate": 0.0001, "loss": 7.4871, "loss/crossentropy": 2.078028707951307, "loss/hidden": 3.37578125, "loss/jsd": 0.0, "loss/logits": 0.188079852424562, "step": 4300 }, { "epoch": 0.10775, "grad_norm": 33.25, "grad_norm_var": 2.1020833333333333, "learning_rate": 0.0001, "loss": 7.5379, "loss/crossentropy": 2.003500834107399, "loss/hidden": 3.544921875, "loss/jsd": 0.0, "loss/logits": 0.20521650360897184, "step": 4310 }, { "epoch": 0.108, "grad_norm": 29.625, "grad_norm_var": 2.8447916666666666, "learning_rate": 0.0001, "loss": 7.3536, "loss/crossentropy": 2.043112625181675, "loss/hidden": 3.380859375, "loss/jsd": 0.0, "loss/logits": 0.19910661596804857, "step": 4320 }, { "epoch": 0.10825, "grad_norm": 28.5, "grad_norm_var": 4.000455729166666, "learning_rate": 0.0001, "loss": 7.3717, "loss/crossentropy": 2.1422011658549307, "loss/hidden": 3.38671875, "loss/jsd": 0.0, "loss/logits": 0.19375871792435645, "step": 4330 }, { "epoch": 0.1085, "grad_norm": 29.0, "grad_norm_var": 3.6259765625, "learning_rate": 0.0001, "loss": 7.5021, "loss/crossentropy": 2.131446525454521, "loss/hidden": 3.480078125, "loss/jsd": 0.0, "loss/logits": 0.2063008865341544, "step": 4340 }, { "epoch": 0.10875, "grad_norm": 32.0, "grad_norm_var": 5.9525390625, "learning_rate": 0.0001, "loss": 7.4749, "loss/crossentropy": 2.085691845417023, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.1889802658930421, "step": 4350 }, { "epoch": 0.109, "grad_norm": 30.75, "grad_norm_var": 3.154166666666667, "learning_rate": 0.0001, "loss": 7.3816, "loss/crossentropy": 1.8972876839339734, "loss/hidden": 3.319140625, "loss/jsd": 0.0, "loss/logits": 0.17174729090183974, "step": 4360 }, { "epoch": 0.10925, "grad_norm": 29.875, "grad_norm_var": 1.7509765625, "learning_rate": 0.0001, "loss": 7.4444, "loss/crossentropy": 2.127763804793358, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.18679574280977249, "step": 4370 }, { "epoch": 0.1095, "grad_norm": 29.875, "grad_norm_var": 2.16015625, "learning_rate": 0.0001, "loss": 7.4682, "loss/crossentropy": 2.1872297644615175, "loss/hidden": 3.31328125, "loss/jsd": 0.0, "loss/logits": 0.183891461789608, "step": 4380 }, { "epoch": 0.10975, "grad_norm": 28.875, "grad_norm_var": 3.3692057291666666, "learning_rate": 0.0001, "loss": 7.429, "loss/crossentropy": 2.19267495572567, "loss/hidden": 3.394140625, "loss/jsd": 0.0, "loss/logits": 0.20111876968294382, "step": 4390 }, { "epoch": 0.11, "grad_norm": 29.375, "grad_norm_var": 1.6858723958333333, "learning_rate": 0.0001, "loss": 7.556, "loss/crossentropy": 2.1324411287903784, "loss/hidden": 3.4359375, "loss/jsd": 0.0, "loss/logits": 0.2090261412784457, "step": 4400 }, { "epoch": 0.11025, "grad_norm": 33.5, "grad_norm_var": 3.374739583333333, "learning_rate": 0.0001, "loss": 7.4081, "loss/crossentropy": 1.9800483137369156, "loss/hidden": 3.584375, "loss/jsd": 0.0, "loss/logits": 0.19881114605814218, "step": 4410 }, { "epoch": 0.1105, "grad_norm": 31.75, "grad_norm_var": 4.13515625, "learning_rate": 0.0001, "loss": 7.4904, "loss/crossentropy": 2.053773292154074, "loss/hidden": 3.323828125, "loss/jsd": 0.0, "loss/logits": 0.18270381446927786, "step": 4420 }, { "epoch": 0.11075, "grad_norm": 30.75, "grad_norm_var": 2.3499348958333335, "learning_rate": 0.0001, "loss": 7.4509, "loss/crossentropy": 2.0641689248383046, "loss/hidden": 3.40390625, "loss/jsd": 0.0, "loss/logits": 0.19245190378278493, "step": 4430 }, { "epoch": 0.111, "grad_norm": 33.25, "grad_norm_var": 3.158333333333333, "learning_rate": 0.0001, "loss": 7.3576, "loss/crossentropy": 2.073286408931017, "loss/hidden": 3.37265625, "loss/jsd": 0.0, "loss/logits": 0.1892416624352336, "step": 4440 }, { "epoch": 0.11125, "grad_norm": 35.75, "grad_norm_var": 6.167122395833333, "learning_rate": 0.0001, "loss": 7.456, "loss/crossentropy": 2.191167525947094, "loss/hidden": 3.3140625, "loss/jsd": 0.0, "loss/logits": 0.19327596500515937, "step": 4450 }, { "epoch": 0.1115, "grad_norm": 28.0, "grad_norm_var": 6.762239583333334, "learning_rate": 0.0001, "loss": 7.4254, "loss/crossentropy": 1.9917161837220192, "loss/hidden": 3.48828125, "loss/jsd": 0.0, "loss/logits": 0.18673346154391765, "step": 4460 }, { "epoch": 0.11175, "grad_norm": 31.0, "grad_norm_var": 2.763541666666667, "learning_rate": 0.0001, "loss": 7.4458, "loss/crossentropy": 2.0167058646678924, "loss/hidden": 3.477734375, "loss/jsd": 0.0, "loss/logits": 0.20151916183531285, "step": 4470 }, { "epoch": 0.112, "grad_norm": 30.5, "grad_norm_var": 7.175455729166667, "learning_rate": 0.0001, "loss": 7.4057, "loss/crossentropy": 2.013149876892567, "loss/hidden": 3.405859375, "loss/jsd": 0.0, "loss/logits": 0.1819242848083377, "step": 4480 }, { "epoch": 0.11225, "grad_norm": 43.25, "grad_norm_var": 13.478580729166667, "learning_rate": 0.0001, "loss": 7.4416, "loss/crossentropy": 2.111778366565704, "loss/hidden": 3.4, "loss/jsd": 0.0, "loss/logits": 0.20088088884949684, "step": 4490 }, { "epoch": 0.1125, "grad_norm": 30.125, "grad_norm_var": 11.905143229166667, "learning_rate": 0.0001, "loss": 7.4435, "loss/crossentropy": 2.0223396182060243, "loss/hidden": 3.408984375, "loss/jsd": 0.0, "loss/logits": 0.1967620700597763, "step": 4500 }, { "epoch": 0.11275, "grad_norm": 28.375, "grad_norm_var": 2.2978515625, "learning_rate": 0.0001, "loss": 7.3969, "loss/crossentropy": 1.9966137878596784, "loss/hidden": 3.416796875, "loss/jsd": 0.0, "loss/logits": 0.19062119219452142, "step": 4510 }, { "epoch": 0.113, "grad_norm": 29.75, "grad_norm_var": 3.1759765625, "learning_rate": 0.0001, "loss": 7.2845, "loss/crossentropy": 1.8878834903240205, "loss/hidden": 3.346484375, "loss/jsd": 0.0, "loss/logits": 0.16922880560159684, "step": 4520 }, { "epoch": 0.11325, "grad_norm": 33.5, "grad_norm_var": 3.78515625, "learning_rate": 0.0001, "loss": 7.5223, "loss/crossentropy": 2.0424712359905244, "loss/hidden": 3.42109375, "loss/jsd": 0.0, "loss/logits": 0.18261839263141155, "step": 4530 }, { "epoch": 0.1135, "grad_norm": 41.75, "grad_norm_var": 13.172330729166667, "learning_rate": 0.0001, "loss": 7.4917, "loss/crossentropy": 2.1800880253314974, "loss/hidden": 3.419140625, "loss/jsd": 0.0, "loss/logits": 0.1875661849975586, "step": 4540 }, { "epoch": 0.11375, "grad_norm": 29.5, "grad_norm_var": 13.737239583333333, "learning_rate": 0.0001, "loss": 7.4929, "loss/crossentropy": 2.1130245834589005, "loss/hidden": 3.514453125, "loss/jsd": 0.0, "loss/logits": 0.20742647554725407, "step": 4550 }, { "epoch": 0.114, "grad_norm": 31.875, "grad_norm_var": 3.1447265625, "learning_rate": 0.0001, "loss": 7.4885, "loss/crossentropy": 2.0878429099917413, "loss/hidden": 3.4625, "loss/jsd": 0.0, "loss/logits": 0.19807947240769863, "step": 4560 }, { "epoch": 0.11425, "grad_norm": 32.0, "grad_norm_var": 1.9080729166666666, "learning_rate": 0.0001, "loss": 7.412, "loss/crossentropy": 2.045598204433918, "loss/hidden": 3.43984375, "loss/jsd": 0.0, "loss/logits": 0.19935160782188177, "step": 4570 }, { "epoch": 0.1145, "grad_norm": 31.25, "grad_norm_var": 2.703285650940459e+18, "learning_rate": 0.0001, "loss": 7.4112, "loss/crossentropy": 1.9612677067518234, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.1939171139150858, "step": 4580 }, { "epoch": 0.11475, "grad_norm": 30.125, "grad_norm_var": 9.067708333333334, "learning_rate": 0.0001, "loss": 7.4109, "loss/crossentropy": 2.066862888634205, "loss/hidden": 3.440625, "loss/jsd": 0.0, "loss/logits": 0.20057452656328678, "step": 4590 }, { "epoch": 0.115, "grad_norm": 29.25, "grad_norm_var": 6.670833333333333, "learning_rate": 0.0001, "loss": 7.3857, "loss/crossentropy": 2.0378803849220275, "loss/hidden": 3.495703125, "loss/jsd": 0.0, "loss/logits": 0.19217969439923763, "step": 4600 }, { "epoch": 0.11525, "grad_norm": 32.0, "grad_norm_var": 8.108268229166667, "learning_rate": 0.0001, "loss": 7.4449, "loss/crossentropy": 1.9883966132998467, "loss/hidden": 3.378515625, "loss/jsd": 0.0, "loss/logits": 0.1796421378850937, "step": 4610 }, { "epoch": 0.1155, "grad_norm": 28.5, "grad_norm_var": 2.8853515625, "learning_rate": 0.0001, "loss": 7.43, "loss/crossentropy": 2.2122382700443266, "loss/hidden": 3.434765625, "loss/jsd": 0.0, "loss/logits": 0.20737907551229, "step": 4620 }, { "epoch": 0.11575, "grad_norm": 30.375, "grad_norm_var": 3.7968098958333334, "learning_rate": 0.0001, "loss": 7.3858, "loss/crossentropy": 2.0896764233708383, "loss/hidden": 3.540234375, "loss/jsd": 0.0, "loss/logits": 0.20905990786850454, "step": 4630 }, { "epoch": 0.116, "grad_norm": 27.5, "grad_norm_var": 3.6879557291666667, "learning_rate": 0.0001, "loss": 7.5145, "loss/crossentropy": 2.104724445939064, "loss/hidden": 3.3796875, "loss/jsd": 0.0, "loss/logits": 0.19548750538378953, "step": 4640 }, { "epoch": 0.11625, "grad_norm": 29.875, "grad_norm_var": 8.7056640625, "learning_rate": 0.0001, "loss": 7.4009, "loss/crossentropy": 2.155320603400469, "loss/hidden": 3.47578125, "loss/jsd": 0.0, "loss/logits": 0.2002986514940858, "step": 4650 }, { "epoch": 0.1165, "grad_norm": 27.0, "grad_norm_var": 5.1541015625, "learning_rate": 0.0001, "loss": 7.3193, "loss/crossentropy": 2.085461828112602, "loss/hidden": 3.38671875, "loss/jsd": 0.0, "loss/logits": 0.1905359473079443, "step": 4660 }, { "epoch": 0.11675, "grad_norm": 30.5, "grad_norm_var": 1.5926432291666666, "learning_rate": 0.0001, "loss": 7.3125, "loss/crossentropy": 1.9927285239100456, "loss/hidden": 3.411328125, "loss/jsd": 0.0, "loss/logits": 0.17640038076788186, "step": 4670 }, { "epoch": 0.117, "grad_norm": 33.75, "grad_norm_var": 4.747330729166666, "learning_rate": 0.0001, "loss": 7.469, "loss/crossentropy": 2.1633560836315153, "loss/hidden": 3.324609375, "loss/jsd": 0.0, "loss/logits": 0.1862495567649603, "step": 4680 }, { "epoch": 0.11725, "grad_norm": 28.25, "grad_norm_var": 7.198372395833333, "learning_rate": 0.0001, "loss": 7.4318, "loss/crossentropy": 2.2390024289488792, "loss/hidden": 3.430078125, "loss/jsd": 0.0, "loss/logits": 0.2097862558439374, "step": 4690 }, { "epoch": 0.1175, "grad_norm": 31.375, "grad_norm_var": 5.760872395833333, "learning_rate": 0.0001, "loss": 7.4669, "loss/crossentropy": 2.0608770951628683, "loss/hidden": 3.4390625, "loss/jsd": 0.0, "loss/logits": 0.19615320730954408, "step": 4700 }, { "epoch": 0.11775, "grad_norm": 34.25, "grad_norm_var": 4.1894735190686346e+18, "learning_rate": 0.0001, "loss": 7.4596, "loss/crossentropy": 2.0900899082422257, "loss/hidden": 3.360546875, "loss/jsd": 0.0, "loss/logits": 0.17933723451569678, "step": 4710 }, { "epoch": 0.118, "grad_norm": 29.625, "grad_norm_var": 58.10729166666667, "learning_rate": 0.0001, "loss": 7.3979, "loss/crossentropy": 2.094898019731045, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.20720194689929486, "step": 4720 }, { "epoch": 0.11825, "grad_norm": 30.25, "grad_norm_var": 1.98515625, "learning_rate": 0.0001, "loss": 7.4519, "loss/crossentropy": 2.083225329220295, "loss/hidden": 3.426171875, "loss/jsd": 0.0, "loss/logits": 0.20777787994593383, "step": 4730 }, { "epoch": 0.1185, "grad_norm": 30.375, "grad_norm_var": 4.818684895833333, "learning_rate": 0.0001, "loss": 7.4795, "loss/crossentropy": 2.1974314540624618, "loss/hidden": 3.38046875, "loss/jsd": 0.0, "loss/logits": 0.19978385213762523, "step": 4740 }, { "epoch": 0.11875, "grad_norm": 32.5, "grad_norm_var": 3.439322916666667, "learning_rate": 0.0001, "loss": 7.3843, "loss/crossentropy": 1.9562335655093193, "loss/hidden": 3.39140625, "loss/jsd": 0.0, "loss/logits": 0.18924889974296094, "step": 4750 }, { "epoch": 0.119, "grad_norm": 30.625, "grad_norm_var": 1.3015402743274143e+18, "learning_rate": 0.0001, "loss": 7.5729, "loss/crossentropy": 2.0693807609379293, "loss/hidden": 3.339453125, "loss/jsd": 0.0, "loss/logits": 0.18801879994571208, "step": 4760 }, { "epoch": 0.11925, "grad_norm": 35.25, "grad_norm_var": 258.8791015625, "learning_rate": 0.0001, "loss": 7.3013, "loss/crossentropy": 2.0631250627338886, "loss/hidden": 3.3703125, "loss/jsd": 0.0, "loss/logits": 0.18974527437239885, "step": 4770 }, { "epoch": 0.1195, "grad_norm": 28.625, "grad_norm_var": 301.52233072916664, "learning_rate": 0.0001, "loss": 7.4639, "loss/crossentropy": 2.1473939388990404, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.19722200892865657, "step": 4780 }, { "epoch": 0.11975, "grad_norm": 31.125, "grad_norm_var": 25.472330729166668, "learning_rate": 0.0001, "loss": 7.3161, "loss/crossentropy": 2.1767601929605007, "loss/hidden": 3.380078125, "loss/jsd": 0.0, "loss/logits": 0.20041130091995002, "step": 4790 }, { "epoch": 0.12, "grad_norm": 29.5, "grad_norm_var": 2.8580729166666665, "learning_rate": 0.0001, "loss": 7.3077, "loss/crossentropy": 2.0214909121394156, "loss/hidden": 3.38828125, "loss/jsd": 0.0, "loss/logits": 0.19553480856120586, "step": 4800 }, { "epoch": 0.12025, "grad_norm": 34.25, "grad_norm_var": 2.3666015625, "learning_rate": 0.0001, "loss": 7.4537, "loss/crossentropy": 2.092876334488392, "loss/hidden": 3.276171875, "loss/jsd": 0.0, "loss/logits": 0.19079044535756112, "step": 4810 }, { "epoch": 0.1205, "grad_norm": 28.75, "grad_norm_var": 2.1494140625, "learning_rate": 0.0001, "loss": 7.3579, "loss/crossentropy": 2.159788618981838, "loss/hidden": 3.447265625, "loss/jsd": 0.0, "loss/logits": 0.20938555523753166, "step": 4820 }, { "epoch": 0.12075, "grad_norm": 31.625, "grad_norm_var": 1.2635411529466906e+18, "learning_rate": 0.0001, "loss": 7.3822, "loss/crossentropy": 2.221826246380806, "loss/hidden": 3.3140625, "loss/jsd": 0.0, "loss/logits": 0.18899439387023448, "step": 4830 }, { "epoch": 0.121, "grad_norm": 29.375, "grad_norm_var": 7.171875, "learning_rate": 0.0001, "loss": 7.3649, "loss/crossentropy": 2.2076950490474703, "loss/hidden": 3.321875, "loss/jsd": 0.0, "loss/logits": 0.1911212421953678, "step": 4840 }, { "epoch": 0.12125, "grad_norm": 28.875, "grad_norm_var": 5.397916666666666, "learning_rate": 0.0001, "loss": 7.2934, "loss/crossentropy": 2.1398009806871414, "loss/hidden": 3.276953125, "loss/jsd": 0.0, "loss/logits": 0.18104367554187775, "step": 4850 }, { "epoch": 0.1215, "grad_norm": 33.25, "grad_norm_var": 2.292122395833333, "learning_rate": 0.0001, "loss": 7.3944, "loss/crossentropy": 2.0568679124116898, "loss/hidden": 3.31953125, "loss/jsd": 0.0, "loss/logits": 0.19066975675523282, "step": 4860 }, { "epoch": 0.12175, "grad_norm": 31.75, "grad_norm_var": 1.5145182291666666, "learning_rate": 0.0001, "loss": 7.5365, "loss/crossentropy": 2.2600763499736787, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.20988074019551278, "step": 4870 }, { "epoch": 0.122, "grad_norm": 30.125, "grad_norm_var": 0.8442057291666667, "learning_rate": 0.0001, "loss": 7.4425, "loss/crossentropy": 2.087808459997177, "loss/hidden": 3.397265625, "loss/jsd": 0.0, "loss/logits": 0.20126468148082494, "step": 4880 }, { "epoch": 0.12225, "grad_norm": 29.25, "grad_norm_var": 1.9455729166666667, "learning_rate": 0.0001, "loss": 7.4649, "loss/crossentropy": 2.089573635160923, "loss/hidden": 3.3890625, "loss/jsd": 0.0, "loss/logits": 0.18984669484198094, "step": 4890 }, { "epoch": 0.1225, "grad_norm": 29.125, "grad_norm_var": 2.7552083333333335, "learning_rate": 0.0001, "loss": 7.4894, "loss/crossentropy": 2.1424145482480528, "loss/hidden": 3.47890625, "loss/jsd": 0.0, "loss/logits": 0.20886036530137062, "step": 4900 }, { "epoch": 0.12275, "grad_norm": 31.0, "grad_norm_var": 4.751497395833334, "learning_rate": 0.0001, "loss": 7.5033, "loss/crossentropy": 2.104494086652994, "loss/hidden": 3.41875, "loss/jsd": 0.0, "loss/logits": 0.1945918256416917, "step": 4910 }, { "epoch": 0.123, "grad_norm": 28.125, "grad_norm_var": 5.330989583333333, "learning_rate": 0.0001, "loss": 7.4954, "loss/crossentropy": 2.0843611776828768, "loss/hidden": 3.358203125, "loss/jsd": 0.0, "loss/logits": 0.1925347488373518, "step": 4920 }, { "epoch": 0.12325, "grad_norm": 28.625, "grad_norm_var": 3.8166015625, "learning_rate": 0.0001, "loss": 7.4404, "loss/crossentropy": 2.205425333976746, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.18580489940941333, "step": 4930 }, { "epoch": 0.1235, "grad_norm": 29.375, "grad_norm_var": 14.980208333333334, "learning_rate": 0.0001, "loss": 7.3481, "loss/crossentropy": 1.9896500617265702, "loss/hidden": 3.39609375, "loss/jsd": 0.0, "loss/logits": 0.1904701752588153, "step": 4940 }, { "epoch": 0.12375, "grad_norm": 32.75, "grad_norm_var": 19.178580729166665, "learning_rate": 0.0001, "loss": 7.5252, "loss/crossentropy": 2.1207278318703175, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.19760717861354352, "step": 4950 }, { "epoch": 0.124, "grad_norm": 32.5, "grad_norm_var": 17.264583333333334, "learning_rate": 0.0001, "loss": 7.2678, "loss/crossentropy": 1.9271991185843944, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.19860625620931388, "step": 4960 }, { "epoch": 0.12425, "grad_norm": 28.625, "grad_norm_var": 11.196809895833333, "learning_rate": 0.0001, "loss": 7.3703, "loss/crossentropy": 2.0659097760915754, "loss/hidden": 3.287109375, "loss/jsd": 0.0, "loss/logits": 0.18224728610366583, "step": 4970 }, { "epoch": 0.1245, "grad_norm": 37.75, "grad_norm_var": 10.03515625, "learning_rate": 0.0001, "loss": 7.5041, "loss/crossentropy": 1.9809176340699195, "loss/hidden": 3.41796875, "loss/jsd": 0.0, "loss/logits": 0.19965030066668987, "step": 4980 }, { "epoch": 0.12475, "grad_norm": 27.125, "grad_norm_var": 11.567708333333334, "learning_rate": 0.0001, "loss": 7.327, "loss/crossentropy": 2.0197409205138683, "loss/hidden": 3.368359375, "loss/jsd": 0.0, "loss/logits": 0.18525638189166785, "step": 4990 }, { "epoch": 0.125, "grad_norm": 34.75, "grad_norm_var": 8.558268229166666, "learning_rate": 0.0001, "loss": 7.393, "loss/crossentropy": 2.100055608153343, "loss/hidden": 3.391015625, "loss/jsd": 0.0, "loss/logits": 0.19607669236138464, "step": 5000 }, { "epoch": 0.12525, "grad_norm": 29.75, "grad_norm_var": 5.602083333333334, "learning_rate": 0.0001, "loss": 7.4165, "loss/crossentropy": 1.9898378394544125, "loss/hidden": 3.350390625, "loss/jsd": 0.0, "loss/logits": 0.18131834492087365, "step": 5010 }, { "epoch": 0.1255, "grad_norm": 34.75, "grad_norm_var": 5.866666666666666, "learning_rate": 0.0001, "loss": 7.555, "loss/crossentropy": 2.086017055809498, "loss/hidden": 3.446484375, "loss/jsd": 0.0, "loss/logits": 0.1889553153887391, "step": 5020 }, { "epoch": 0.12575, "grad_norm": 33.25, "grad_norm_var": 9.083268229166666, "learning_rate": 0.0001, "loss": 7.4098, "loss/crossentropy": 2.1133529357612133, "loss/hidden": 3.365234375, "loss/jsd": 0.0, "loss/logits": 0.1881294794380665, "step": 5030 }, { "epoch": 0.126, "grad_norm": 37.25, "grad_norm_var": 14.268489583333333, "learning_rate": 0.0001, "loss": 7.4117, "loss/crossentropy": 2.1818468660116195, "loss/hidden": 3.3390625, "loss/jsd": 0.0, "loss/logits": 0.19351670220494271, "step": 5040 }, { "epoch": 0.12625, "grad_norm": 27.75, "grad_norm_var": 15.270572916666667, "learning_rate": 0.0001, "loss": 7.448, "loss/crossentropy": 2.133790023624897, "loss/hidden": 3.267578125, "loss/jsd": 0.0, "loss/logits": 0.17900315206497908, "step": 5050 }, { "epoch": 0.1265, "grad_norm": 27.75, "grad_norm_var": 12.469791666666667, "learning_rate": 0.0001, "loss": 7.4514, "loss/crossentropy": 2.013399636745453, "loss/hidden": 3.502734375, "loss/jsd": 0.0, "loss/logits": 0.19984339475631713, "step": 5060 }, { "epoch": 0.12675, "grad_norm": 28.625, "grad_norm_var": 6.479622395833333, "learning_rate": 0.0001, "loss": 7.4411, "loss/crossentropy": 2.1552533119916917, "loss/hidden": 3.4265625, "loss/jsd": 0.0, "loss/logits": 0.20999168753623962, "step": 5070 }, { "epoch": 0.127, "grad_norm": 30.125, "grad_norm_var": 4.280989583333334, "learning_rate": 0.0001, "loss": 7.4238, "loss/crossentropy": 2.1047082796692846, "loss/hidden": 3.44375, "loss/jsd": 0.0, "loss/logits": 0.19757428932935, "step": 5080 }, { "epoch": 0.12725, "grad_norm": 29.25, "grad_norm_var": 3.971875, "learning_rate": 0.0001, "loss": 7.547, "loss/crossentropy": 2.2064288735389708, "loss/hidden": 3.472265625, "loss/jsd": 0.0, "loss/logits": 0.2037733059376478, "step": 5090 }, { "epoch": 0.1275, "grad_norm": 30.125, "grad_norm_var": 3.809309895833333, "learning_rate": 0.0001, "loss": 7.5359, "loss/crossentropy": 2.307460626959801, "loss/hidden": 3.405078125, "loss/jsd": 0.0, "loss/logits": 0.2208320491015911, "step": 5100 }, { "epoch": 0.12775, "grad_norm": 30.0, "grad_norm_var": 6.887434895833334, "learning_rate": 0.0001, "loss": 7.3695, "loss/crossentropy": 2.1241589702665804, "loss/hidden": 3.36640625, "loss/jsd": 0.0, "loss/logits": 0.18603499811142682, "step": 5110 }, { "epoch": 0.128, "grad_norm": 30.125, "grad_norm_var": 1.8353515625, "learning_rate": 0.0001, "loss": 7.4045, "loss/crossentropy": 2.1248120576143266, "loss/hidden": 3.364453125, "loss/jsd": 0.0, "loss/logits": 0.19218573588877916, "step": 5120 }, { "epoch": 0.12825, "grad_norm": 30.875, "grad_norm_var": 18.843489583333334, "learning_rate": 0.0001, "loss": 7.3683, "loss/crossentropy": 2.0221078641712666, "loss/hidden": 3.396484375, "loss/jsd": 0.0, "loss/logits": 0.19441262539476156, "step": 5130 }, { "epoch": 0.1285, "grad_norm": 30.25, "grad_norm_var": 19.755989583333335, "learning_rate": 0.0001, "loss": 7.4467, "loss/crossentropy": 2.0746863678097727, "loss/hidden": 3.41015625, "loss/jsd": 0.0, "loss/logits": 0.20940047055482863, "step": 5140 }, { "epoch": 0.12875, "grad_norm": 29.75, "grad_norm_var": 7.226497395833333, "learning_rate": 0.0001, "loss": 7.4125, "loss/crossentropy": 2.127023458480835, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.19320496991276742, "step": 5150 }, { "epoch": 0.129, "grad_norm": 30.875, "grad_norm_var": 8.332747395833334, "learning_rate": 0.0001, "loss": 7.3609, "loss/crossentropy": 2.0404578357934953, "loss/hidden": 3.271484375, "loss/jsd": 0.0, "loss/logits": 0.18524497244507074, "step": 5160 }, { "epoch": 0.12925, "grad_norm": 30.375, "grad_norm_var": 5.566080729166667, "learning_rate": 0.0001, "loss": 7.3806, "loss/crossentropy": 2.05174797475338, "loss/hidden": 3.4125, "loss/jsd": 0.0, "loss/logits": 0.20412184661254287, "step": 5170 }, { "epoch": 0.1295, "grad_norm": 30.875, "grad_norm_var": 72.65201822916667, "learning_rate": 0.0001, "loss": 7.4596, "loss/crossentropy": 2.0945761643350123, "loss/hidden": 3.388671875, "loss/jsd": 0.0, "loss/logits": 0.1920377543196082, "step": 5180 }, { "epoch": 0.12975, "grad_norm": 34.25, "grad_norm_var": 1.8330729166666666, "learning_rate": 0.0001, "loss": 7.3953, "loss/crossentropy": 2.0848742216825484, "loss/hidden": 3.548046875, "loss/jsd": 0.0, "loss/logits": 0.22283064387738705, "step": 5190 }, { "epoch": 0.13, "grad_norm": 31.25, "grad_norm_var": 2.4244140625, "learning_rate": 0.0001, "loss": 7.5061, "loss/crossentropy": 1.997230054438114, "loss/hidden": 3.488671875, "loss/jsd": 0.0, "loss/logits": 0.198976163379848, "step": 5200 }, { "epoch": 0.13025, "grad_norm": 42.0, "grad_norm_var": 9.762239583333333, "learning_rate": 0.0001, "loss": 7.523, "loss/crossentropy": 2.169138702750206, "loss/hidden": 3.458984375, "loss/jsd": 0.0, "loss/logits": 0.2180183682590723, "step": 5210 }, { "epoch": 0.1305, "grad_norm": 27.75, "grad_norm_var": 13.330989583333333, "learning_rate": 0.0001, "loss": 7.4174, "loss/crossentropy": 2.0436717979609966, "loss/hidden": 3.4265625, "loss/jsd": 0.0, "loss/logits": 0.18874377477914095, "step": 5220 }, { "epoch": 0.13075, "grad_norm": 33.5, "grad_norm_var": 5.186393229166667, "learning_rate": 0.0001, "loss": 7.4735, "loss/crossentropy": 2.1061771392822264, "loss/hidden": 3.3125, "loss/jsd": 0.0, "loss/logits": 0.1841479053720832, "step": 5230 }, { "epoch": 0.131, "grad_norm": 28.25, "grad_norm_var": 3.480208333333333, "learning_rate": 0.0001, "loss": 7.4379, "loss/crossentropy": 1.9957973182201385, "loss/hidden": 3.46171875, "loss/jsd": 0.0, "loss/logits": 0.19976749327033758, "step": 5240 }, { "epoch": 0.13125, "grad_norm": 31.0, "grad_norm_var": 2.2249348958333335, "learning_rate": 0.0001, "loss": 7.445, "loss/crossentropy": 2.089694794267416, "loss/hidden": 3.344921875, "loss/jsd": 0.0, "loss/logits": 0.18123079631477595, "step": 5250 }, { "epoch": 0.1315, "grad_norm": 30.625, "grad_norm_var": 2.062239583333333, "learning_rate": 0.0001, "loss": 7.3525, "loss/crossentropy": 2.096596322953701, "loss/hidden": 3.3875, "loss/jsd": 0.0, "loss/logits": 0.17838086038827897, "step": 5260 }, { "epoch": 0.13175, "grad_norm": 27.75, "grad_norm_var": 2.6619140625, "learning_rate": 0.0001, "loss": 7.4042, "loss/crossentropy": 2.086874121427536, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.18623477015644313, "step": 5270 }, { "epoch": 0.132, "grad_norm": 31.125, "grad_norm_var": 1.8416015625, "learning_rate": 0.0001, "loss": 7.3634, "loss/crossentropy": 2.131754931807518, "loss/hidden": 3.416796875, "loss/jsd": 0.0, "loss/logits": 0.20106223467737436, "step": 5280 }, { "epoch": 0.13225, "grad_norm": 32.5, "grad_norm_var": 6.598372395833334, "learning_rate": 0.0001, "loss": 7.4969, "loss/crossentropy": 2.1548122704029082, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.20439809635281564, "step": 5290 }, { "epoch": 0.1325, "grad_norm": 28.125, "grad_norm_var": 5.252083333333333, "learning_rate": 0.0001, "loss": 7.4373, "loss/crossentropy": 2.1770398393273354, "loss/hidden": 3.40078125, "loss/jsd": 0.0, "loss/logits": 0.19656166546046733, "step": 5300 }, { "epoch": 0.13275, "grad_norm": 29.875, "grad_norm_var": 1.2473307291666667, "learning_rate": 0.0001, "loss": 7.4308, "loss/crossentropy": 2.152033807337284, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.18985964702442287, "step": 5310 }, { "epoch": 0.133, "grad_norm": 31.625, "grad_norm_var": 1.0041666666666667, "learning_rate": 0.0001, "loss": 7.4327, "loss/crossentropy": 2.087932828068733, "loss/hidden": 3.3390625, "loss/jsd": 0.0, "loss/logits": 0.184254783205688, "step": 5320 }, { "epoch": 0.13325, "grad_norm": 31.0, "grad_norm_var": 2.4344770491390623e+18, "learning_rate": 0.0001, "loss": 7.3366, "loss/crossentropy": 2.034941144287586, "loss/hidden": 3.38359375, "loss/jsd": 0.0, "loss/logits": 0.1904723599553108, "step": 5330 }, { "epoch": 0.1335, "grad_norm": 31.5, "grad_norm_var": 6.255989583333333, "learning_rate": 0.0001, "loss": 7.4492, "loss/crossentropy": 2.152811796963215, "loss/hidden": 3.385546875, "loss/jsd": 0.0, "loss/logits": 0.18841406889259815, "step": 5340 }, { "epoch": 0.13375, "grad_norm": 31.625, "grad_norm_var": 1.6822916666666667, "learning_rate": 0.0001, "loss": 7.465, "loss/crossentropy": 2.1877569228410723, "loss/hidden": 3.36796875, "loss/jsd": 0.0, "loss/logits": 0.19606791157275438, "step": 5350 }, { "epoch": 0.134, "grad_norm": 30.25, "grad_norm_var": 2.17890625, "learning_rate": 0.0001, "loss": 7.3975, "loss/crossentropy": 2.0353255167603495, "loss/hidden": 3.340234375, "loss/jsd": 0.0, "loss/logits": 0.1815076546743512, "step": 5360 }, { "epoch": 0.13425, "grad_norm": 31.0, "grad_norm_var": 12.4525390625, "learning_rate": 0.0001, "loss": 7.3462, "loss/crossentropy": 1.919140312820673, "loss/hidden": 3.370703125, "loss/jsd": 0.0, "loss/logits": 0.18229803508147596, "step": 5370 }, { "epoch": 0.1345, "grad_norm": 29.625, "grad_norm_var": 7.26640625, "learning_rate": 0.0001, "loss": 7.506, "loss/crossentropy": 2.11019846200943, "loss/hidden": 3.43046875, "loss/jsd": 0.0, "loss/logits": 0.19277678560465575, "step": 5380 }, { "epoch": 0.13475, "grad_norm": 32.0, "grad_norm_var": 0.9275390625, "learning_rate": 0.0001, "loss": 7.3668, "loss/crossentropy": 2.1108837127685547, "loss/hidden": 3.493359375, "loss/jsd": 0.0, "loss/logits": 0.20013203900307416, "step": 5390 }, { "epoch": 0.135, "grad_norm": 28.125, "grad_norm_var": 1.7330729166666667, "learning_rate": 0.0001, "loss": 7.5524, "loss/crossentropy": 2.16382010653615, "loss/hidden": 3.371484375, "loss/jsd": 0.0, "loss/logits": 0.1924815428443253, "step": 5400 }, { "epoch": 0.13525, "grad_norm": 30.25, "grad_norm_var": 2.5893229166666667, "learning_rate": 0.0001, "loss": 7.5393, "loss/crossentropy": 2.0622613176703455, "loss/hidden": 3.498828125, "loss/jsd": 0.0, "loss/logits": 0.20035731326788664, "step": 5410 }, { "epoch": 0.1355, "grad_norm": 29.625, "grad_norm_var": 2.8499348958333335, "learning_rate": 0.0001, "loss": 7.3466, "loss/crossentropy": 2.169532992690802, "loss/hidden": 3.381640625, "loss/jsd": 0.0, "loss/logits": 0.2085475005209446, "step": 5420 }, { "epoch": 0.13575, "grad_norm": 31.0, "grad_norm_var": 2.8212890625, "learning_rate": 0.0001, "loss": 7.4299, "loss/crossentropy": 2.056758251786232, "loss/hidden": 3.460546875, "loss/jsd": 0.0, "loss/logits": 0.19217969793826342, "step": 5430 }, { "epoch": 0.136, "grad_norm": 32.25, "grad_norm_var": 3.9749348958333335, "learning_rate": 0.0001, "loss": 7.4334, "loss/crossentropy": 2.1805212616920473, "loss/hidden": 3.437109375, "loss/jsd": 0.0, "loss/logits": 0.21980819348245859, "step": 5440 }, { "epoch": 0.13625, "grad_norm": 29.5, "grad_norm_var": 2.0218098958333335, "learning_rate": 0.0001, "loss": 7.5382, "loss/crossentropy": 2.1516773015260697, "loss/hidden": 3.50703125, "loss/jsd": 0.0, "loss/logits": 0.208776849322021, "step": 5450 }, { "epoch": 0.1365, "grad_norm": 31.375, "grad_norm_var": 2.082747395833333, "learning_rate": 0.0001, "loss": 7.4684, "loss/crossentropy": 2.1602507561445234, "loss/hidden": 3.366796875, "loss/jsd": 0.0, "loss/logits": 0.1842126866802573, "step": 5460 }, { "epoch": 0.13675, "grad_norm": 31.25, "grad_norm_var": 2.6197265625, "learning_rate": 0.0001, "loss": 7.3823, "loss/crossentropy": 2.081377077102661, "loss/hidden": 3.38359375, "loss/jsd": 0.0, "loss/logits": 0.1906685210764408, "step": 5470 }, { "epoch": 0.137, "grad_norm": 30.875, "grad_norm_var": 2.21875, "learning_rate": 0.0001, "loss": 7.4128, "loss/crossentropy": 2.138934540748596, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.18631890565156936, "step": 5480 }, { "epoch": 0.13725, "grad_norm": 31.0, "grad_norm_var": 3.958268229166667, "learning_rate": 0.0001, "loss": 7.568, "loss/crossentropy": 2.02208868265152, "loss/hidden": 3.43828125, "loss/jsd": 0.0, "loss/logits": 0.19863407909870148, "step": 5490 }, { "epoch": 0.1375, "grad_norm": 33.0, "grad_norm_var": 6.1775390625, "learning_rate": 0.0001, "loss": 7.3806, "loss/crossentropy": 2.0247954726219177, "loss/hidden": 3.408984375, "loss/jsd": 0.0, "loss/logits": 0.18538292730227113, "step": 5500 }, { "epoch": 0.13775, "grad_norm": 30.625, "grad_norm_var": 6.076041666666667, "learning_rate": 0.0001, "loss": 7.6241, "loss/crossentropy": 2.2269895624369385, "loss/hidden": 3.445703125, "loss/jsd": 0.0, "loss/logits": 0.197488261340186, "step": 5510 }, { "epoch": 0.138, "grad_norm": 30.25, "grad_norm_var": 4.1947265625, "learning_rate": 0.0001, "loss": 7.4968, "loss/crossentropy": 2.077942840754986, "loss/hidden": 3.420703125, "loss/jsd": 0.0, "loss/logits": 0.19135653134435415, "step": 5520 }, { "epoch": 0.13825, "grad_norm": 27.75, "grad_norm_var": 2.701041666666667, "learning_rate": 0.0001, "loss": 7.3842, "loss/crossentropy": 2.104434663057327, "loss/hidden": 3.37890625, "loss/jsd": 0.0, "loss/logits": 0.18928063409402968, "step": 5530 }, { "epoch": 0.1385, "grad_norm": 31.5, "grad_norm_var": 6.4087890625, "learning_rate": 0.0001, "loss": 7.443, "loss/crossentropy": 2.0420807294547556, "loss/hidden": 3.362890625, "loss/jsd": 0.0, "loss/logits": 0.18584198467433452, "step": 5540 }, { "epoch": 0.13875, "grad_norm": 32.5, "grad_norm_var": 4.7556640625, "learning_rate": 0.0001, "loss": 7.418, "loss/crossentropy": 2.0941856279969215, "loss/hidden": 3.311328125, "loss/jsd": 0.0, "loss/logits": 0.18290557386353612, "step": 5550 }, { "epoch": 0.139, "grad_norm": 34.0, "grad_norm_var": 3.0947916666666666, "learning_rate": 0.0001, "loss": 7.3594, "loss/crossentropy": 2.1482032746076585, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.19914243686944247, "step": 5560 }, { "epoch": 0.13925, "grad_norm": 31.875, "grad_norm_var": 4.449739583333334, "learning_rate": 0.0001, "loss": 7.5652, "loss/crossentropy": 2.150991679728031, "loss/hidden": 3.4984375, "loss/jsd": 0.0, "loss/logits": 0.20609580241143705, "step": 5570 }, { "epoch": 0.1395, "grad_norm": 30.125, "grad_norm_var": 8.527083333333334, "learning_rate": 0.0001, "loss": 7.3112, "loss/crossentropy": 2.1712302803993224, "loss/hidden": 3.38203125, "loss/jsd": 0.0, "loss/logits": 0.18877983894199132, "step": 5580 }, { "epoch": 0.13975, "grad_norm": 28.5, "grad_norm_var": 2.1197265625, "learning_rate": 0.0001, "loss": 7.4033, "loss/crossentropy": 2.1502134561538697, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.21474836114794016, "step": 5590 }, { "epoch": 0.14, "grad_norm": 30.25, "grad_norm_var": 3.6809895833333335, "learning_rate": 0.0001, "loss": 7.5089, "loss/crossentropy": 2.230164831876755, "loss/hidden": 3.409375, "loss/jsd": 0.0, "loss/logits": 0.2127472611144185, "step": 5600 }, { "epoch": 0.14025, "grad_norm": 30.625, "grad_norm_var": 52.13333333333333, "learning_rate": 0.0001, "loss": 7.5328, "loss/crossentropy": 2.1207681491971018, "loss/hidden": 3.459765625, "loss/jsd": 0.0, "loss/logits": 0.20428987089544534, "step": 5610 }, { "epoch": 0.1405, "grad_norm": 30.625, "grad_norm_var": 2.037239583333333, "learning_rate": 0.0001, "loss": 7.3972, "loss/crossentropy": 2.065328547358513, "loss/hidden": 3.43671875, "loss/jsd": 0.0, "loss/logits": 0.19245364069938659, "step": 5620 }, { "epoch": 0.14075, "grad_norm": 29.375, "grad_norm_var": 2.40625, "learning_rate": 0.0001, "loss": 7.2803, "loss/crossentropy": 2.0791175961494446, "loss/hidden": 3.380078125, "loss/jsd": 0.0, "loss/logits": 0.1857742078602314, "step": 5630 }, { "epoch": 0.141, "grad_norm": 29.0, "grad_norm_var": 2.687239583333333, "learning_rate": 0.0001, "loss": 7.3324, "loss/crossentropy": 2.054654690623283, "loss/hidden": 3.31328125, "loss/jsd": 0.0, "loss/logits": 0.18879605047404766, "step": 5640 }, { "epoch": 0.14125, "grad_norm": 27.625, "grad_norm_var": 3.06015625, "learning_rate": 0.0001, "loss": 7.4804, "loss/crossentropy": 2.163857588917017, "loss/hidden": 3.434375, "loss/jsd": 0.0, "loss/logits": 0.2018537001684308, "step": 5650 }, { "epoch": 0.1415, "grad_norm": 31.625, "grad_norm_var": 3.54140625, "learning_rate": 0.0001, "loss": 7.4337, "loss/crossentropy": 2.1230690620839594, "loss/hidden": 3.33046875, "loss/jsd": 0.0, "loss/logits": 0.18610329292714595, "step": 5660 }, { "epoch": 0.14175, "grad_norm": 30.0, "grad_norm_var": 2.0171223958333333, "learning_rate": 0.0001, "loss": 7.484, "loss/crossentropy": 2.1232656478881835, "loss/hidden": 3.306640625, "loss/jsd": 0.0, "loss/logits": 0.18209880087524652, "step": 5670 }, { "epoch": 0.142, "grad_norm": 30.625, "grad_norm_var": 3.6244140625, "learning_rate": 0.0001, "loss": 7.3601, "loss/crossentropy": 1.9925632011145353, "loss/hidden": 3.487109375, "loss/jsd": 0.0, "loss/logits": 0.18384792990982532, "step": 5680 }, { "epoch": 0.14225, "grad_norm": 28.875, "grad_norm_var": 4.595247395833334, "learning_rate": 0.0001, "loss": 7.3492, "loss/crossentropy": 1.9747695334255695, "loss/hidden": 3.4203125, "loss/jsd": 0.0, "loss/logits": 0.18710496351122857, "step": 5690 }, { "epoch": 0.1425, "grad_norm": 30.0, "grad_norm_var": 5.160872395833334, "learning_rate": 0.0001, "loss": 7.1869, "loss/crossentropy": 1.9229816131293773, "loss/hidden": 3.390234375, "loss/jsd": 0.0, "loss/logits": 0.1764959843829274, "step": 5700 }, { "epoch": 0.14275, "grad_norm": 31.375, "grad_norm_var": 1.1177083333333333, "learning_rate": 0.0001, "loss": 7.5353, "loss/crossentropy": 2.1759460479021073, "loss/hidden": 3.442578125, "loss/jsd": 0.0, "loss/logits": 0.2011772884055972, "step": 5710 }, { "epoch": 0.143, "grad_norm": 30.875, "grad_norm_var": 19.32265625, "learning_rate": 0.0001, "loss": 7.4364, "loss/crossentropy": 1.9983633741736413, "loss/hidden": 3.44453125, "loss/jsd": 0.0, "loss/logits": 0.2036839971318841, "step": 5720 }, { "epoch": 0.14325, "grad_norm": 31.75, "grad_norm_var": 6.330989583333333, "learning_rate": 0.0001, "loss": 7.3473, "loss/crossentropy": 2.2608693316578865, "loss/hidden": 3.395703125, "loss/jsd": 0.0, "loss/logits": 0.20173839703202248, "step": 5730 }, { "epoch": 0.1435, "grad_norm": 29.0, "grad_norm_var": 30.8306640625, "learning_rate": 0.0001, "loss": 7.4861, "loss/crossentropy": 2.191919285058975, "loss/hidden": 3.378125, "loss/jsd": 0.0, "loss/logits": 0.1848000530153513, "step": 5740 }, { "epoch": 0.14375, "grad_norm": 32.5, "grad_norm_var": 8.167643229166666, "learning_rate": 0.0001, "loss": 7.5179, "loss/crossentropy": 2.098912109434605, "loss/hidden": 3.544921875, "loss/jsd": 0.0, "loss/logits": 0.22364525627344847, "step": 5750 }, { "epoch": 0.144, "grad_norm": 30.5, "grad_norm_var": 2.5479166666666666, "learning_rate": 0.0001, "loss": 7.4241, "loss/crossentropy": 2.089163874089718, "loss/hidden": 3.39296875, "loss/jsd": 0.0, "loss/logits": 0.19125983892008663, "step": 5760 }, { "epoch": 0.14425, "grad_norm": 29.875, "grad_norm_var": 8.223958333333334, "learning_rate": 0.0001, "loss": 7.45, "loss/crossentropy": 2.2600366115570067, "loss/hidden": 3.444140625, "loss/jsd": 0.0, "loss/logits": 0.1984367400407791, "step": 5770 }, { "epoch": 0.1445, "grad_norm": 32.0, "grad_norm_var": 14.806705729166667, "learning_rate": 0.0001, "loss": 7.3632, "loss/crossentropy": 1.9510320864617825, "loss/hidden": 3.347265625, "loss/jsd": 0.0, "loss/logits": 0.18547183061018585, "step": 5780 }, { "epoch": 0.14475, "grad_norm": 30.0, "grad_norm_var": 9.655989583333334, "learning_rate": 0.0001, "loss": 7.5503, "loss/crossentropy": 2.143619356304407, "loss/hidden": 3.4, "loss/jsd": 0.0, "loss/logits": 0.20010631643235682, "step": 5790 }, { "epoch": 0.145, "grad_norm": 28.375, "grad_norm_var": 9.556184895833333, "learning_rate": 0.0001, "loss": 7.5076, "loss/crossentropy": 1.9316529139876366, "loss/hidden": 3.40546875, "loss/jsd": 0.0, "loss/logits": 0.18306834027171134, "step": 5800 }, { "epoch": 0.14525, "grad_norm": 37.25, "grad_norm_var": 10.718489583333334, "learning_rate": 0.0001, "loss": 7.5004, "loss/crossentropy": 2.136544609069824, "loss/hidden": 3.3453125, "loss/jsd": 0.0, "loss/logits": 0.20682645812630654, "step": 5810 }, { "epoch": 0.1455, "grad_norm": 34.75, "grad_norm_var": 9.395572916666667, "learning_rate": 0.0001, "loss": 7.4109, "loss/crossentropy": 2.0811544865369798, "loss/hidden": 3.346484375, "loss/jsd": 0.0, "loss/logits": 0.187607554346323, "step": 5820 }, { "epoch": 0.14575, "grad_norm": 30.125, "grad_norm_var": 11.476041666666667, "learning_rate": 0.0001, "loss": 7.4615, "loss/crossentropy": 2.2464685887098312, "loss/hidden": 3.419140625, "loss/jsd": 0.0, "loss/logits": 0.21834317222237587, "step": 5830 }, { "epoch": 0.146, "grad_norm": 31.875, "grad_norm_var": 8.106705729166666, "learning_rate": 0.0001, "loss": 7.379, "loss/crossentropy": 2.1494223892688753, "loss/hidden": 3.4875, "loss/jsd": 0.0, "loss/logits": 0.20599585752934219, "step": 5840 }, { "epoch": 0.14625, "grad_norm": 32.25, "grad_norm_var": 119.0541015625, "learning_rate": 0.0001, "loss": 7.4223, "loss/crossentropy": 2.013238602876663, "loss/hidden": 3.355859375, "loss/jsd": 0.0, "loss/logits": 0.17934355642646552, "step": 5850 }, { "epoch": 0.1465, "grad_norm": 55.5, "grad_norm_var": 40.619205729166666, "learning_rate": 0.0001, "loss": 7.4766, "loss/crossentropy": 2.1309464499354362, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.1953899236395955, "step": 5860 }, { "epoch": 0.14675, "grad_norm": 31.125, "grad_norm_var": 51.764322916666664, "learning_rate": 0.0001, "loss": 7.5385, "loss/crossentropy": 2.203585295379162, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.20035494081676006, "step": 5870 }, { "epoch": 0.147, "grad_norm": 30.125, "grad_norm_var": 8.2541015625, "learning_rate": 0.0001, "loss": 7.3861, "loss/crossentropy": 2.057890709489584, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.18680873457342387, "step": 5880 }, { "epoch": 0.14725, "grad_norm": 30.25, "grad_norm_var": 4.555989583333333, "learning_rate": 0.0001, "loss": 7.4283, "loss/crossentropy": 2.049139867722988, "loss/hidden": 3.369921875, "loss/jsd": 0.0, "loss/logits": 0.1856512013822794, "step": 5890 }, { "epoch": 0.1475, "grad_norm": 30.375, "grad_norm_var": 10.570768229166667, "learning_rate": 0.0001, "loss": 7.4651, "loss/crossentropy": 2.0553019613027574, "loss/hidden": 3.404296875, "loss/jsd": 0.0, "loss/logits": 0.18676785845309496, "step": 5900 }, { "epoch": 0.14775, "grad_norm": 30.125, "grad_norm_var": 14.59140625, "learning_rate": 0.0001, "loss": 7.4727, "loss/crossentropy": 2.0098409935832025, "loss/hidden": 3.48125, "loss/jsd": 0.0, "loss/logits": 0.20080684809945523, "step": 5910 }, { "epoch": 0.148, "grad_norm": 28.625, "grad_norm_var": 8.489322916666667, "learning_rate": 0.0001, "loss": 7.4203, "loss/crossentropy": 2.2615666806697847, "loss/hidden": 3.3, "loss/jsd": 0.0, "loss/logits": 0.18850413355976342, "step": 5920 }, { "epoch": 0.14825, "grad_norm": 30.0, "grad_norm_var": 3.6233723958333335, "learning_rate": 0.0001, "loss": 7.4441, "loss/crossentropy": 2.178256964683533, "loss/hidden": 3.29609375, "loss/jsd": 0.0, "loss/logits": 0.19248567353934048, "step": 5930 }, { "epoch": 0.1485, "grad_norm": 31.0, "grad_norm_var": 3.455208333333333, "learning_rate": 0.0001, "loss": 7.4573, "loss/crossentropy": 2.246034747362137, "loss/hidden": 3.45859375, "loss/jsd": 0.0, "loss/logits": 0.2096735591068864, "step": 5940 }, { "epoch": 0.14875, "grad_norm": 30.625, "grad_norm_var": 3.8494140625, "learning_rate": 0.0001, "loss": 7.4811, "loss/crossentropy": 2.180899788439274, "loss/hidden": 3.341796875, "loss/jsd": 0.0, "loss/logits": 0.19460927378386259, "step": 5950 }, { "epoch": 0.149, "grad_norm": 30.5, "grad_norm_var": 3.4385416666666666, "learning_rate": 0.0001, "loss": 7.3829, "loss/crossentropy": 2.258976912498474, "loss/hidden": 3.332421875, "loss/jsd": 0.0, "loss/logits": 0.19133044108748437, "step": 5960 }, { "epoch": 0.14925, "grad_norm": 29.0, "grad_norm_var": 13.80390625, "learning_rate": 0.0001, "loss": 7.5384, "loss/crossentropy": 2.012222741544247, "loss/hidden": 3.4640625, "loss/jsd": 0.0, "loss/logits": 0.19505210760980846, "step": 5970 }, { "epoch": 0.1495, "grad_norm": 27.875, "grad_norm_var": 12.9369140625, "learning_rate": 0.0001, "loss": 7.4033, "loss/crossentropy": 2.0392286255955696, "loss/hidden": 3.397265625, "loss/jsd": 0.0, "loss/logits": 0.1930427584797144, "step": 5980 }, { "epoch": 0.14975, "grad_norm": 28.25, "grad_norm_var": 18.174739583333334, "learning_rate": 0.0001, "loss": 7.399, "loss/crossentropy": 1.9029529005289079, "loss/hidden": 3.462109375, "loss/jsd": 0.0, "loss/logits": 0.198425155505538, "step": 5990 }, { "epoch": 0.15, "grad_norm": 31.875, "grad_norm_var": 3.0737770860662226e+18, "learning_rate": 0.0001, "loss": 7.4994, "loss/crossentropy": 1.8985859856009484, "loss/hidden": 3.4609375, "loss/jsd": 0.0, "loss/logits": 0.1951824951916933, "step": 6000 }, { "epoch": 0.15025, "grad_norm": 36.0, "grad_norm_var": 3.073777086665239e+18, "learning_rate": 0.0001, "loss": 7.4659, "loss/crossentropy": 2.097201645374298, "loss/hidden": 3.351171875, "loss/jsd": 0.0, "loss/logits": 0.18253911342471837, "step": 6010 }, { "epoch": 0.1505, "grad_norm": 27.875, "grad_norm_var": 6.801041666666666, "learning_rate": 0.0001, "loss": 7.2415, "loss/crossentropy": 2.0210610911250115, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.197306059114635, "step": 6020 }, { "epoch": 0.15075, "grad_norm": 31.0, "grad_norm_var": 14.46640625, "learning_rate": 0.0001, "loss": 7.4519, "loss/crossentropy": 2.1985476523637772, "loss/hidden": 3.475, "loss/jsd": 0.0, "loss/logits": 0.20262509360909461, "step": 6030 }, { "epoch": 0.151, "grad_norm": 30.625, "grad_norm_var": 6.254622395833334, "learning_rate": 0.0001, "loss": 7.3353, "loss/crossentropy": 2.0093181416392327, "loss/hidden": 3.316015625, "loss/jsd": 0.0, "loss/logits": 0.17620250331237913, "step": 6040 }, { "epoch": 0.15125, "grad_norm": 31.625, "grad_norm_var": 56.4291015625, "learning_rate": 0.0001, "loss": 7.4034, "loss/crossentropy": 2.177773226797581, "loss/hidden": 3.41875, "loss/jsd": 0.0, "loss/logits": 0.20441538300365208, "step": 6050 }, { "epoch": 0.1515, "grad_norm": 26.75, "grad_norm_var": 55.889322916666664, "learning_rate": 0.0001, "loss": 7.3245, "loss/crossentropy": 2.1666259437799456, "loss/hidden": 3.392578125, "loss/jsd": 0.0, "loss/logits": 0.19311951845884323, "step": 6060 }, { "epoch": 0.15175, "grad_norm": 30.25, "grad_norm_var": 91.0103515625, "learning_rate": 0.0001, "loss": 7.368, "loss/crossentropy": 2.063462796807289, "loss/hidden": 3.373046875, "loss/jsd": 0.0, "loss/logits": 0.1834208857268095, "step": 6070 }, { "epoch": 0.152, "grad_norm": 30.5, "grad_norm_var": 18.1212890625, "learning_rate": 0.0001, "loss": 7.4335, "loss/crossentropy": 1.9907098844647408, "loss/hidden": 3.53671875, "loss/jsd": 0.0, "loss/logits": 0.20707368329167367, "step": 6080 }, { "epoch": 0.15225, "grad_norm": 38.25, "grad_norm_var": 11.470247395833333, "learning_rate": 0.0001, "loss": 7.3789, "loss/crossentropy": 2.083692157268524, "loss/hidden": 3.465625, "loss/jsd": 0.0, "loss/logits": 0.1846569798886776, "step": 6090 }, { "epoch": 0.1525, "grad_norm": 28.25, "grad_norm_var": 21.829622395833333, "learning_rate": 0.0001, "loss": 7.3767, "loss/crossentropy": 2.1113929279148578, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.1990992769598961, "step": 6100 }, { "epoch": 0.15275, "grad_norm": 33.75, "grad_norm_var": 20.676497395833334, "learning_rate": 0.0001, "loss": 7.339, "loss/crossentropy": 2.1296695113182067, "loss/hidden": 3.385546875, "loss/jsd": 0.0, "loss/logits": 0.1936045665293932, "step": 6110 }, { "epoch": 0.153, "grad_norm": 36.25, "grad_norm_var": 7.8166015625, "learning_rate": 0.0001, "loss": 7.5027, "loss/crossentropy": 2.1011226207017897, "loss/hidden": 3.471875, "loss/jsd": 0.0, "loss/logits": 0.20695240292698144, "step": 6120 }, { "epoch": 0.15325, "grad_norm": 30.625, "grad_norm_var": 2.1304840750224113e+18, "learning_rate": 0.0001, "loss": 7.506, "loss/crossentropy": 2.2427969723939896, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.1966634625568986, "step": 6130 }, { "epoch": 0.1535, "grad_norm": 34.25, "grad_norm_var": 36.542643229166664, "learning_rate": 0.0001, "loss": 7.4413, "loss/crossentropy": 2.0855264641344546, "loss/hidden": 3.384765625, "loss/jsd": 0.0, "loss/logits": 0.19436217453330756, "step": 6140 }, { "epoch": 0.15375, "grad_norm": 32.75, "grad_norm_var": 10.153580729166666, "learning_rate": 0.0001, "loss": 7.3096, "loss/crossentropy": 2.0322439685463904, "loss/hidden": 3.3453125, "loss/jsd": 0.0, "loss/logits": 0.1753252800554037, "step": 6150 }, { "epoch": 0.154, "grad_norm": 31.0, "grad_norm_var": 10.216080729166666, "learning_rate": 0.0001, "loss": 7.2481, "loss/crossentropy": 2.074477408081293, "loss/hidden": 3.346875, "loss/jsd": 0.0, "loss/logits": 0.17578690703958272, "step": 6160 }, { "epoch": 0.15425, "grad_norm": 33.25, "grad_norm_var": 28.79765625, "learning_rate": 0.0001, "loss": 7.4403, "loss/crossentropy": 2.0863804474473, "loss/hidden": 3.431640625, "loss/jsd": 0.0, "loss/logits": 0.20699662044644357, "step": 6170 }, { "epoch": 0.1545, "grad_norm": 33.5, "grad_norm_var": 24.84375, "learning_rate": 0.0001, "loss": 7.4609, "loss/crossentropy": 2.0696858704090118, "loss/hidden": 3.458984375, "loss/jsd": 0.0, "loss/logits": 0.2128069180995226, "step": 6180 }, { "epoch": 0.15475, "grad_norm": 31.375, "grad_norm_var": 5.099739583333333, "learning_rate": 0.0001, "loss": 7.2575, "loss/crossentropy": 2.182169410586357, "loss/hidden": 3.31796875, "loss/jsd": 0.0, "loss/logits": 0.181897877715528, "step": 6190 }, { "epoch": 0.155, "grad_norm": 34.25, "grad_norm_var": 4.699934895833334, "learning_rate": 0.0001, "loss": 7.4975, "loss/crossentropy": 2.165008749067783, "loss/hidden": 3.492578125, "loss/jsd": 0.0, "loss/logits": 0.19800901636481286, "step": 6200 }, { "epoch": 0.15525, "grad_norm": 33.75, "grad_norm_var": 2.7738932291666667, "learning_rate": 0.0001, "loss": 7.3992, "loss/crossentropy": 2.0653695166110992, "loss/hidden": 3.406640625, "loss/jsd": 0.0, "loss/logits": 0.19180236533284187, "step": 6210 }, { "epoch": 0.1555, "grad_norm": 35.25, "grad_norm_var": 5.31015625, "learning_rate": 0.0001, "loss": 7.3638, "loss/crossentropy": 2.1244568385183813, "loss/hidden": 3.33125, "loss/jsd": 0.0, "loss/logits": 0.18238217020407319, "step": 6220 }, { "epoch": 0.15575, "grad_norm": 32.25, "grad_norm_var": 3.2817057291666667, "learning_rate": 0.0001, "loss": 7.5073, "loss/crossentropy": 2.1881898671388624, "loss/hidden": 3.498828125, "loss/jsd": 0.0, "loss/logits": 0.2045454490929842, "step": 6230 }, { "epoch": 0.156, "grad_norm": 28.25, "grad_norm_var": 2.6572265625, "learning_rate": 0.0001, "loss": 7.4609, "loss/crossentropy": 2.14600064009428, "loss/hidden": 3.4265625, "loss/jsd": 0.0, "loss/logits": 0.18945380430668593, "step": 6240 }, { "epoch": 0.15625, "grad_norm": 30.75, "grad_norm_var": 35.73118489583333, "learning_rate": 0.0001, "loss": 7.3786, "loss/crossentropy": 2.168429624289274, "loss/hidden": 3.296484375, "loss/jsd": 0.0, "loss/logits": 0.18439108245074748, "step": 6250 }, { "epoch": 0.1565, "grad_norm": 52.5, "grad_norm_var": 64.9962890625, "learning_rate": 0.0001, "loss": 7.3511, "loss/crossentropy": 2.1293379329144955, "loss/hidden": 3.414453125, "loss/jsd": 0.0, "loss/logits": 0.1828605517745018, "step": 6260 }, { "epoch": 0.15675, "grad_norm": 29.25, "grad_norm_var": 59.703125, "learning_rate": 0.0001, "loss": 7.3978, "loss/crossentropy": 1.8641120925545693, "loss/hidden": 3.430078125, "loss/jsd": 0.0, "loss/logits": 0.18577109538018705, "step": 6270 }, { "epoch": 0.157, "grad_norm": 28.75, "grad_norm_var": 32.1994140625, "learning_rate": 0.0001, "loss": 7.4066, "loss/crossentropy": 2.0997040398418902, "loss/hidden": 3.344921875, "loss/jsd": 0.0, "loss/logits": 0.18688563201576472, "step": 6280 }, { "epoch": 0.15725, "grad_norm": 31.375, "grad_norm_var": 17.302018229166666, "learning_rate": 0.0001, "loss": 7.4043, "loss/crossentropy": 1.9712626039981842, "loss/hidden": 3.45078125, "loss/jsd": 0.0, "loss/logits": 0.20054549565538765, "step": 6290 }, { "epoch": 0.1575, "grad_norm": 31.5, "grad_norm_var": 17.8431640625, "learning_rate": 0.0001, "loss": 7.4502, "loss/crossentropy": 2.0252815186977386, "loss/hidden": 3.3890625, "loss/jsd": 0.0, "loss/logits": 0.18488127905875446, "step": 6300 }, { "epoch": 0.15775, "grad_norm": 30.75, "grad_norm_var": 7.995572916666666, "learning_rate": 0.0001, "loss": 7.3829, "loss/crossentropy": 2.030302118510008, "loss/hidden": 3.400390625, "loss/jsd": 0.0, "loss/logits": 0.19101340658962726, "step": 6310 }, { "epoch": 0.158, "grad_norm": 30.125, "grad_norm_var": 5.805143229166666, "learning_rate": 0.0001, "loss": 7.3852, "loss/crossentropy": 1.9795936658978461, "loss/hidden": 3.45703125, "loss/jsd": 0.0, "loss/logits": 0.19110800279304385, "step": 6320 }, { "epoch": 0.15825, "grad_norm": 34.0, "grad_norm_var": 6.91640625, "learning_rate": 0.0001, "loss": 7.4417, "loss/crossentropy": 2.0620448149740698, "loss/hidden": 3.454296875, "loss/jsd": 0.0, "loss/logits": 0.194018579646945, "step": 6330 }, { "epoch": 0.1585, "grad_norm": 28.125, "grad_norm_var": 31.058268229166668, "learning_rate": 0.0001, "loss": 7.4142, "loss/crossentropy": 2.012200343608856, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.21260247621685266, "step": 6340 }, { "epoch": 0.15875, "grad_norm": 36.75, "grad_norm_var": 35.18118489583333, "learning_rate": 0.0001, "loss": 7.3776, "loss/crossentropy": 1.9757203698158263, "loss/hidden": 3.382421875, "loss/jsd": 0.0, "loss/logits": 0.200297892652452, "step": 6350 }, { "epoch": 0.159, "grad_norm": 31.125, "grad_norm_var": 17.764583333333334, "learning_rate": 0.0001, "loss": 7.4762, "loss/crossentropy": 2.195678301155567, "loss/hidden": 3.434375, "loss/jsd": 0.0, "loss/logits": 0.1989523505792022, "step": 6360 }, { "epoch": 0.15925, "grad_norm": 29.625, "grad_norm_var": 12.851041666666667, "learning_rate": 0.0001, "loss": 7.4661, "loss/crossentropy": 2.0537394002079963, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.20311654023826123, "step": 6370 }, { "epoch": 0.1595, "grad_norm": 30.0, "grad_norm_var": 10.0994140625, "learning_rate": 0.0001, "loss": 7.2759, "loss/crossentropy": 2.02838040292263, "loss/hidden": 3.45859375, "loss/jsd": 0.0, "loss/logits": 0.19148585237562657, "step": 6380 }, { "epoch": 0.15975, "grad_norm": 39.0, "grad_norm_var": 2324.6707682291667, "learning_rate": 0.0001, "loss": 7.3973, "loss/crossentropy": 2.0951177358627318, "loss/hidden": 3.430859375, "loss/jsd": 0.0, "loss/logits": 0.2158473737537861, "step": 6390 }, { "epoch": 0.16, "grad_norm": 40.0, "grad_norm_var": 21.121809895833334, "learning_rate": 0.0001, "loss": 7.2877, "loss/crossentropy": 1.878954614698887, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.18514612764120103, "step": 6400 }, { "epoch": 0.16025, "grad_norm": 33.5, "grad_norm_var": 23.1666015625, "learning_rate": 0.0001, "loss": 7.3598, "loss/crossentropy": 2.123918867111206, "loss/hidden": 3.3140625, "loss/jsd": 0.0, "loss/logits": 0.18728599287569522, "step": 6410 }, { "epoch": 0.1605, "grad_norm": 29.25, "grad_norm_var": 11.230143229166666, "learning_rate": 0.0001, "loss": 7.476, "loss/crossentropy": 2.168968527019024, "loss/hidden": 3.342578125, "loss/jsd": 0.0, "loss/logits": 0.20153266489505767, "step": 6420 }, { "epoch": 0.16075, "grad_norm": 30.0, "grad_norm_var": 90.8056640625, "learning_rate": 0.0001, "loss": 7.4289, "loss/crossentropy": 2.0426762118935584, "loss/hidden": 3.367578125, "loss/jsd": 0.0, "loss/logits": 0.19033107869327068, "step": 6430 }, { "epoch": 0.161, "grad_norm": 38.25, "grad_norm_var": 15.570247395833333, "learning_rate": 0.0001, "loss": 7.3992, "loss/crossentropy": 2.0535445332527162, "loss/hidden": 3.453515625, "loss/jsd": 0.0, "loss/logits": 0.19420330366119742, "step": 6440 }, { "epoch": 0.16125, "grad_norm": 45.5, "grad_norm_var": 30.326822916666668, "learning_rate": 0.0001, "loss": 7.3881, "loss/crossentropy": 1.949498599767685, "loss/hidden": 3.337890625, "loss/jsd": 0.0, "loss/logits": 0.18119702748954297, "step": 6450 }, { "epoch": 0.1615, "grad_norm": 30.625, "grad_norm_var": 96.64837239583333, "learning_rate": 0.0001, "loss": 7.432, "loss/crossentropy": 2.2534308552742006, "loss/hidden": 3.334375, "loss/jsd": 0.0, "loss/logits": 0.20010371711105107, "step": 6460 }, { "epoch": 0.16175, "grad_norm": 34.0, "grad_norm_var": 82.66015625, "learning_rate": 0.0001, "loss": 7.3555, "loss/crossentropy": 2.114310759305954, "loss/hidden": 3.387109375, "loss/jsd": 0.0, "loss/logits": 0.19822147954255342, "step": 6470 }, { "epoch": 0.162, "grad_norm": 29.875, "grad_norm_var": 8.4541015625, "learning_rate": 0.0001, "loss": 7.1893, "loss/crossentropy": 2.062894639372826, "loss/hidden": 3.348828125, "loss/jsd": 0.0, "loss/logits": 0.17771479729562997, "step": 6480 }, { "epoch": 0.16225, "grad_norm": 30.375, "grad_norm_var": 15.72265625, "learning_rate": 0.0001, "loss": 7.3665, "loss/crossentropy": 2.0109994761645793, "loss/hidden": 3.45625, "loss/jsd": 0.0, "loss/logits": 0.19815738410688938, "step": 6490 }, { "epoch": 0.1625, "grad_norm": 33.75, "grad_norm_var": 162.74680989583334, "learning_rate": 0.0001, "loss": 7.4105, "loss/crossentropy": 2.100720961391926, "loss/hidden": 3.26484375, "loss/jsd": 0.0, "loss/logits": 0.18080853056162596, "step": 6500 }, { "epoch": 0.16275, "grad_norm": 36.0, "grad_norm_var": 10.530143229166667, "learning_rate": 0.0001, "loss": 7.4993, "loss/crossentropy": 2.208073277771473, "loss/hidden": 3.3328125, "loss/jsd": 0.0, "loss/logits": 0.18481182418763636, "step": 6510 }, { "epoch": 0.163, "grad_norm": 37.0, "grad_norm_var": 8.981705729166666, "learning_rate": 0.0001, "loss": 7.5196, "loss/crossentropy": 2.2666310742497444, "loss/hidden": 3.40703125, "loss/jsd": 0.0, "loss/logits": 0.20527655016630889, "step": 6520 }, { "epoch": 0.16325, "grad_norm": 29.625, "grad_norm_var": 13.959830729166667, "learning_rate": 0.0001, "loss": 7.4239, "loss/crossentropy": 2.2184250116348267, "loss/hidden": 3.384765625, "loss/jsd": 0.0, "loss/logits": 0.19118925426155328, "step": 6530 }, { "epoch": 0.1635, "grad_norm": 29.375, "grad_norm_var": 8.820833333333333, "learning_rate": 0.0001, "loss": 7.3298, "loss/crossentropy": 2.119840921461582, "loss/hidden": 3.39296875, "loss/jsd": 0.0, "loss/logits": 0.20046985391527414, "step": 6540 }, { "epoch": 0.16375, "grad_norm": 31.375, "grad_norm_var": 3.1910807291666665, "learning_rate": 0.0001, "loss": 7.4507, "loss/crossentropy": 2.109931045770645, "loss/hidden": 3.387890625, "loss/jsd": 0.0, "loss/logits": 0.18326662238687277, "step": 6550 }, { "epoch": 0.164, "grad_norm": 35.25, "grad_norm_var": 8.6625, "learning_rate": 0.0001, "loss": 7.43, "loss/crossentropy": 2.090756069123745, "loss/hidden": 3.348046875, "loss/jsd": 0.0, "loss/logits": 0.1922204466536641, "step": 6560 }, { "epoch": 0.16425, "grad_norm": 30.625, "grad_norm_var": 14.567643229166666, "learning_rate": 0.0001, "loss": 7.1796, "loss/crossentropy": 1.9266313910484314, "loss/hidden": 3.465234375, "loss/jsd": 0.0, "loss/logits": 0.20681370329111814, "step": 6570 }, { "epoch": 0.1645, "grad_norm": 37.5, "grad_norm_var": 12.9447265625, "learning_rate": 0.0001, "loss": 7.5097, "loss/crossentropy": 1.875116103887558, "loss/hidden": 3.49921875, "loss/jsd": 0.0, "loss/logits": 0.2045787101611495, "step": 6580 }, { "epoch": 0.16475, "grad_norm": 29.875, "grad_norm_var": 5.706184895833333, "learning_rate": 0.0001, "loss": 7.329, "loss/crossentropy": 2.116366655379534, "loss/hidden": 3.4546875, "loss/jsd": 0.0, "loss/logits": 0.18577212654054165, "step": 6590 }, { "epoch": 0.165, "grad_norm": 33.75, "grad_norm_var": 2.3650390625, "learning_rate": 0.0001, "loss": 7.3765, "loss/crossentropy": 2.0037689693272114, "loss/hidden": 3.38125, "loss/jsd": 0.0, "loss/logits": 0.1971780034713447, "step": 6600 }, { "epoch": 0.16525, "grad_norm": 29.625, "grad_norm_var": 4.601497395833333, "learning_rate": 0.0001, "loss": 7.4001, "loss/crossentropy": 2.1523181863129137, "loss/hidden": 3.405859375, "loss/jsd": 0.0, "loss/logits": 0.192273567058146, "step": 6610 }, { "epoch": 0.1655, "grad_norm": 29.875, "grad_norm_var": 7.6525390625, "learning_rate": 0.0001, "loss": 7.3804, "loss/crossentropy": 2.0919234342873096, "loss/hidden": 3.39609375, "loss/jsd": 0.0, "loss/logits": 0.20757663380354643, "step": 6620 }, { "epoch": 0.16575, "grad_norm": 31.875, "grad_norm_var": 6.917708333333334, "learning_rate": 0.0001, "loss": 7.3889, "loss/crossentropy": 2.035097151994705, "loss/hidden": 3.31484375, "loss/jsd": 0.0, "loss/logits": 0.19357213731855155, "step": 6630 }, { "epoch": 0.166, "grad_norm": 28.125, "grad_norm_var": 2.4607245906905574e+18, "learning_rate": 0.0001, "loss": 7.5149, "loss/crossentropy": 2.114539227634668, "loss/hidden": 3.358984375, "loss/jsd": 0.0, "loss/logits": 0.19631449952721597, "step": 6640 }, { "epoch": 0.16625, "grad_norm": 28.375, "grad_norm_var": 2.4607245908931773e+18, "learning_rate": 0.0001, "loss": 7.3157, "loss/crossentropy": 2.0170676171779633, "loss/hidden": 3.376171875, "loss/jsd": 0.0, "loss/logits": 0.1893145913258195, "step": 6650 }, { "epoch": 0.1665, "grad_norm": 28.0, "grad_norm_var": 32.25807291666667, "learning_rate": 0.0001, "loss": 7.2947, "loss/crossentropy": 1.9412188947200775, "loss/hidden": 3.41953125, "loss/jsd": 0.0, "loss/logits": 0.18349691890180111, "step": 6660 }, { "epoch": 0.16675, "grad_norm": 30.25, "grad_norm_var": 48.2375, "learning_rate": 0.0001, "loss": 7.4086, "loss/crossentropy": 2.1157006829977036, "loss/hidden": 3.410546875, "loss/jsd": 0.0, "loss/logits": 0.1936411712318659, "step": 6670 }, { "epoch": 0.167, "grad_norm": 29.75, "grad_norm_var": 36.18333333333333, "learning_rate": 0.0001, "loss": 7.4096, "loss/crossentropy": 1.9384170174598694, "loss/hidden": 3.46015625, "loss/jsd": 0.0, "loss/logits": 0.1831628430634737, "step": 6680 }, { "epoch": 0.16725, "grad_norm": 30.0, "grad_norm_var": 62.6150390625, "learning_rate": 0.0001, "loss": 7.4604, "loss/crossentropy": 2.152873657643795, "loss/hidden": 3.421484375, "loss/jsd": 0.0, "loss/logits": 0.20049556214362382, "step": 6690 }, { "epoch": 0.1675, "grad_norm": 29.75, "grad_norm_var": 28.671875, "learning_rate": 0.0001, "loss": 7.5739, "loss/crossentropy": 2.1935679107904433, "loss/hidden": 3.359765625, "loss/jsd": 0.0, "loss/logits": 0.20188184324651956, "step": 6700 }, { "epoch": 0.16775, "grad_norm": 28.25, "grad_norm_var": 2.278580729166667, "learning_rate": 0.0001, "loss": 7.4709, "loss/crossentropy": 2.062030902504921, "loss/hidden": 3.4328125, "loss/jsd": 0.0, "loss/logits": 0.18874304387718438, "step": 6710 }, { "epoch": 0.168, "grad_norm": 29.125, "grad_norm_var": 3.3150390625, "learning_rate": 0.0001, "loss": 7.3655, "loss/crossentropy": 1.999978879839182, "loss/hidden": 3.3875, "loss/jsd": 0.0, "loss/logits": 0.18526637642644345, "step": 6720 }, { "epoch": 0.16825, "grad_norm": 35.25, "grad_norm_var": 6.237239583333333, "learning_rate": 0.0001, "loss": 7.4037, "loss/crossentropy": 2.0561595499515533, "loss/hidden": 3.41015625, "loss/jsd": 0.0, "loss/logits": 0.20046296287328005, "step": 6730 }, { "epoch": 0.1685, "grad_norm": 28.75, "grad_norm_var": 5.5619140625, "learning_rate": 0.0001, "loss": 7.368, "loss/crossentropy": 2.0664093092083933, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.1885912848636508, "step": 6740 }, { "epoch": 0.16875, "grad_norm": 30.875, "grad_norm_var": 5.843489583333334, "learning_rate": 0.0001, "loss": 7.4899, "loss/crossentropy": 2.1205774366855623, "loss/hidden": 3.491796875, "loss/jsd": 0.0, "loss/logits": 0.21412673257291318, "step": 6750 }, { "epoch": 0.169, "grad_norm": 32.0, "grad_norm_var": 6.3775390625, "learning_rate": 0.0001, "loss": 7.3708, "loss/crossentropy": 2.0314668610692026, "loss/hidden": 3.33203125, "loss/jsd": 0.0, "loss/logits": 0.18550706487149, "step": 6760 }, { "epoch": 0.16925, "grad_norm": 32.5, "grad_norm_var": 7.378059895833333, "learning_rate": 0.0001, "loss": 7.4055, "loss/crossentropy": 2.1428691864013674, "loss/hidden": 3.506640625, "loss/jsd": 0.0, "loss/logits": 0.21024896781891583, "step": 6770 }, { "epoch": 0.1695, "grad_norm": 31.375, "grad_norm_var": 3.9791666666666665, "learning_rate": 0.0001, "loss": 7.4444, "loss/crossentropy": 1.9500044576823712, "loss/hidden": 3.434375, "loss/jsd": 0.0, "loss/logits": 0.19028451843187213, "step": 6780 }, { "epoch": 0.16975, "grad_norm": 29.75, "grad_norm_var": 3.2853515625, "learning_rate": 0.0001, "loss": 7.4369, "loss/crossentropy": 2.1563921123743057, "loss/hidden": 3.3328125, "loss/jsd": 0.0, "loss/logits": 0.18290520180016756, "step": 6790 }, { "epoch": 0.17, "grad_norm": 38.0, "grad_norm_var": 8.0947265625, "learning_rate": 0.0001, "loss": 7.3756, "loss/crossentropy": 2.133736363053322, "loss/hidden": 3.346875, "loss/jsd": 0.0, "loss/logits": 0.1851572971791029, "step": 6800 }, { "epoch": 0.17025, "grad_norm": 34.75, "grad_norm_var": 14.757747395833333, "learning_rate": 0.0001, "loss": 7.2181, "loss/crossentropy": 2.054655596613884, "loss/hidden": 3.349609375, "loss/jsd": 0.0, "loss/logits": 0.18349520340561867, "step": 6810 }, { "epoch": 0.1705, "grad_norm": 30.375, "grad_norm_var": 5.17890625, "learning_rate": 0.0001, "loss": 7.303, "loss/crossentropy": 2.024763736128807, "loss/hidden": 3.429296875, "loss/jsd": 0.0, "loss/logits": 0.19044207576662303, "step": 6820 }, { "epoch": 0.17075, "grad_norm": 32.25, "grad_norm_var": 3.7955729166666665, "learning_rate": 0.0001, "loss": 7.4884, "loss/crossentropy": 1.9924081854522229, "loss/hidden": 3.49609375, "loss/jsd": 0.0, "loss/logits": 0.20656490996479987, "step": 6830 }, { "epoch": 0.171, "grad_norm": 30.875, "grad_norm_var": 3.215559895833333, "learning_rate": 0.0001, "loss": 7.4845, "loss/crossentropy": 2.1104799427092074, "loss/hidden": 3.334375, "loss/jsd": 0.0, "loss/logits": 0.18316805781796575, "step": 6840 }, { "epoch": 0.17125, "grad_norm": 31.625, "grad_norm_var": 25.610872395833333, "learning_rate": 0.0001, "loss": 7.3771, "loss/crossentropy": 2.02793128117919, "loss/hidden": 3.342578125, "loss/jsd": 0.0, "loss/logits": 0.17605492258444427, "step": 6850 }, { "epoch": 0.1715, "grad_norm": 29.375, "grad_norm_var": 34.992122395833334, "learning_rate": 0.0001, "loss": 7.5157, "loss/crossentropy": 2.1305345237255096, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.19113806802779437, "step": 6860 }, { "epoch": 0.17175, "grad_norm": 28.375, "grad_norm_var": 12.389322916666666, "learning_rate": 0.0001, "loss": 7.2752, "loss/crossentropy": 2.0788576349616052, "loss/hidden": 3.388671875, "loss/jsd": 0.0, "loss/logits": 0.19617441901937127, "step": 6870 }, { "epoch": 0.172, "grad_norm": 27.625, "grad_norm_var": 17.758072916666666, "learning_rate": 0.0001, "loss": 7.4366, "loss/crossentropy": 2.068412736058235, "loss/hidden": 3.355078125, "loss/jsd": 0.0, "loss/logits": 0.18466003462672234, "step": 6880 }, { "epoch": 0.17225, "grad_norm": 30.375, "grad_norm_var": 19.836393229166667, "learning_rate": 0.0001, "loss": 7.5047, "loss/crossentropy": 2.07881121635437, "loss/hidden": 3.380078125, "loss/jsd": 0.0, "loss/logits": 0.19641269743442535, "step": 6890 }, { "epoch": 0.1725, "grad_norm": 29.625, "grad_norm_var": 14.70625, "learning_rate": 0.0001, "loss": 7.4086, "loss/crossentropy": 1.84702168405056, "loss/hidden": 3.469140625, "loss/jsd": 0.0, "loss/logits": 0.1932983512058854, "step": 6900 }, { "epoch": 0.17275, "grad_norm": 31.875, "grad_norm_var": 31.006705729166665, "learning_rate": 0.0001, "loss": 7.3715, "loss/crossentropy": 2.099086304008961, "loss/hidden": 3.406640625, "loss/jsd": 0.0, "loss/logits": 0.20250021573156118, "step": 6910 }, { "epoch": 0.173, "grad_norm": 29.0, "grad_norm_var": 26.07265625, "learning_rate": 0.0001, "loss": 7.3343, "loss/crossentropy": 2.0957569405436516, "loss/hidden": 3.346875, "loss/jsd": 0.0, "loss/logits": 0.2010388659313321, "step": 6920 }, { "epoch": 0.17325, "grad_norm": 33.0, "grad_norm_var": 3.753059895833333, "learning_rate": 0.0001, "loss": 7.4937, "loss/crossentropy": 2.071177572757006, "loss/hidden": 3.445703125, "loss/jsd": 0.0, "loss/logits": 0.19133195597678423, "step": 6930 }, { "epoch": 0.1735, "grad_norm": 31.125, "grad_norm_var": 5.88515625, "learning_rate": 0.0001, "loss": 7.4757, "loss/crossentropy": 2.0803056344389916, "loss/hidden": 3.51875, "loss/jsd": 0.0, "loss/logits": 0.20995833892375232, "step": 6940 }, { "epoch": 0.17375, "grad_norm": 28.5, "grad_norm_var": 7.328059895833333, "learning_rate": 0.0001, "loss": 7.2293, "loss/crossentropy": 1.9285863403230905, "loss/hidden": 3.283203125, "loss/jsd": 0.0, "loss/logits": 0.1665677004493773, "step": 6950 }, { "epoch": 0.174, "grad_norm": 27.375, "grad_norm_var": 11.585872395833333, "learning_rate": 0.0001, "loss": 7.313, "loss/crossentropy": 2.0258478805422784, "loss/hidden": 3.342578125, "loss/jsd": 0.0, "loss/logits": 0.1821833540685475, "step": 6960 }, { "epoch": 0.17425, "grad_norm": 30.5, "grad_norm_var": 10.760416666666666, "learning_rate": 0.0001, "loss": 7.5413, "loss/crossentropy": 2.1308654129505156, "loss/hidden": 3.43203125, "loss/jsd": 0.0, "loss/logits": 0.21524183861911297, "step": 6970 }, { "epoch": 0.1745, "grad_norm": 29.625, "grad_norm_var": 4.138541666666667, "learning_rate": 0.0001, "loss": 7.4692, "loss/crossentropy": 2.1182237058877944, "loss/hidden": 3.362890625, "loss/jsd": 0.0, "loss/logits": 0.18946228343993426, "step": 6980 }, { "epoch": 0.17475, "grad_norm": 31.0, "grad_norm_var": 5.499934895833333, "learning_rate": 0.0001, "loss": 7.4658, "loss/crossentropy": 2.0863646306097507, "loss/hidden": 3.38984375, "loss/jsd": 0.0, "loss/logits": 0.2054815970361233, "step": 6990 }, { "epoch": 0.175, "grad_norm": 31.375, "grad_norm_var": 1.7067057291666667, "learning_rate": 0.0001, "loss": 7.3196, "loss/crossentropy": 2.1002516582608224, "loss/hidden": 3.40546875, "loss/jsd": 0.0, "loss/logits": 0.19273097421973945, "step": 7000 }, { "epoch": 0.17525, "grad_norm": 31.25, "grad_norm_var": 1.7999348958333334, "learning_rate": 0.0001, "loss": 7.3643, "loss/crossentropy": 2.015849883854389, "loss/hidden": 3.383984375, "loss/jsd": 0.0, "loss/logits": 0.18935495987534523, "step": 7010 }, { "epoch": 0.1755, "grad_norm": 28.875, "grad_norm_var": 3.3645833333333335, "learning_rate": 0.0001, "loss": 7.3973, "loss/crossentropy": 2.072261115908623, "loss/hidden": 3.482421875, "loss/jsd": 0.0, "loss/logits": 0.19815812185406684, "step": 7020 }, { "epoch": 0.17575, "grad_norm": 29.375, "grad_norm_var": 8.442708333333334, "learning_rate": 0.0001, "loss": 7.4193, "loss/crossentropy": 2.1367180705070496, "loss/hidden": 3.31328125, "loss/jsd": 0.0, "loss/logits": 0.1970324844121933, "step": 7030 }, { "epoch": 0.176, "grad_norm": 30.375, "grad_norm_var": 5.426822916666667, "learning_rate": 0.0001, "loss": 7.518, "loss/crossentropy": 2.210773140192032, "loss/hidden": 3.40390625, "loss/jsd": 0.0, "loss/logits": 0.1955376474186778, "step": 7040 }, { "epoch": 0.17625, "grad_norm": 30.125, "grad_norm_var": 3.1791015625, "learning_rate": 0.0001, "loss": 7.3883, "loss/crossentropy": 2.1343745410442354, "loss/hidden": 3.342578125, "loss/jsd": 0.0, "loss/logits": 0.19158909022808074, "step": 7050 }, { "epoch": 0.1765, "grad_norm": 29.25, "grad_norm_var": 4.112434895833333, "learning_rate": 0.0001, "loss": 7.4451, "loss/crossentropy": 1.9646480686962604, "loss/hidden": 3.465234375, "loss/jsd": 0.0, "loss/logits": 0.19925388041883707, "step": 7060 }, { "epoch": 0.17675, "grad_norm": 31.5, "grad_norm_var": 3.3275390625, "learning_rate": 0.0001, "loss": 7.4874, "loss/crossentropy": 2.1882633604109287, "loss/hidden": 3.373828125, "loss/jsd": 0.0, "loss/logits": 0.19836742132902146, "step": 7070 }, { "epoch": 0.177, "grad_norm": 34.75, "grad_norm_var": 5.7291015625, "learning_rate": 0.0001, "loss": 7.4907, "loss/crossentropy": 2.2362487465143204, "loss/hidden": 3.33203125, "loss/jsd": 0.0, "loss/logits": 0.19153916742652655, "step": 7080 }, { "epoch": 0.17725, "grad_norm": 29.5, "grad_norm_var": 5.002083333333333, "learning_rate": 0.0001, "loss": 7.4459, "loss/crossentropy": 2.164398466050625, "loss/hidden": 3.43828125, "loss/jsd": 0.0, "loss/logits": 0.20325577780604362, "step": 7090 }, { "epoch": 0.1775, "grad_norm": 32.25, "grad_norm_var": 21.282747395833333, "learning_rate": 0.0001, "loss": 7.4933, "loss/crossentropy": 2.1975876331329345, "loss/hidden": 3.455078125, "loss/jsd": 0.0, "loss/logits": 0.19638306740671396, "step": 7100 }, { "epoch": 0.17775, "grad_norm": 28.5, "grad_norm_var": 32.18020833333333, "learning_rate": 0.0001, "loss": 7.358, "loss/crossentropy": 1.9753169894218445, "loss/hidden": 3.416015625, "loss/jsd": 0.0, "loss/logits": 0.17774544414132834, "step": 7110 }, { "epoch": 0.178, "grad_norm": 30.0, "grad_norm_var": 11.808333333333334, "learning_rate": 0.0001, "loss": 7.4718, "loss/crossentropy": 2.0591190218925477, "loss/hidden": 3.4578125, "loss/jsd": 0.0, "loss/logits": 0.19963474106043577, "step": 7120 }, { "epoch": 0.17825, "grad_norm": 29.25, "grad_norm_var": 7.753580729166667, "learning_rate": 0.0001, "loss": 7.3979, "loss/crossentropy": 2.062809920310974, "loss/hidden": 3.437109375, "loss/jsd": 0.0, "loss/logits": 0.19096513669937848, "step": 7130 }, { "epoch": 0.1785, "grad_norm": 30.125, "grad_norm_var": 6.6025390625, "learning_rate": 0.0001, "loss": 7.3036, "loss/crossentropy": 2.0240534149110316, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.18496394343674183, "step": 7140 }, { "epoch": 0.17875, "grad_norm": 31.375, "grad_norm_var": 2.4942057291666666, "learning_rate": 0.0001, "loss": 7.4778, "loss/crossentropy": 2.124583348631859, "loss/hidden": 3.44921875, "loss/jsd": 0.0, "loss/logits": 0.2003519142046571, "step": 7150 }, { "epoch": 0.179, "grad_norm": 29.0, "grad_norm_var": 13.8978515625, "learning_rate": 0.0001, "loss": 7.3866, "loss/crossentropy": 2.035899819433689, "loss/hidden": 3.41484375, "loss/jsd": 0.0, "loss/logits": 0.18300745636224747, "step": 7160 }, { "epoch": 0.17925, "grad_norm": 28.125, "grad_norm_var": 18.375455729166667, "learning_rate": 0.0001, "loss": 7.3499, "loss/crossentropy": 2.086082286387682, "loss/hidden": 3.26875, "loss/jsd": 0.0, "loss/logits": 0.17294995756819845, "step": 7170 }, { "epoch": 0.1795, "grad_norm": 28.5, "grad_norm_var": 22.834830729166665, "learning_rate": 0.0001, "loss": 7.4265, "loss/crossentropy": 2.0105697728693483, "loss/hidden": 3.393359375, "loss/jsd": 0.0, "loss/logits": 0.19119318593293427, "step": 7180 }, { "epoch": 0.17975, "grad_norm": 33.0, "grad_norm_var": 19.51640625, "learning_rate": 0.0001, "loss": 7.4722, "loss/crossentropy": 2.1506593719124796, "loss/hidden": 3.387109375, "loss/jsd": 0.0, "loss/logits": 0.19607089888304471, "step": 7190 }, { "epoch": 0.18, "grad_norm": 31.5, "grad_norm_var": 9.854622395833333, "learning_rate": 0.0001, "loss": 7.3205, "loss/crossentropy": 2.0767677523195744, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.19086614530533552, "step": 7200 }, { "epoch": 0.18025, "grad_norm": 27.625, "grad_norm_var": 8.885416666666666, "learning_rate": 0.0001, "loss": 7.2908, "loss/crossentropy": 2.230179136991501, "loss/hidden": 3.26953125, "loss/jsd": 0.0, "loss/logits": 0.18500201255083085, "step": 7210 }, { "epoch": 0.1805, "grad_norm": 38.25, "grad_norm_var": 8.60625, "learning_rate": 0.0001, "loss": 7.4664, "loss/crossentropy": 2.192136238515377, "loss/hidden": 3.380859375, "loss/jsd": 0.0, "loss/logits": 0.1965922711417079, "step": 7220 }, { "epoch": 0.18075, "grad_norm": 33.0, "grad_norm_var": 7.258333333333334, "learning_rate": 0.0001, "loss": 7.3796, "loss/crossentropy": 2.010923378914595, "loss/hidden": 3.3609375, "loss/jsd": 0.0, "loss/logits": 0.19027266185730696, "step": 7230 }, { "epoch": 0.181, "grad_norm": 39.75, "grad_norm_var": 11.3353515625, "learning_rate": 0.0001, "loss": 7.5537, "loss/crossentropy": 2.055556283891201, "loss/hidden": 3.47421875, "loss/jsd": 0.0, "loss/logits": 0.19257053220644593, "step": 7240 }, { "epoch": 0.18125, "grad_norm": 27.5, "grad_norm_var": 14.424739583333333, "learning_rate": 0.0001, "loss": 7.4216, "loss/crossentropy": 2.136477355659008, "loss/hidden": 3.350390625, "loss/jsd": 0.0, "loss/logits": 0.20195687096565962, "step": 7250 }, { "epoch": 0.1815, "grad_norm": 30.125, "grad_norm_var": 9.70390625, "learning_rate": 0.0001, "loss": 7.4656, "loss/crossentropy": 2.0134367659687995, "loss/hidden": 3.523046875, "loss/jsd": 0.0, "loss/logits": 0.22354123163968326, "step": 7260 }, { "epoch": 0.18175, "grad_norm": 30.0, "grad_norm_var": 6.324739583333334, "learning_rate": 0.0001, "loss": 7.3959, "loss/crossentropy": 2.1245115220546724, "loss/hidden": 3.3609375, "loss/jsd": 0.0, "loss/logits": 0.19102834183722733, "step": 7270 }, { "epoch": 0.182, "grad_norm": 29.125, "grad_norm_var": 18.903580729166666, "learning_rate": 0.0001, "loss": 7.3761, "loss/crossentropy": 2.035867254436016, "loss/hidden": 3.373046875, "loss/jsd": 0.0, "loss/logits": 0.19426564145833253, "step": 7280 }, { "epoch": 0.18225, "grad_norm": 30.5, "grad_norm_var": 20.406705729166667, "learning_rate": 0.0001, "loss": 7.3884, "loss/crossentropy": 1.9805133253335954, "loss/hidden": 3.370703125, "loss/jsd": 0.0, "loss/logits": 0.19129701480269432, "step": 7290 }, { "epoch": 0.1825, "grad_norm": 31.375, "grad_norm_var": 5.842708333333333, "learning_rate": 0.0001, "loss": 7.3833, "loss/crossentropy": 2.0621849209070207, "loss/hidden": 3.417578125, "loss/jsd": 0.0, "loss/logits": 0.19919742476195096, "step": 7300 }, { "epoch": 0.18275, "grad_norm": 47.0, "grad_norm_var": 2.675771470406222e+18, "learning_rate": 0.0001, "loss": 7.2357, "loss/crossentropy": 2.1282688602805138, "loss/hidden": 3.298046875, "loss/jsd": 0.0, "loss/logits": 0.18128441767767073, "step": 7310 }, { "epoch": 0.183, "grad_norm": 29.25, "grad_norm_var": 28.158333333333335, "learning_rate": 0.0001, "loss": 7.4531, "loss/crossentropy": 2.1078659296035767, "loss/hidden": 3.580078125, "loss/jsd": 0.0, "loss/logits": 0.2171280149370432, "step": 7320 }, { "epoch": 0.18325, "grad_norm": 35.25, "grad_norm_var": 25.365625, "learning_rate": 0.0001, "loss": 7.3538, "loss/crossentropy": 2.200744313001633, "loss/hidden": 3.37421875, "loss/jsd": 0.0, "loss/logits": 0.19498275145888327, "step": 7330 }, { "epoch": 0.1835, "grad_norm": 28.375, "grad_norm_var": 23.859830729166667, "learning_rate": 0.0001, "loss": 7.3437, "loss/crossentropy": 1.9968993581831456, "loss/hidden": 3.3859375, "loss/jsd": 0.0, "loss/logits": 0.1875888810493052, "step": 7340 }, { "epoch": 0.18375, "grad_norm": 28.0, "grad_norm_var": 8.989518229166666, "learning_rate": 0.0001, "loss": 7.3984, "loss/crossentropy": 2.0748091831803324, "loss/hidden": 3.412109375, "loss/jsd": 0.0, "loss/logits": 0.1947355069220066, "step": 7350 }, { "epoch": 0.184, "grad_norm": 29.875, "grad_norm_var": 7.728125, "learning_rate": 0.0001, "loss": 7.3008, "loss/crossentropy": 2.067877373099327, "loss/hidden": 3.313671875, "loss/jsd": 0.0, "loss/logits": 0.18336500320583582, "step": 7360 }, { "epoch": 0.18425, "grad_norm": 29.25, "grad_norm_var": 13.189583333333333, "learning_rate": 0.0001, "loss": 7.4547, "loss/crossentropy": 2.152864509820938, "loss/hidden": 3.451953125, "loss/jsd": 0.0, "loss/logits": 0.2102669222280383, "step": 7370 }, { "epoch": 0.1845, "grad_norm": 28.5, "grad_norm_var": 8.7556640625, "learning_rate": 0.0001, "loss": 7.286, "loss/crossentropy": 1.9991149730980395, "loss/hidden": 3.327734375, "loss/jsd": 0.0, "loss/logits": 0.18364950213581324, "step": 7380 }, { "epoch": 0.18475, "grad_norm": 31.75, "grad_norm_var": 7.770572916666667, "learning_rate": 0.0001, "loss": 7.3546, "loss/crossentropy": 2.0513292245566843, "loss/hidden": 3.355078125, "loss/jsd": 0.0, "loss/logits": 0.1970413200557232, "step": 7390 }, { "epoch": 0.185, "grad_norm": 30.125, "grad_norm_var": 7.620572916666666, "learning_rate": 0.0001, "loss": 7.5116, "loss/crossentropy": 2.102216296643019, "loss/hidden": 3.51875, "loss/jsd": 0.0, "loss/logits": 0.21091360161080958, "step": 7400 }, { "epoch": 0.18525, "grad_norm": 32.75, "grad_norm_var": 9.191666666666666, "learning_rate": 0.0001, "loss": 7.3421, "loss/crossentropy": 1.9926266744732857, "loss/hidden": 3.455078125, "loss/jsd": 0.0, "loss/logits": 0.19832582902163268, "step": 7410 }, { "epoch": 0.1855, "grad_norm": 29.5, "grad_norm_var": 5.945833333333334, "learning_rate": 0.0001, "loss": 7.4563, "loss/crossentropy": 2.141331580281258, "loss/hidden": 3.408203125, "loss/jsd": 0.0, "loss/logits": 0.20020943265408278, "step": 7420 }, { "epoch": 0.18575, "grad_norm": 31.0, "grad_norm_var": 7.198372395833333, "learning_rate": 0.0001, "loss": 7.4227, "loss/crossentropy": 1.9694693490862847, "loss/hidden": 3.421484375, "loss/jsd": 0.0, "loss/logits": 0.18249646089971067, "step": 7430 }, { "epoch": 0.186, "grad_norm": 29.375, "grad_norm_var": 5.9697265625, "learning_rate": 0.0001, "loss": 7.4119, "loss/crossentropy": 2.1407265037298204, "loss/hidden": 3.327734375, "loss/jsd": 0.0, "loss/logits": 0.17720893137156962, "step": 7440 }, { "epoch": 0.18625, "grad_norm": 29.875, "grad_norm_var": 0.8072916666666666, "learning_rate": 0.0001, "loss": 7.3441, "loss/crossentropy": 2.124198019504547, "loss/hidden": 3.424609375, "loss/jsd": 0.0, "loss/logits": 0.1946978410705924, "step": 7450 }, { "epoch": 0.1865, "grad_norm": 32.75, "grad_norm_var": 1.9634765625, "learning_rate": 0.0001, "loss": 7.4652, "loss/crossentropy": 2.131994958221912, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.19461573138833047, "step": 7460 }, { "epoch": 0.18675, "grad_norm": 30.875, "grad_norm_var": 2.7280598958333333, "learning_rate": 0.0001, "loss": 7.3278, "loss/crossentropy": 2.117748848348856, "loss/hidden": 3.2578125, "loss/jsd": 0.0, "loss/logits": 0.18584235943853855, "step": 7470 }, { "epoch": 0.187, "grad_norm": 31.625, "grad_norm_var": 2.466080729166667, "learning_rate": 0.0001, "loss": 7.5817, "loss/crossentropy": 2.1364282086491584, "loss/hidden": 3.460546875, "loss/jsd": 0.0, "loss/logits": 0.21069204956293106, "step": 7480 }, { "epoch": 0.18725, "grad_norm": 31.25, "grad_norm_var": 3.012434895833333, "learning_rate": 0.0001, "loss": 7.5197, "loss/crossentropy": 2.0523312032222747, "loss/hidden": 3.50078125, "loss/jsd": 0.0, "loss/logits": 0.21221144162118435, "step": 7490 }, { "epoch": 0.1875, "grad_norm": 33.75, "grad_norm_var": 4.995768229166667, "learning_rate": 0.0001, "loss": 7.4317, "loss/crossentropy": 2.0852270901203154, "loss/hidden": 3.359765625, "loss/jsd": 0.0, "loss/logits": 0.17908250950276852, "step": 7500 }, { "epoch": 0.18775, "grad_norm": 30.875, "grad_norm_var": 2.4518229166666665, "learning_rate": 0.0001, "loss": 7.5444, "loss/crossentropy": 2.059383874386549, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.19440155941992998, "step": 7510 }, { "epoch": 0.188, "grad_norm": 27.625, "grad_norm_var": 2.851822916666667, "learning_rate": 0.0001, "loss": 7.5199, "loss/crossentropy": 2.1382876858115196, "loss/hidden": 3.4078125, "loss/jsd": 0.0, "loss/logits": 0.19888029601424934, "step": 7520 }, { "epoch": 0.18825, "grad_norm": 29.375, "grad_norm_var": 3.073372395833333, "learning_rate": 0.0001, "loss": 7.3012, "loss/crossentropy": 2.0625696159899234, "loss/hidden": 3.387109375, "loss/jsd": 0.0, "loss/logits": 0.18495072829537093, "step": 7530 }, { "epoch": 0.1885, "grad_norm": 32.0, "grad_norm_var": 2.2122395833333335, "learning_rate": 0.0001, "loss": 7.3643, "loss/crossentropy": 2.124967637658119, "loss/hidden": 3.3328125, "loss/jsd": 0.0, "loss/logits": 0.18375679664313793, "step": 7540 }, { "epoch": 0.18875, "grad_norm": 29.125, "grad_norm_var": 3.2681640625, "learning_rate": 0.0001, "loss": 7.351, "loss/crossentropy": 2.0680116668343542, "loss/hidden": 3.41875, "loss/jsd": 0.0, "loss/logits": 0.18827605471014977, "step": 7550 }, { "epoch": 0.189, "grad_norm": 31.75, "grad_norm_var": 1.5337890625, "learning_rate": 0.0001, "loss": 7.4207, "loss/crossentropy": 2.079096484184265, "loss/hidden": 3.443359375, "loss/jsd": 0.0, "loss/logits": 0.20160295628011227, "step": 7560 }, { "epoch": 0.18925, "grad_norm": 30.625, "grad_norm_var": 18.2197265625, "learning_rate": 0.0001, "loss": 7.4789, "loss/crossentropy": 2.058067685365677, "loss/hidden": 3.46640625, "loss/jsd": 0.0, "loss/logits": 0.21216327100992202, "step": 7570 }, { "epoch": 0.1895, "grad_norm": 34.25, "grad_norm_var": 14.415625, "learning_rate": 0.0001, "loss": 7.4987, "loss/crossentropy": 2.0142914205789566, "loss/hidden": 3.580078125, "loss/jsd": 0.0, "loss/logits": 0.20794902741909027, "step": 7580 }, { "epoch": 0.18975, "grad_norm": 30.5, "grad_norm_var": 1.9809895833333333, "learning_rate": 0.0001, "loss": 7.4585, "loss/crossentropy": 2.299562671780586, "loss/hidden": 3.347265625, "loss/jsd": 0.0, "loss/logits": 0.19880922697484493, "step": 7590 }, { "epoch": 0.19, "grad_norm": 30.875, "grad_norm_var": 15.101822916666666, "learning_rate": 0.0001, "loss": 7.3204, "loss/crossentropy": 2.1472302600741386, "loss/hidden": 3.384375, "loss/jsd": 0.0, "loss/logits": 0.18796155080199242, "step": 7600 }, { "epoch": 0.19025, "grad_norm": 29.0, "grad_norm_var": 2.5940733610451533e+18, "learning_rate": 0.0001, "loss": 7.5335, "loss/crossentropy": 2.1664531916379928, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.18874028734862805, "step": 7610 }, { "epoch": 0.1905, "grad_norm": 29.125, "grad_norm_var": 0.8843098958333333, "learning_rate": 0.0001, "loss": 7.4846, "loss/crossentropy": 2.0765088513493537, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.18674521408975125, "step": 7620 }, { "epoch": 0.19075, "grad_norm": 27.25, "grad_norm_var": 4.381184895833333, "learning_rate": 0.0001, "loss": 7.3401, "loss/crossentropy": 1.7713539503514766, "loss/hidden": 3.515234375, "loss/jsd": 0.0, "loss/logits": 0.1749590938910842, "step": 7630 }, { "epoch": 0.191, "grad_norm": 33.5, "grad_norm_var": 5.26640625, "learning_rate": 0.0001, "loss": 7.4838, "loss/crossentropy": 1.922049730271101, "loss/hidden": 3.388671875, "loss/jsd": 0.0, "loss/logits": 0.18183694053441285, "step": 7640 }, { "epoch": 0.19125, "grad_norm": 30.5, "grad_norm_var": 4.030989583333334, "learning_rate": 0.0001, "loss": 7.3887, "loss/crossentropy": 2.094516658782959, "loss/hidden": 3.452734375, "loss/jsd": 0.0, "loss/logits": 0.18757299687713386, "step": 7650 }, { "epoch": 0.1915, "grad_norm": 32.0, "grad_norm_var": 1.5035807291666667, "learning_rate": 0.0001, "loss": 7.4445, "loss/crossentropy": 1.999229770898819, "loss/hidden": 3.516015625, "loss/jsd": 0.0, "loss/logits": 0.19523975029587745, "step": 7660 }, { "epoch": 0.19175, "grad_norm": 32.25, "grad_norm_var": 3.0275390625, "learning_rate": 0.0001, "loss": 7.4626, "loss/crossentropy": 2.0366951674222946, "loss/hidden": 3.42265625, "loss/jsd": 0.0, "loss/logits": 0.18243371956050397, "step": 7670 }, { "epoch": 0.192, "grad_norm": 32.75, "grad_norm_var": 2.5931640625, "learning_rate": 0.0001, "loss": 7.5274, "loss/crossentropy": 2.0489632681012155, "loss/hidden": 3.564453125, "loss/jsd": 0.0, "loss/logits": 0.21230401135981083, "step": 7680 }, { "epoch": 0.19225, "grad_norm": 32.75, "grad_norm_var": 3.738997395833333, "learning_rate": 0.0001, "loss": 7.4128, "loss/crossentropy": 2.0826203912496566, "loss/hidden": 3.373828125, "loss/jsd": 0.0, "loss/logits": 0.18644160348922015, "step": 7690 }, { "epoch": 0.1925, "grad_norm": 29.875, "grad_norm_var": 4.351041666666666, "learning_rate": 0.0001, "loss": 7.5195, "loss/crossentropy": 2.1674430795013904, "loss/hidden": 3.3640625, "loss/jsd": 0.0, "loss/logits": 0.19499621093273162, "step": 7700 }, { "epoch": 0.19275, "grad_norm": 28.625, "grad_norm_var": 4.939322916666667, "learning_rate": 0.0001, "loss": 7.4236, "loss/crossentropy": 2.092283549904823, "loss/hidden": 3.39453125, "loss/jsd": 0.0, "loss/logits": 0.19506504610180855, "step": 7710 }, { "epoch": 0.193, "grad_norm": 33.0, "grad_norm_var": 9.080208333333333, "learning_rate": 0.0001, "loss": 7.3501, "loss/crossentropy": 2.0526101261377336, "loss/hidden": 3.51328125, "loss/jsd": 0.0, "loss/logits": 0.19770189765840768, "step": 7720 }, { "epoch": 0.19325, "grad_norm": 29.0, "grad_norm_var": 9.731184895833334, "learning_rate": 0.0001, "loss": 7.4607, "loss/crossentropy": 2.05912861302495, "loss/hidden": 3.41328125, "loss/jsd": 0.0, "loss/logits": 0.19287334326654673, "step": 7730 }, { "epoch": 0.1935, "grad_norm": 30.5, "grad_norm_var": 1.8052083333333333, "learning_rate": 0.0001, "loss": 7.4243, "loss/crossentropy": 1.8983285859227181, "loss/hidden": 3.4203125, "loss/jsd": 0.0, "loss/logits": 0.18626301139593124, "step": 7740 }, { "epoch": 0.19375, "grad_norm": 40.75, "grad_norm_var": 9.1541015625, "learning_rate": 0.0001, "loss": 7.4332, "loss/crossentropy": 2.099681233614683, "loss/hidden": 3.40546875, "loss/jsd": 0.0, "loss/logits": 0.20556394904851913, "step": 7750 }, { "epoch": 0.194, "grad_norm": 37.0, "grad_norm_var": 12.584375, "learning_rate": 0.0001, "loss": 7.516, "loss/crossentropy": 2.0436215907335282, "loss/hidden": 3.317578125, "loss/jsd": 0.0, "loss/logits": 0.17505232142284513, "step": 7760 }, { "epoch": 0.19425, "grad_norm": 30.125, "grad_norm_var": 5.2681640625, "learning_rate": 0.0001, "loss": 7.5486, "loss/crossentropy": 2.0449838273227217, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.19324074545875192, "step": 7770 }, { "epoch": 0.1945, "grad_norm": 30.125, "grad_norm_var": 1.4518229166666667, "learning_rate": 0.0001, "loss": 7.4052, "loss/crossentropy": 2.1020638972520826, "loss/hidden": 3.3234375, "loss/jsd": 0.0, "loss/logits": 0.1987349819391966, "step": 7780 }, { "epoch": 0.19475, "grad_norm": 32.25, "grad_norm_var": 1.525, "learning_rate": 0.0001, "loss": 7.3148, "loss/crossentropy": 2.0913542471826077, "loss/hidden": 3.38515625, "loss/jsd": 0.0, "loss/logits": 0.1900358498096466, "step": 7790 }, { "epoch": 0.195, "grad_norm": 29.875, "grad_norm_var": 1.2280598958333333, "learning_rate": 0.0001, "loss": 7.43, "loss/crossentropy": 1.9448820307850838, "loss/hidden": 3.423828125, "loss/jsd": 0.0, "loss/logits": 0.18228193083778024, "step": 7800 }, { "epoch": 0.19525, "grad_norm": 28.75, "grad_norm_var": 2.3791015625, "learning_rate": 0.0001, "loss": 7.4772, "loss/crossentropy": 2.0547610491514208, "loss/hidden": 3.459765625, "loss/jsd": 0.0, "loss/logits": 0.19370344914495946, "step": 7810 }, { "epoch": 0.1955, "grad_norm": 31.375, "grad_norm_var": 6.208072916666667, "learning_rate": 0.0001, "loss": 7.4448, "loss/crossentropy": 2.0798824220895766, "loss/hidden": 3.431640625, "loss/jsd": 0.0, "loss/logits": 0.18972196318209172, "step": 7820 }, { "epoch": 0.19575, "grad_norm": 29.5, "grad_norm_var": 6.248893229166667, "learning_rate": 0.0001, "loss": 7.5358, "loss/crossentropy": 2.2324195951223373, "loss/hidden": 3.4375, "loss/jsd": 0.0, "loss/logits": 0.2047037549316883, "step": 7830 }, { "epoch": 0.196, "grad_norm": 29.375, "grad_norm_var": 4.453059895833333, "learning_rate": 0.0001, "loss": 7.3114, "loss/crossentropy": 2.1020479179918765, "loss/hidden": 3.373046875, "loss/jsd": 0.0, "loss/logits": 0.19328910131007432, "step": 7840 }, { "epoch": 0.19625, "grad_norm": 33.5, "grad_norm_var": 590.0817057291666, "learning_rate": 0.0001, "loss": 7.4281, "loss/crossentropy": 2.0953447744250298, "loss/hidden": 3.374609375, "loss/jsd": 0.0, "loss/logits": 0.1854689259082079, "step": 7850 }, { "epoch": 0.1965, "grad_norm": 33.0, "grad_norm_var": 625.6192057291667, "learning_rate": 0.0001, "loss": 7.5283, "loss/crossentropy": 2.061315707862377, "loss/hidden": 3.429296875, "loss/jsd": 0.0, "loss/logits": 0.18506519980728625, "step": 7860 }, { "epoch": 0.19675, "grad_norm": 32.0, "grad_norm_var": 69.38430989583334, "learning_rate": 0.0001, "loss": 7.5314, "loss/crossentropy": 2.1605025470256805, "loss/hidden": 3.3890625, "loss/jsd": 0.0, "loss/logits": 0.20316088199615479, "step": 7870 }, { "epoch": 0.197, "grad_norm": 31.375, "grad_norm_var": 1.1067057291666667, "learning_rate": 0.0001, "loss": 7.4244, "loss/crossentropy": 2.1977868393063544, "loss/hidden": 3.430078125, "loss/jsd": 0.0, "loss/logits": 0.1992826245725155, "step": 7880 }, { "epoch": 0.19725, "grad_norm": 29.25, "grad_norm_var": 9.578059895833333, "learning_rate": 0.0001, "loss": 7.502, "loss/crossentropy": 2.0480964958667753, "loss/hidden": 3.636328125, "loss/jsd": 0.0, "loss/logits": 0.21483200527727603, "step": 7890 }, { "epoch": 0.1975, "grad_norm": 30.875, "grad_norm_var": 3.6372395833333333, "learning_rate": 0.0001, "loss": 7.548, "loss/crossentropy": 2.157261362671852, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.2084518164396286, "step": 7900 }, { "epoch": 0.19775, "grad_norm": 28.5, "grad_norm_var": 2.095572916666667, "learning_rate": 0.0001, "loss": 7.3864, "loss/crossentropy": 2.1441849052906035, "loss/hidden": 3.414453125, "loss/jsd": 0.0, "loss/logits": 0.20142039898782968, "step": 7910 }, { "epoch": 0.198, "grad_norm": 37.5, "grad_norm_var": 16.170247395833332, "learning_rate": 0.0001, "loss": 7.429, "loss/crossentropy": 2.001983726769686, "loss/hidden": 3.448828125, "loss/jsd": 0.0, "loss/logits": 0.19066998092457652, "step": 7920 }, { "epoch": 0.19825, "grad_norm": 29.375, "grad_norm_var": 15.561393229166667, "learning_rate": 0.0001, "loss": 7.4093, "loss/crossentropy": 2.225774070620537, "loss/hidden": 3.36796875, "loss/jsd": 0.0, "loss/logits": 0.1993710033595562, "step": 7930 }, { "epoch": 0.1985, "grad_norm": 34.0, "grad_norm_var": 25.343489583333334, "learning_rate": 0.0001, "loss": 7.4932, "loss/crossentropy": 2.0083594918251038, "loss/hidden": 3.3453125, "loss/jsd": 0.0, "loss/logits": 0.19412665143609048, "step": 7940 }, { "epoch": 0.19875, "grad_norm": 31.625, "grad_norm_var": 23.877083333333335, "learning_rate": 0.0001, "loss": 7.441, "loss/crossentropy": 2.08128562271595, "loss/hidden": 3.412109375, "loss/jsd": 0.0, "loss/logits": 0.17889103144407273, "step": 7950 }, { "epoch": 0.199, "grad_norm": 27.875, "grad_norm_var": 1.5921223958333333, "learning_rate": 0.0001, "loss": 7.4251, "loss/crossentropy": 2.1019147261977196, "loss/hidden": 3.571484375, "loss/jsd": 0.0, "loss/logits": 0.2211546439677477, "step": 7960 }, { "epoch": 0.19925, "grad_norm": 32.0, "grad_norm_var": 3.3059895833333335, "learning_rate": 0.0001, "loss": 7.4213, "loss/crossentropy": 2.14247687458992, "loss/hidden": 3.420703125, "loss/jsd": 0.0, "loss/logits": 0.18870262056589127, "step": 7970 }, { "epoch": 0.1995, "grad_norm": 31.5, "grad_norm_var": 2.0947265625, "learning_rate": 0.0001, "loss": 7.5788, "loss/crossentropy": 2.1896591186523438, "loss/hidden": 3.479296875, "loss/jsd": 0.0, "loss/logits": 0.20883973222225904, "step": 7980 }, { "epoch": 0.19975, "grad_norm": 28.25, "grad_norm_var": 2.982291666666667, "learning_rate": 0.0001, "loss": 7.5217, "loss/crossentropy": 2.1884095311164855, "loss/hidden": 3.44296875, "loss/jsd": 0.0, "loss/logits": 0.2022842913866043, "step": 7990 }, { "epoch": 0.2, "grad_norm": 28.5, "grad_norm_var": 2.379622395833333, "learning_rate": 0.0001, "loss": 7.4975, "loss/crossentropy": 2.225523295998573, "loss/hidden": 3.4203125, "loss/jsd": 0.0, "loss/logits": 0.2032675376161933, "step": 8000 }, { "epoch": 0.20025, "grad_norm": 30.625, "grad_norm_var": 4.181705729166667, "learning_rate": 0.0001, "loss": 7.3874, "loss/crossentropy": 1.9566738605499268, "loss/hidden": 3.576953125, "loss/jsd": 0.0, "loss/logits": 0.19702311754226684, "step": 8010 }, { "epoch": 0.2005, "grad_norm": 32.75, "grad_norm_var": 6.3509765625, "learning_rate": 0.0001, "loss": 7.4528, "loss/crossentropy": 2.1517204724252226, "loss/hidden": 3.466015625, "loss/jsd": 0.0, "loss/logits": 0.2005010774359107, "step": 8020 }, { "epoch": 0.20075, "grad_norm": 30.875, "grad_norm_var": 5.44140625, "learning_rate": 0.0001, "loss": 7.5306, "loss/crossentropy": 2.0300184957683087, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.18887464031577111, "step": 8030 }, { "epoch": 0.201, "grad_norm": 31.75, "grad_norm_var": 3.1302083333333335, "learning_rate": 0.0001, "loss": 7.4025, "loss/crossentropy": 2.0889772072434427, "loss/hidden": 3.27109375, "loss/jsd": 0.0, "loss/logits": 0.17016669576987625, "step": 8040 }, { "epoch": 0.20125, "grad_norm": 27.125, "grad_norm_var": 94.78125, "learning_rate": 0.0001, "loss": 7.4379, "loss/crossentropy": 2.158484524488449, "loss/hidden": 3.410546875, "loss/jsd": 0.0, "loss/logits": 0.19672231934964657, "step": 8050 }, { "epoch": 0.2015, "grad_norm": 38.0, "grad_norm_var": 13.220572916666667, "learning_rate": 0.0001, "loss": 7.4314, "loss/crossentropy": 1.9623262777924537, "loss/hidden": 3.339453125, "loss/jsd": 0.0, "loss/logits": 0.1833876773715019, "step": 8060 }, { "epoch": 0.20175, "grad_norm": 29.875, "grad_norm_var": 7.858268229166667, "learning_rate": 0.0001, "loss": 7.4662, "loss/crossentropy": 2.2177498638629913, "loss/hidden": 3.383203125, "loss/jsd": 0.0, "loss/logits": 0.1954023003578186, "step": 8070 }, { "epoch": 0.202, "grad_norm": 28.875, "grad_norm_var": 7.627083333333333, "learning_rate": 0.0001, "loss": 7.4792, "loss/crossentropy": 2.1019868202507497, "loss/hidden": 3.44375, "loss/jsd": 0.0, "loss/logits": 0.1995641984976828, "step": 8080 }, { "epoch": 0.20225, "grad_norm": 44.75, "grad_norm_var": 20.4306640625, "learning_rate": 0.0001, "loss": 7.5664, "loss/crossentropy": 2.299755599349737, "loss/hidden": 3.3015625, "loss/jsd": 0.0, "loss/logits": 0.19075682908296585, "step": 8090 }, { "epoch": 0.2025, "grad_norm": 37.75, "grad_norm_var": 2.5671221292944763e+18, "learning_rate": 0.0001, "loss": 7.4526, "loss/crossentropy": 2.131952489167452, "loss/hidden": 3.475390625, "loss/jsd": 0.0, "loss/logits": 0.1955398641526699, "step": 8100 }, { "epoch": 0.20275, "grad_norm": 31.25, "grad_norm_var": 20.342643229166665, "learning_rate": 0.0001, "loss": 7.4687, "loss/crossentropy": 1.9825796701014042, "loss/hidden": 3.475, "loss/jsd": 0.0, "loss/logits": 0.20226136669516565, "step": 8110 }, { "epoch": 0.203, "grad_norm": 27.125, "grad_norm_var": 15.192643229166666, "learning_rate": 0.0001, "loss": 7.1604, "loss/crossentropy": 2.0296560734510423, "loss/hidden": 3.387109375, "loss/jsd": 0.0, "loss/logits": 0.1786259189248085, "step": 8120 }, { "epoch": 0.20325, "grad_norm": 31.0, "grad_norm_var": 7.1775390625, "learning_rate": 0.0001, "loss": 7.2815, "loss/crossentropy": 2.104996609687805, "loss/hidden": 3.405859375, "loss/jsd": 0.0, "loss/logits": 0.19307580199092628, "step": 8130 }, { "epoch": 0.2035, "grad_norm": 34.75, "grad_norm_var": 6.820572916666666, "learning_rate": 0.0001, "loss": 7.3774, "loss/crossentropy": 2.1900447353720667, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.20103423558175565, "step": 8140 }, { "epoch": 0.20375, "grad_norm": 31.0, "grad_norm_var": 5.48125, "learning_rate": 0.0001, "loss": 7.3628, "loss/crossentropy": 2.0671978294849396, "loss/hidden": 3.338671875, "loss/jsd": 0.0, "loss/logits": 0.18412660714238882, "step": 8150 }, { "epoch": 0.204, "grad_norm": 41.25, "grad_norm_var": 14.245572916666667, "learning_rate": 0.0001, "loss": 7.5547, "loss/crossentropy": 2.05537860840559, "loss/hidden": 3.48515625, "loss/jsd": 0.0, "loss/logits": 0.1820721985772252, "step": 8160 }, { "epoch": 0.20425, "grad_norm": 29.125, "grad_norm_var": 13.8625, "learning_rate": 0.0001, "loss": 7.345, "loss/crossentropy": 1.9336151838302613, "loss/hidden": 3.564453125, "loss/jsd": 0.0, "loss/logits": 0.20152895338833332, "step": 8170 }, { "epoch": 0.2045, "grad_norm": 28.125, "grad_norm_var": 7.096875, "learning_rate": 0.0001, "loss": 7.5005, "loss/crossentropy": 2.055295965075493, "loss/hidden": 3.35234375, "loss/jsd": 0.0, "loss/logits": 0.1865659100934863, "step": 8180 }, { "epoch": 0.20475, "grad_norm": 28.25, "grad_norm_var": 8.805989583333334, "learning_rate": 0.0001, "loss": 7.5324, "loss/crossentropy": 1.9976115971803665, "loss/hidden": 3.508203125, "loss/jsd": 0.0, "loss/logits": 0.2114253517240286, "step": 8190 }, { "epoch": 0.205, "grad_norm": 29.875, "grad_norm_var": 12.435416666666667, "learning_rate": 0.0001, "loss": 7.4232, "loss/crossentropy": 2.1122905567288397, "loss/hidden": 3.400390625, "loss/jsd": 0.0, "loss/logits": 0.21649520397186278, "step": 8200 }, { "epoch": 0.20525, "grad_norm": 28.125, "grad_norm_var": 10.055208333333333, "learning_rate": 0.0001, "loss": 7.4875, "loss/crossentropy": 2.1132961876690386, "loss/hidden": 3.47734375, "loss/jsd": 0.0, "loss/logits": 0.2026100393384695, "step": 8210 }, { "epoch": 0.2055, "grad_norm": 29.125, "grad_norm_var": 6.793489583333334, "learning_rate": 0.0001, "loss": 7.3834, "loss/crossentropy": 2.191230720281601, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.19885572660714387, "step": 8220 }, { "epoch": 0.20575, "grad_norm": 30.75, "grad_norm_var": 4.630208333333333, "learning_rate": 0.0001, "loss": 7.4544, "loss/crossentropy": 2.080083931982517, "loss/hidden": 3.474609375, "loss/jsd": 0.0, "loss/logits": 0.2282587742432952, "step": 8230 }, { "epoch": 0.206, "grad_norm": 31.375, "grad_norm_var": 11.458268229166666, "learning_rate": 0.0001, "loss": 7.449, "loss/crossentropy": 2.1384637162089346, "loss/hidden": 3.346484375, "loss/jsd": 0.0, "loss/logits": 0.1842938730493188, "step": 8240 }, { "epoch": 0.20625, "grad_norm": 31.875, "grad_norm_var": 1.7087890625, "learning_rate": 0.0001, "loss": 7.3946, "loss/crossentropy": 2.1355771869421005, "loss/hidden": 3.264453125, "loss/jsd": 0.0, "loss/logits": 0.18052869867533444, "step": 8250 }, { "epoch": 0.2065, "grad_norm": 31.75, "grad_norm_var": 13.0447265625, "learning_rate": 0.0001, "loss": 7.6316, "loss/crossentropy": 2.0203320410102608, "loss/hidden": 3.445703125, "loss/jsd": 0.0, "loss/logits": 0.192235934920609, "step": 8260 }, { "epoch": 0.20675, "grad_norm": 29.5, "grad_norm_var": 12.270247395833334, "learning_rate": 0.0001, "loss": 7.4572, "loss/crossentropy": 1.9869592547416688, "loss/hidden": 3.44140625, "loss/jsd": 0.0, "loss/logits": 0.202651490829885, "step": 8270 }, { "epoch": 0.207, "grad_norm": 30.125, "grad_norm_var": 1.4822916666666666, "learning_rate": 0.0001, "loss": 7.5227, "loss/crossentropy": 2.051181730628014, "loss/hidden": 3.43984375, "loss/jsd": 0.0, "loss/logits": 0.20994498692452906, "step": 8280 }, { "epoch": 0.20725, "grad_norm": 43.75, "grad_norm_var": 12.776822916666667, "learning_rate": 0.0001, "loss": 7.3664, "loss/crossentropy": 1.9618318520486355, "loss/hidden": 3.558984375, "loss/jsd": 0.0, "loss/logits": 0.19053993374109268, "step": 8290 }, { "epoch": 0.2075, "grad_norm": 30.5, "grad_norm_var": 14.92890625, "learning_rate": 0.0001, "loss": 7.318, "loss/crossentropy": 2.0114831268787383, "loss/hidden": 3.459375, "loss/jsd": 0.0, "loss/logits": 0.20329152811318635, "step": 8300 }, { "epoch": 0.20775, "grad_norm": 34.0, "grad_norm_var": 2.8869140625, "learning_rate": 0.0001, "loss": 7.5489, "loss/crossentropy": 2.1187786638736723, "loss/hidden": 3.512890625, "loss/jsd": 0.0, "loss/logits": 0.20839224103838205, "step": 8310 }, { "epoch": 0.208, "grad_norm": 27.875, "grad_norm_var": 2.565559895833333, "learning_rate": 0.0001, "loss": 7.3443, "loss/crossentropy": 2.1263110756874086, "loss/hidden": 3.32109375, "loss/jsd": 0.0, "loss/logits": 0.18627664018422366, "step": 8320 }, { "epoch": 0.20825, "grad_norm": 30.375, "grad_norm_var": 2.755989583333333, "learning_rate": 0.0001, "loss": 7.3857, "loss/crossentropy": 1.944861602783203, "loss/hidden": 3.468359375, "loss/jsd": 0.0, "loss/logits": 0.18570939563214778, "step": 8330 }, { "epoch": 0.2085, "grad_norm": 31.125, "grad_norm_var": 2.340625, "learning_rate": 0.0001, "loss": 7.5344, "loss/crossentropy": 2.1811724051833155, "loss/hidden": 3.41875, "loss/jsd": 0.0, "loss/logits": 0.19298948515206577, "step": 8340 }, { "epoch": 0.20875, "grad_norm": 31.5, "grad_norm_var": 1.56015625, "learning_rate": 0.0001, "loss": 7.3826, "loss/crossentropy": 2.152976579964161, "loss/hidden": 3.305859375, "loss/jsd": 0.0, "loss/logits": 0.190250195749104, "step": 8350 }, { "epoch": 0.209, "grad_norm": 28.875, "grad_norm_var": 1.3416015625, "learning_rate": 0.0001, "loss": 7.323, "loss/crossentropy": 2.2053099036216737, "loss/hidden": 3.344140625, "loss/jsd": 0.0, "loss/logits": 0.1929330924525857, "step": 8360 }, { "epoch": 0.20925, "grad_norm": 29.875, "grad_norm_var": 1.65625, "learning_rate": 0.0001, "loss": 7.4376, "loss/crossentropy": 2.011850906908512, "loss/hidden": 3.442578125, "loss/jsd": 0.0, "loss/logits": 0.20794765576720237, "step": 8370 }, { "epoch": 0.2095, "grad_norm": 29.25, "grad_norm_var": 2.381705729166667, "learning_rate": 0.0001, "loss": 7.4518, "loss/crossentropy": 2.284806078672409, "loss/hidden": 3.412109375, "loss/jsd": 0.0, "loss/logits": 0.2203810729086399, "step": 8380 }, { "epoch": 0.20975, "grad_norm": 43.25, "grad_norm_var": 17.695247395833334, "learning_rate": 0.0001, "loss": 7.418, "loss/crossentropy": 2.1161764934659004, "loss/hidden": 3.475390625, "loss/jsd": 0.0, "loss/logits": 0.1863908626139164, "step": 8390 }, { "epoch": 0.21, "grad_norm": 29.625, "grad_norm_var": 19.8697265625, "learning_rate": 0.0001, "loss": 7.4564, "loss/crossentropy": 2.0293585821986198, "loss/hidden": 3.373046875, "loss/jsd": 0.0, "loss/logits": 0.1884205201640725, "step": 8400 }, { "epoch": 0.21025, "grad_norm": 27.25, "grad_norm_var": 11.422330729166667, "learning_rate": 0.0001, "loss": 7.4317, "loss/crossentropy": 2.2351802065968513, "loss/hidden": 3.376953125, "loss/jsd": 0.0, "loss/logits": 0.20424611177295446, "step": 8410 }, { "epoch": 0.2105, "grad_norm": 30.125, "grad_norm_var": 10.734830729166667, "learning_rate": 0.0001, "loss": 7.3989, "loss/crossentropy": 2.1349810734391212, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.18979013338685036, "step": 8420 }, { "epoch": 0.21075, "grad_norm": 30.875, "grad_norm_var": 2.387955729166667, "learning_rate": 0.0001, "loss": 7.3957, "loss/crossentropy": 2.080636392533779, "loss/hidden": 3.371875, "loss/jsd": 0.0, "loss/logits": 0.18893024744465947, "step": 8430 }, { "epoch": 0.211, "grad_norm": 28.75, "grad_norm_var": 20.089518229166668, "learning_rate": 0.0001, "loss": 7.4847, "loss/crossentropy": 1.9267802774906158, "loss/hidden": 3.31484375, "loss/jsd": 0.0, "loss/logits": 0.18156335428357123, "step": 8440 }, { "epoch": 0.21125, "grad_norm": 29.125, "grad_norm_var": 2.2650390625, "learning_rate": 0.0001, "loss": 7.3839, "loss/crossentropy": 2.14048397988081, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.1889717074111104, "step": 8450 }, { "epoch": 0.2115, "grad_norm": 28.375, "grad_norm_var": 3.283124099188418e+18, "learning_rate": 0.0001, "loss": 7.4661, "loss/crossentropy": 2.1406930878758432, "loss/hidden": 3.699609375, "loss/jsd": 0.0, "loss/logits": 0.20904620084911585, "step": 8460 }, { "epoch": 0.21175, "grad_norm": 30.75, "grad_norm_var": 23.509830729166666, "learning_rate": 0.0001, "loss": 7.2867, "loss/crossentropy": 2.1153231114149094, "loss/hidden": 3.342578125, "loss/jsd": 0.0, "loss/logits": 0.18998019583523273, "step": 8470 }, { "epoch": 0.212, "grad_norm": 31.0, "grad_norm_var": 2.3139973958333333, "learning_rate": 0.0001, "loss": 7.3997, "loss/crossentropy": 2.169650764763355, "loss/hidden": 3.276953125, "loss/jsd": 0.0, "loss/logits": 0.17556187361478806, "step": 8480 }, { "epoch": 0.21225, "grad_norm": 29.625, "grad_norm_var": 2.312239583333333, "learning_rate": 0.0001, "loss": 7.4486, "loss/crossentropy": 2.128412726521492, "loss/hidden": 3.37734375, "loss/jsd": 0.0, "loss/logits": 0.1975807584822178, "step": 8490 }, { "epoch": 0.2125, "grad_norm": 30.75, "grad_norm_var": 2.903125, "learning_rate": 0.0001, "loss": 7.4526, "loss/crossentropy": 2.029564914107323, "loss/hidden": 3.408984375, "loss/jsd": 0.0, "loss/logits": 0.19610330546274782, "step": 8500 }, { "epoch": 0.21275, "grad_norm": 31.375, "grad_norm_var": 4.004622395833334, "learning_rate": 0.0001, "loss": 7.2845, "loss/crossentropy": 1.9631854377686977, "loss/hidden": 3.409765625, "loss/jsd": 0.0, "loss/logits": 0.1855375848710537, "step": 8510 }, { "epoch": 0.213, "grad_norm": 30.0, "grad_norm_var": 3.476822916666667, "learning_rate": 0.0001, "loss": 7.4124, "loss/crossentropy": 2.1494273841381073, "loss/hidden": 3.380078125, "loss/jsd": 0.0, "loss/logits": 0.19540442936122418, "step": 8520 }, { "epoch": 0.21325, "grad_norm": 31.625, "grad_norm_var": 0.9452473958333333, "learning_rate": 0.0001, "loss": 7.4659, "loss/crossentropy": 2.0508621491491796, "loss/hidden": 3.3078125, "loss/jsd": 0.0, "loss/logits": 0.18023168351501226, "step": 8530 }, { "epoch": 0.2135, "grad_norm": 31.25, "grad_norm_var": 2.5809895833333334, "learning_rate": 0.0001, "loss": 7.4522, "loss/crossentropy": 2.090715576708317, "loss/hidden": 3.3875, "loss/jsd": 0.0, "loss/logits": 0.18382459450513125, "step": 8540 }, { "epoch": 0.21375, "grad_norm": 28.75, "grad_norm_var": 2.986168000807314e+18, "learning_rate": 0.0001, "loss": 7.4917, "loss/crossentropy": 2.177123633027077, "loss/hidden": 3.359765625, "loss/jsd": 0.0, "loss/logits": 0.197673611715436, "step": 8550 }, { "epoch": 0.214, "grad_norm": 27.75, "grad_norm_var": 4.035872395833334, "learning_rate": 0.0001, "loss": 7.3008, "loss/crossentropy": 2.028589369356632, "loss/hidden": 3.383984375, "loss/jsd": 0.0, "loss/logits": 0.196718043461442, "step": 8560 }, { "epoch": 0.21425, "grad_norm": 30.5, "grad_norm_var": 4.758072916666666, "learning_rate": 0.0001, "loss": 7.3829, "loss/crossentropy": 2.02908306196332, "loss/hidden": 3.3703125, "loss/jsd": 0.0, "loss/logits": 0.1757409404963255, "step": 8570 }, { "epoch": 0.2145, "grad_norm": 30.875, "grad_norm_var": 1.9754557291666666, "learning_rate": 0.0001, "loss": 7.4107, "loss/crossentropy": 2.0411842301487924, "loss/hidden": 3.580078125, "loss/jsd": 0.0, "loss/logits": 0.2309743857011199, "step": 8580 }, { "epoch": 0.21475, "grad_norm": 30.0, "grad_norm_var": 25.7244140625, "learning_rate": 0.0001, "loss": 7.4038, "loss/crossentropy": 2.1026074662804604, "loss/hidden": 3.490234375, "loss/jsd": 0.0, "loss/logits": 0.202506691403687, "step": 8590 }, { "epoch": 0.215, "grad_norm": 28.625, "grad_norm_var": 3.386458333333333, "learning_rate": 0.0001, "loss": 7.3559, "loss/crossentropy": 2.1690615713596344, "loss/hidden": 3.388671875, "loss/jsd": 0.0, "loss/logits": 0.19095612335950135, "step": 8600 }, { "epoch": 0.21525, "grad_norm": 29.0, "grad_norm_var": 23.880989583333335, "learning_rate": 0.0001, "loss": 7.4164, "loss/crossentropy": 2.099227898567915, "loss/hidden": 3.376953125, "loss/jsd": 0.0, "loss/logits": 0.19162636022083462, "step": 8610 }, { "epoch": 0.2155, "grad_norm": 34.25, "grad_norm_var": 23.880208333333332, "learning_rate": 0.0001, "loss": 7.3725, "loss/crossentropy": 1.9689884655177594, "loss/hidden": 3.52265625, "loss/jsd": 0.0, "loss/logits": 0.1830376190133393, "step": 8620 }, { "epoch": 0.21575, "grad_norm": 29.875, "grad_norm_var": 2.77890625, "learning_rate": 0.0001, "loss": 7.4511, "loss/crossentropy": 2.0263702854514123, "loss/hidden": 3.351171875, "loss/jsd": 0.0, "loss/logits": 0.18029460608959197, "step": 8630 }, { "epoch": 0.216, "grad_norm": 32.0, "grad_norm_var": 23.001822916666665, "learning_rate": 0.0001, "loss": 7.3863, "loss/crossentropy": 1.9046964697539805, "loss/hidden": 3.44140625, "loss/jsd": 0.0, "loss/logits": 0.1935629203915596, "step": 8640 }, { "epoch": 0.21625, "grad_norm": 33.5, "grad_norm_var": 25.009375, "learning_rate": 0.0001, "loss": 7.3083, "loss/crossentropy": 2.129426471889019, "loss/hidden": 3.448046875, "loss/jsd": 0.0, "loss/logits": 0.20037918202579022, "step": 8650 }, { "epoch": 0.2165, "grad_norm": 28.375, "grad_norm_var": 12.242643229166667, "learning_rate": 0.0001, "loss": 7.4603, "loss/crossentropy": 2.2266604125499727, "loss/hidden": 3.33203125, "loss/jsd": 0.0, "loss/logits": 0.18952292017638683, "step": 8660 }, { "epoch": 0.21675, "grad_norm": 34.75, "grad_norm_var": 19.517708333333335, "learning_rate": 0.0001, "loss": 7.4031, "loss/crossentropy": 1.9756697475910188, "loss/hidden": 3.529296875, "loss/jsd": 0.0, "loss/logits": 0.20053059812635182, "step": 8670 }, { "epoch": 0.217, "grad_norm": 29.25, "grad_norm_var": 8.315625, "learning_rate": 0.0001, "loss": 7.4013, "loss/crossentropy": 2.13729098290205, "loss/hidden": 3.3859375, "loss/jsd": 0.0, "loss/logits": 0.19440573658794164, "step": 8680 }, { "epoch": 0.21725, "grad_norm": 31.5, "grad_norm_var": 1.9936848958333333, "learning_rate": 0.0001, "loss": 7.2982, "loss/crossentropy": 2.0935462579131126, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.200434254668653, "step": 8690 }, { "epoch": 0.2175, "grad_norm": 31.625, "grad_norm_var": 26.66640625, "learning_rate": 0.0001, "loss": 7.4596, "loss/crossentropy": 2.0081637501716614, "loss/hidden": 3.42734375, "loss/jsd": 0.0, "loss/logits": 0.196201959438622, "step": 8700 }, { "epoch": 0.21775, "grad_norm": 28.125, "grad_norm_var": 1.869627142435242e+18, "learning_rate": 0.0001, "loss": 7.4726, "loss/crossentropy": 2.057238683104515, "loss/hidden": 3.372265625, "loss/jsd": 0.0, "loss/logits": 0.18527965154498816, "step": 8710 }, { "epoch": 0.218, "grad_norm": 30.25, "grad_norm_var": 4.806705729166667, "learning_rate": 0.0001, "loss": 7.424, "loss/crossentropy": 2.225848586857319, "loss/hidden": 3.4, "loss/jsd": 0.0, "loss/logits": 0.20839223694056272, "step": 8720 }, { "epoch": 0.21825, "grad_norm": 29.5, "grad_norm_var": 3.2572265625, "learning_rate": 0.0001, "loss": 7.3713, "loss/crossentropy": 1.9737806752324105, "loss/hidden": 3.413671875, "loss/jsd": 0.0, "loss/logits": 0.18166052605956792, "step": 8730 }, { "epoch": 0.2185, "grad_norm": 31.25, "grad_norm_var": 3.49765625, "learning_rate": 0.0001, "loss": 7.3192, "loss/crossentropy": 2.1625877559185027, "loss/hidden": 3.3765625, "loss/jsd": 0.0, "loss/logits": 0.196309875510633, "step": 8740 }, { "epoch": 0.21875, "grad_norm": 31.625, "grad_norm_var": 15.733333333333333, "learning_rate": 0.0001, "loss": 7.3994, "loss/crossentropy": 2.1306996777653695, "loss/hidden": 3.3328125, "loss/jsd": 0.0, "loss/logits": 0.17798179090023042, "step": 8750 }, { "epoch": 0.219, "grad_norm": 31.25, "grad_norm_var": 5.465625, "learning_rate": 0.0001, "loss": 7.4225, "loss/crossentropy": 2.168031161278486, "loss/hidden": 3.423046875, "loss/jsd": 0.0, "loss/logits": 0.19842574130743743, "step": 8760 }, { "epoch": 0.21925, "grad_norm": 29.5, "grad_norm_var": 3.457747395833333, "learning_rate": 0.0001, "loss": 7.4515, "loss/crossentropy": 2.081401216983795, "loss/hidden": 3.3875, "loss/jsd": 0.0, "loss/logits": 0.1922367751598358, "step": 8770 }, { "epoch": 0.2195, "grad_norm": 28.625, "grad_norm_var": 2.9389973958333333, "learning_rate": 0.0001, "loss": 7.4641, "loss/crossentropy": 1.9896476596593857, "loss/hidden": 3.367578125, "loss/jsd": 0.0, "loss/logits": 0.18900877684354783, "step": 8780 }, { "epoch": 0.21975, "grad_norm": 29.375, "grad_norm_var": 3.5931640625, "learning_rate": 0.0001, "loss": 7.5034, "loss/crossentropy": 2.131504286080599, "loss/hidden": 3.4234375, "loss/jsd": 0.0, "loss/logits": 0.1981994620524347, "step": 8790 }, { "epoch": 0.22, "grad_norm": 29.75, "grad_norm_var": 17.612955729166668, "learning_rate": 0.0001, "loss": 7.3602, "loss/crossentropy": 2.367118790745735, "loss/hidden": 3.357421875, "loss/jsd": 0.0, "loss/logits": 0.20705808699131012, "step": 8800 }, { "epoch": 0.22025, "grad_norm": 30.625, "grad_norm_var": 3.588997395833333, "learning_rate": 0.0001, "loss": 7.3512, "loss/crossentropy": 1.915165586769581, "loss/hidden": 3.41328125, "loss/jsd": 0.0, "loss/logits": 0.18030493911355733, "step": 8810 }, { "epoch": 0.2205, "grad_norm": 31.625, "grad_norm_var": 2.732291666666667, "learning_rate": 0.0001, "loss": 7.3832, "loss/crossentropy": 2.1191729307174683, "loss/hidden": 3.37734375, "loss/jsd": 0.0, "loss/logits": 0.20246538575738668, "step": 8820 }, { "epoch": 0.22075, "grad_norm": 32.0, "grad_norm_var": 2.034309895833333, "learning_rate": 0.0001, "loss": 7.4184, "loss/crossentropy": 2.280582541972399, "loss/hidden": 3.355078125, "loss/jsd": 0.0, "loss/logits": 0.1913912059739232, "step": 8830 }, { "epoch": 0.221, "grad_norm": 28.75, "grad_norm_var": 1.4629557291666666, "learning_rate": 0.0001, "loss": 7.5177, "loss/crossentropy": 2.07154730707407, "loss/hidden": 3.334765625, "loss/jsd": 0.0, "loss/logits": 0.18417379464954137, "step": 8840 }, { "epoch": 0.22125, "grad_norm": 29.25, "grad_norm_var": 2.0306640625, "learning_rate": 0.0001, "loss": 7.4098, "loss/crossentropy": 2.0918928742408753, "loss/hidden": 3.478515625, "loss/jsd": 0.0, "loss/logits": 0.21339783817529678, "step": 8850 }, { "epoch": 0.2215, "grad_norm": 34.75, "grad_norm_var": 2.4098307291666665, "learning_rate": 0.0001, "loss": 7.3907, "loss/crossentropy": 2.0262165658175944, "loss/hidden": 3.4796875, "loss/jsd": 0.0, "loss/logits": 0.1963033676147461, "step": 8860 }, { "epoch": 0.22175, "grad_norm": 29.625, "grad_norm_var": 3.1041015625, "learning_rate": 0.0001, "loss": 7.3643, "loss/crossentropy": 2.119324280321598, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.19243048634380103, "step": 8870 }, { "epoch": 0.222, "grad_norm": 29.25, "grad_norm_var": 3.3593098958333334, "learning_rate": 0.0001, "loss": 7.335, "loss/crossentropy": 2.1042064100503923, "loss/hidden": 3.43671875, "loss/jsd": 0.0, "loss/logits": 0.2055317424237728, "step": 8880 }, { "epoch": 0.22225, "grad_norm": 29.0, "grad_norm_var": 4.35390625, "learning_rate": 0.0001, "loss": 7.3974, "loss/crossentropy": 2.110988216102123, "loss/hidden": 3.344140625, "loss/jsd": 0.0, "loss/logits": 0.18536690715700388, "step": 8890 }, { "epoch": 0.2225, "grad_norm": 30.0, "grad_norm_var": 6.014322916666667, "learning_rate": 0.0001, "loss": 7.4155, "loss/crossentropy": 2.033397987484932, "loss/hidden": 3.367578125, "loss/jsd": 0.0, "loss/logits": 0.19111265633255242, "step": 8900 }, { "epoch": 0.22275, "grad_norm": 32.25, "grad_norm_var": 4.747916666666667, "learning_rate": 0.0001, "loss": 7.4631, "loss/crossentropy": 2.090746468305588, "loss/hidden": 3.3609375, "loss/jsd": 0.0, "loss/logits": 0.20997797157615422, "step": 8910 }, { "epoch": 0.223, "grad_norm": 32.75, "grad_norm_var": 2.0247395833333335, "learning_rate": 0.0001, "loss": 7.3647, "loss/crossentropy": 2.107650229334831, "loss/hidden": 3.41171875, "loss/jsd": 0.0, "loss/logits": 0.18095682561397552, "step": 8920 }, { "epoch": 0.22325, "grad_norm": 33.0, "grad_norm_var": 4.983072916666667, "learning_rate": 0.0001, "loss": 7.4161, "loss/crossentropy": 2.1033721581101417, "loss/hidden": 3.394921875, "loss/jsd": 0.0, "loss/logits": 0.1837721960619092, "step": 8930 }, { "epoch": 0.2235, "grad_norm": 30.0, "grad_norm_var": 5.160872395833334, "learning_rate": 0.0001, "loss": 7.381, "loss/crossentropy": 2.2039036631584166, "loss/hidden": 3.41953125, "loss/jsd": 0.0, "loss/logits": 0.20040026511996983, "step": 8940 }, { "epoch": 0.22375, "grad_norm": 32.75, "grad_norm_var": 5.625455729166666, "learning_rate": 0.0001, "loss": 7.4581, "loss/crossentropy": 2.0015091970562935, "loss/hidden": 3.4046875, "loss/jsd": 0.0, "loss/logits": 0.1968998895958066, "step": 8950 }, { "epoch": 0.224, "grad_norm": 30.125, "grad_norm_var": 7.976497395833333, "learning_rate": 0.0001, "loss": 7.446, "loss/crossentropy": 2.1770635031163694, "loss/hidden": 3.290234375, "loss/jsd": 0.0, "loss/logits": 0.18164771795272827, "step": 8960 }, { "epoch": 0.22425, "grad_norm": 26.875, "grad_norm_var": 2.037239583333333, "learning_rate": 0.0001, "loss": 7.2677, "loss/crossentropy": 1.9846746385097505, "loss/hidden": 3.45859375, "loss/jsd": 0.0, "loss/logits": 0.19175144601613284, "step": 8970 }, { "epoch": 0.2245, "grad_norm": 30.25, "grad_norm_var": 2.8353515625, "learning_rate": 0.0001, "loss": 7.4645, "loss/crossentropy": 2.169223573803902, "loss/hidden": 3.33359375, "loss/jsd": 0.0, "loss/logits": 0.18991702441126107, "step": 8980 }, { "epoch": 0.22475, "grad_norm": 29.875, "grad_norm_var": 10.3228515625, "learning_rate": 0.0001, "loss": 7.4607, "loss/crossentropy": 2.1323930069804193, "loss/hidden": 3.348046875, "loss/jsd": 0.0, "loss/logits": 0.18631141390651465, "step": 8990 }, { "epoch": 0.225, "grad_norm": 29.625, "grad_norm_var": 11.431705729166667, "learning_rate": 0.0001, "loss": 7.4203, "loss/crossentropy": 2.1948125064373016, "loss/hidden": 3.420703125, "loss/jsd": 0.0, "loss/logits": 0.21075339019298553, "step": 9000 }, { "epoch": 0.22525, "grad_norm": 31.0, "grad_norm_var": 3.21875, "learning_rate": 0.0001, "loss": 7.3937, "loss/crossentropy": 2.061431697010994, "loss/hidden": 3.357421875, "loss/jsd": 0.0, "loss/logits": 0.20084240343421697, "step": 9010 }, { "epoch": 0.2255, "grad_norm": 30.0, "grad_norm_var": 9.7125, "learning_rate": 0.0001, "loss": 7.5364, "loss/crossentropy": 2.1422473564743996, "loss/hidden": 3.38359375, "loss/jsd": 0.0, "loss/logits": 0.20135180205106734, "step": 9020 }, { "epoch": 0.22575, "grad_norm": 29.0, "grad_norm_var": 10.320247395833333, "learning_rate": 0.0001, "loss": 7.3638, "loss/crossentropy": 2.064805781841278, "loss/hidden": 3.509375, "loss/jsd": 0.0, "loss/logits": 0.19638751186430453, "step": 9030 }, { "epoch": 0.226, "grad_norm": 29.0, "grad_norm_var": 7.7306640625, "learning_rate": 0.0001, "loss": 7.367, "loss/crossentropy": 2.0203323513269424, "loss/hidden": 3.351953125, "loss/jsd": 0.0, "loss/logits": 0.1730576554313302, "step": 9040 }, { "epoch": 0.22625, "grad_norm": 28.0, "grad_norm_var": 7.5291015625, "learning_rate": 0.0001, "loss": 7.4083, "loss/crossentropy": 2.1449467122554777, "loss/hidden": 3.428125, "loss/jsd": 0.0, "loss/logits": 0.20745128113776445, "step": 9050 }, { "epoch": 0.2265, "grad_norm": 29.625, "grad_norm_var": 10.059830729166666, "learning_rate": 0.0001, "loss": 7.3739, "loss/crossentropy": 2.277996188402176, "loss/hidden": 3.359765625, "loss/jsd": 0.0, "loss/logits": 0.1961175424978137, "step": 9060 }, { "epoch": 0.22675, "grad_norm": 29.75, "grad_norm_var": 2.9497395833333333, "learning_rate": 0.0001, "loss": 7.425, "loss/crossentropy": 2.128904873877764, "loss/hidden": 3.307421875, "loss/jsd": 0.0, "loss/logits": 0.1912422338500619, "step": 9070 }, { "epoch": 0.227, "grad_norm": 30.0, "grad_norm_var": 2.4291666666666667, "learning_rate": 0.0001, "loss": 7.3724, "loss/crossentropy": 1.8966167330741883, "loss/hidden": 3.315234375, "loss/jsd": 0.0, "loss/logits": 0.1772749178111553, "step": 9080 }, { "epoch": 0.22725, "grad_norm": 30.625, "grad_norm_var": 2.2955729166666665, "learning_rate": 0.0001, "loss": 7.4341, "loss/crossentropy": 1.9993775576353072, "loss/hidden": 3.409375, "loss/jsd": 0.0, "loss/logits": 0.17548400331288577, "step": 9090 }, { "epoch": 0.2275, "grad_norm": 30.125, "grad_norm_var": 2.3212890625, "learning_rate": 0.0001, "loss": 7.4512, "loss/crossentropy": 2.1553252935409546, "loss/hidden": 3.41953125, "loss/jsd": 0.0, "loss/logits": 0.1904754728078842, "step": 9100 }, { "epoch": 0.22775, "grad_norm": 31.875, "grad_norm_var": 1.8264973958333333, "learning_rate": 0.0001, "loss": 7.4299, "loss/crossentropy": 2.0791067980229854, "loss/hidden": 3.384765625, "loss/jsd": 0.0, "loss/logits": 0.18264174591749907, "step": 9110 }, { "epoch": 0.228, "grad_norm": 30.625, "grad_norm_var": 8.670572916666666, "learning_rate": 0.0001, "loss": 7.3621, "loss/crossentropy": 2.2495081633329392, "loss/hidden": 3.3703125, "loss/jsd": 0.0, "loss/logits": 0.19295338317751884, "step": 9120 }, { "epoch": 0.22825, "grad_norm": 34.25, "grad_norm_var": 9.628580729166666, "learning_rate": 0.0001, "loss": 7.4069, "loss/crossentropy": 2.0221784450113773, "loss/hidden": 3.43046875, "loss/jsd": 0.0, "loss/logits": 0.18131311442703008, "step": 9130 }, { "epoch": 0.2285, "grad_norm": 31.375, "grad_norm_var": 3.8372395833333335, "learning_rate": 0.0001, "loss": 7.3788, "loss/crossentropy": 2.096472094208002, "loss/hidden": 3.533984375, "loss/jsd": 0.0, "loss/logits": 0.19181215222924947, "step": 9140 }, { "epoch": 0.22875, "grad_norm": 32.75, "grad_norm_var": 2.9369140625, "learning_rate": 0.0001, "loss": 7.4625, "loss/crossentropy": 2.161876367032528, "loss/hidden": 3.39765625, "loss/jsd": 0.0, "loss/logits": 0.19645992666482925, "step": 9150 }, { "epoch": 0.229, "grad_norm": 36.5, "grad_norm_var": 13.144791666666666, "learning_rate": 0.0001, "loss": 7.3883, "loss/crossentropy": 1.870260328054428, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.19089705124497414, "step": 9160 }, { "epoch": 0.22925, "grad_norm": 30.125, "grad_norm_var": 11.746809895833334, "learning_rate": 0.0001, "loss": 7.3778, "loss/crossentropy": 2.304895442724228, "loss/hidden": 3.30390625, "loss/jsd": 0.0, "loss/logits": 0.19963221047073604, "step": 9170 }, { "epoch": 0.2295, "grad_norm": 30.875, "grad_norm_var": 2.64839803006287e+18, "learning_rate": 0.0001, "loss": 7.4899, "loss/crossentropy": 2.1439118653535845, "loss/hidden": 3.46484375, "loss/jsd": 0.0, "loss/logits": 0.20680125001817942, "step": 9180 }, { "epoch": 0.22975, "grad_norm": 29.625, "grad_norm_var": 16.839518229166668, "learning_rate": 0.0001, "loss": 7.3153, "loss/crossentropy": 1.9939407154917717, "loss/hidden": 3.4125, "loss/jsd": 0.0, "loss/logits": 0.19113040501251816, "step": 9190 }, { "epoch": 0.23, "grad_norm": 30.125, "grad_norm_var": 1.8309895833333334, "learning_rate": 0.0001, "loss": 7.5357, "loss/crossentropy": 2.088300883769989, "loss/hidden": 3.476953125, "loss/jsd": 0.0, "loss/logits": 0.19538584928959607, "step": 9200 }, { "epoch": 0.23025, "grad_norm": 30.75, "grad_norm_var": 1.66640625, "learning_rate": 0.0001, "loss": 7.3255, "loss/crossentropy": 2.0176690459251403, "loss/hidden": 3.361328125, "loss/jsd": 0.0, "loss/logits": 0.1823873495683074, "step": 9210 }, { "epoch": 0.2305, "grad_norm": 32.75, "grad_norm_var": 4.837955729166667, "learning_rate": 0.0001, "loss": 7.3334, "loss/crossentropy": 2.061754436790943, "loss/hidden": 3.394921875, "loss/jsd": 0.0, "loss/logits": 0.18769590836018324, "step": 9220 }, { "epoch": 0.23075, "grad_norm": 31.75, "grad_norm_var": 4.683268229166667, "learning_rate": 0.0001, "loss": 7.3932, "loss/crossentropy": 2.0443666532635687, "loss/hidden": 3.40390625, "loss/jsd": 0.0, "loss/logits": 0.19146692994982004, "step": 9230 }, { "epoch": 0.231, "grad_norm": 33.25, "grad_norm_var": 2.594073359575469e+18, "learning_rate": 0.0001, "loss": 7.5992, "loss/crossentropy": 2.189889648556709, "loss/hidden": 3.344921875, "loss/jsd": 0.0, "loss/logits": 0.19348510541021824, "step": 9240 }, { "epoch": 0.23125, "grad_norm": 29.125, "grad_norm_var": 2.594073359702976e+18, "learning_rate": 0.0001, "loss": 7.4217, "loss/crossentropy": 2.0103170931339265, "loss/hidden": 3.44140625, "loss/jsd": 0.0, "loss/logits": 0.19076041504740715, "step": 9250 }, { "epoch": 0.2315, "grad_norm": 31.375, "grad_norm_var": 13.5134765625, "learning_rate": 0.0001, "loss": 7.3004, "loss/crossentropy": 2.0597189858555796, "loss/hidden": 3.3421875, "loss/jsd": 0.0, "loss/logits": 0.18948603458702565, "step": 9260 }, { "epoch": 0.23175, "grad_norm": 31.625, "grad_norm_var": 13.402083333333334, "learning_rate": 0.0001, "loss": 7.3946, "loss/crossentropy": 2.0123729363083838, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.1885578565299511, "step": 9270 }, { "epoch": 0.232, "grad_norm": 29.875, "grad_norm_var": 5.879622395833334, "learning_rate": 0.0001, "loss": 7.1578, "loss/crossentropy": 2.2501363843679427, "loss/hidden": 3.286328125, "loss/jsd": 0.0, "loss/logits": 0.19268405642360448, "step": 9280 }, { "epoch": 0.23225, "grad_norm": 30.25, "grad_norm_var": 16.982291666666665, "learning_rate": 0.0001, "loss": 7.3481, "loss/crossentropy": 2.076053886115551, "loss/hidden": 3.571875, "loss/jsd": 0.0, "loss/logits": 0.21077420487999915, "step": 9290 }, { "epoch": 0.2325, "grad_norm": 29.5, "grad_norm_var": 21.769791666666666, "learning_rate": 0.0001, "loss": 7.2711, "loss/crossentropy": 2.193411388993263, "loss/hidden": 3.33828125, "loss/jsd": 0.0, "loss/logits": 0.18807493168860673, "step": 9300 }, { "epoch": 0.23275, "grad_norm": 34.25, "grad_norm_var": 12.355143229166666, "learning_rate": 0.0001, "loss": 7.3819, "loss/crossentropy": 2.166665832698345, "loss/hidden": 3.3203125, "loss/jsd": 0.0, "loss/logits": 0.18620893750339745, "step": 9310 }, { "epoch": 0.233, "grad_norm": 29.875, "grad_norm_var": 3.002083333333333, "learning_rate": 0.0001, "loss": 7.4076, "loss/crossentropy": 2.022654353827238, "loss/hidden": 3.2984375, "loss/jsd": 0.0, "loss/logits": 0.18289813362061977, "step": 9320 }, { "epoch": 0.23325, "grad_norm": 30.0, "grad_norm_var": 3.0509765625, "learning_rate": 0.0001, "loss": 7.3206, "loss/crossentropy": 2.020027980953455, "loss/hidden": 3.425390625, "loss/jsd": 0.0, "loss/logits": 0.18972874553874136, "step": 9330 }, { "epoch": 0.2335, "grad_norm": 31.125, "grad_norm_var": 2.098893229166667, "learning_rate": 0.0001, "loss": 7.4091, "loss/crossentropy": 2.029365235567093, "loss/hidden": 3.43125, "loss/jsd": 0.0, "loss/logits": 0.2005317559465766, "step": 9340 }, { "epoch": 0.23375, "grad_norm": 29.5, "grad_norm_var": 10.520768229166666, "learning_rate": 0.0001, "loss": 7.3704, "loss/crossentropy": 1.95634398534894, "loss/hidden": 3.433203125, "loss/jsd": 0.0, "loss/logits": 0.18962019477039577, "step": 9350 }, { "epoch": 0.234, "grad_norm": 27.75, "grad_norm_var": 13.475, "learning_rate": 0.0001, "loss": 7.3797, "loss/crossentropy": 2.063827896118164, "loss/hidden": 3.3609375, "loss/jsd": 0.0, "loss/logits": 0.19280518041923642, "step": 9360 }, { "epoch": 0.23425, "grad_norm": 30.5, "grad_norm_var": 2.0869140625, "learning_rate": 0.0001, "loss": 7.4918, "loss/crossentropy": 2.068470099568367, "loss/hidden": 3.429296875, "loss/jsd": 0.0, "loss/logits": 0.19361184667795897, "step": 9370 }, { "epoch": 0.2345, "grad_norm": 28.0, "grad_norm_var": 6.534830729166667, "learning_rate": 0.0001, "loss": 7.4265, "loss/crossentropy": 2.125886672735214, "loss/hidden": 3.401171875, "loss/jsd": 0.0, "loss/logits": 0.19355848152190447, "step": 9380 }, { "epoch": 0.23475, "grad_norm": 30.375, "grad_norm_var": 2.2770182291666665, "learning_rate": 0.0001, "loss": 7.3769, "loss/crossentropy": 2.2057121500372885, "loss/hidden": 3.28515625, "loss/jsd": 0.0, "loss/logits": 0.1843592157587409, "step": 9390 }, { "epoch": 0.235, "grad_norm": 87.5, "grad_norm_var": 202.43932291666667, "learning_rate": 0.0001, "loss": 7.4444, "loss/crossentropy": 2.17584248483181, "loss/hidden": 3.4515625, "loss/jsd": 0.0, "loss/logits": 0.1986805137246847, "step": 9400 }, { "epoch": 0.23525, "grad_norm": 33.0, "grad_norm_var": 205.446875, "learning_rate": 0.0001, "loss": 7.5575, "loss/crossentropy": 1.9989879056811333, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.21514312122017146, "step": 9410 }, { "epoch": 0.2355, "grad_norm": 33.25, "grad_norm_var": 3.4775390625, "learning_rate": 0.0001, "loss": 7.3965, "loss/crossentropy": 2.31248200237751, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.20192647576332093, "step": 9420 }, { "epoch": 0.23575, "grad_norm": 34.0, "grad_norm_var": 2.5723307291666666, "learning_rate": 0.0001, "loss": 7.425, "loss/crossentropy": 2.0246524304151534, "loss/hidden": 3.43125, "loss/jsd": 0.0, "loss/logits": 0.1936120893806219, "step": 9430 }, { "epoch": 0.236, "grad_norm": 31.5, "grad_norm_var": 3.8207682291666667, "learning_rate": 0.0001, "loss": 7.3458, "loss/crossentropy": 2.0714851915836334, "loss/hidden": 3.315625, "loss/jsd": 0.0, "loss/logits": 0.19567677434533834, "step": 9440 }, { "epoch": 0.23625, "grad_norm": 32.25, "grad_norm_var": 2.0452473958333335, "learning_rate": 0.0001, "loss": 7.3934, "loss/crossentropy": 2.024892423301935, "loss/hidden": 3.541796875, "loss/jsd": 0.0, "loss/logits": 0.19850265365093947, "step": 9450 }, { "epoch": 0.2365, "grad_norm": 32.5, "grad_norm_var": 3.16640625, "learning_rate": 0.0001, "loss": 7.4192, "loss/crossentropy": 2.146814212203026, "loss/hidden": 3.43828125, "loss/jsd": 0.0, "loss/logits": 0.20048882197588683, "step": 9460 }, { "epoch": 0.23675, "grad_norm": 27.875, "grad_norm_var": 12.364322916666667, "learning_rate": 0.0001, "loss": 7.3745, "loss/crossentropy": 2.0004019677639007, "loss/hidden": 3.409375, "loss/jsd": 0.0, "loss/logits": 0.19019459020346402, "step": 9470 }, { "epoch": 0.237, "grad_norm": 28.75, "grad_norm_var": 12.885416666666666, "learning_rate": 0.0001, "loss": 7.2858, "loss/crossentropy": 2.0364832431077957, "loss/hidden": 3.357421875, "loss/jsd": 0.0, "loss/logits": 0.18539349418133497, "step": 9480 }, { "epoch": 0.23725, "grad_norm": 26.75, "grad_norm_var": 4.488997395833334, "learning_rate": 0.0001, "loss": 7.2865, "loss/crossentropy": 2.257227724790573, "loss/hidden": 3.247265625, "loss/jsd": 0.0, "loss/logits": 0.1774066084995866, "step": 9490 }, { "epoch": 0.2375, "grad_norm": 28.25, "grad_norm_var": 5.15625, "learning_rate": 0.0001, "loss": 7.3742, "loss/crossentropy": 2.192533364892006, "loss/hidden": 3.319140625, "loss/jsd": 0.0, "loss/logits": 0.18531391881406306, "step": 9500 }, { "epoch": 0.23775, "grad_norm": 28.5, "grad_norm_var": 8.7025390625, "learning_rate": 0.0001, "loss": 7.4007, "loss/crossentropy": 1.9954842567443847, "loss/hidden": 3.512890625, "loss/jsd": 0.0, "loss/logits": 0.21250668447464705, "step": 9510 }, { "epoch": 0.238, "grad_norm": 27.0, "grad_norm_var": 3.2018229166666665, "learning_rate": 0.0001, "loss": 7.3589, "loss/crossentropy": 2.0501152604818342, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.19774708338081837, "step": 9520 }, { "epoch": 0.23825, "grad_norm": 31.125, "grad_norm_var": 2.471875, "learning_rate": 0.0001, "loss": 7.3611, "loss/crossentropy": 2.1446721121668815, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.19521092902868986, "step": 9530 }, { "epoch": 0.2385, "grad_norm": 33.25, "grad_norm_var": 2.911393229166667, "learning_rate": 0.0001, "loss": 7.4037, "loss/crossentropy": 2.050874675065279, "loss/hidden": 3.4078125, "loss/jsd": 0.0, "loss/logits": 0.18872325737029313, "step": 9540 }, { "epoch": 0.23875, "grad_norm": 32.25, "grad_norm_var": 1.69375, "learning_rate": 0.0001, "loss": 7.3851, "loss/crossentropy": 2.092629846930504, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.1858744696713984, "step": 9550 }, { "epoch": 0.239, "grad_norm": 30.25, "grad_norm_var": 1.6229166666666666, "learning_rate": 0.0001, "loss": 7.324, "loss/crossentropy": 2.1389416724443437, "loss/hidden": 3.344921875, "loss/jsd": 0.0, "loss/logits": 0.18115091007202863, "step": 9560 }, { "epoch": 0.23925, "grad_norm": 28.375, "grad_norm_var": 0.7893229166666667, "learning_rate": 0.0001, "loss": 7.3684, "loss/crossentropy": 2.1148220866918566, "loss/hidden": 3.390234375, "loss/jsd": 0.0, "loss/logits": 0.18993774689733983, "step": 9570 }, { "epoch": 0.2395, "grad_norm": 29.125, "grad_norm_var": 1.0260416666666667, "learning_rate": 0.0001, "loss": 7.4711, "loss/crossentropy": 1.987958113849163, "loss/hidden": 3.409375, "loss/jsd": 0.0, "loss/logits": 0.19200538750737906, "step": 9580 }, { "epoch": 0.23975, "grad_norm": 29.625, "grad_norm_var": 0.6018229166666667, "learning_rate": 0.0001, "loss": 7.3578, "loss/crossentropy": 2.076607885956764, "loss/hidden": 3.36640625, "loss/jsd": 0.0, "loss/logits": 0.20001101978123187, "step": 9590 }, { "epoch": 0.24, "grad_norm": 30.75, "grad_norm_var": 1.4124348958333333, "learning_rate": 0.0001, "loss": 7.4458, "loss/crossentropy": 2.1496243715286254, "loss/hidden": 3.357421875, "loss/jsd": 0.0, "loss/logits": 0.1889566643163562, "step": 9600 }, { "epoch": 0.24025, "grad_norm": 29.875, "grad_norm_var": 1.04140625, "learning_rate": 0.0001, "loss": 7.4632, "loss/crossentropy": 2.162689308822155, "loss/hidden": 3.33828125, "loss/jsd": 0.0, "loss/logits": 0.19204493910074233, "step": 9610 }, { "epoch": 0.2405, "grad_norm": 27.375, "grad_norm_var": 1.5863932291666667, "learning_rate": 0.0001, "loss": 7.3897, "loss/crossentropy": 2.1323146484792233, "loss/hidden": 3.455859375, "loss/jsd": 0.0, "loss/logits": 0.20718522872775794, "step": 9620 }, { "epoch": 0.24075, "grad_norm": 32.25, "grad_norm_var": 2.09375, "learning_rate": 0.0001, "loss": 7.2848, "loss/crossentropy": 2.0804166465997698, "loss/hidden": 3.417578125, "loss/jsd": 0.0, "loss/logits": 0.1966247998178005, "step": 9630 }, { "epoch": 0.241, "grad_norm": 31.25, "grad_norm_var": 1.3559895833333333, "learning_rate": 0.0001, "loss": 7.4174, "loss/crossentropy": 2.3036757931113243, "loss/hidden": 3.341015625, "loss/jsd": 0.0, "loss/logits": 0.18416995517909526, "step": 9640 }, { "epoch": 0.24125, "grad_norm": 33.0, "grad_norm_var": 2.41015625, "learning_rate": 0.0001, "loss": 7.2853, "loss/crossentropy": 1.9936259984970093, "loss/hidden": 3.428125, "loss/jsd": 0.0, "loss/logits": 0.21977887880057095, "step": 9650 }, { "epoch": 0.2415, "grad_norm": 32.5, "grad_norm_var": 6.1228515625, "learning_rate": 0.0001, "loss": 7.4938, "loss/crossentropy": 2.2000851720571517, "loss/hidden": 3.39765625, "loss/jsd": 0.0, "loss/logits": 0.20301534831523896, "step": 9660 }, { "epoch": 0.24175, "grad_norm": 31.0, "grad_norm_var": 3.59765625, "learning_rate": 0.0001, "loss": 7.5133, "loss/crossentropy": 2.192278115451336, "loss/hidden": 3.351953125, "loss/jsd": 0.0, "loss/logits": 0.19489070847630502, "step": 9670 }, { "epoch": 0.242, "grad_norm": 28.875, "grad_norm_var": 6.8978515625, "learning_rate": 0.0001, "loss": 7.4247, "loss/crossentropy": 2.194097451120615, "loss/hidden": 3.43046875, "loss/jsd": 0.0, "loss/logits": 0.20746590523049235, "step": 9680 }, { "epoch": 0.24225, "grad_norm": 29.75, "grad_norm_var": 4.0587890625, "learning_rate": 0.0001, "loss": 7.3413, "loss/crossentropy": 2.1902857303619383, "loss/hidden": 3.187109375, "loss/jsd": 0.0, "loss/logits": 0.17439354099333287, "step": 9690 }, { "epoch": 0.2425, "grad_norm": 41.5, "grad_norm_var": 3.952541960104418e+18, "learning_rate": 0.0001, "loss": 7.5429, "loss/crossentropy": 2.046304853260517, "loss/hidden": 3.31953125, "loss/jsd": 0.0, "loss/logits": 0.18022917695343493, "step": 9700 }, { "epoch": 0.24275, "grad_norm": 30.375, "grad_norm_var": 3.952541960261809e+18, "learning_rate": 0.0001, "loss": 7.2729, "loss/crossentropy": 2.082195009291172, "loss/hidden": 3.5140625, "loss/jsd": 0.0, "loss/logits": 0.1983122780919075, "step": 9710 }, { "epoch": 0.243, "grad_norm": 28.375, "grad_norm_var": 2.6056640625, "learning_rate": 0.0001, "loss": 7.2271, "loss/crossentropy": 1.970278625190258, "loss/hidden": 3.474609375, "loss/jsd": 0.0, "loss/logits": 0.19305091574788094, "step": 9720 }, { "epoch": 0.24325, "grad_norm": 30.375, "grad_norm_var": 2.0576524262052987e+18, "learning_rate": 0.0001, "loss": 7.3979, "loss/crossentropy": 2.1140229746699335, "loss/hidden": 3.390234375, "loss/jsd": 0.0, "loss/logits": 0.19698369763791562, "step": 9730 }, { "epoch": 0.2435, "grad_norm": 29.0, "grad_norm_var": 2.057652425978177e+18, "learning_rate": 0.0001, "loss": 7.2491, "loss/crossentropy": 1.9204902969300748, "loss/hidden": 3.40703125, "loss/jsd": 0.0, "loss/logits": 0.18290270324796437, "step": 9740 }, { "epoch": 0.24375, "grad_norm": 49.25, "grad_norm_var": 27.333072916666666, "learning_rate": 0.0001, "loss": 7.4168, "loss/crossentropy": 2.203968660533428, "loss/hidden": 3.358203125, "loss/jsd": 0.0, "loss/logits": 0.19210707377642394, "step": 9750 }, { "epoch": 0.244, "grad_norm": 27.75, "grad_norm_var": 28.5837890625, "learning_rate": 0.0001, "loss": 7.3667, "loss/crossentropy": 2.0645239472389223, "loss/hidden": 3.42578125, "loss/jsd": 0.0, "loss/logits": 0.1849998442456126, "step": 9760 }, { "epoch": 0.24425, "grad_norm": 30.0, "grad_norm_var": 7.441666666666666, "learning_rate": 0.0001, "loss": 7.3958, "loss/crossentropy": 2.0423281893134115, "loss/hidden": 3.46640625, "loss/jsd": 0.0, "loss/logits": 0.19458430632948875, "step": 9770 }, { "epoch": 0.2445, "grad_norm": 28.25, "grad_norm_var": 25.725, "learning_rate": 0.0001, "loss": 7.371, "loss/crossentropy": 2.2162967801094053, "loss/hidden": 3.392578125, "loss/jsd": 0.0, "loss/logits": 0.19894264116883278, "step": 9780 }, { "epoch": 0.24475, "grad_norm": 31.875, "grad_norm_var": 25.701822916666668, "learning_rate": 0.0001, "loss": 7.4136, "loss/crossentropy": 2.052091246843338, "loss/hidden": 3.331640625, "loss/jsd": 0.0, "loss/logits": 0.19768061954528093, "step": 9790 }, { "epoch": 0.245, "grad_norm": 29.375, "grad_norm_var": 2.4247395833333334, "learning_rate": 0.0001, "loss": 7.3752, "loss/crossentropy": 2.0378331199288366, "loss/hidden": 3.45703125, "loss/jsd": 0.0, "loss/logits": 0.1888222724199295, "step": 9800 }, { "epoch": 0.24525, "grad_norm": 29.75, "grad_norm_var": 17.073958333333334, "learning_rate": 0.0001, "loss": 7.389, "loss/crossentropy": 2.0574826121330263, "loss/hidden": 3.313671875, "loss/jsd": 0.0, "loss/logits": 0.1817958688363433, "step": 9810 }, { "epoch": 0.2455, "grad_norm": 28.125, "grad_norm_var": 24.198372395833335, "learning_rate": 0.0001, "loss": 7.284, "loss/crossentropy": 2.068347904086113, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.1905125178396702, "step": 9820 }, { "epoch": 0.24575, "grad_norm": 28.5, "grad_norm_var": 9.8041015625, "learning_rate": 0.0001, "loss": 7.3867, "loss/crossentropy": 2.0113710410892964, "loss/hidden": 3.318359375, "loss/jsd": 0.0, "loss/logits": 0.1825895557180047, "step": 9830 }, { "epoch": 0.246, "grad_norm": 30.5, "grad_norm_var": 1.16015625, "learning_rate": 0.0001, "loss": 7.3496, "loss/crossentropy": 2.0473890252411366, "loss/hidden": 3.408984375, "loss/jsd": 0.0, "loss/logits": 0.18930337531492114, "step": 9840 }, { "epoch": 0.24625, "grad_norm": 29.875, "grad_norm_var": 1.75, "learning_rate": 0.0001, "loss": 7.4305, "loss/crossentropy": 2.167546259611845, "loss/hidden": 3.344921875, "loss/jsd": 0.0, "loss/logits": 0.18493952229619026, "step": 9850 }, { "epoch": 0.2465, "grad_norm": 28.375, "grad_norm_var": 2.7625, "learning_rate": 0.0001, "loss": 7.444, "loss/crossentropy": 2.1046394810080526, "loss/hidden": 3.324609375, "loss/jsd": 0.0, "loss/logits": 0.18062518630176783, "step": 9860 }, { "epoch": 0.24675, "grad_norm": 28.75, "grad_norm_var": 7819.056705729166, "learning_rate": 0.0001, "loss": 7.4016, "loss/crossentropy": 2.0700221791863442, "loss/hidden": 3.372265625, "loss/jsd": 0.0, "loss/logits": 0.190355353243649, "step": 9870 }, { "epoch": 0.247, "grad_norm": 29.0, "grad_norm_var": 7843.72265625, "learning_rate": 0.0001, "loss": 7.2875, "loss/crossentropy": 2.057565826922655, "loss/hidden": 3.4234375, "loss/jsd": 0.0, "loss/logits": 0.18850490506738424, "step": 9880 }, { "epoch": 0.24725, "grad_norm": 32.5, "grad_norm_var": 3.778059895833333, "learning_rate": 0.0001, "loss": 7.4329, "loss/crossentropy": 2.0141181223094464, "loss/hidden": 3.446875, "loss/jsd": 0.0, "loss/logits": 0.19769613686949014, "step": 9890 }, { "epoch": 0.2475, "grad_norm": 30.375, "grad_norm_var": 6.312239583333334, "learning_rate": 0.0001, "loss": 7.336, "loss/crossentropy": 2.146551664918661, "loss/hidden": 3.34296875, "loss/jsd": 0.0, "loss/logits": 0.18510441221296786, "step": 9900 }, { "epoch": 0.24775, "grad_norm": 33.0, "grad_norm_var": 6.36640625, "learning_rate": 0.0001, "loss": 7.306, "loss/crossentropy": 2.2865438759326935, "loss/hidden": 3.366796875, "loss/jsd": 0.0, "loss/logits": 0.19417236726731063, "step": 9910 }, { "epoch": 0.248, "grad_norm": 33.5, "grad_norm_var": 2.405989583333333, "learning_rate": 0.0001, "loss": 7.3855, "loss/crossentropy": 2.133404280245304, "loss/hidden": 3.37578125, "loss/jsd": 0.0, "loss/logits": 0.19065556656569244, "step": 9920 }, { "epoch": 0.24825, "grad_norm": 34.25, "grad_norm_var": 3.1626528109486234e+18, "learning_rate": 0.0001, "loss": 7.4378, "loss/crossentropy": 2.0854051023721696, "loss/hidden": 3.459765625, "loss/jsd": 0.0, "loss/logits": 0.20351361632347106, "step": 9930 }, { "epoch": 0.2485, "grad_norm": 32.25, "grad_norm_var": 3.1626528099408717e+18, "learning_rate": 0.0001, "loss": 7.3826, "loss/crossentropy": 2.087091060727835, "loss/hidden": 3.39296875, "loss/jsd": 0.0, "loss/logits": 0.19672764679417015, "step": 9940 }, { "epoch": 0.24875, "grad_norm": 31.875, "grad_norm_var": 3.729622395833333, "learning_rate": 0.0001, "loss": 7.4303, "loss/crossentropy": 1.9529958970844745, "loss/hidden": 3.461328125, "loss/jsd": 0.0, "loss/logits": 0.1957404987886548, "step": 9950 }, { "epoch": 0.249, "grad_norm": 27.0, "grad_norm_var": 3.515625, "learning_rate": 0.0001, "loss": 7.2963, "loss/crossentropy": 2.1291355013847353, "loss/hidden": 3.278125, "loss/jsd": 0.0, "loss/logits": 0.1832482174038887, "step": 9960 }, { "epoch": 0.24925, "grad_norm": 31.5, "grad_norm_var": 2.403580729166667, "learning_rate": 0.0001, "loss": 7.3011, "loss/crossentropy": 2.073649263381958, "loss/hidden": 3.2859375, "loss/jsd": 0.0, "loss/logits": 0.17194083742797375, "step": 9970 }, { "epoch": 0.2495, "grad_norm": 30.5, "grad_norm_var": 3.7934895833333333, "learning_rate": 0.0001, "loss": 7.3253, "loss/crossentropy": 2.312130460143089, "loss/hidden": 3.334765625, "loss/jsd": 0.0, "loss/logits": 0.19424791410565376, "step": 9980 }, { "epoch": 0.24975, "grad_norm": 31.875, "grad_norm_var": 6.4666015625, "learning_rate": 0.0001, "loss": 7.3395, "loss/crossentropy": 1.9769040577113628, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.1865438589360565, "step": 9990 }, { "epoch": 0.25, "grad_norm": 37.0, "grad_norm_var": 5.488997395833334, "learning_rate": 0.0001, "loss": 7.3121, "loss/crossentropy": 2.0656296610832214, "loss/hidden": 3.418359375, "loss/jsd": 0.0, "loss/logits": 0.19510807991027831, "step": 10000 }, { "epoch": 0.25025, "grad_norm": 30.5, "grad_norm_var": 6.201822916666667, "learning_rate": 0.0001, "loss": 7.3059, "loss/crossentropy": 2.1552528128027917, "loss/hidden": 3.338671875, "loss/jsd": 0.0, "loss/logits": 0.1871857862919569, "step": 10010 }, { "epoch": 0.2505, "grad_norm": 29.375, "grad_norm_var": 3.7697265625, "learning_rate": 0.0001, "loss": 7.402, "loss/crossentropy": 2.1086191907525063, "loss/hidden": 3.36953125, "loss/jsd": 0.0, "loss/logits": 0.20956437531858682, "step": 10020 }, { "epoch": 0.25075, "grad_norm": 27.0, "grad_norm_var": 79.871875, "learning_rate": 0.0001, "loss": 7.2906, "loss/crossentropy": 2.24168366715312, "loss/hidden": 3.333984375, "loss/jsd": 0.0, "loss/logits": 0.1914551755413413, "step": 10030 }, { "epoch": 0.251, "grad_norm": 28.25, "grad_norm_var": 3.5348307291666665, "learning_rate": 0.0001, "loss": 7.3933, "loss/crossentropy": 1.9110309466719628, "loss/hidden": 3.52421875, "loss/jsd": 0.0, "loss/logits": 0.1938932467252016, "step": 10040 }, { "epoch": 0.25125, "grad_norm": 29.5, "grad_norm_var": 3.318489583333333, "learning_rate": 0.0001, "loss": 7.3066, "loss/crossentropy": 2.2151729106903075, "loss/hidden": 3.374609375, "loss/jsd": 0.0, "loss/logits": 0.1898823155090213, "step": 10050 }, { "epoch": 0.2515, "grad_norm": 27.125, "grad_norm_var": 5.114583333333333, "learning_rate": 0.0001, "loss": 7.3306, "loss/crossentropy": 1.9325201705098152, "loss/hidden": 3.387109375, "loss/jsd": 0.0, "loss/logits": 0.1914183372631669, "step": 10060 }, { "epoch": 0.25175, "grad_norm": 28.375, "grad_norm_var": 5.568489583333333, "learning_rate": 0.0001, "loss": 7.4302, "loss/crossentropy": 2.053273378312588, "loss/hidden": 3.48828125, "loss/jsd": 0.0, "loss/logits": 0.193053549900651, "step": 10070 }, { "epoch": 0.252, "grad_norm": 32.25, "grad_norm_var": 1.5947265625, "learning_rate": 0.0001, "loss": 7.3679, "loss/crossentropy": 2.1425619572401047, "loss/hidden": 3.378125, "loss/jsd": 0.0, "loss/logits": 0.19612068887799977, "step": 10080 }, { "epoch": 0.25225, "grad_norm": 28.125, "grad_norm_var": 1.2666666666666666, "learning_rate": 0.0001, "loss": 7.4696, "loss/crossentropy": 2.1615766674280166, "loss/hidden": 3.437890625, "loss/jsd": 0.0, "loss/logits": 0.21496526505798103, "step": 10090 }, { "epoch": 0.2525, "grad_norm": 31.375, "grad_norm_var": 87.72389322916666, "learning_rate": 0.0001, "loss": 7.3933, "loss/crossentropy": 2.093507520854473, "loss/hidden": 3.306640625, "loss/jsd": 0.0, "loss/logits": 0.18447848707437514, "step": 10100 }, { "epoch": 0.25275, "grad_norm": 29.125, "grad_norm_var": 87.9962890625, "learning_rate": 0.0001, "loss": 7.42, "loss/crossentropy": 2.2579648420214653, "loss/hidden": 3.415625, "loss/jsd": 0.0, "loss/logits": 0.19929521884769202, "step": 10110 }, { "epoch": 0.253, "grad_norm": 31.0, "grad_norm_var": 3.0921223958333335, "learning_rate": 0.0001, "loss": 7.4462, "loss/crossentropy": 2.187203352898359, "loss/hidden": 3.360546875, "loss/jsd": 0.0, "loss/logits": 0.18776683378964662, "step": 10120 }, { "epoch": 0.25325, "grad_norm": 35.0, "grad_norm_var": 37.43333333333333, "learning_rate": 0.0001, "loss": 7.493, "loss/crossentropy": 2.156028524041176, "loss/hidden": 3.41171875, "loss/jsd": 0.0, "loss/logits": 0.20926487501710653, "step": 10130 }, { "epoch": 0.2535, "grad_norm": 30.625, "grad_norm_var": 36.87180989583333, "learning_rate": 0.0001, "loss": 7.4227, "loss/crossentropy": 1.9757653154432773, "loss/hidden": 3.383984375, "loss/jsd": 0.0, "loss/logits": 0.1927978384308517, "step": 10140 }, { "epoch": 0.25375, "grad_norm": 30.75, "grad_norm_var": 2.1582682291666666, "learning_rate": 0.0001, "loss": 7.3403, "loss/crossentropy": 2.1322492100298405, "loss/hidden": 3.455859375, "loss/jsd": 0.0, "loss/logits": 0.1974259439855814, "step": 10150 }, { "epoch": 0.254, "grad_norm": 28.125, "grad_norm_var": 2.9259765625, "learning_rate": 0.0001, "loss": 7.3083, "loss/crossentropy": 2.0464363016188143, "loss/hidden": 3.27734375, "loss/jsd": 0.0, "loss/logits": 0.17546537732705475, "step": 10160 }, { "epoch": 0.25425, "grad_norm": 28.625, "grad_norm_var": 3.7514973958333333, "learning_rate": 0.0001, "loss": 7.3988, "loss/crossentropy": 2.059445019811392, "loss/hidden": 3.396484375, "loss/jsd": 0.0, "loss/logits": 0.19312301548197866, "step": 10170 }, { "epoch": 0.2545, "grad_norm": 29.5, "grad_norm_var": 3.36015625, "learning_rate": 0.0001, "loss": 7.3178, "loss/crossentropy": 2.0300123430788517, "loss/hidden": 3.334375, "loss/jsd": 0.0, "loss/logits": 0.18283235086128116, "step": 10180 }, { "epoch": 0.25475, "grad_norm": 31.625, "grad_norm_var": 2.7192057291666667, "learning_rate": 0.0001, "loss": 7.4117, "loss/crossentropy": 2.052568303793669, "loss/hidden": 3.394921875, "loss/jsd": 0.0, "loss/logits": 0.17590052969753742, "step": 10190 }, { "epoch": 0.255, "grad_norm": 30.875, "grad_norm_var": 3.0660807291666665, "learning_rate": 0.0001, "loss": 7.4384, "loss/crossentropy": 2.1124994076788424, "loss/hidden": 3.4421875, "loss/jsd": 0.0, "loss/logits": 0.20564852859824895, "step": 10200 }, { "epoch": 0.25525, "grad_norm": 32.25, "grad_norm_var": 1.8223307291666666, "learning_rate": 0.0001, "loss": 7.4006, "loss/crossentropy": 1.997454211115837, "loss/hidden": 3.4078125, "loss/jsd": 0.0, "loss/logits": 0.189224443025887, "step": 10210 }, { "epoch": 0.2555, "grad_norm": 30.75, "grad_norm_var": 12.555989583333334, "learning_rate": 0.0001, "loss": 7.5754, "loss/crossentropy": 2.095755438506603, "loss/hidden": 3.470703125, "loss/jsd": 0.0, "loss/logits": 0.1905398152768612, "step": 10220 }, { "epoch": 0.25575, "grad_norm": 30.25, "grad_norm_var": 2.0666015625, "learning_rate": 0.0001, "loss": 7.4499, "loss/crossentropy": 2.031408229470253, "loss/hidden": 3.379296875, "loss/jsd": 0.0, "loss/logits": 0.1826389298774302, "step": 10230 }, { "epoch": 0.256, "grad_norm": 30.875, "grad_norm_var": 1.1973307291666666, "learning_rate": 0.0001, "loss": 7.4736, "loss/crossentropy": 2.103855460882187, "loss/hidden": 3.479296875, "loss/jsd": 0.0, "loss/logits": 0.21460786666721104, "step": 10240 }, { "epoch": 0.25625, "grad_norm": 31.125, "grad_norm_var": 2.6556640625, "learning_rate": 0.0001, "loss": 7.5078, "loss/crossentropy": 2.0419384971261025, "loss/hidden": 3.44609375, "loss/jsd": 0.0, "loss/logits": 0.1855954358354211, "step": 10250 }, { "epoch": 0.2565, "grad_norm": 31.0, "grad_norm_var": 3.025, "learning_rate": 0.0001, "loss": 7.3116, "loss/crossentropy": 2.2164057731628417, "loss/hidden": 3.336328125, "loss/jsd": 0.0, "loss/logits": 0.18515868298709393, "step": 10260 }, { "epoch": 0.25675, "grad_norm": 30.75, "grad_norm_var": 2.653059895833333, "learning_rate": 0.0001, "loss": 7.2962, "loss/crossentropy": 2.0967153370380402, "loss/hidden": 3.5140625, "loss/jsd": 0.0, "loss/logits": 0.2144587781280279, "step": 10270 }, { "epoch": 0.257, "grad_norm": 27.625, "grad_norm_var": 2874.977018229167, "learning_rate": 0.0001, "loss": 7.3634, "loss/crossentropy": 2.197367396950722, "loss/hidden": 3.32421875, "loss/jsd": 0.0, "loss/logits": 0.1903674839064479, "step": 10280 }, { "epoch": 0.25725, "grad_norm": 30.25, "grad_norm_var": 2830.1666015625, "learning_rate": 0.0001, "loss": 7.4461, "loss/crossentropy": 1.9168385773897172, "loss/hidden": 3.384375, "loss/jsd": 0.0, "loss/logits": 0.18945675920695065, "step": 10290 }, { "epoch": 0.2575, "grad_norm": 30.625, "grad_norm_var": 10.3634765625, "learning_rate": 0.0001, "loss": 7.4383, "loss/crossentropy": 2.196033439040184, "loss/hidden": 3.389453125, "loss/jsd": 0.0, "loss/logits": 0.20106703452765942, "step": 10300 }, { "epoch": 0.25775, "grad_norm": 29.75, "grad_norm_var": 2.41015625, "learning_rate": 0.0001, "loss": 7.4, "loss/crossentropy": 1.9811425000429153, "loss/hidden": 3.2875, "loss/jsd": 0.0, "loss/logits": 0.18086131382733583, "step": 10310 }, { "epoch": 0.258, "grad_norm": 28.625, "grad_norm_var": 1.3056640625, "learning_rate": 0.0001, "loss": 7.3814, "loss/crossentropy": 2.141839873790741, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.19040859565138818, "step": 10320 }, { "epoch": 0.25825, "grad_norm": 29.75, "grad_norm_var": 0.9582682291666667, "learning_rate": 0.0001, "loss": 7.3988, "loss/crossentropy": 2.0412203505635262, "loss/hidden": 3.471484375, "loss/jsd": 0.0, "loss/logits": 0.19841136487666516, "step": 10330 }, { "epoch": 0.2585, "grad_norm": 31.0, "grad_norm_var": 1.94765625, "learning_rate": 0.0001, "loss": 7.464, "loss/crossentropy": 2.124895977973938, "loss/hidden": 3.42890625, "loss/jsd": 0.0, "loss/logits": 0.19423805810511113, "step": 10340 }, { "epoch": 0.25875, "grad_norm": 27.125, "grad_norm_var": 20.603125, "learning_rate": 0.0001, "loss": 7.3612, "loss/crossentropy": 2.271273523569107, "loss/hidden": 3.341015625, "loss/jsd": 0.0, "loss/logits": 0.19086832217872143, "step": 10350 }, { "epoch": 0.259, "grad_norm": 31.75, "grad_norm_var": 41.87265625, "learning_rate": 0.0001, "loss": 7.3645, "loss/crossentropy": 2.148430307209492, "loss/hidden": 3.33203125, "loss/jsd": 0.0, "loss/logits": 0.18722805399447678, "step": 10360 }, { "epoch": 0.25925, "grad_norm": 31.125, "grad_norm_var": 28.49140625, "learning_rate": 0.0001, "loss": 7.2841, "loss/crossentropy": 2.088160905241966, "loss/hidden": 3.43359375, "loss/jsd": 0.0, "loss/logits": 0.19452540278434755, "step": 10370 }, { "epoch": 0.2595, "grad_norm": 30.375, "grad_norm_var": 3.8275390625, "learning_rate": 0.0001, "loss": 7.3269, "loss/crossentropy": 2.037956405431032, "loss/hidden": 3.324609375, "loss/jsd": 0.0, "loss/logits": 0.19043162725865842, "step": 10380 }, { "epoch": 0.25975, "grad_norm": 28.25, "grad_norm_var": 8.718489583333334, "learning_rate": 0.0001, "loss": 7.2968, "loss/crossentropy": 1.9823431193828582, "loss/hidden": 3.36171875, "loss/jsd": 0.0, "loss/logits": 0.19301451742649078, "step": 10390 }, { "epoch": 0.26, "grad_norm": 31.625, "grad_norm_var": 8.5625, "learning_rate": 0.0001, "loss": 7.3314, "loss/crossentropy": 2.0179347068071367, "loss/hidden": 3.393359375, "loss/jsd": 0.0, "loss/logits": 0.18727120459079744, "step": 10400 }, { "epoch": 0.26025, "grad_norm": 29.375, "grad_norm_var": 7.3625, "learning_rate": 0.0001, "loss": 7.3878, "loss/crossentropy": 2.0856245614588262, "loss/hidden": 3.335546875, "loss/jsd": 0.0, "loss/logits": 0.18157497737556696, "step": 10410 }, { "epoch": 0.2605, "grad_norm": 33.0, "grad_norm_var": 3.0134765625, "learning_rate": 0.0001, "loss": 7.3355, "loss/crossentropy": 2.0798820555210114, "loss/hidden": 3.36875, "loss/jsd": 0.0, "loss/logits": 0.1937016123905778, "step": 10420 }, { "epoch": 0.26075, "grad_norm": 30.625, "grad_norm_var": 2.5400390625, "learning_rate": 0.0001, "loss": 7.4527, "loss/crossentropy": 2.0538913279771807, "loss/hidden": 3.487890625, "loss/jsd": 0.0, "loss/logits": 0.19370690621435643, "step": 10430 }, { "epoch": 0.261, "grad_norm": 30.125, "grad_norm_var": 3.42890625, "learning_rate": 0.0001, "loss": 7.46, "loss/crossentropy": 2.181585241854191, "loss/hidden": 3.41953125, "loss/jsd": 0.0, "loss/logits": 0.1953628245741129, "step": 10440 }, { "epoch": 0.26125, "grad_norm": 27.75, "grad_norm_var": 3.783072916666667, "learning_rate": 0.0001, "loss": 7.3934, "loss/crossentropy": 2.183496044576168, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.20456707030534743, "step": 10450 }, { "epoch": 0.2615, "grad_norm": 30.125, "grad_norm_var": 1.4379557291666667, "learning_rate": 0.0001, "loss": 7.3211, "loss/crossentropy": 2.132909268140793, "loss/hidden": 3.376171875, "loss/jsd": 0.0, "loss/logits": 0.18731370605528355, "step": 10460 }, { "epoch": 0.26175, "grad_norm": 31.25, "grad_norm_var": 2.5791666666666666, "learning_rate": 0.0001, "loss": 7.3587, "loss/crossentropy": 2.0115842171013356, "loss/hidden": 3.380078125, "loss/jsd": 0.0, "loss/logits": 0.18043918311595916, "step": 10470 }, { "epoch": 0.262, "grad_norm": 30.25, "grad_norm_var": 3.8268229166666665, "learning_rate": 0.0001, "loss": 7.2962, "loss/crossentropy": 2.1343379452824593, "loss/hidden": 3.305859375, "loss/jsd": 0.0, "loss/logits": 0.18047874867916108, "step": 10480 }, { "epoch": 0.26225, "grad_norm": 36.0, "grad_norm_var": 8.967708333333333, "learning_rate": 0.0001, "loss": 7.2903, "loss/crossentropy": 1.8975451827049254, "loss/hidden": 3.395703125, "loss/jsd": 0.0, "loss/logits": 0.17858757209032775, "step": 10490 }, { "epoch": 0.2625, "grad_norm": 31.75, "grad_norm_var": 10.077083333333333, "learning_rate": 0.0001, "loss": 7.3074, "loss/crossentropy": 2.0936397939920424, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.19368096999824047, "step": 10500 }, { "epoch": 0.26275, "grad_norm": 29.375, "grad_norm_var": 2.020768229166667, "learning_rate": 0.0001, "loss": 7.3675, "loss/crossentropy": 2.06072843298316, "loss/hidden": 3.30390625, "loss/jsd": 0.0, "loss/logits": 0.17674694433808327, "step": 10510 }, { "epoch": 0.263, "grad_norm": 29.125, "grad_norm_var": 3.4686848958333334, "learning_rate": 0.0001, "loss": 7.3847, "loss/crossentropy": 2.057940775156021, "loss/hidden": 3.381640625, "loss/jsd": 0.0, "loss/logits": 0.18738706540316344, "step": 10520 }, { "epoch": 0.26325, "grad_norm": 31.625, "grad_norm_var": 3.225, "learning_rate": 0.0001, "loss": 7.3449, "loss/crossentropy": 2.051366009563208, "loss/hidden": 3.376171875, "loss/jsd": 0.0, "loss/logits": 0.18555977791547776, "step": 10530 }, { "epoch": 0.2635, "grad_norm": 29.75, "grad_norm_var": 32.99108072916667, "learning_rate": 0.0001, "loss": 7.3795, "loss/crossentropy": 2.061921717226505, "loss/hidden": 3.427734375, "loss/jsd": 0.0, "loss/logits": 0.1927286960184574, "step": 10540 }, { "epoch": 0.26375, "grad_norm": 31.875, "grad_norm_var": 34.16451822916667, "learning_rate": 0.0001, "loss": 7.3305, "loss/crossentropy": 2.051614002883434, "loss/hidden": 3.39609375, "loss/jsd": 0.0, "loss/logits": 0.20516632981598376, "step": 10550 }, { "epoch": 0.264, "grad_norm": 30.25, "grad_norm_var": 2.2802083333333334, "learning_rate": 0.0001, "loss": 7.3251, "loss/crossentropy": 2.208609312772751, "loss/hidden": 3.302734375, "loss/jsd": 0.0, "loss/logits": 0.1906878400593996, "step": 10560 }, { "epoch": 0.26425, "grad_norm": 28.0, "grad_norm_var": 2.8785807291666665, "learning_rate": 0.0001, "loss": 7.4584, "loss/crossentropy": 2.2136432066559792, "loss/hidden": 3.346484375, "loss/jsd": 0.0, "loss/logits": 0.1983118023723364, "step": 10570 }, { "epoch": 0.2645, "grad_norm": 30.5, "grad_norm_var": 3.562239583333333, "learning_rate": 0.0001, "loss": 7.3017, "loss/crossentropy": 2.2243916779756545, "loss/hidden": 3.276953125, "loss/jsd": 0.0, "loss/logits": 0.19225324541330338, "step": 10580 }, { "epoch": 0.26475, "grad_norm": 31.75, "grad_norm_var": 2.47265625, "learning_rate": 0.0001, "loss": 7.2623, "loss/crossentropy": 2.085415804386139, "loss/hidden": 3.308203125, "loss/jsd": 0.0, "loss/logits": 0.1769318761304021, "step": 10590 }, { "epoch": 0.265, "grad_norm": 30.5, "grad_norm_var": 1.8895182291666666, "learning_rate": 0.0001, "loss": 7.4543, "loss/crossentropy": 2.0929081469774244, "loss/hidden": 3.431640625, "loss/jsd": 0.0, "loss/logits": 0.21267817355692387, "step": 10600 }, { "epoch": 0.26525, "grad_norm": 28.75, "grad_norm_var": 2.505989583333333, "learning_rate": 0.0001, "loss": 7.2136, "loss/crossentropy": 1.885066507011652, "loss/hidden": 3.34609375, "loss/jsd": 0.0, "loss/logits": 0.17078990247100592, "step": 10610 }, { "epoch": 0.2655, "grad_norm": 27.25, "grad_norm_var": 2.18515625, "learning_rate": 0.0001, "loss": 7.258, "loss/crossentropy": 1.9193078093230724, "loss/hidden": 3.366015625, "loss/jsd": 0.0, "loss/logits": 0.18096741195768118, "step": 10620 }, { "epoch": 0.26575, "grad_norm": 29.5, "grad_norm_var": 2.499739583333333, "learning_rate": 0.0001, "loss": 7.256, "loss/crossentropy": 1.9643688909709454, "loss/hidden": 3.317578125, "loss/jsd": 0.0, "loss/logits": 0.16975617725402117, "step": 10630 }, { "epoch": 0.266, "grad_norm": 41.25, "grad_norm_var": 2.3308942592018744e+18, "learning_rate": 0.0001, "loss": 7.3436, "loss/crossentropy": 1.8120718583464623, "loss/hidden": 3.403515625, "loss/jsd": 0.0, "loss/logits": 0.17283489797264337, "step": 10640 }, { "epoch": 0.26625, "grad_norm": 30.5, "grad_norm_var": 2.3308942571726003e+18, "learning_rate": 0.0001, "loss": 7.398, "loss/crossentropy": 2.2462258487939835, "loss/hidden": 3.378515625, "loss/jsd": 0.0, "loss/logits": 0.20116339288651944, "step": 10650 }, { "epoch": 0.2665, "grad_norm": 32.0, "grad_norm_var": 21.258268229166667, "learning_rate": 0.0001, "loss": 7.2994, "loss/crossentropy": 2.044428373128176, "loss/hidden": 3.42265625, "loss/jsd": 0.0, "loss/logits": 0.1989501324482262, "step": 10660 }, { "epoch": 0.26675, "grad_norm": 29.875, "grad_norm_var": 1.8681640625, "learning_rate": 0.0001, "loss": 7.3881, "loss/crossentropy": 1.9573393151164056, "loss/hidden": 3.27421875, "loss/jsd": 0.0, "loss/logits": 0.17021252401173115, "step": 10670 }, { "epoch": 0.267, "grad_norm": 33.0, "grad_norm_var": 2.6197265625, "learning_rate": 0.0001, "loss": 7.449, "loss/crossentropy": 2.1520077705383303, "loss/hidden": 3.498046875, "loss/jsd": 0.0, "loss/logits": 0.20022546350955964, "step": 10680 }, { "epoch": 0.26725, "grad_norm": 30.25, "grad_norm_var": 69.97473958333333, "learning_rate": 0.0001, "loss": 7.4428, "loss/crossentropy": 2.1147203534841537, "loss/hidden": 3.366015625, "loss/jsd": 0.0, "loss/logits": 0.18886133264750243, "step": 10690 }, { "epoch": 0.2675, "grad_norm": 33.25, "grad_norm_var": 27.978125, "learning_rate": 0.0001, "loss": 7.3953, "loss/crossentropy": 2.1656775265932082, "loss/hidden": 3.334765625, "loss/jsd": 0.0, "loss/logits": 0.18777367267757655, "step": 10700 }, { "epoch": 0.26775, "grad_norm": 30.75, "grad_norm_var": 39.307291666666664, "learning_rate": 0.0001, "loss": 7.2137, "loss/crossentropy": 2.2314133137464522, "loss/hidden": 3.280859375, "loss/jsd": 0.0, "loss/logits": 0.18172509353607894, "step": 10710 }, { "epoch": 0.268, "grad_norm": 30.5, "grad_norm_var": 43.19055989583333, "learning_rate": 0.0001, "loss": 7.3973, "loss/crossentropy": 2.0373408157378434, "loss/hidden": 3.500390625, "loss/jsd": 0.0, "loss/logits": 0.19881470818072558, "step": 10720 }, { "epoch": 0.26825, "grad_norm": 28.125, "grad_norm_var": 7.35625, "learning_rate": 0.0001, "loss": 7.3064, "loss/crossentropy": 2.065886814892292, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.19001882169395684, "step": 10730 }, { "epoch": 0.2685, "grad_norm": 30.0, "grad_norm_var": 2.437239583333333, "learning_rate": 0.0001, "loss": 7.4337, "loss/crossentropy": 2.1057795181870462, "loss/hidden": 3.380078125, "loss/jsd": 0.0, "loss/logits": 0.19459401965141296, "step": 10740 }, { "epoch": 0.26875, "grad_norm": 29.75, "grad_norm_var": 3.24765625, "learning_rate": 0.0001, "loss": 7.4157, "loss/crossentropy": 1.7320681288838387, "loss/hidden": 3.533984375, "loss/jsd": 0.0, "loss/logits": 0.20864265877753496, "step": 10750 }, { "epoch": 0.269, "grad_norm": 31.0, "grad_norm_var": 2.9098307291666665, "learning_rate": 0.0001, "loss": 7.3267, "loss/crossentropy": 2.1352997794747353, "loss/hidden": 3.398828125, "loss/jsd": 0.0, "loss/logits": 0.19193184426985682, "step": 10760 }, { "epoch": 0.26925, "grad_norm": 32.25, "grad_norm_var": 3.0444333193999944e+18, "learning_rate": 0.0001, "loss": 7.3861, "loss/crossentropy": 2.070178285241127, "loss/hidden": 3.4390625, "loss/jsd": 0.0, "loss/logits": 0.19077087007462978, "step": 10770 }, { "epoch": 0.2695, "grad_norm": 28.5, "grad_norm_var": 23.91640625, "learning_rate": 0.0001, "loss": 7.4494, "loss/crossentropy": 2.147341425716877, "loss/hidden": 3.341015625, "loss/jsd": 0.0, "loss/logits": 0.18359180726110935, "step": 10780 }, { "epoch": 0.26975, "grad_norm": 30.875, "grad_norm_var": 1.5843098958333333, "learning_rate": 0.0001, "loss": 7.3566, "loss/crossentropy": 2.0286200530827045, "loss/hidden": 3.373046875, "loss/jsd": 0.0, "loss/logits": 0.1995188482105732, "step": 10790 }, { "epoch": 0.27, "grad_norm": 30.625, "grad_norm_var": 1.8247395833333333, "learning_rate": 0.0001, "loss": 7.3752, "loss/crossentropy": 2.1403896272182465, "loss/hidden": 3.408203125, "loss/jsd": 0.0, "loss/logits": 0.19066570214927198, "step": 10800 }, { "epoch": 0.27025, "grad_norm": 32.25, "grad_norm_var": 3.2462890625, "learning_rate": 0.0001, "loss": 7.3297, "loss/crossentropy": 2.109960842132568, "loss/hidden": 3.35234375, "loss/jsd": 0.0, "loss/logits": 0.19408894181251526, "step": 10810 }, { "epoch": 0.2705, "grad_norm": 31.0, "grad_norm_var": 2.125, "learning_rate": 0.0001, "loss": 7.4115, "loss/crossentropy": 2.170359855145216, "loss/hidden": 3.346875, "loss/jsd": 0.0, "loss/logits": 0.1864105872809887, "step": 10820 }, { "epoch": 0.27075, "grad_norm": 30.125, "grad_norm_var": 2.5025390625, "learning_rate": 0.0001, "loss": 7.3568, "loss/crossentropy": 2.0696767389774324, "loss/hidden": 3.32109375, "loss/jsd": 0.0, "loss/logits": 0.18454026989638805, "step": 10830 }, { "epoch": 0.271, "grad_norm": 29.625, "grad_norm_var": 3.187239583333333, "learning_rate": 0.0001, "loss": 7.3986, "loss/crossentropy": 2.1671934336423875, "loss/hidden": 3.377734375, "loss/jsd": 0.0, "loss/logits": 0.19040703494101763, "step": 10840 }, { "epoch": 0.27125, "grad_norm": 29.25, "grad_norm_var": 3.470833333333333, "learning_rate": 0.0001, "loss": 7.3105, "loss/crossentropy": 1.9061052799224854, "loss/hidden": 3.3734375, "loss/jsd": 0.0, "loss/logits": 0.1798303933814168, "step": 10850 }, { "epoch": 0.2715, "grad_norm": 33.0, "grad_norm_var": 3.2087890625, "learning_rate": 0.0001, "loss": 7.3692, "loss/crossentropy": 2.1218462653458117, "loss/hidden": 3.432421875, "loss/jsd": 0.0, "loss/logits": 0.20523111913353204, "step": 10860 }, { "epoch": 0.27175, "grad_norm": 29.375, "grad_norm_var": 3.544205729166667, "learning_rate": 0.0001, "loss": 7.3206, "loss/crossentropy": 2.006873355805874, "loss/hidden": 3.4734375, "loss/jsd": 0.0, "loss/logits": 0.2144127245992422, "step": 10870 }, { "epoch": 0.272, "grad_norm": 29.875, "grad_norm_var": 1.5832967245463552e+18, "learning_rate": 0.0001, "loss": 7.4255, "loss/crossentropy": 2.1331238821148872, "loss/hidden": 3.366015625, "loss/jsd": 0.0, "loss/logits": 0.19302980024367572, "step": 10880 }, { "epoch": 0.27225, "grad_norm": 31.875, "grad_norm_var": 1.5832967244414976e+18, "learning_rate": 0.0001, "loss": 7.3477, "loss/crossentropy": 2.041455736756325, "loss/hidden": 3.4515625, "loss/jsd": 0.0, "loss/logits": 0.230126572214067, "step": 10890 }, { "epoch": 0.2725, "grad_norm": 32.25, "grad_norm_var": 16.21015625, "learning_rate": 0.0001, "loss": 7.3668, "loss/crossentropy": 2.11685880869627, "loss/hidden": 3.32890625, "loss/jsd": 0.0, "loss/logits": 0.18238812778145075, "step": 10900 }, { "epoch": 0.27275, "grad_norm": 29.625, "grad_norm_var": 15.566666666666666, "learning_rate": 0.0001, "loss": 7.349, "loss/crossentropy": 2.1108590021729468, "loss/hidden": 3.448828125, "loss/jsd": 0.0, "loss/logits": 0.2031965653412044, "step": 10910 }, { "epoch": 0.273, "grad_norm": 30.5, "grad_norm_var": 6.133072916666666, "learning_rate": 0.0001, "loss": 7.3685, "loss/crossentropy": 1.9768708415329457, "loss/hidden": 3.4, "loss/jsd": 0.0, "loss/logits": 0.1786771345883608, "step": 10920 }, { "epoch": 0.27325, "grad_norm": 28.75, "grad_norm_var": 6.448372395833333, "learning_rate": 0.0001, "loss": 7.3505, "loss/crossentropy": 1.9710132874548436, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.18313440950587392, "step": 10930 }, { "epoch": 0.2735, "grad_norm": 29.5, "grad_norm_var": 2.63515625, "learning_rate": 0.0001, "loss": 7.4147, "loss/crossentropy": 2.1898112446069717, "loss/hidden": 3.3546875, "loss/jsd": 0.0, "loss/logits": 0.1952790988609195, "step": 10940 }, { "epoch": 0.27375, "grad_norm": 31.25, "grad_norm_var": 1.8978515625, "learning_rate": 0.0001, "loss": 7.3679, "loss/crossentropy": 2.135420022904873, "loss/hidden": 3.43203125, "loss/jsd": 0.0, "loss/logits": 0.21568923965096473, "step": 10950 }, { "epoch": 0.274, "grad_norm": 31.625, "grad_norm_var": 3.5184895833333334, "learning_rate": 0.0001, "loss": 7.2611, "loss/crossentropy": 1.9667419284582137, "loss/hidden": 3.42265625, "loss/jsd": 0.0, "loss/logits": 0.1834744794294238, "step": 10960 }, { "epoch": 0.27425, "grad_norm": 30.625, "grad_norm_var": 1.3561848958333333, "learning_rate": 0.0001, "loss": 7.4273, "loss/crossentropy": 2.074997512996197, "loss/hidden": 3.356640625, "loss/jsd": 0.0, "loss/logits": 0.18813611660152674, "step": 10970 }, { "epoch": 0.2745, "grad_norm": 31.75, "grad_norm_var": 2.325455729166667, "learning_rate": 0.0001, "loss": 7.3688, "loss/crossentropy": 2.234239089488983, "loss/hidden": 3.254296875, "loss/jsd": 0.0, "loss/logits": 0.18571207812055945, "step": 10980 }, { "epoch": 0.27475, "grad_norm": 33.25, "grad_norm_var": 5.548958333333333, "learning_rate": 0.0001, "loss": 7.4528, "loss/crossentropy": 2.1355573579669, "loss/hidden": 3.489453125, "loss/jsd": 0.0, "loss/logits": 0.20484505780041218, "step": 10990 }, { "epoch": 0.275, "grad_norm": 26.625, "grad_norm_var": 5.2759765625, "learning_rate": 0.0001, "loss": 7.4261, "loss/crossentropy": 2.2509318992495535, "loss/hidden": 3.3265625, "loss/jsd": 0.0, "loss/logits": 0.19250140003859997, "step": 11000 }, { "epoch": 0.27525, "grad_norm": 31.0, "grad_norm_var": 5.387239583333334, "learning_rate": 0.0001, "loss": 7.3389, "loss/crossentropy": 2.114951176941395, "loss/hidden": 3.3640625, "loss/jsd": 0.0, "loss/logits": 0.198347245156765, "step": 11010 }, { "epoch": 0.2755, "grad_norm": 30.375, "grad_norm_var": 1.5468098958333334, "learning_rate": 0.0001, "loss": 7.3199, "loss/crossentropy": 2.110514160990715, "loss/hidden": 3.381640625, "loss/jsd": 0.0, "loss/logits": 0.1900305664166808, "step": 11020 }, { "epoch": 0.27575, "grad_norm": 27.25, "grad_norm_var": 2.003059895833333, "learning_rate": 0.0001, "loss": 7.3304, "loss/crossentropy": 2.201597589254379, "loss/hidden": 3.393359375, "loss/jsd": 0.0, "loss/logits": 0.20035982932895421, "step": 11030 }, { "epoch": 0.276, "grad_norm": 32.25, "grad_norm_var": 3.0139973958333335, "learning_rate": 0.0001, "loss": 7.4256, "loss/crossentropy": 2.1428385630249975, "loss/hidden": 3.287890625, "loss/jsd": 0.0, "loss/logits": 0.18116684164851904, "step": 11040 }, { "epoch": 0.27625, "grad_norm": 31.25, "grad_norm_var": 2.4671223958333335, "learning_rate": 0.0001, "loss": 7.2733, "loss/crossentropy": 2.043394061923027, "loss/hidden": 3.37734375, "loss/jsd": 0.0, "loss/logits": 0.17936107851564884, "step": 11050 }, { "epoch": 0.2765, "grad_norm": 32.75, "grad_norm_var": 1.3830729166666667, "learning_rate": 0.0001, "loss": 7.4904, "loss/crossentropy": 2.1975908786058427, "loss/hidden": 3.392578125, "loss/jsd": 0.0, "loss/logits": 0.21010352578014135, "step": 11060 }, { "epoch": 0.27675, "grad_norm": 30.0, "grad_norm_var": 4.671875, "learning_rate": 0.0001, "loss": 7.3718, "loss/crossentropy": 2.1417069509625435, "loss/hidden": 3.35390625, "loss/jsd": 0.0, "loss/logits": 0.17585361283272505, "step": 11070 }, { "epoch": 0.277, "grad_norm": 29.375, "grad_norm_var": 4.309375, "learning_rate": 0.0001, "loss": 7.2855, "loss/crossentropy": 2.041763362288475, "loss/hidden": 3.221875, "loss/jsd": 0.0, "loss/logits": 0.17130352668464183, "step": 11080 }, { "epoch": 0.27725, "grad_norm": 28.375, "grad_norm_var": 11.15390625, "learning_rate": 0.0001, "loss": 7.3131, "loss/crossentropy": 2.1522015750408174, "loss/hidden": 3.3375, "loss/jsd": 0.0, "loss/logits": 0.18296321779489516, "step": 11090 }, { "epoch": 0.2775, "grad_norm": 30.375, "grad_norm_var": 11.266080729166667, "learning_rate": 0.0001, "loss": 7.3499, "loss/crossentropy": 2.1265663146972655, "loss/hidden": 3.318359375, "loss/jsd": 0.0, "loss/logits": 0.18307223506271839, "step": 11100 }, { "epoch": 0.27775, "grad_norm": 30.75, "grad_norm_var": 1.6994140625, "learning_rate": 0.0001, "loss": 7.3764, "loss/crossentropy": 2.126886320114136, "loss/hidden": 3.365234375, "loss/jsd": 0.0, "loss/logits": 0.1963003095239401, "step": 11110 }, { "epoch": 0.278, "grad_norm": 29.75, "grad_norm_var": 1.9780598958333333, "learning_rate": 0.0001, "loss": 7.3686, "loss/crossentropy": 2.209390402585268, "loss/hidden": 3.284375, "loss/jsd": 0.0, "loss/logits": 0.18762579131871462, "step": 11120 }, { "epoch": 0.27825, "grad_norm": 29.875, "grad_norm_var": 2.3806640625, "learning_rate": 0.0001, "loss": 7.4086, "loss/crossentropy": 2.0636716064065697, "loss/hidden": 3.4109375, "loss/jsd": 0.0, "loss/logits": 0.18702280572615565, "step": 11130 }, { "epoch": 0.2785, "grad_norm": 30.0, "grad_norm_var": 11.084375, "learning_rate": 0.0001, "loss": 7.3611, "loss/crossentropy": 1.9728175386786462, "loss/hidden": 3.371875, "loss/jsd": 0.0, "loss/logits": 0.18249935954809188, "step": 11140 }, { "epoch": 0.27875, "grad_norm": 30.75, "grad_norm_var": 13.183333333333334, "learning_rate": 0.0001, "loss": 7.2901, "loss/crossentropy": 1.9563469380140304, "loss/hidden": 3.32421875, "loss/jsd": 0.0, "loss/logits": 0.1720572842285037, "step": 11150 }, { "epoch": 0.279, "grad_norm": 40.75, "grad_norm_var": 11.42890625, "learning_rate": 0.0001, "loss": 7.2633, "loss/crossentropy": 2.1441919445991515, "loss/hidden": 3.30703125, "loss/jsd": 0.0, "loss/logits": 0.1793568328022957, "step": 11160 }, { "epoch": 0.27925, "grad_norm": 31.125, "grad_norm_var": 9.245833333333334, "learning_rate": 0.0001, "loss": 7.3546, "loss/crossentropy": 2.0893258482217787, "loss/hidden": 3.376171875, "loss/jsd": 0.0, "loss/logits": 0.18886512089520693, "step": 11170 }, { "epoch": 0.2795, "grad_norm": 29.25, "grad_norm_var": 15.84140625, "learning_rate": 0.0001, "loss": 7.3265, "loss/crossentropy": 2.028592649102211, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.1916877321898937, "step": 11180 }, { "epoch": 0.27975, "grad_norm": 29.0, "grad_norm_var": 5.451822916666667, "learning_rate": 0.0001, "loss": 7.4362, "loss/crossentropy": 2.1735318168997764, "loss/hidden": 3.325, "loss/jsd": 0.0, "loss/logits": 0.18261966463178397, "step": 11190 }, { "epoch": 0.28, "grad_norm": 30.75, "grad_norm_var": 3.405143229166667, "learning_rate": 0.0001, "loss": 7.3648, "loss/crossentropy": 2.129372274875641, "loss/hidden": 3.434375, "loss/jsd": 0.0, "loss/logits": 0.19846085608005523, "step": 11200 }, { "epoch": 0.28025, "grad_norm": 30.125, "grad_norm_var": 3.4462890625, "learning_rate": 0.0001, "loss": 7.4302, "loss/crossentropy": 2.1702652648091316, "loss/hidden": 3.466015625, "loss/jsd": 0.0, "loss/logits": 0.20027328319847584, "step": 11210 }, { "epoch": 0.2805, "grad_norm": 30.625, "grad_norm_var": 2.58515625, "learning_rate": 0.0001, "loss": 7.3575, "loss/crossentropy": 2.134139196574688, "loss/hidden": 3.373828125, "loss/jsd": 0.0, "loss/logits": 0.1903523735702038, "step": 11220 }, { "epoch": 0.28075, "grad_norm": 32.25, "grad_norm_var": 3.1093098958333334, "learning_rate": 0.0001, "loss": 7.43, "loss/crossentropy": 2.026133489608765, "loss/hidden": 3.45390625, "loss/jsd": 0.0, "loss/logits": 0.201400394923985, "step": 11230 }, { "epoch": 0.281, "grad_norm": 31.25, "grad_norm_var": 12.038997395833333, "learning_rate": 0.0001, "loss": 7.4162, "loss/crossentropy": 2.1442901253700257, "loss/hidden": 3.37265625, "loss/jsd": 0.0, "loss/logits": 0.20070651173591614, "step": 11240 }, { "epoch": 0.28125, "grad_norm": 28.875, "grad_norm_var": 162.22962239583333, "learning_rate": 0.0001, "loss": 7.4058, "loss/crossentropy": 2.1855412632226945, "loss/hidden": 3.4109375, "loss/jsd": 0.0, "loss/logits": 0.1976642705500126, "step": 11250 }, { "epoch": 0.2815, "grad_norm": 31.75, "grad_norm_var": 10.683072916666667, "learning_rate": 0.0001, "loss": 7.3535, "loss/crossentropy": 2.030480499565601, "loss/hidden": 3.40078125, "loss/jsd": 0.0, "loss/logits": 0.20611500162631274, "step": 11260 }, { "epoch": 0.28175, "grad_norm": 36.75, "grad_norm_var": 11.08515625, "learning_rate": 0.0001, "loss": 7.287, "loss/crossentropy": 2.01583767645061, "loss/hidden": 3.312109375, "loss/jsd": 0.0, "loss/logits": 0.18007306316867472, "step": 11270 }, { "epoch": 0.282, "grad_norm": 30.625, "grad_norm_var": 7.605989583333334, "learning_rate": 0.0001, "loss": 7.3012, "loss/crossentropy": 2.2034949243068693, "loss/hidden": 3.327734375, "loss/jsd": 0.0, "loss/logits": 0.18539215214550495, "step": 11280 }, { "epoch": 0.28225, "grad_norm": 29.125, "grad_norm_var": 4.1791015625, "learning_rate": 0.0001, "loss": 7.3566, "loss/crossentropy": 2.0296434491872786, "loss/hidden": 3.335546875, "loss/jsd": 0.0, "loss/logits": 0.18707384672015906, "step": 11290 }, { "epoch": 0.2825, "grad_norm": 30.375, "grad_norm_var": 7.01015625, "learning_rate": 0.0001, "loss": 7.2793, "loss/crossentropy": 2.0983324527740477, "loss/hidden": 3.327734375, "loss/jsd": 0.0, "loss/logits": 0.17865268159657716, "step": 11300 }, { "epoch": 0.28275, "grad_norm": 34.0, "grad_norm_var": 7.029622395833333, "learning_rate": 0.0001, "loss": 7.3716, "loss/crossentropy": 2.127777361869812, "loss/hidden": 3.476953125, "loss/jsd": 0.0, "loss/logits": 0.20420280396938323, "step": 11310 }, { "epoch": 0.283, "grad_norm": 32.5, "grad_norm_var": 38.66712239583333, "learning_rate": 0.0001, "loss": 7.3326, "loss/crossentropy": 2.1566628091037274, "loss/hidden": 3.28828125, "loss/jsd": 0.0, "loss/logits": 0.180232659727335, "step": 11320 }, { "epoch": 0.28325, "grad_norm": 30.5, "grad_norm_var": 44.13743489583333, "learning_rate": 0.0001, "loss": 7.3567, "loss/crossentropy": 2.091618612408638, "loss/hidden": 3.330078125, "loss/jsd": 0.0, "loss/logits": 0.18182430015876888, "step": 11330 }, { "epoch": 0.2835, "grad_norm": 29.625, "grad_norm_var": 4.887434895833334, "learning_rate": 0.0001, "loss": 7.3312, "loss/crossentropy": 2.1611782908439636, "loss/hidden": 3.369140625, "loss/jsd": 0.0, "loss/logits": 0.1828035417944193, "step": 11340 }, { "epoch": 0.28375, "grad_norm": 34.25, "grad_norm_var": 6.423893229166667, "learning_rate": 0.0001, "loss": 7.1759, "loss/crossentropy": 1.9736236594617367, "loss/hidden": 3.33125, "loss/jsd": 0.0, "loss/logits": 0.18899882938712836, "step": 11350 }, { "epoch": 0.284, "grad_norm": 30.0, "grad_norm_var": 9.488541666666666, "learning_rate": 0.0001, "loss": 7.1771, "loss/crossentropy": 2.0265045419335364, "loss/hidden": 3.273828125, "loss/jsd": 0.0, "loss/logits": 0.1712389207445085, "step": 11360 }, { "epoch": 0.28425, "grad_norm": 33.75, "grad_norm_var": 12.092122395833334, "learning_rate": 0.0001, "loss": 7.361, "loss/crossentropy": 2.2598242223262788, "loss/hidden": 3.3265625, "loss/jsd": 0.0, "loss/logits": 0.19073396287858485, "step": 11370 }, { "epoch": 0.2845, "grad_norm": 29.0, "grad_norm_var": 8.455989583333333, "learning_rate": 0.0001, "loss": 7.3234, "loss/crossentropy": 2.144850969314575, "loss/hidden": 3.321484375, "loss/jsd": 0.0, "loss/logits": 0.18078193911351265, "step": 11380 }, { "epoch": 0.28475, "grad_norm": 29.25, "grad_norm_var": 1.7212890625, "learning_rate": 0.0001, "loss": 7.3, "loss/crossentropy": 2.1646609872579576, "loss/hidden": 3.3140625, "loss/jsd": 0.0, "loss/logits": 0.18855173885822296, "step": 11390 }, { "epoch": 0.285, "grad_norm": 31.875, "grad_norm_var": 2.5837890625, "learning_rate": 0.0001, "loss": 7.2976, "loss/crossentropy": 2.120090515911579, "loss/hidden": 3.308203125, "loss/jsd": 0.0, "loss/logits": 0.17507072482258082, "step": 11400 }, { "epoch": 0.28525, "grad_norm": 29.375, "grad_norm_var": 2.0785807291666667, "learning_rate": 0.0001, "loss": 7.4018, "loss/crossentropy": 2.0800730645656587, "loss/hidden": 3.43046875, "loss/jsd": 0.0, "loss/logits": 0.20872990731149912, "step": 11410 }, { "epoch": 0.2855, "grad_norm": 30.25, "grad_norm_var": 13.87890625, "learning_rate": 0.0001, "loss": 7.1549, "loss/crossentropy": 2.1735212251544, "loss/hidden": 3.25625, "loss/jsd": 0.0, "loss/logits": 0.18148906547576188, "step": 11420 }, { "epoch": 0.28575, "grad_norm": 30.375, "grad_norm_var": 15.132747395833333, "learning_rate": 0.0001, "loss": 7.206, "loss/crossentropy": 1.8314741916954518, "loss/hidden": 3.272265625, "loss/jsd": 0.0, "loss/logits": 0.17273464631289243, "step": 11430 }, { "epoch": 0.286, "grad_norm": 28.875, "grad_norm_var": 2.2330729166666665, "learning_rate": 0.0001, "loss": 7.3502, "loss/crossentropy": 1.9636109337210654, "loss/hidden": 3.329296875, "loss/jsd": 0.0, "loss/logits": 0.16999556813389063, "step": 11440 }, { "epoch": 0.28625, "grad_norm": 30.5, "grad_norm_var": 1.8009765625, "learning_rate": 0.0001, "loss": 7.2893, "loss/crossentropy": 1.9146189287304878, "loss/hidden": 3.425390625, "loss/jsd": 0.0, "loss/logits": 0.1754764079116285, "step": 11450 }, { "epoch": 0.2865, "grad_norm": 31.75, "grad_norm_var": 1.9197265625, "learning_rate": 0.0001, "loss": 7.3233, "loss/crossentropy": 2.0301330149173737, "loss/hidden": 3.351171875, "loss/jsd": 0.0, "loss/logits": 0.1742406915873289, "step": 11460 }, { "epoch": 0.28675, "grad_norm": 34.25, "grad_norm_var": 3.8009765625, "learning_rate": 0.0001, "loss": 7.4231, "loss/crossentropy": 2.1827415734529496, "loss/hidden": 3.33515625, "loss/jsd": 0.0, "loss/logits": 0.18332260251045226, "step": 11470 }, { "epoch": 0.287, "grad_norm": 30.875, "grad_norm_var": 3.479622395833333, "learning_rate": 0.0001, "loss": 7.3842, "loss/crossentropy": 1.9815428338944912, "loss/hidden": 3.33125, "loss/jsd": 0.0, "loss/logits": 0.17839936017990113, "step": 11480 }, { "epoch": 0.28725, "grad_norm": 30.125, "grad_norm_var": 12.9228515625, "learning_rate": 0.0001, "loss": 7.3888, "loss/crossentropy": 2.10543007850647, "loss/hidden": 3.29765625, "loss/jsd": 0.0, "loss/logits": 0.18028822876513004, "step": 11490 }, { "epoch": 0.2875, "grad_norm": 30.5, "grad_norm_var": 0.9973307291666667, "learning_rate": 0.0001, "loss": 7.2278, "loss/crossentropy": 1.9690312303602695, "loss/hidden": 3.35390625, "loss/jsd": 0.0, "loss/logits": 0.19321996718645096, "step": 11500 }, { "epoch": 0.28775, "grad_norm": 29.5, "grad_norm_var": 7.462239583333333, "learning_rate": 0.0001, "loss": 7.4485, "loss/crossentropy": 2.0997009545564653, "loss/hidden": 3.371875, "loss/jsd": 0.0, "loss/logits": 0.1947010463103652, "step": 11510 }, { "epoch": 0.288, "grad_norm": 30.625, "grad_norm_var": 6.814583333333333, "learning_rate": 0.0001, "loss": 7.4014, "loss/crossentropy": 2.041436542570591, "loss/hidden": 3.371484375, "loss/jsd": 0.0, "loss/logits": 0.17614111984148623, "step": 11520 }, { "epoch": 0.28825, "grad_norm": 29.625, "grad_norm_var": 1.7247395833333334, "learning_rate": 0.0001, "loss": 7.3231, "loss/crossentropy": 2.056125995516777, "loss/hidden": 3.475, "loss/jsd": 0.0, "loss/logits": 0.20333750136196613, "step": 11530 }, { "epoch": 0.2885, "grad_norm": 30.5, "grad_norm_var": 3.0014973958333333, "learning_rate": 0.0001, "loss": 7.2735, "loss/crossentropy": 1.9806349158287049, "loss/hidden": 3.306640625, "loss/jsd": 0.0, "loss/logits": 0.17432715664617718, "step": 11540 }, { "epoch": 0.28875, "grad_norm": 32.75, "grad_norm_var": 1.6973307291666666, "learning_rate": 0.0001, "loss": 7.3842, "loss/crossentropy": 2.0468244731426237, "loss/hidden": 3.506640625, "loss/jsd": 0.0, "loss/logits": 0.19939990881830455, "step": 11550 }, { "epoch": 0.289, "grad_norm": 31.625, "grad_norm_var": 3.546875, "learning_rate": 0.0001, "loss": 7.4212, "loss/crossentropy": 2.1170118719339373, "loss/hidden": 3.452734375, "loss/jsd": 0.0, "loss/logits": 0.1938545262441039, "step": 11560 }, { "epoch": 0.28925, "grad_norm": 29.0, "grad_norm_var": 6.987239583333333, "learning_rate": 0.0001, "loss": 7.3068, "loss/crossentropy": 1.9378852248191833, "loss/hidden": 3.392578125, "loss/jsd": 0.0, "loss/logits": 0.18525508288294076, "step": 11570 }, { "epoch": 0.2895, "grad_norm": 32.25, "grad_norm_var": 7.7650390625, "learning_rate": 0.0001, "loss": 7.3939, "loss/crossentropy": 2.094913274049759, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.19857972972095012, "step": 11580 }, { "epoch": 0.28975, "grad_norm": 31.0, "grad_norm_var": 2.3934895833333334, "learning_rate": 0.0001, "loss": 7.3634, "loss/crossentropy": 2.112175312638283, "loss/hidden": 3.391796875, "loss/jsd": 0.0, "loss/logits": 0.1923027027398348, "step": 11590 }, { "epoch": 0.29, "grad_norm": 29.75, "grad_norm_var": 2.8676432291666667, "learning_rate": 0.0001, "loss": 7.4788, "loss/crossentropy": 2.347763030230999, "loss/hidden": 3.379296875, "loss/jsd": 0.0, "loss/logits": 0.20385736282914876, "step": 11600 }, { "epoch": 0.29025, "grad_norm": 36.0, "grad_norm_var": 2.1060661210680481e+18, "learning_rate": 0.0001, "loss": 7.482, "loss/crossentropy": 2.1438632771372794, "loss/hidden": 3.375390625, "loss/jsd": 0.0, "loss/logits": 0.1947392811998725, "step": 11610 }, { "epoch": 0.2905, "grad_norm": 31.5, "grad_norm_var": 2.1060661214006216e+18, "learning_rate": 0.0001, "loss": 7.2853, "loss/crossentropy": 2.080791215598583, "loss/hidden": 3.2875, "loss/jsd": 0.0, "loss/logits": 0.19497475158423186, "step": 11620 }, { "epoch": 0.29075, "grad_norm": 30.0, "grad_norm_var": 1.5931640625, "learning_rate": 0.0001, "loss": 7.2971, "loss/crossentropy": 2.16863374710083, "loss/hidden": 3.30390625, "loss/jsd": 0.0, "loss/logits": 0.18860027231276036, "step": 11630 }, { "epoch": 0.291, "grad_norm": 31.625, "grad_norm_var": 1.5994140625, "learning_rate": 0.0001, "loss": 7.3835, "loss/crossentropy": 2.061967647075653, "loss/hidden": 3.366015625, "loss/jsd": 0.0, "loss/logits": 0.18620339650660753, "step": 11640 }, { "epoch": 0.29125, "grad_norm": 33.0, "grad_norm_var": 3.690207280948122e+18, "learning_rate": 0.0001, "loss": 7.3738, "loss/crossentropy": 2.0998649850487707, "loss/hidden": 3.279296875, "loss/jsd": 0.0, "loss/logits": 0.18567133340984582, "step": 11650 }, { "epoch": 0.2915, "grad_norm": 29.0, "grad_norm_var": 29.6228515625, "learning_rate": 0.0001, "loss": 7.2405, "loss/crossentropy": 2.1259921073913572, "loss/hidden": 3.326953125, "loss/jsd": 0.0, "loss/logits": 0.18609032463282346, "step": 11660 }, { "epoch": 0.29175, "grad_norm": 31.0, "grad_norm_var": 14.421809895833333, "learning_rate": 0.0001, "loss": 7.4326, "loss/crossentropy": 2.066218351200223, "loss/hidden": 3.348828125, "loss/jsd": 0.0, "loss/logits": 0.1920524787157774, "step": 11670 }, { "epoch": 0.292, "grad_norm": 27.875, "grad_norm_var": 49.305989583333336, "learning_rate": 0.0001, "loss": 7.2881, "loss/crossentropy": 2.07202163413167, "loss/hidden": 3.35, "loss/jsd": 0.0, "loss/logits": 0.18048244724050164, "step": 11680 }, { "epoch": 0.29225, "grad_norm": 31.875, "grad_norm_var": 4.995247395833333, "learning_rate": 0.0001, "loss": 7.2687, "loss/crossentropy": 2.045185898244381, "loss/hidden": 3.275, "loss/jsd": 0.0, "loss/logits": 0.17395208198577167, "step": 11690 }, { "epoch": 0.2925, "grad_norm": 30.0, "grad_norm_var": 5.696875, "learning_rate": 0.0001, "loss": 7.3259, "loss/crossentropy": 2.3408665537834166, "loss/hidden": 3.345703125, "loss/jsd": 0.0, "loss/logits": 0.20136666856706142, "step": 11700 }, { "epoch": 0.29275, "grad_norm": 30.0, "grad_norm_var": 3.7421223958333334, "learning_rate": 0.0001, "loss": 7.2137, "loss/crossentropy": 1.9074330985546113, "loss/hidden": 3.349609375, "loss/jsd": 0.0, "loss/logits": 0.1704694928601384, "step": 11710 }, { "epoch": 0.293, "grad_norm": 31.5, "grad_norm_var": 5.803125, "learning_rate": 0.0001, "loss": 7.2817, "loss/crossentropy": 2.290114316344261, "loss/hidden": 3.36484375, "loss/jsd": 0.0, "loss/logits": 0.20061923637986184, "step": 11720 }, { "epoch": 0.29325, "grad_norm": 31.125, "grad_norm_var": 2.4936848958333333, "learning_rate": 0.0001, "loss": 7.4136, "loss/crossentropy": 2.2652835667133333, "loss/hidden": 3.29375, "loss/jsd": 0.0, "loss/logits": 0.18269798178225755, "step": 11730 }, { "epoch": 0.2935, "grad_norm": 31.875, "grad_norm_var": 2.06640625, "learning_rate": 0.0001, "loss": 7.3219, "loss/crossentropy": 2.047122722864151, "loss/hidden": 3.361328125, "loss/jsd": 0.0, "loss/logits": 0.18665554728358985, "step": 11740 }, { "epoch": 0.29375, "grad_norm": 29.875, "grad_norm_var": 11.484309895833333, "learning_rate": 0.0001, "loss": 7.4668, "loss/crossentropy": 2.0893666088581084, "loss/hidden": 3.350390625, "loss/jsd": 0.0, "loss/logits": 0.18700389545410873, "step": 11750 }, { "epoch": 0.294, "grad_norm": 33.25, "grad_norm_var": 2.412239583333333, "learning_rate": 0.0001, "loss": 7.3389, "loss/crossentropy": 2.0646077781915664, "loss/hidden": 3.3890625, "loss/jsd": 0.0, "loss/logits": 0.1896628426387906, "step": 11760 }, { "epoch": 0.29425, "grad_norm": 36.75, "grad_norm_var": 15.843489583333334, "learning_rate": 0.0001, "loss": 7.3537, "loss/crossentropy": 2.14951953291893, "loss/hidden": 3.420703125, "loss/jsd": 0.0, "loss/logits": 0.19772466979920864, "step": 11770 }, { "epoch": 0.2945, "grad_norm": 34.0, "grad_norm_var": 15.808333333333334, "learning_rate": 0.0001, "loss": 7.2266, "loss/crossentropy": 1.9084580048918725, "loss/hidden": 3.31953125, "loss/jsd": 0.0, "loss/logits": 0.17457483559846879, "step": 11780 }, { "epoch": 0.29475, "grad_norm": 30.5, "grad_norm_var": 17.133268229166667, "learning_rate": 0.0001, "loss": 7.4198, "loss/crossentropy": 2.1059816129505635, "loss/hidden": 3.2609375, "loss/jsd": 0.0, "loss/logits": 0.19060559011995792, "step": 11790 }, { "epoch": 0.295, "grad_norm": 28.875, "grad_norm_var": 15.191666666666666, "learning_rate": 0.0001, "loss": 7.3353, "loss/crossentropy": 2.0854270696640014, "loss/hidden": 3.340625, "loss/jsd": 0.0, "loss/logits": 0.18200992513448, "step": 11800 }, { "epoch": 0.29525, "grad_norm": 33.0, "grad_norm_var": 5.77890625, "learning_rate": 0.0001, "loss": 7.338, "loss/crossentropy": 2.0409404814243315, "loss/hidden": 3.3640625, "loss/jsd": 0.0, "loss/logits": 0.18492084443569184, "step": 11810 }, { "epoch": 0.2955, "grad_norm": 28.25, "grad_norm_var": 4.9556640625, "learning_rate": 0.0001, "loss": 7.2394, "loss/crossentropy": 2.1242421194911003, "loss/hidden": 3.318359375, "loss/jsd": 0.0, "loss/logits": 0.17592983674257995, "step": 11820 }, { "epoch": 0.29575, "grad_norm": 32.75, "grad_norm_var": 12.3994140625, "learning_rate": 0.0001, "loss": 7.3172, "loss/crossentropy": 2.135970714688301, "loss/hidden": 3.328125, "loss/jsd": 0.0, "loss/logits": 0.1905512258410454, "step": 11830 }, { "epoch": 0.296, "grad_norm": 31.125, "grad_norm_var": 13.479166666666666, "learning_rate": 0.0001, "loss": 7.3928, "loss/crossentropy": 2.0788447827100756, "loss/hidden": 3.35859375, "loss/jsd": 0.0, "loss/logits": 0.18583104945719242, "step": 11840 }, { "epoch": 0.29625, "grad_norm": 29.375, "grad_norm_var": 5.16640625, "learning_rate": 0.0001, "loss": 7.2575, "loss/crossentropy": 2.0609687596559523, "loss/hidden": 3.36875, "loss/jsd": 0.0, "loss/logits": 0.1885878125205636, "step": 11850 }, { "epoch": 0.2965, "grad_norm": 33.5, "grad_norm_var": 4.534309895833333, "learning_rate": 0.0001, "loss": 7.2397, "loss/crossentropy": 2.113059785962105, "loss/hidden": 3.347265625, "loss/jsd": 0.0, "loss/logits": 0.19279801957309245, "step": 11860 }, { "epoch": 0.29675, "grad_norm": 28.0, "grad_norm_var": 2.833268229166667, "learning_rate": 0.0001, "loss": 7.4813, "loss/crossentropy": 2.2016214139759542, "loss/hidden": 3.325, "loss/jsd": 0.0, "loss/logits": 0.19246475584805012, "step": 11870 }, { "epoch": 0.297, "grad_norm": 35.5, "grad_norm_var": 5.829622395833334, "learning_rate": 0.0001, "loss": 7.4107, "loss/crossentropy": 2.1937885224819182, "loss/hidden": 3.394921875, "loss/jsd": 0.0, "loss/logits": 0.20546972285956144, "step": 11880 }, { "epoch": 0.29725, "grad_norm": 32.25, "grad_norm_var": 5.101041666666666, "learning_rate": 0.0001, "loss": 7.4133, "loss/crossentropy": 2.112873890995979, "loss/hidden": 3.4078125, "loss/jsd": 0.0, "loss/logits": 0.19512166157364846, "step": 11890 }, { "epoch": 0.2975, "grad_norm": 29.0, "grad_norm_var": 4.71875, "learning_rate": 0.0001, "loss": 7.3659, "loss/crossentropy": 2.068368895351887, "loss/hidden": 3.427734375, "loss/jsd": 0.0, "loss/logits": 0.22229769406840205, "step": 11900 }, { "epoch": 0.29775, "grad_norm": 28.25, "grad_norm_var": 30.575, "learning_rate": 0.0001, "loss": 7.3459, "loss/crossentropy": 1.9633590430021286, "loss/hidden": 3.429296875, "loss/jsd": 0.0, "loss/logits": 0.1919392876327038, "step": 11910 }, { "epoch": 0.298, "grad_norm": 31.5, "grad_norm_var": 25.169205729166666, "learning_rate": 0.0001, "loss": 7.2575, "loss/crossentropy": 2.0932082675397394, "loss/hidden": 3.3484375, "loss/jsd": 0.0, "loss/logits": 0.18578519038856028, "step": 11920 }, { "epoch": 0.29825, "grad_norm": 29.375, "grad_norm_var": 1.9197916666666666, "learning_rate": 0.0001, "loss": 7.3554, "loss/crossentropy": 2.0443739868700503, "loss/hidden": 3.39140625, "loss/jsd": 0.0, "loss/logits": 0.18646586760878564, "step": 11930 }, { "epoch": 0.2985, "grad_norm": 29.125, "grad_norm_var": 1.1629557291666666, "learning_rate": 0.0001, "loss": 7.3341, "loss/crossentropy": 2.0645432278513907, "loss/hidden": 3.369921875, "loss/jsd": 0.0, "loss/logits": 0.20268544293940066, "step": 11940 }, { "epoch": 0.29875, "grad_norm": 29.25, "grad_norm_var": 4.085416666666666, "learning_rate": 0.0001, "loss": 7.2596, "loss/crossentropy": 1.8821386724710465, "loss/hidden": 3.37265625, "loss/jsd": 0.0, "loss/logits": 0.17970443107187747, "step": 11950 }, { "epoch": 0.299, "grad_norm": 29.75, "grad_norm_var": 2.4072265625, "learning_rate": 0.0001, "loss": 7.3139, "loss/crossentropy": 2.1848059430718423, "loss/hidden": 3.311328125, "loss/jsd": 0.0, "loss/logits": 0.19258468672633172, "step": 11960 }, { "epoch": 0.29925, "grad_norm": 28.625, "grad_norm_var": 1.49140625, "learning_rate": 0.0001, "loss": 7.3064, "loss/crossentropy": 1.9953081101179122, "loss/hidden": 3.341015625, "loss/jsd": 0.0, "loss/logits": 0.17766128927469255, "step": 11970 }, { "epoch": 0.2995, "grad_norm": 30.625, "grad_norm_var": 3.527083333333333, "learning_rate": 0.0001, "loss": 7.3213, "loss/crossentropy": 1.9512401312589644, "loss/hidden": 3.257421875, "loss/jsd": 0.0, "loss/logits": 0.16765224877744914, "step": 11980 }, { "epoch": 0.29975, "grad_norm": 31.875, "grad_norm_var": 2.8322265625, "learning_rate": 0.0001, "loss": 7.3373, "loss/crossentropy": 2.134880567342043, "loss/hidden": 3.353515625, "loss/jsd": 0.0, "loss/logits": 0.19845206197351217, "step": 11990 }, { "epoch": 0.3, "grad_norm": 29.0, "grad_norm_var": 1.88515625, "learning_rate": 0.0001, "loss": 7.3355, "loss/crossentropy": 2.084690621495247, "loss/hidden": 3.325, "loss/jsd": 0.0, "loss/logits": 0.18881614562124013, "step": 12000 }, { "epoch": 0.30025, "grad_norm": 29.375, "grad_norm_var": 1.678125, "learning_rate": 0.0001, "loss": 7.2797, "loss/crossentropy": 2.0142040476202965, "loss/hidden": 3.277734375, "loss/jsd": 0.0, "loss/logits": 0.17630351725965737, "step": 12010 }, { "epoch": 0.3005, "grad_norm": 30.125, "grad_norm_var": 1.0702473958333334, "learning_rate": 0.0001, "loss": 7.3524, "loss/crossentropy": 2.0099969252943994, "loss/hidden": 3.3703125, "loss/jsd": 0.0, "loss/logits": 0.17516886200755835, "step": 12020 }, { "epoch": 0.30075, "grad_norm": 30.125, "grad_norm_var": 2.1509765625, "learning_rate": 0.0001, "loss": 7.4081, "loss/crossentropy": 2.0550965458154677, "loss/hidden": 3.33359375, "loss/jsd": 0.0, "loss/logits": 0.18450612868182362, "step": 12030 }, { "epoch": 0.301, "grad_norm": 29.375, "grad_norm_var": 2.088997395833333, "learning_rate": 0.0001, "loss": 7.3732, "loss/crossentropy": 2.015381334722042, "loss/hidden": 3.434375, "loss/jsd": 0.0, "loss/logits": 0.19866329655051232, "step": 12040 }, { "epoch": 0.30125, "grad_norm": 32.5, "grad_norm_var": 28.125455729166667, "learning_rate": 0.0001, "loss": 7.3689, "loss/crossentropy": 2.0619105845689774, "loss/hidden": 3.41640625, "loss/jsd": 0.0, "loss/logits": 0.19213079251348972, "step": 12050 }, { "epoch": 0.3015, "grad_norm": 29.5, "grad_norm_var": 2.5869140625, "learning_rate": 0.0001, "loss": 7.3533, "loss/crossentropy": 2.059162912517786, "loss/hidden": 3.3625, "loss/jsd": 0.0, "loss/logits": 0.1831585830077529, "step": 12060 }, { "epoch": 0.30175, "grad_norm": 30.375, "grad_norm_var": 6.6150390625, "learning_rate": 0.0001, "loss": 7.339, "loss/crossentropy": 2.0344214349985124, "loss/hidden": 3.3328125, "loss/jsd": 0.0, "loss/logits": 0.18243852034211158, "step": 12070 }, { "epoch": 0.302, "grad_norm": 31.375, "grad_norm_var": 10.05390625, "learning_rate": 0.0001, "loss": 7.3457, "loss/crossentropy": 2.222717672586441, "loss/hidden": 3.28671875, "loss/jsd": 0.0, "loss/logits": 0.19160165078938007, "step": 12080 }, { "epoch": 0.30225, "grad_norm": 30.375, "grad_norm_var": 8.060416666666667, "learning_rate": 0.0001, "loss": 7.2928, "loss/crossentropy": 1.9902059100568295, "loss/hidden": 3.3921875, "loss/jsd": 0.0, "loss/logits": 0.1817794761620462, "step": 12090 }, { "epoch": 0.3025, "grad_norm": 29.0, "grad_norm_var": 4.048958333333333, "learning_rate": 0.0001, "loss": 7.403, "loss/crossentropy": 1.9990896947681904, "loss/hidden": 3.330859375, "loss/jsd": 0.0, "loss/logits": 0.17872856128960848, "step": 12100 }, { "epoch": 0.30275, "grad_norm": 32.0, "grad_norm_var": 14.486393229166667, "learning_rate": 0.0001, "loss": 7.3774, "loss/crossentropy": 2.178261125087738, "loss/hidden": 3.3796875, "loss/jsd": 0.0, "loss/logits": 0.19976076446473598, "step": 12110 }, { "epoch": 0.303, "grad_norm": 29.0, "grad_norm_var": 25.053059895833332, "learning_rate": 0.0001, "loss": 7.3852, "loss/crossentropy": 2.1738795921206475, "loss/hidden": 3.21171875, "loss/jsd": 0.0, "loss/logits": 0.1802441133186221, "step": 12120 }, { "epoch": 0.30325, "grad_norm": 32.5, "grad_norm_var": 27.863541666666666, "learning_rate": 0.0001, "loss": 7.3119, "loss/crossentropy": 2.017075891792774, "loss/hidden": 3.39296875, "loss/jsd": 0.0, "loss/logits": 0.1819239752367139, "step": 12130 }, { "epoch": 0.3035, "grad_norm": 5704253440.0, "grad_norm_var": 2.033656683216328e+18, "learning_rate": 0.0001, "loss": 7.3187, "loss/crossentropy": 2.1378798320889474, "loss/hidden": 3.403125, "loss/jsd": 0.0, "loss/logits": 0.18621215373277664, "step": 12140 }, { "epoch": 0.30375, "grad_norm": 28.5, "grad_norm_var": 2.0336566834599473e+18, "learning_rate": 0.0001, "loss": 7.3375, "loss/crossentropy": 2.0977103441953657, "loss/hidden": 3.33046875, "loss/jsd": 0.0, "loss/logits": 0.18937066961079835, "step": 12150 }, { "epoch": 0.304, "grad_norm": 30.0, "grad_norm_var": 25.7119140625, "learning_rate": 0.0001, "loss": 7.2813, "loss/crossentropy": 2.0422945946455, "loss/hidden": 3.29453125, "loss/jsd": 0.0, "loss/logits": 0.17747708465903997, "step": 12160 }, { "epoch": 0.30425, "grad_norm": 29.375, "grad_norm_var": 1.8973307291666666, "learning_rate": 0.0001, "loss": 7.2982, "loss/crossentropy": 2.1438152998685838, "loss/hidden": 3.4, "loss/jsd": 0.0, "loss/logits": 0.19225314557552337, "step": 12170 }, { "epoch": 0.3045, "grad_norm": 32.5, "grad_norm_var": 6.688997395833334, "learning_rate": 0.0001, "loss": 7.2886, "loss/crossentropy": 2.033204630762339, "loss/hidden": 3.326953125, "loss/jsd": 0.0, "loss/logits": 0.1741345826536417, "step": 12180 }, { "epoch": 0.30475, "grad_norm": 28.5, "grad_norm_var": 7.262434895833334, "learning_rate": 0.0001, "loss": 7.3812, "loss/crossentropy": 1.9149722829461098, "loss/hidden": 3.384765625, "loss/jsd": 0.0, "loss/logits": 0.16993715493008493, "step": 12190 }, { "epoch": 0.305, "grad_norm": 30.625, "grad_norm_var": 2.315625, "learning_rate": 0.0001, "loss": 7.355, "loss/crossentropy": 2.200256870687008, "loss/hidden": 3.3703125, "loss/jsd": 0.0, "loss/logits": 0.1957419654354453, "step": 12200 }, { "epoch": 0.30525, "grad_norm": 32.25, "grad_norm_var": 1.8291666666666666, "learning_rate": 0.0001, "loss": 7.2689, "loss/crossentropy": 1.9441036701202392, "loss/hidden": 3.450390625, "loss/jsd": 0.0, "loss/logits": 0.19158205185085536, "step": 12210 }, { "epoch": 0.3055, "grad_norm": 32.5, "grad_norm_var": 224.99993489583332, "learning_rate": 0.0001, "loss": 7.4205, "loss/crossentropy": 2.21477717012167, "loss/hidden": 3.353515625, "loss/jsd": 0.0, "loss/logits": 0.19972570687532426, "step": 12220 }, { "epoch": 0.30575, "grad_norm": 77.0, "grad_norm_var": 161.4994140625, "learning_rate": 0.0001, "loss": 7.3221, "loss/crossentropy": 2.243621972203255, "loss/hidden": 3.381640625, "loss/jsd": 0.0, "loss/logits": 0.20230026841163634, "step": 12230 }, { "epoch": 0.306, "grad_norm": 29.125, "grad_norm_var": 136.2509765625, "learning_rate": 0.0001, "loss": 7.2544, "loss/crossentropy": 2.1267666652798654, "loss/hidden": 3.30078125, "loss/jsd": 0.0, "loss/logits": 0.18361538834869862, "step": 12240 }, { "epoch": 0.30625, "grad_norm": 43.0, "grad_norm_var": 15.1384765625, "learning_rate": 0.0001, "loss": 7.3457, "loss/crossentropy": 2.097319718450308, "loss/hidden": 3.380859375, "loss/jsd": 0.0, "loss/logits": 0.1941752176731825, "step": 12250 }, { "epoch": 0.3065, "grad_norm": 27.125, "grad_norm_var": 45.91875, "learning_rate": 0.0001, "loss": 7.1913, "loss/crossentropy": 2.024158325791359, "loss/hidden": 3.301171875, "loss/jsd": 0.0, "loss/logits": 0.18036661259829997, "step": 12260 }, { "epoch": 0.30675, "grad_norm": 32.5, "grad_norm_var": 34.94765625, "learning_rate": 0.0001, "loss": 7.2604, "loss/crossentropy": 2.05818811878562, "loss/hidden": 3.384375, "loss/jsd": 0.0, "loss/logits": 0.19272202141582967, "step": 12270 }, { "epoch": 0.307, "grad_norm": 31.625, "grad_norm_var": 1.7958333333333334, "learning_rate": 0.0001, "loss": 7.3478, "loss/crossentropy": 2.058871729671955, "loss/hidden": 3.316015625, "loss/jsd": 0.0, "loss/logits": 0.1805756026878953, "step": 12280 }, { "epoch": 0.30725, "grad_norm": 37.0, "grad_norm_var": 4.418489583333334, "learning_rate": 0.0001, "loss": 7.3202, "loss/crossentropy": 1.9400940239429474, "loss/hidden": 3.41875, "loss/jsd": 0.0, "loss/logits": 0.19434008933603764, "step": 12290 }, { "epoch": 0.3075, "grad_norm": 30.5, "grad_norm_var": 30.1572265625, "learning_rate": 0.0001, "loss": 7.2939, "loss/crossentropy": 2.069047340750694, "loss/hidden": 3.42421875, "loss/jsd": 0.0, "loss/logits": 0.20558738969266416, "step": 12300 }, { "epoch": 0.30775, "grad_norm": 33.0, "grad_norm_var": 11.9666015625, "learning_rate": 0.0001, "loss": 7.4109, "loss/crossentropy": 2.227694994211197, "loss/hidden": 3.395703125, "loss/jsd": 0.0, "loss/logits": 0.18980047646909953, "step": 12310 }, { "epoch": 0.308, "grad_norm": 29.625, "grad_norm_var": 4.90390625, "learning_rate": 0.0001, "loss": 7.3686, "loss/crossentropy": 2.0309864141047003, "loss/hidden": 3.42578125, "loss/jsd": 0.0, "loss/logits": 0.1816682495176792, "step": 12320 }, { "epoch": 0.30825, "grad_norm": 30.125, "grad_norm_var": 1.7249348958333333, "learning_rate": 0.0001, "loss": 7.4105, "loss/crossentropy": 2.2505668699741364, "loss/hidden": 3.307421875, "loss/jsd": 0.0, "loss/logits": 0.19684594608843325, "step": 12330 }, { "epoch": 0.3085, "grad_norm": 30.75, "grad_norm_var": 1.9535807291666667, "learning_rate": 0.0001, "loss": 7.3679, "loss/crossentropy": 2.0422501020133494, "loss/hidden": 3.223828125, "loss/jsd": 0.0, "loss/logits": 0.1723391016945243, "step": 12340 }, { "epoch": 0.30875, "grad_norm": 30.25, "grad_norm_var": 1.9186848958333333, "learning_rate": 0.0001, "loss": 7.3186, "loss/crossentropy": 2.190096604824066, "loss/hidden": 3.299609375, "loss/jsd": 0.0, "loss/logits": 0.1794669708237052, "step": 12350 }, { "epoch": 0.309, "grad_norm": 30.0, "grad_norm_var": 1.7926432291666667, "learning_rate": 0.0001, "loss": 7.2068, "loss/crossentropy": 2.011892383173108, "loss/hidden": 3.3046875, "loss/jsd": 0.0, "loss/logits": 0.17626657225191594, "step": 12360 }, { "epoch": 0.30925, "grad_norm": 32.25, "grad_norm_var": 3.3900390625, "learning_rate": 0.0001, "loss": 7.3963, "loss/crossentropy": 2.1367972552776338, "loss/hidden": 3.356640625, "loss/jsd": 0.0, "loss/logits": 0.19975323602557182, "step": 12370 }, { "epoch": 0.3095, "grad_norm": 30.875, "grad_norm_var": 2.9785807291666666, "learning_rate": 0.0001, "loss": 7.3586, "loss/crossentropy": 2.1065737903118134, "loss/hidden": 3.3296875, "loss/jsd": 0.0, "loss/logits": 0.1861647253856063, "step": 12380 }, { "epoch": 0.30975, "grad_norm": 34.0, "grad_norm_var": 49.94583333333333, "learning_rate": 0.0001, "loss": 7.4425, "loss/crossentropy": 2.047655486315489, "loss/hidden": 3.39765625, "loss/jsd": 0.0, "loss/logits": 0.19645441174507142, "step": 12390 }, { "epoch": 0.31, "grad_norm": 30.75, "grad_norm_var": 55.521809895833336, "learning_rate": 0.0001, "loss": 7.3384, "loss/crossentropy": 2.1421022772789002, "loss/hidden": 3.453515625, "loss/jsd": 0.0, "loss/logits": 0.21594423688948156, "step": 12400 }, { "epoch": 0.31025, "grad_norm": 28.625, "grad_norm_var": 3.630989583333333, "learning_rate": 0.0001, "loss": 7.4915, "loss/crossentropy": 1.9852606005966664, "loss/hidden": 3.3875, "loss/jsd": 0.0, "loss/logits": 0.1970728723332286, "step": 12410 }, { "epoch": 0.3105, "grad_norm": 30.375, "grad_norm_var": 13.821875, "learning_rate": 0.0001, "loss": 7.316, "loss/crossentropy": 2.0221208080649378, "loss/hidden": 3.40546875, "loss/jsd": 0.0, "loss/logits": 0.18263351675122977, "step": 12420 }, { "epoch": 0.31075, "grad_norm": 27.75, "grad_norm_var": 3.7905598958333333, "learning_rate": 0.0001, "loss": 7.4569, "loss/crossentropy": 2.2214868292212486, "loss/hidden": 3.430078125, "loss/jsd": 0.0, "loss/logits": 0.20803299229592084, "step": 12430 }, { "epoch": 0.311, "grad_norm": 28.0, "grad_norm_var": 5.039322916666666, "learning_rate": 0.0001, "loss": 7.2246, "loss/crossentropy": 1.968916893005371, "loss/hidden": 3.398046875, "loss/jsd": 0.0, "loss/logits": 0.1823066620156169, "step": 12440 }, { "epoch": 0.31125, "grad_norm": 32.25, "grad_norm_var": 3.011458333333333, "learning_rate": 0.0001, "loss": 7.2986, "loss/crossentropy": 2.102560856938362, "loss/hidden": 3.462109375, "loss/jsd": 0.0, "loss/logits": 0.19421595316380263, "step": 12450 }, { "epoch": 0.3115, "grad_norm": 31.625, "grad_norm_var": 1.7122395833333333, "learning_rate": 0.0001, "loss": 7.4102, "loss/crossentropy": 1.8904587842524052, "loss/hidden": 3.33671875, "loss/jsd": 0.0, "loss/logits": 0.16765992902219296, "step": 12460 }, { "epoch": 0.31175, "grad_norm": 32.75, "grad_norm_var": 1.3614583333333334, "learning_rate": 0.0001, "loss": 7.317, "loss/crossentropy": 2.2456528916954994, "loss/hidden": 3.3265625, "loss/jsd": 0.0, "loss/logits": 0.18445245549082756, "step": 12470 }, { "epoch": 0.312, "grad_norm": 32.5, "grad_norm_var": 2.9613932291666667, "learning_rate": 0.0001, "loss": 7.2527, "loss/crossentropy": 1.9254849448800087, "loss/hidden": 3.330859375, "loss/jsd": 0.0, "loss/logits": 0.17210696106776596, "step": 12480 }, { "epoch": 0.31225, "grad_norm": 29.0, "grad_norm_var": 2.503059895833333, "learning_rate": 0.0001, "loss": 7.3589, "loss/crossentropy": 1.963121373206377, "loss/hidden": 3.3421875, "loss/jsd": 0.0, "loss/logits": 0.17580699175596237, "step": 12490 }, { "epoch": 0.3125, "grad_norm": 27.375, "grad_norm_var": 2.6510416666666665, "learning_rate": 0.0001, "loss": 7.3302, "loss/crossentropy": 2.0668979212641716, "loss/hidden": 3.33828125, "loss/jsd": 0.0, "loss/logits": 0.1756731817498803, "step": 12500 }, { "epoch": 0.31275, "grad_norm": 32.5, "grad_norm_var": 9.658072916666667, "learning_rate": 0.0001, "loss": 7.396, "loss/crossentropy": 2.0963846892118454, "loss/hidden": 3.37578125, "loss/jsd": 0.0, "loss/logits": 0.19187606330960988, "step": 12510 }, { "epoch": 0.313, "grad_norm": 28.125, "grad_norm_var": 8.387239583333333, "learning_rate": 0.0001, "loss": 7.291, "loss/crossentropy": 2.1238081738352776, "loss/hidden": 3.319921875, "loss/jsd": 0.0, "loss/logits": 0.18899311302229763, "step": 12520 }, { "epoch": 0.31325, "grad_norm": 30.625, "grad_norm_var": 2.2712890625, "learning_rate": 0.0001, "loss": 7.2927, "loss/crossentropy": 1.8629853092133999, "loss/hidden": 3.430078125, "loss/jsd": 0.0, "loss/logits": 0.18793845577165486, "step": 12530 }, { "epoch": 0.3135, "grad_norm": 31.25, "grad_norm_var": 29.926041666666666, "learning_rate": 0.0001, "loss": 7.334, "loss/crossentropy": 2.1922558069229128, "loss/hidden": 3.371484375, "loss/jsd": 0.0, "loss/logits": 0.18994415253400804, "step": 12540 }, { "epoch": 0.31375, "grad_norm": 33.0, "grad_norm_var": 19.626822916666665, "learning_rate": 0.0001, "loss": 7.4246, "loss/crossentropy": 2.0747738771140574, "loss/hidden": 3.463671875, "loss/jsd": 0.0, "loss/logits": 0.1956952316686511, "step": 12550 }, { "epoch": 0.314, "grad_norm": 31.0, "grad_norm_var": 5.9822265625, "learning_rate": 0.0001, "loss": 7.3921, "loss/crossentropy": 2.168559101223946, "loss/hidden": 3.346875, "loss/jsd": 0.0, "loss/logits": 0.19871533028781413, "step": 12560 }, { "epoch": 0.31425, "grad_norm": 29.875, "grad_norm_var": 23.478125, "learning_rate": 0.0001, "loss": 7.381, "loss/crossentropy": 2.1687012270092962, "loss/hidden": 3.2828125, "loss/jsd": 0.0, "loss/logits": 0.17819978184998037, "step": 12570 }, { "epoch": 0.3145, "grad_norm": 30.5, "grad_norm_var": 24.369791666666668, "learning_rate": 0.0001, "loss": 7.2273, "loss/crossentropy": 2.0223761543631555, "loss/hidden": 3.2953125, "loss/jsd": 0.0, "loss/logits": 0.17995049208402633, "step": 12580 }, { "epoch": 0.31475, "grad_norm": 31.125, "grad_norm_var": 3.8067057291666666, "learning_rate": 0.0001, "loss": 7.3672, "loss/crossentropy": 2.056264813989401, "loss/hidden": 3.40390625, "loss/jsd": 0.0, "loss/logits": 0.19224987253546716, "step": 12590 }, { "epoch": 0.315, "grad_norm": 30.125, "grad_norm_var": 3.3018229166666666, "learning_rate": 0.0001, "loss": 7.2404, "loss/crossentropy": 2.0127123326063154, "loss/hidden": 3.36640625, "loss/jsd": 0.0, "loss/logits": 0.17083216030150652, "step": 12600 }, { "epoch": 0.31525, "grad_norm": 31.25, "grad_norm_var": 2.94375, "learning_rate": 0.0001, "loss": 7.3918, "loss/crossentropy": 2.0961378000676634, "loss/hidden": 3.3796875, "loss/jsd": 0.0, "loss/logits": 0.18820952698588372, "step": 12610 }, { "epoch": 0.3155, "grad_norm": 30.75, "grad_norm_var": 3.910416666666667, "learning_rate": 0.0001, "loss": 7.2715, "loss/crossentropy": 2.1737264052033423, "loss/hidden": 3.33359375, "loss/jsd": 0.0, "loss/logits": 0.1752359176054597, "step": 12620 }, { "epoch": 0.31575, "grad_norm": 28.5, "grad_norm_var": 12.914322916666666, "learning_rate": 0.0001, "loss": 7.2866, "loss/crossentropy": 2.04836600497365, "loss/hidden": 3.308984375, "loss/jsd": 0.0, "loss/logits": 0.1814719710499048, "step": 12630 }, { "epoch": 0.316, "grad_norm": 32.5, "grad_norm_var": 5.895572916666667, "learning_rate": 0.0001, "loss": 7.4359, "loss/crossentropy": 2.0633833378553392, "loss/hidden": 3.38515625, "loss/jsd": 0.0, "loss/logits": 0.1844972724094987, "step": 12640 }, { "epoch": 0.31625, "grad_norm": 29.5, "grad_norm_var": 4.556705729166667, "learning_rate": 0.0001, "loss": 7.3404, "loss/crossentropy": 2.1031613536179066, "loss/hidden": 3.38671875, "loss/jsd": 0.0, "loss/logits": 0.1890261113177985, "step": 12650 }, { "epoch": 0.3165, "grad_norm": 33.25, "grad_norm_var": 7.43515625, "learning_rate": 0.0001, "loss": 7.3116, "loss/crossentropy": 2.0913070663809776, "loss/hidden": 3.48515625, "loss/jsd": 0.0, "loss/logits": 0.19625448603183032, "step": 12660 }, { "epoch": 0.31675, "grad_norm": 30.0, "grad_norm_var": 24.72265625, "learning_rate": 0.0001, "loss": 7.309, "loss/crossentropy": 2.1766112834215163, "loss/hidden": 3.398046875, "loss/jsd": 0.0, "loss/logits": 0.19453570377081633, "step": 12670 }, { "epoch": 0.317, "grad_norm": 28.0, "grad_norm_var": 3.5518229166666666, "learning_rate": 0.0001, "loss": 7.2193, "loss/crossentropy": 2.1292213678359984, "loss/hidden": 3.312890625, "loss/jsd": 0.0, "loss/logits": 0.17923556556925177, "step": 12680 }, { "epoch": 0.31725, "grad_norm": 33.75, "grad_norm_var": 3.2041666666666666, "learning_rate": 0.0001, "loss": 7.3568, "loss/crossentropy": 2.0966979548335076, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.1796438904479146, "step": 12690 }, { "epoch": 0.3175, "grad_norm": 30.125, "grad_norm_var": 21.9853515625, "learning_rate": 0.0001, "loss": 7.3281, "loss/crossentropy": 2.2050976656377315, "loss/hidden": 3.317578125, "loss/jsd": 0.0, "loss/logits": 0.18696724623441696, "step": 12700 }, { "epoch": 0.31775, "grad_norm": 31.25, "grad_norm_var": 20.936458333333334, "learning_rate": 0.0001, "loss": 7.2583, "loss/crossentropy": 2.0703148849308493, "loss/hidden": 3.267578125, "loss/jsd": 0.0, "loss/logits": 0.17689838781952857, "step": 12710 }, { "epoch": 0.318, "grad_norm": 31.75, "grad_norm_var": 1.9395833333333334, "learning_rate": 0.0001, "loss": 7.3007, "loss/crossentropy": 1.9791635520756246, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.17443582694977522, "step": 12720 }, { "epoch": 0.31825, "grad_norm": 30.25, "grad_norm_var": 2.5155598958333334, "learning_rate": 0.0001, "loss": 7.2557, "loss/crossentropy": 2.0753975957632065, "loss/hidden": 3.319140625, "loss/jsd": 0.0, "loss/logits": 0.18456004988402128, "step": 12730 }, { "epoch": 0.3185, "grad_norm": 29.125, "grad_norm_var": 1.3518229166666667, "learning_rate": 0.0001, "loss": 7.2658, "loss/crossentropy": 2.0314756602048876, "loss/hidden": 3.3890625, "loss/jsd": 0.0, "loss/logits": 0.18754468094557525, "step": 12740 }, { "epoch": 0.31875, "grad_norm": 29.25, "grad_norm_var": 1.6634765625, "learning_rate": 0.0001, "loss": 7.2545, "loss/crossentropy": 2.027373602986336, "loss/hidden": 3.366796875, "loss/jsd": 0.0, "loss/logits": 0.18138464018702508, "step": 12750 }, { "epoch": 0.319, "grad_norm": 33.0, "grad_norm_var": 2.2837890625, "learning_rate": 0.0001, "loss": 7.2711, "loss/crossentropy": 2.0996690608561037, "loss/hidden": 3.408984375, "loss/jsd": 0.0, "loss/logits": 0.19384829625487326, "step": 12760 }, { "epoch": 0.31925, "grad_norm": 29.5, "grad_norm_var": 1.7580729166666667, "learning_rate": 0.0001, "loss": 7.3448, "loss/crossentropy": 2.081255576014519, "loss/hidden": 3.48515625, "loss/jsd": 0.0, "loss/logits": 0.2054756512865424, "step": 12770 }, { "epoch": 0.3195, "grad_norm": 30.375, "grad_norm_var": 1.6337890625, "learning_rate": 0.0001, "loss": 7.3614, "loss/crossentropy": 2.109497997164726, "loss/hidden": 3.2984375, "loss/jsd": 0.0, "loss/logits": 0.1826931856572628, "step": 12780 }, { "epoch": 0.31975, "grad_norm": 29.125, "grad_norm_var": 2.9125, "learning_rate": 0.0001, "loss": 7.3569, "loss/crossentropy": 2.071355660259724, "loss/hidden": 3.390625, "loss/jsd": 0.0, "loss/logits": 0.18764556515961886, "step": 12790 }, { "epoch": 0.32, "grad_norm": 31.0, "grad_norm_var": 5.68125, "learning_rate": 0.0001, "loss": 7.3399, "loss/crossentropy": 1.9978457435965538, "loss/hidden": 3.365625, "loss/jsd": 0.0, "loss/logits": 0.17209083335474135, "step": 12800 }, { "epoch": 0.32025, "grad_norm": 28.75, "grad_norm_var": 20.754166666666666, "learning_rate": 0.0001, "loss": 7.3016, "loss/crossentropy": 1.984939170628786, "loss/hidden": 3.50625, "loss/jsd": 0.0, "loss/logits": 0.19444225933402776, "step": 12810 }, { "epoch": 0.3205, "grad_norm": 28.875, "grad_norm_var": 3.3942057291666665, "learning_rate": 0.0001, "loss": 7.3066, "loss/crossentropy": 1.8981015786528588, "loss/hidden": 3.41171875, "loss/jsd": 0.0, "loss/logits": 0.18696952695026994, "step": 12820 }, { "epoch": 0.32075, "grad_norm": 30.25, "grad_norm_var": 3.9218098958333334, "learning_rate": 0.0001, "loss": 7.3974, "loss/crossentropy": 2.1415200501680376, "loss/hidden": 3.434375, "loss/jsd": 0.0, "loss/logits": 0.20154954344034196, "step": 12830 }, { "epoch": 0.321, "grad_norm": 31.375, "grad_norm_var": 35.8400390625, "learning_rate": 0.0001, "loss": 7.3048, "loss/crossentropy": 2.1483262166380883, "loss/hidden": 3.33125, "loss/jsd": 0.0, "loss/logits": 0.19059547781944275, "step": 12840 }, { "epoch": 0.32125, "grad_norm": 31.0, "grad_norm_var": 2.561393229166667, "learning_rate": 0.0001, "loss": 7.4233, "loss/crossentropy": 2.1605855494737627, "loss/hidden": 3.3015625, "loss/jsd": 0.0, "loss/logits": 0.19197470052167773, "step": 12850 }, { "epoch": 0.3215, "grad_norm": 29.375, "grad_norm_var": 1.98125, "learning_rate": 0.0001, "loss": 7.3826, "loss/crossentropy": 2.1228017687797545, "loss/hidden": 3.380078125, "loss/jsd": 0.0, "loss/logits": 0.21111836601048709, "step": 12860 }, { "epoch": 0.32175, "grad_norm": 32.75, "grad_norm_var": 2.6666666666666665, "learning_rate": 0.0001, "loss": 7.317, "loss/crossentropy": 1.9638956546783448, "loss/hidden": 3.3625, "loss/jsd": 0.0, "loss/logits": 0.18815859649330377, "step": 12870 }, { "epoch": 0.322, "grad_norm": 30.875, "grad_norm_var": 2.5927083333333334, "learning_rate": 0.0001, "loss": 7.3412, "loss/crossentropy": 2.15369998216629, "loss/hidden": 3.32109375, "loss/jsd": 0.0, "loss/logits": 0.19398804232478142, "step": 12880 }, { "epoch": 0.32225, "grad_norm": 28.875, "grad_norm_var": 4.522916666666666, "learning_rate": 0.0001, "loss": 7.3785, "loss/crossentropy": 2.134490595757961, "loss/hidden": 3.367578125, "loss/jsd": 0.0, "loss/logits": 0.19743212331086396, "step": 12890 }, { "epoch": 0.3225, "grad_norm": 28.125, "grad_norm_var": 4.824739583333334, "learning_rate": 0.0001, "loss": 7.2795, "loss/crossentropy": 2.05883831679821, "loss/hidden": 3.38515625, "loss/jsd": 0.0, "loss/logits": 0.1887126075103879, "step": 12900 }, { "epoch": 0.32275, "grad_norm": 31.125, "grad_norm_var": 1.6257994456428096e+18, "learning_rate": 0.0001, "loss": 7.3637, "loss/crossentropy": 1.865815930068493, "loss/hidden": 3.334765625, "loss/jsd": 0.0, "loss/logits": 0.16476531345397233, "step": 12910 }, { "epoch": 0.323, "grad_norm": 27.375, "grad_norm_var": 1.6257994456056202e+18, "learning_rate": 0.0001, "loss": 7.3231, "loss/crossentropy": 2.1335863292217256, "loss/hidden": 3.3234375, "loss/jsd": 0.0, "loss/logits": 0.18054731655865908, "step": 12920 }, { "epoch": 0.32325, "grad_norm": 30.5, "grad_norm_var": 1.6822265625, "learning_rate": 0.0001, "loss": 7.2992, "loss/crossentropy": 2.029070366919041, "loss/hidden": 3.292578125, "loss/jsd": 0.0, "loss/logits": 0.1656917490065098, "step": 12930 }, { "epoch": 0.3235, "grad_norm": 29.875, "grad_norm_var": 1.9139973958333334, "learning_rate": 0.0001, "loss": 7.3529, "loss/crossentropy": 2.0367226876318454, "loss/hidden": 3.446484375, "loss/jsd": 0.0, "loss/logits": 0.18601053059101105, "step": 12940 }, { "epoch": 0.32375, "grad_norm": 27.875, "grad_norm_var": 2.1228515625, "learning_rate": 0.0001, "loss": 7.146, "loss/crossentropy": 2.008280509710312, "loss/hidden": 3.339453125, "loss/jsd": 0.0, "loss/logits": 0.17591428142040968, "step": 12950 }, { "epoch": 0.324, "grad_norm": 31.5, "grad_norm_var": 1.8893229166666667, "learning_rate": 0.0001, "loss": 7.3616, "loss/crossentropy": 2.1186923176050185, "loss/hidden": 3.476171875, "loss/jsd": 0.0, "loss/logits": 0.20130661278963088, "step": 12960 }, { "epoch": 0.32425, "grad_norm": 28.0, "grad_norm_var": 1.4749348958333333, "learning_rate": 0.0001, "loss": 7.3933, "loss/crossentropy": 2.1367026805877685, "loss/hidden": 3.41640625, "loss/jsd": 0.0, "loss/logits": 0.19593327604234217, "step": 12970 }, { "epoch": 0.3245, "grad_norm": 33.5, "grad_norm_var": 2.5009765625, "learning_rate": 0.0001, "loss": 7.36, "loss/crossentropy": 1.9709487736225129, "loss/hidden": 3.388671875, "loss/jsd": 0.0, "loss/logits": 0.18522804593667389, "step": 12980 }, { "epoch": 0.32475, "grad_norm": 32.25, "grad_norm_var": 3.037955729166667, "learning_rate": 0.0001, "loss": 7.3995, "loss/crossentropy": 2.182401825487614, "loss/hidden": 3.288671875, "loss/jsd": 0.0, "loss/logits": 0.1944254267960787, "step": 12990 }, { "epoch": 0.325, "grad_norm": 31.875, "grad_norm_var": 1.8259765625, "learning_rate": 0.0001, "loss": 7.297, "loss/crossentropy": 2.144014260172844, "loss/hidden": 3.335546875, "loss/jsd": 0.0, "loss/logits": 0.18785431496798993, "step": 13000 }, { "epoch": 0.32525, "grad_norm": 29.125, "grad_norm_var": 1.5858723958333334, "learning_rate": 0.0001, "loss": 7.4172, "loss/crossentropy": 2.185704696178436, "loss/hidden": 3.476953125, "loss/jsd": 0.0, "loss/logits": 0.21230035796761512, "step": 13010 }, { "epoch": 0.3255, "grad_norm": 33.75, "grad_norm_var": 2.0989583333333335, "learning_rate": 0.0001, "loss": 7.3542, "loss/crossentropy": 2.169140038639307, "loss/hidden": 3.367578125, "loss/jsd": 0.0, "loss/logits": 0.20137128187343478, "step": 13020 }, { "epoch": 0.32575, "grad_norm": 29.0, "grad_norm_var": 1.7613932291666667, "learning_rate": 0.0001, "loss": 7.3018, "loss/crossentropy": 2.0429045438766478, "loss/hidden": 3.306640625, "loss/jsd": 0.0, "loss/logits": 0.1801679054275155, "step": 13030 }, { "epoch": 0.326, "grad_norm": 28.25, "grad_norm_var": 20.3603515625, "learning_rate": 0.0001, "loss": 7.3076, "loss/crossentropy": 2.090936814248562, "loss/hidden": 3.363671875, "loss/jsd": 0.0, "loss/logits": 0.18894913028925658, "step": 13040 }, { "epoch": 0.32625, "grad_norm": 29.5, "grad_norm_var": 21.5759765625, "learning_rate": 0.0001, "loss": 7.3032, "loss/crossentropy": 2.0933708012104035, "loss/hidden": 3.306640625, "loss/jsd": 0.0, "loss/logits": 0.18349476121366023, "step": 13050 }, { "epoch": 0.3265, "grad_norm": 31.125, "grad_norm_var": 23.737434895833335, "learning_rate": 0.0001, "loss": 7.4007, "loss/crossentropy": 1.9636264845728875, "loss/hidden": 3.342578125, "loss/jsd": 0.0, "loss/logits": 0.17653203681111335, "step": 13060 }, { "epoch": 0.32675, "grad_norm": 30.875, "grad_norm_var": 26.170833333333334, "learning_rate": 0.0001, "loss": 7.2685, "loss/crossentropy": 2.0459864370524885, "loss/hidden": 3.392578125, "loss/jsd": 0.0, "loss/logits": 0.18335657650604845, "step": 13070 }, { "epoch": 0.327, "grad_norm": 32.25, "grad_norm_var": 13.033072916666667, "learning_rate": 0.0001, "loss": 7.3321, "loss/crossentropy": 2.2077785804867744, "loss/hidden": 3.376171875, "loss/jsd": 0.0, "loss/logits": 0.1968739289790392, "step": 13080 }, { "epoch": 0.32725, "grad_norm": 29.625, "grad_norm_var": 8.8634765625, "learning_rate": 0.0001, "loss": 7.2613, "loss/crossentropy": 2.039609357714653, "loss/hidden": 3.45078125, "loss/jsd": 0.0, "loss/logits": 0.19310947302728892, "step": 13090 }, { "epoch": 0.3275, "grad_norm": 29.875, "grad_norm_var": 8.334309895833334, "learning_rate": 0.0001, "loss": 7.3587, "loss/crossentropy": 2.0822102136909963, "loss/hidden": 3.26640625, "loss/jsd": 0.0, "loss/logits": 0.1863661216571927, "step": 13100 }, { "epoch": 0.32775, "grad_norm": 34.25, "grad_norm_var": 8.694205729166667, "learning_rate": 0.0001, "loss": 7.3602, "loss/crossentropy": 2.167416235804558, "loss/hidden": 3.36484375, "loss/jsd": 0.0, "loss/logits": 0.19521966725587844, "step": 13110 }, { "epoch": 0.328, "grad_norm": 29.125, "grad_norm_var": 7.191080729166667, "learning_rate": 0.0001, "loss": 7.3767, "loss/crossentropy": 2.07031906619668, "loss/hidden": 3.4015625, "loss/jsd": 0.0, "loss/logits": 0.22052818778902292, "step": 13120 }, { "epoch": 0.32825, "grad_norm": 35.5, "grad_norm_var": 5.821809895833334, "learning_rate": 0.0001, "loss": 7.5034, "loss/crossentropy": 2.1318026900291445, "loss/hidden": 3.4046875, "loss/jsd": 0.0, "loss/logits": 0.19695397429168224, "step": 13130 }, { "epoch": 0.3285, "grad_norm": 29.5, "grad_norm_var": 4.738541666666666, "learning_rate": 0.0001, "loss": 7.2907, "loss/crossentropy": 1.893085064738989, "loss/hidden": 3.421875, "loss/jsd": 0.0, "loss/logits": 0.17454516123980285, "step": 13140 }, { "epoch": 0.32875, "grad_norm": 33.25, "grad_norm_var": 4.96640625, "learning_rate": 0.0001, "loss": 7.3911, "loss/crossentropy": 2.0709325544536115, "loss/hidden": 3.405078125, "loss/jsd": 0.0, "loss/logits": 0.19108830243349076, "step": 13150 }, { "epoch": 0.329, "grad_norm": 30.375, "grad_norm_var": 6.593489583333334, "learning_rate": 0.0001, "loss": 7.278, "loss/crossentropy": 2.0809618443250657, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.1867564545944333, "step": 13160 }, { "epoch": 0.32925, "grad_norm": 28.875, "grad_norm_var": 5.355143229166667, "learning_rate": 0.0001, "loss": 7.1865, "loss/crossentropy": 1.86175979077816, "loss/hidden": 3.41640625, "loss/jsd": 0.0, "loss/logits": 0.18658734802156687, "step": 13170 }, { "epoch": 0.3295, "grad_norm": 32.5, "grad_norm_var": 7.297916666666667, "learning_rate": 0.0001, "loss": 7.2993, "loss/crossentropy": 2.0688765406608582, "loss/hidden": 3.320703125, "loss/jsd": 0.0, "loss/logits": 0.1887798959389329, "step": 13180 }, { "epoch": 0.32975, "grad_norm": 32.0, "grad_norm_var": 3.1462890625, "learning_rate": 0.0001, "loss": 7.396, "loss/crossentropy": 2.1349074259400367, "loss/hidden": 3.3734375, "loss/jsd": 0.0, "loss/logits": 0.1891916124150157, "step": 13190 }, { "epoch": 0.33, "grad_norm": 29.375, "grad_norm_var": 3.2080729166666666, "learning_rate": 0.0001, "loss": 7.2194, "loss/crossentropy": 2.0701663225889204, "loss/hidden": 3.290625, "loss/jsd": 0.0, "loss/logits": 0.19238610491156577, "step": 13200 }, { "epoch": 0.33025, "grad_norm": 29.875, "grad_norm_var": 1.3447265625, "learning_rate": 0.0001, "loss": 7.3476, "loss/crossentropy": 2.126928760111332, "loss/hidden": 3.351953125, "loss/jsd": 0.0, "loss/logits": 0.18552705701440572, "step": 13210 }, { "epoch": 0.3305, "grad_norm": 29.875, "grad_norm_var": 2.4916666666666667, "learning_rate": 0.0001, "loss": 7.3275, "loss/crossentropy": 2.0960667990148067, "loss/hidden": 3.3546875, "loss/jsd": 0.0, "loss/logits": 0.18840519580990076, "step": 13220 }, { "epoch": 0.33075, "grad_norm": 29.0, "grad_norm_var": 31.097916666666666, "learning_rate": 0.0001, "loss": 7.3449, "loss/crossentropy": 1.9724566139280797, "loss/hidden": 3.401171875, "loss/jsd": 0.0, "loss/logits": 0.17257819082587958, "step": 13230 }, { "epoch": 0.331, "grad_norm": 30.25, "grad_norm_var": 29.516080729166667, "learning_rate": 0.0001, "loss": 7.5214, "loss/crossentropy": 2.144184100627899, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.19250041600316764, "step": 13240 }, { "epoch": 0.33125, "grad_norm": 28.25, "grad_norm_var": 6.55, "learning_rate": 0.0001, "loss": 7.3245, "loss/crossentropy": 1.9724821582436562, "loss/hidden": 3.423046875, "loss/jsd": 0.0, "loss/logits": 0.18454758413136005, "step": 13250 }, { "epoch": 0.3315, "grad_norm": 35.75, "grad_norm_var": 10.574739583333333, "learning_rate": 0.0001, "loss": 7.2067, "loss/crossentropy": 2.0341859996318816, "loss/hidden": 3.466015625, "loss/jsd": 0.0, "loss/logits": 0.19355686828494073, "step": 13260 }, { "epoch": 0.33175, "grad_norm": 31.875, "grad_norm_var": 6.093489583333334, "learning_rate": 0.0001, "loss": 7.3386, "loss/crossentropy": 2.1593449860811234, "loss/hidden": 3.282421875, "loss/jsd": 0.0, "loss/logits": 0.18228702070191502, "step": 13270 }, { "epoch": 0.332, "grad_norm": 32.25, "grad_norm_var": 4.518489583333333, "learning_rate": 0.0001, "loss": 7.2584, "loss/crossentropy": 2.021310421824455, "loss/hidden": 3.37265625, "loss/jsd": 0.0, "loss/logits": 0.193563433829695, "step": 13280 }, { "epoch": 0.33225, "grad_norm": 33.25, "grad_norm_var": 3.008072916666667, "learning_rate": 0.0001, "loss": 7.3654, "loss/crossentropy": 2.010246267169714, "loss/hidden": 3.398046875, "loss/jsd": 0.0, "loss/logits": 0.18375244587659836, "step": 13290 }, { "epoch": 0.3325, "grad_norm": 33.5, "grad_norm_var": 1.8309895833333334, "learning_rate": 0.0001, "loss": 7.4013, "loss/crossentropy": 2.1324703454971314, "loss/hidden": 3.42109375, "loss/jsd": 0.0, "loss/logits": 0.19651575423777104, "step": 13300 }, { "epoch": 0.33275, "grad_norm": 31.625, "grad_norm_var": 9.018489583333333, "learning_rate": 0.0001, "loss": 7.4123, "loss/crossentropy": 2.2024870067834854, "loss/hidden": 3.36484375, "loss/jsd": 0.0, "loss/logits": 0.19117040373384953, "step": 13310 }, { "epoch": 0.333, "grad_norm": 29.5, "grad_norm_var": 54.362239583333334, "learning_rate": 0.0001, "loss": 7.3561, "loss/crossentropy": 1.9291939452290534, "loss/hidden": 3.33828125, "loss/jsd": 0.0, "loss/logits": 0.18055410776287317, "step": 13320 }, { "epoch": 0.33325, "grad_norm": 30.5, "grad_norm_var": 53.271875, "learning_rate": 0.0001, "loss": 7.3417, "loss/crossentropy": 1.9812588825821877, "loss/hidden": 3.246875, "loss/jsd": 0.0, "loss/logits": 0.17277830513194203, "step": 13330 }, { "epoch": 0.3335, "grad_norm": 30.625, "grad_norm_var": 2.0332682291666666, "learning_rate": 0.0001, "loss": 7.4792, "loss/crossentropy": 2.1468162171542646, "loss/hidden": 3.29453125, "loss/jsd": 0.0, "loss/logits": 0.1815652133896947, "step": 13340 }, { "epoch": 0.33375, "grad_norm": 30.5, "grad_norm_var": 1.8942057291666667, "learning_rate": 0.0001, "loss": 7.3437, "loss/crossentropy": 1.9899487346410751, "loss/hidden": 3.336328125, "loss/jsd": 0.0, "loss/logits": 0.1736677044071257, "step": 13350 }, { "epoch": 0.334, "grad_norm": 32.75, "grad_norm_var": 3.374955311060432e+18, "learning_rate": 0.0001, "loss": 7.4592, "loss/crossentropy": 2.0041578873991965, "loss/hidden": 3.38984375, "loss/jsd": 0.0, "loss/logits": 0.19686540886759757, "step": 13360 }, { "epoch": 0.33425, "grad_norm": 29.375, "grad_norm_var": 3.3749553110757407e+18, "learning_rate": 0.0001, "loss": 7.3521, "loss/crossentropy": 2.050678627192974, "loss/hidden": 3.35859375, "loss/jsd": 0.0, "loss/logits": 0.1835683614946902, "step": 13370 }, { "epoch": 0.3345, "grad_norm": 32.5, "grad_norm_var": 1.4561848958333334, "learning_rate": 0.0001, "loss": 7.3463, "loss/crossentropy": 2.255459001660347, "loss/hidden": 3.3484375, "loss/jsd": 0.0, "loss/logits": 0.2026721488684416, "step": 13380 }, { "epoch": 0.33475, "grad_norm": 55.0, "grad_norm_var": 37.805989583333336, "learning_rate": 0.0001, "loss": 7.432, "loss/crossentropy": 2.092339722812176, "loss/hidden": 3.445703125, "loss/jsd": 0.0, "loss/logits": 0.20365451760590075, "step": 13390 }, { "epoch": 0.335, "grad_norm": 29.5, "grad_norm_var": 39.1962890625, "learning_rate": 0.0001, "loss": 7.3082, "loss/crossentropy": 2.060484157502651, "loss/hidden": 3.49921875, "loss/jsd": 0.0, "loss/logits": 0.19208470694720745, "step": 13400 }, { "epoch": 0.33525, "grad_norm": 30.875, "grad_norm_var": 4.119205729166667, "learning_rate": 0.0001, "loss": 7.3159, "loss/crossentropy": 2.0885482341051103, "loss/hidden": 3.3484375, "loss/jsd": 0.0, "loss/logits": 0.18701701434329152, "step": 13410 }, { "epoch": 0.3355, "grad_norm": 34.75, "grad_norm_var": 3.8354166666666667, "learning_rate": 0.0001, "loss": 7.3792, "loss/crossentropy": 2.064309825748205, "loss/hidden": 3.40546875, "loss/jsd": 0.0, "loss/logits": 0.20586481597274542, "step": 13420 }, { "epoch": 0.33575, "grad_norm": 29.75, "grad_norm_var": 2.535416666666667, "learning_rate": 0.0001, "loss": 7.4193, "loss/crossentropy": 1.8737250491976738, "loss/hidden": 3.427734375, "loss/jsd": 0.0, "loss/logits": 0.19758351668715476, "step": 13430 }, { "epoch": 0.336, "grad_norm": 30.25, "grad_norm_var": 3.8686848958333333, "learning_rate": 0.0001, "loss": 7.4324, "loss/crossentropy": 1.9396042831242084, "loss/hidden": 3.39609375, "loss/jsd": 0.0, "loss/logits": 0.18089414723217487, "step": 13440 }, { "epoch": 0.33625, "grad_norm": 28.875, "grad_norm_var": 4.540625, "learning_rate": 0.0001, "loss": 7.3798, "loss/crossentropy": 2.004528859257698, "loss/hidden": 3.442578125, "loss/jsd": 0.0, "loss/logits": 0.22120596412569285, "step": 13450 }, { "epoch": 0.3365, "grad_norm": 29.125, "grad_norm_var": 1.69375, "learning_rate": 0.0001, "loss": 7.3708, "loss/crossentropy": 2.170805335044861, "loss/hidden": 3.378515625, "loss/jsd": 0.0, "loss/logits": 0.18870076667517424, "step": 13460 }, { "epoch": 0.33675, "grad_norm": 29.375, "grad_norm_var": 2.9284656301175997e+18, "learning_rate": 0.0001, "loss": 7.3594, "loss/crossentropy": 2.0726011991500854, "loss/hidden": 3.310546875, "loss/jsd": 0.0, "loss/logits": 0.18316982481628657, "step": 13470 }, { "epoch": 0.337, "grad_norm": 32.25, "grad_norm_var": 3.999739583333333, "learning_rate": 0.0001, "loss": 7.3072, "loss/crossentropy": 2.0701726540923118, "loss/hidden": 3.37734375, "loss/jsd": 0.0, "loss/logits": 0.18232779456302523, "step": 13480 }, { "epoch": 0.33725, "grad_norm": 31.625, "grad_norm_var": 1.01640625, "learning_rate": 0.0001, "loss": 7.3553, "loss/crossentropy": 2.066875821352005, "loss/hidden": 3.34765625, "loss/jsd": 0.0, "loss/logits": 0.18299774192273616, "step": 13490 }, { "epoch": 0.3375, "grad_norm": 32.0, "grad_norm_var": 1.4489583333333333, "learning_rate": 0.0001, "loss": 7.2708, "loss/crossentropy": 2.110422171652317, "loss/hidden": 3.353125, "loss/jsd": 0.0, "loss/logits": 0.19538786429911853, "step": 13500 }, { "epoch": 0.33775, "grad_norm": 29.875, "grad_norm_var": 1.06015625, "learning_rate": 0.0001, "loss": 7.3529, "loss/crossentropy": 2.1061682522296907, "loss/hidden": 3.4078125, "loss/jsd": 0.0, "loss/logits": 0.19031856823712587, "step": 13510 }, { "epoch": 0.338, "grad_norm": 32.5, "grad_norm_var": 1.40390625, "learning_rate": 0.0001, "loss": 7.2473, "loss/crossentropy": 2.0029176853597166, "loss/hidden": 3.284375, "loss/jsd": 0.0, "loss/logits": 0.182035060133785, "step": 13520 }, { "epoch": 0.33825, "grad_norm": 29.375, "grad_norm_var": 9.417708333333334, "learning_rate": 0.0001, "loss": 7.3084, "loss/crossentropy": 2.1238187912851574, "loss/hidden": 3.380078125, "loss/jsd": 0.0, "loss/logits": 0.188486148416996, "step": 13530 }, { "epoch": 0.3385, "grad_norm": 30.25, "grad_norm_var": 1.3983723958333334, "learning_rate": 0.0001, "loss": 7.2256, "loss/crossentropy": 1.9766217768192291, "loss/hidden": 3.326953125, "loss/jsd": 0.0, "loss/logits": 0.17294846558943391, "step": 13540 }, { "epoch": 0.33875, "grad_norm": 29.375, "grad_norm_var": 3.0874348958333333, "learning_rate": 0.0001, "loss": 7.327, "loss/crossentropy": 2.1272383123636245, "loss/hidden": 3.345703125, "loss/jsd": 0.0, "loss/logits": 0.18510236088186502, "step": 13550 }, { "epoch": 0.339, "grad_norm": 31.0, "grad_norm_var": 3.10390625, "learning_rate": 0.0001, "loss": 7.2885, "loss/crossentropy": 2.0515687823295594, "loss/hidden": 3.368359375, "loss/jsd": 0.0, "loss/logits": 0.18967761769890784, "step": 13560 }, { "epoch": 0.33925, "grad_norm": 29.625, "grad_norm_var": 11.933072916666667, "learning_rate": 0.0001, "loss": 7.3573, "loss/crossentropy": 2.0446967758238315, "loss/hidden": 3.425, "loss/jsd": 0.0, "loss/logits": 0.18640230242162942, "step": 13570 }, { "epoch": 0.3395, "grad_norm": 28.875, "grad_norm_var": 1.84765625, "learning_rate": 0.0001, "loss": 7.2965, "loss/crossentropy": 1.9837520524859429, "loss/hidden": 3.3171875, "loss/jsd": 0.0, "loss/logits": 0.1916389312595129, "step": 13580 }, { "epoch": 0.33975, "grad_norm": 30.5, "grad_norm_var": 3.1791666666666667, "learning_rate": 0.0001, "loss": 7.3632, "loss/crossentropy": 2.1806528866291046, "loss/hidden": 3.337109375, "loss/jsd": 0.0, "loss/logits": 0.19690024815499782, "step": 13590 }, { "epoch": 0.34, "grad_norm": 30.75, "grad_norm_var": 1.3385416666666667, "learning_rate": 0.0001, "loss": 7.3761, "loss/crossentropy": 2.215220719575882, "loss/hidden": 3.312109375, "loss/jsd": 0.0, "loss/logits": 0.19034639187157154, "step": 13600 }, { "epoch": 0.34025, "grad_norm": 27.375, "grad_norm_var": 2.7129557291666666, "learning_rate": 0.0001, "loss": 7.3129, "loss/crossentropy": 2.16262392103672, "loss/hidden": 3.503125, "loss/jsd": 0.0, "loss/logits": 0.20174582321196793, "step": 13610 }, { "epoch": 0.3405, "grad_norm": 30.5, "grad_norm_var": 3.2363932291666666, "learning_rate": 0.0001, "loss": 7.3501, "loss/crossentropy": 2.126379433274269, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.18432336021214724, "step": 13620 }, { "epoch": 0.34075, "grad_norm": 30.875, "grad_norm_var": 1.8176432291666667, "learning_rate": 0.0001, "loss": 7.3601, "loss/crossentropy": 2.0324837125837805, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.18331200983375312, "step": 13630 }, { "epoch": 0.341, "grad_norm": 29.0, "grad_norm_var": 1.7942057291666667, "learning_rate": 0.0001, "loss": 7.3882, "loss/crossentropy": 2.176556921005249, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.19637300558388232, "step": 13640 }, { "epoch": 0.34125, "grad_norm": 29.875, "grad_norm_var": 3.758072916666667, "learning_rate": 0.0001, "loss": 7.1992, "loss/crossentropy": 1.9680285774171353, "loss/hidden": 3.30546875, "loss/jsd": 0.0, "loss/logits": 0.1837764661759138, "step": 13650 }, { "epoch": 0.3415, "grad_norm": 29.125, "grad_norm_var": 2.6936848958333335, "learning_rate": 0.0001, "loss": 7.361, "loss/crossentropy": 2.1511219844222067, "loss/hidden": 3.29140625, "loss/jsd": 0.0, "loss/logits": 0.1746565790846944, "step": 13660 }, { "epoch": 0.34175, "grad_norm": 28.375, "grad_norm_var": 2.312239583333333, "learning_rate": 0.0001, "loss": 7.3856, "loss/crossentropy": 2.187084162980318, "loss/hidden": 3.37734375, "loss/jsd": 0.0, "loss/logits": 0.18925763312727212, "step": 13670 }, { "epoch": 0.342, "grad_norm": 29.75, "grad_norm_var": 2.4139973958333334, "learning_rate": 0.0001, "loss": 7.196, "loss/crossentropy": 2.0472760550677775, "loss/hidden": 3.32109375, "loss/jsd": 0.0, "loss/logits": 0.17813790533691645, "step": 13680 }, { "epoch": 0.34225, "grad_norm": 32.25, "grad_norm_var": 1.6458333333333333, "learning_rate": 0.0001, "loss": 7.4184, "loss/crossentropy": 2.0482982218265535, "loss/hidden": 3.483203125, "loss/jsd": 0.0, "loss/logits": 0.1855379816144705, "step": 13690 }, { "epoch": 0.3425, "grad_norm": 27.875, "grad_norm_var": 98.41432291666666, "learning_rate": 0.0001, "loss": 7.2863, "loss/crossentropy": 2.0204505778849127, "loss/hidden": 3.34609375, "loss/jsd": 0.0, "loss/logits": 0.17318440061062573, "step": 13700 }, { "epoch": 0.34275, "grad_norm": 28.375, "grad_norm_var": 33.1900390625, "learning_rate": 0.0001, "loss": 7.3409, "loss/crossentropy": 2.0783301174640654, "loss/hidden": 3.38125, "loss/jsd": 0.0, "loss/logits": 0.19091757852584124, "step": 13710 }, { "epoch": 0.343, "grad_norm": 32.0, "grad_norm_var": 3.7150390625, "learning_rate": 0.0001, "loss": 7.3412, "loss/crossentropy": 2.0553587220609186, "loss/hidden": 3.39140625, "loss/jsd": 0.0, "loss/logits": 0.18813807256519793, "step": 13720 }, { "epoch": 0.34325, "grad_norm": 34.25, "grad_norm_var": 1586.7400390625, "learning_rate": 0.0001, "loss": 7.4918, "loss/crossentropy": 2.14101425036788, "loss/hidden": 3.36796875, "loss/jsd": 0.0, "loss/logits": 0.1929340995848179, "step": 13730 }, { "epoch": 0.3435, "grad_norm": 32.5, "grad_norm_var": 4.633072916666666, "learning_rate": 0.0001, "loss": 7.3162, "loss/crossentropy": 2.0531166955828666, "loss/hidden": 3.411328125, "loss/jsd": 0.0, "loss/logits": 0.18961485847830772, "step": 13740 }, { "epoch": 0.34375, "grad_norm": 51.5, "grad_norm_var": 29.780989583333334, "learning_rate": 0.0001, "loss": 7.3981, "loss/crossentropy": 2.146902695298195, "loss/hidden": 3.4203125, "loss/jsd": 0.0, "loss/logits": 0.18639619778841734, "step": 13750 }, { "epoch": 0.344, "grad_norm": 31.875, "grad_norm_var": 27.141080729166667, "learning_rate": 0.0001, "loss": 7.3596, "loss/crossentropy": 2.2127513885498047, "loss/hidden": 3.407421875, "loss/jsd": 0.0, "loss/logits": 0.19791291914880277, "step": 13760 }, { "epoch": 0.34425, "grad_norm": 32.25, "grad_norm_var": 1.9809895833333333, "learning_rate": 0.0001, "loss": 7.3245, "loss/crossentropy": 2.113487794995308, "loss/hidden": 3.396875, "loss/jsd": 0.0, "loss/logits": 0.18752570655196904, "step": 13770 }, { "epoch": 0.3445, "grad_norm": 28.875, "grad_norm_var": 2.7666666666666666, "learning_rate": 0.0001, "loss": 7.3737, "loss/crossentropy": 2.192716282606125, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.1977760722860694, "step": 13780 }, { "epoch": 0.34475, "grad_norm": 30.375, "grad_norm_var": 1.4999348958333334, "learning_rate": 0.0001, "loss": 7.2308, "loss/crossentropy": 2.1602986216545106, "loss/hidden": 3.2796875, "loss/jsd": 0.0, "loss/logits": 0.1859057329595089, "step": 13790 }, { "epoch": 0.345, "grad_norm": 45.25, "grad_norm_var": 15.587434895833333, "learning_rate": 0.0001, "loss": 7.3689, "loss/crossentropy": 2.042529730498791, "loss/hidden": 3.34765625, "loss/jsd": 0.0, "loss/logits": 0.2055663662031293, "step": 13800 }, { "epoch": 0.34525, "grad_norm": 31.375, "grad_norm_var": 16.111458333333335, "learning_rate": 0.0001, "loss": 7.355, "loss/crossentropy": 2.004579763114452, "loss/hidden": 3.46171875, "loss/jsd": 0.0, "loss/logits": 0.18781258668750525, "step": 13810 }, { "epoch": 0.3455, "grad_norm": 27.75, "grad_norm_var": 2.9369140625, "learning_rate": 0.0001, "loss": 7.2946, "loss/crossentropy": 1.9647000446915626, "loss/hidden": 3.404296875, "loss/jsd": 0.0, "loss/logits": 0.18860411625355483, "step": 13820 }, { "epoch": 0.34575, "grad_norm": 28.625, "grad_norm_var": 2.4830729166666665, "learning_rate": 0.0001, "loss": 7.3643, "loss/crossentropy": 2.1840324103832245, "loss/hidden": 3.321875, "loss/jsd": 0.0, "loss/logits": 0.18433275502175092, "step": 13830 }, { "epoch": 0.346, "grad_norm": 31.0, "grad_norm_var": 2.061393229166667, "learning_rate": 0.0001, "loss": 7.39, "loss/crossentropy": 2.1660002395510674, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.18589738458395005, "step": 13840 }, { "epoch": 0.34625, "grad_norm": 30.625, "grad_norm_var": 3.237434895833333, "learning_rate": 0.0001, "loss": 7.2772, "loss/crossentropy": 2.0304790273308755, "loss/hidden": 3.383984375, "loss/jsd": 0.0, "loss/logits": 0.19276629090309144, "step": 13850 }, { "epoch": 0.3465, "grad_norm": 27.375, "grad_norm_var": 4.35390625, "learning_rate": 0.0001, "loss": 7.2433, "loss/crossentropy": 2.0913878247141837, "loss/hidden": 3.348828125, "loss/jsd": 0.0, "loss/logits": 0.17961858343333006, "step": 13860 }, { "epoch": 0.34675, "grad_norm": 31.625, "grad_norm_var": 1.9833333333333334, "learning_rate": 0.0001, "loss": 7.298, "loss/crossentropy": 1.9085475612431764, "loss/hidden": 3.41875, "loss/jsd": 0.0, "loss/logits": 0.1773257221095264, "step": 13870 }, { "epoch": 0.347, "grad_norm": 31.75, "grad_norm_var": 1.2061848958333334, "learning_rate": 0.0001, "loss": 7.3026, "loss/crossentropy": 2.134579537808895, "loss/hidden": 3.29375, "loss/jsd": 0.0, "loss/logits": 0.18533094711601733, "step": 13880 }, { "epoch": 0.34725, "grad_norm": 29.75, "grad_norm_var": 1.74140625, "learning_rate": 0.0001, "loss": 7.2397, "loss/crossentropy": 2.0479501873254775, "loss/hidden": 3.36015625, "loss/jsd": 0.0, "loss/logits": 0.1840794005431235, "step": 13890 }, { "epoch": 0.3475, "grad_norm": 30.625, "grad_norm_var": 2.6400390625, "learning_rate": 0.0001, "loss": 7.2447, "loss/crossentropy": 2.0175841793417932, "loss/hidden": 3.29296875, "loss/jsd": 0.0, "loss/logits": 0.17613419592380525, "step": 13900 }, { "epoch": 0.34775, "grad_norm": 32.5, "grad_norm_var": 5.333072916666667, "learning_rate": 0.0001, "loss": 7.3573, "loss/crossentropy": 2.248972457647324, "loss/hidden": 3.35234375, "loss/jsd": 0.0, "loss/logits": 0.19207954667508603, "step": 13910 }, { "epoch": 0.348, "grad_norm": 31.0, "grad_norm_var": 4.843489583333334, "learning_rate": 0.0001, "loss": 7.3649, "loss/crossentropy": 2.215424671769142, "loss/hidden": 3.3328125, "loss/jsd": 0.0, "loss/logits": 0.19513901360332966, "step": 13920 }, { "epoch": 0.34825, "grad_norm": 30.25, "grad_norm_var": 2.004622395833333, "learning_rate": 0.0001, "loss": 7.4324, "loss/crossentropy": 2.157894307374954, "loss/hidden": 3.457421875, "loss/jsd": 0.0, "loss/logits": 0.20247049070894718, "step": 13930 }, { "epoch": 0.3485, "grad_norm": 28.5, "grad_norm_var": 2.4150390625, "learning_rate": 0.0001, "loss": 7.3422, "loss/crossentropy": 2.14641355201602, "loss/hidden": 3.37890625, "loss/jsd": 0.0, "loss/logits": 0.1898975013755262, "step": 13940 }, { "epoch": 0.34875, "grad_norm": 28.5, "grad_norm_var": 6.0103515625, "learning_rate": 0.0001, "loss": 7.3309, "loss/crossentropy": 2.0432268232107162, "loss/hidden": 3.461328125, "loss/jsd": 0.0, "loss/logits": 0.19791582636535168, "step": 13950 }, { "epoch": 0.349, "grad_norm": 28.875, "grad_norm_var": 4.4744140625, "learning_rate": 0.0001, "loss": 7.3187, "loss/crossentropy": 2.1648118555545808, "loss/hidden": 3.301171875, "loss/jsd": 0.0, "loss/logits": 0.1794663654640317, "step": 13960 }, { "epoch": 0.34925, "grad_norm": 29.5, "grad_norm_var": 3.1332682291666667, "learning_rate": 0.0001, "loss": 7.3122, "loss/crossentropy": 2.0566162675619126, "loss/hidden": 3.373046875, "loss/jsd": 0.0, "loss/logits": 0.18033871669322252, "step": 13970 }, { "epoch": 0.3495, "grad_norm": 27.875, "grad_norm_var": 1.3686848958333333, "learning_rate": 0.0001, "loss": 7.2538, "loss/crossentropy": 2.0309479638934134, "loss/hidden": 3.316015625, "loss/jsd": 0.0, "loss/logits": 0.1834420131519437, "step": 13980 }, { "epoch": 0.34975, "grad_norm": 30.0, "grad_norm_var": 2.3510416666666667, "learning_rate": 0.0001, "loss": 7.3068, "loss/crossentropy": 2.0843526370823384, "loss/hidden": 3.35859375, "loss/jsd": 0.0, "loss/logits": 0.18843147568404675, "step": 13990 }, { "epoch": 0.35, "grad_norm": 34.25, "grad_norm_var": 10.67265625, "learning_rate": 0.0001, "loss": 7.3839, "loss/crossentropy": 2.0708370715379716, "loss/hidden": 3.389453125, "loss/jsd": 0.0, "loss/logits": 0.19229107201099396, "step": 14000 }, { "epoch": 0.35025, "grad_norm": 28.5, "grad_norm_var": 11.178059895833334, "learning_rate": 0.0001, "loss": 7.3404, "loss/crossentropy": 2.016050732135773, "loss/hidden": 3.49921875, "loss/jsd": 0.0, "loss/logits": 0.1993856718763709, "step": 14010 }, { "epoch": 0.3505, "grad_norm": 31.375, "grad_norm_var": 1.55625, "learning_rate": 0.0001, "loss": 7.4397, "loss/crossentropy": 2.0941019743680953, "loss/hidden": 3.445703125, "loss/jsd": 0.0, "loss/logits": 0.1941377494484186, "step": 14020 }, { "epoch": 0.35075, "grad_norm": 28.125, "grad_norm_var": 1.7934895833333333, "learning_rate": 0.0001, "loss": 7.2787, "loss/crossentropy": 2.175595435500145, "loss/hidden": 3.36484375, "loss/jsd": 0.0, "loss/logits": 0.1906340267509222, "step": 14030 }, { "epoch": 0.351, "grad_norm": 29.625, "grad_norm_var": 3.01015625, "learning_rate": 0.0001, "loss": 7.3322, "loss/crossentropy": 1.945583702623844, "loss/hidden": 3.308203125, "loss/jsd": 0.0, "loss/logits": 0.17433239622041583, "step": 14040 }, { "epoch": 0.35125, "grad_norm": 32.0, "grad_norm_var": 11.308072916666667, "learning_rate": 0.0001, "loss": 7.4361, "loss/crossentropy": 2.078453540802002, "loss/hidden": 3.271484375, "loss/jsd": 0.0, "loss/logits": 0.1803632376715541, "step": 14050 }, { "epoch": 0.3515, "grad_norm": 31.125, "grad_norm_var": 11.920833333333333, "learning_rate": 0.0001, "loss": 7.3917, "loss/crossentropy": 2.2025377944111826, "loss/hidden": 3.38515625, "loss/jsd": 0.0, "loss/logits": 0.18862273804843427, "step": 14060 }, { "epoch": 0.35175, "grad_norm": 26.875, "grad_norm_var": 14.82890625, "learning_rate": 0.0001, "loss": 7.294, "loss/crossentropy": 2.0608675971627237, "loss/hidden": 3.229296875, "loss/jsd": 0.0, "loss/logits": 0.17895151115953922, "step": 14070 }, { "epoch": 0.352, "grad_norm": 31.5, "grad_norm_var": 16.512239583333333, "learning_rate": 0.0001, "loss": 7.2397, "loss/crossentropy": 1.951252208650112, "loss/hidden": 3.347265625, "loss/jsd": 0.0, "loss/logits": 0.17488121166825293, "step": 14080 }, { "epoch": 0.35225, "grad_norm": 29.25, "grad_norm_var": 1.6044777159285975e+18, "learning_rate": 0.0001, "loss": 7.3104, "loss/crossentropy": 2.1797886729240417, "loss/hidden": 3.346484375, "loss/jsd": 0.0, "loss/logits": 0.19085733741521835, "step": 14090 }, { "epoch": 0.3525, "grad_norm": 30.625, "grad_norm_var": 1.6044777150524774e+18, "learning_rate": 0.0001, "loss": 7.269, "loss/crossentropy": 2.133794055879116, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.18788108974695206, "step": 14100 }, { "epoch": 0.35275, "grad_norm": 31.125, "grad_norm_var": 57.800455729166664, "learning_rate": 0.0001, "loss": 7.2859, "loss/crossentropy": 2.0392952769994737, "loss/hidden": 3.317578125, "loss/jsd": 0.0, "loss/logits": 0.1854009686037898, "step": 14110 }, { "epoch": 0.353, "grad_norm": 29.0, "grad_norm_var": 2.729622395833333, "learning_rate": 0.0001, "loss": 7.3152, "loss/crossentropy": 2.026089659333229, "loss/hidden": 3.36875, "loss/jsd": 0.0, "loss/logits": 0.18226654808968307, "step": 14120 }, { "epoch": 0.35325, "grad_norm": 30.375, "grad_norm_var": 2.703125, "learning_rate": 0.0001, "loss": 7.4705, "loss/crossentropy": 2.132002358883619, "loss/hidden": 3.39296875, "loss/jsd": 0.0, "loss/logits": 0.2102237056940794, "step": 14130 }, { "epoch": 0.3535, "grad_norm": 29.0, "grad_norm_var": 2.9497395833333333, "learning_rate": 0.0001, "loss": 7.3236, "loss/crossentropy": 2.0131492763757706, "loss/hidden": 3.380078125, "loss/jsd": 0.0, "loss/logits": 0.19223638353869318, "step": 14140 }, { "epoch": 0.35375, "grad_norm": 29.5, "grad_norm_var": 2.7212890625, "learning_rate": 0.0001, "loss": 7.271, "loss/crossentropy": 2.1130865029990673, "loss/hidden": 3.324609375, "loss/jsd": 0.0, "loss/logits": 0.1914577091112733, "step": 14150 }, { "epoch": 0.354, "grad_norm": 31.125, "grad_norm_var": 1.8254557291666667, "learning_rate": 0.0001, "loss": 7.4232, "loss/crossentropy": 2.12569759786129, "loss/hidden": 3.460546875, "loss/jsd": 0.0, "loss/logits": 0.19339151214808226, "step": 14160 }, { "epoch": 0.35425, "grad_norm": 29.875, "grad_norm_var": 1064.6176432291666, "learning_rate": 0.0001, "loss": 7.2083, "loss/crossentropy": 2.0492417976260184, "loss/hidden": 3.37109375, "loss/jsd": 0.0, "loss/logits": 0.18359060864895582, "step": 14170 }, { "epoch": 0.3545, "grad_norm": 31.375, "grad_norm_var": 9.59765625, "learning_rate": 0.0001, "loss": 7.3908, "loss/crossentropy": 2.099221628904343, "loss/hidden": 3.419140625, "loss/jsd": 0.0, "loss/logits": 0.18481487538665534, "step": 14180 }, { "epoch": 0.35475, "grad_norm": 44.25, "grad_norm_var": 12.892708333333333, "learning_rate": 0.0001, "loss": 7.3495, "loss/crossentropy": 2.04138702750206, "loss/hidden": 3.405078125, "loss/jsd": 0.0, "loss/logits": 0.1797080934047699, "step": 14190 }, { "epoch": 0.355, "grad_norm": 29.875, "grad_norm_var": 15.040559895833333, "learning_rate": 0.0001, "loss": 7.3724, "loss/crossentropy": 2.0216434367001055, "loss/hidden": 3.440625, "loss/jsd": 0.0, "loss/logits": 0.19053793977946043, "step": 14200 }, { "epoch": 0.35525, "grad_norm": 28.25, "grad_norm_var": 13.096809895833333, "learning_rate": 0.0001, "loss": 7.299, "loss/crossentropy": 2.160589988529682, "loss/hidden": 3.31953125, "loss/jsd": 0.0, "loss/logits": 0.1827318999916315, "step": 14210 }, { "epoch": 0.3555, "grad_norm": 31.625, "grad_norm_var": 9.878059895833333, "learning_rate": 0.0001, "loss": 7.3381, "loss/crossentropy": 1.7803485259413718, "loss/hidden": 3.37890625, "loss/jsd": 0.0, "loss/logits": 0.16204789485782384, "step": 14220 }, { "epoch": 0.35575, "grad_norm": 28.625, "grad_norm_var": 5.543489583333334, "learning_rate": 0.0001, "loss": 7.2506, "loss/crossentropy": 1.8810015477240085, "loss/hidden": 3.36640625, "loss/jsd": 0.0, "loss/logits": 0.17873666901141405, "step": 14230 }, { "epoch": 0.356, "grad_norm": 30.375, "grad_norm_var": 4.208268229166666, "learning_rate": 0.0001, "loss": 7.4941, "loss/crossentropy": 2.016931130737066, "loss/hidden": 3.300390625, "loss/jsd": 0.0, "loss/logits": 0.17651305962353944, "step": 14240 }, { "epoch": 0.35625, "grad_norm": 29.75, "grad_norm_var": 2.067122395833333, "learning_rate": 0.0001, "loss": 7.3659, "loss/crossentropy": 2.1511716455221177, "loss/hidden": 3.334375, "loss/jsd": 0.0, "loss/logits": 0.18431759886443616, "step": 14250 }, { "epoch": 0.3565, "grad_norm": 31.5, "grad_norm_var": 14.267708333333333, "learning_rate": 0.0001, "loss": 7.4284, "loss/crossentropy": 2.0923829920589925, "loss/hidden": 3.414453125, "loss/jsd": 0.0, "loss/logits": 0.20195957068353892, "step": 14260 }, { "epoch": 0.35675, "grad_norm": 32.75, "grad_norm_var": 2.780989583333333, "learning_rate": 0.0001, "loss": 7.3516, "loss/crossentropy": 2.1890153646469117, "loss/hidden": 3.407421875, "loss/jsd": 0.0, "loss/logits": 0.19206568598747253, "step": 14270 }, { "epoch": 0.357, "grad_norm": 26.875, "grad_norm_var": 3.0858723958333334, "learning_rate": 0.0001, "loss": 7.2643, "loss/crossentropy": 2.109933242201805, "loss/hidden": 3.362890625, "loss/jsd": 0.0, "loss/logits": 0.1951975781470537, "step": 14280 }, { "epoch": 0.35725, "grad_norm": 28.875, "grad_norm_var": 1.7747395833333333, "learning_rate": 0.0001, "loss": 7.2549, "loss/crossentropy": 2.104505704343319, "loss/hidden": 3.449609375, "loss/jsd": 0.0, "loss/logits": 0.19365676920861005, "step": 14290 }, { "epoch": 0.3575, "grad_norm": 32.25, "grad_norm_var": 11.114322916666667, "learning_rate": 0.0001, "loss": 7.247, "loss/crossentropy": 2.108228546380997, "loss/hidden": 3.398046875, "loss/jsd": 0.0, "loss/logits": 0.18031150791794062, "step": 14300 }, { "epoch": 0.35775, "grad_norm": 33.5, "grad_norm_var": 4.084375, "learning_rate": 0.0001, "loss": 7.2652, "loss/crossentropy": 2.11306362003088, "loss/hidden": 3.403125, "loss/jsd": 0.0, "loss/logits": 0.19340378735214472, "step": 14310 }, { "epoch": 0.358, "grad_norm": 31.625, "grad_norm_var": 2.2712890625, "learning_rate": 0.0001, "loss": 7.5154, "loss/crossentropy": 2.073298954963684, "loss/hidden": 3.4765625, "loss/jsd": 0.0, "loss/logits": 0.20225481800734996, "step": 14320 }, { "epoch": 0.35825, "grad_norm": 33.0, "grad_norm_var": 7.609309895833333, "learning_rate": 0.0001, "loss": 7.3172, "loss/crossentropy": 2.068447032570839, "loss/hidden": 3.360546875, "loss/jsd": 0.0, "loss/logits": 0.1848990023136139, "step": 14330 }, { "epoch": 0.3585, "grad_norm": 28.625, "grad_norm_var": 9.287434895833334, "learning_rate": 0.0001, "loss": 7.341, "loss/crossentropy": 2.0122443050146104, "loss/hidden": 3.270703125, "loss/jsd": 0.0, "loss/logits": 0.16750977858901023, "step": 14340 }, { "epoch": 0.35875, "grad_norm": 31.625, "grad_norm_var": 1.5587890625, "learning_rate": 0.0001, "loss": 7.4114, "loss/crossentropy": 1.9815908901393413, "loss/hidden": 3.413671875, "loss/jsd": 0.0, "loss/logits": 0.19162300042808056, "step": 14350 }, { "epoch": 0.359, "grad_norm": 34.5, "grad_norm_var": 1.75390625, "learning_rate": 0.0001, "loss": 7.4096, "loss/crossentropy": 2.0974107921123504, "loss/hidden": 3.418359375, "loss/jsd": 0.0, "loss/logits": 0.18985395636409522, "step": 14360 }, { "epoch": 0.35925, "grad_norm": 29.875, "grad_norm_var": 27.2291015625, "learning_rate": 0.0001, "loss": 7.2838, "loss/crossentropy": 2.18302740752697, "loss/hidden": 3.3375, "loss/jsd": 0.0, "loss/logits": 0.19919114038348198, "step": 14370 }, { "epoch": 0.3595, "grad_norm": 29.0, "grad_norm_var": 2.393684895833333, "learning_rate": 0.0001, "loss": 7.3591, "loss/crossentropy": 2.1828642159700395, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.1880602626129985, "step": 14380 }, { "epoch": 0.35975, "grad_norm": 28.25, "grad_norm_var": 1.8875, "learning_rate": 0.0001, "loss": 7.4149, "loss/crossentropy": 2.0250351071357726, "loss/hidden": 3.432421875, "loss/jsd": 0.0, "loss/logits": 0.21426690481603144, "step": 14390 }, { "epoch": 0.36, "grad_norm": 36.0, "grad_norm_var": 4.823893229166667, "learning_rate": 0.0001, "loss": 7.3843, "loss/crossentropy": 1.9927750542759894, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.18537398371845484, "step": 14400 }, { "epoch": 0.36025, "grad_norm": 48.0, "grad_norm_var": 20.270247395833334, "learning_rate": 0.0001, "loss": 7.3442, "loss/crossentropy": 2.2051343381404878, "loss/hidden": 3.337109375, "loss/jsd": 0.0, "loss/logits": 0.18966466654092073, "step": 14410 }, { "epoch": 0.3605, "grad_norm": 26.75, "grad_norm_var": 32.15598958333333, "learning_rate": 0.0001, "loss": 7.3599, "loss/crossentropy": 2.050907912105322, "loss/hidden": 3.34453125, "loss/jsd": 0.0, "loss/logits": 0.18331205267459155, "step": 14420 }, { "epoch": 0.36075, "grad_norm": 32.0, "grad_norm_var": 15.5244140625, "learning_rate": 0.0001, "loss": 7.4431, "loss/crossentropy": 1.9177762359380721, "loss/hidden": 3.507421875, "loss/jsd": 0.0, "loss/logits": 0.18733049537986518, "step": 14430 }, { "epoch": 0.361, "grad_norm": 31.625, "grad_norm_var": 3.370247395833333, "learning_rate": 0.0001, "loss": 7.3748, "loss/crossentropy": 2.1186861246824265, "loss/hidden": 3.374609375, "loss/jsd": 0.0, "loss/logits": 0.1846270103007555, "step": 14440 }, { "epoch": 0.36125, "grad_norm": 31.125, "grad_norm_var": 2.378059895833333, "learning_rate": 0.0001, "loss": 7.4001, "loss/crossentropy": 2.1338623888790607, "loss/hidden": 3.42890625, "loss/jsd": 0.0, "loss/logits": 0.1889139147475362, "step": 14450 }, { "epoch": 0.3615, "grad_norm": 29.125, "grad_norm_var": 5.045572916666667, "learning_rate": 0.0001, "loss": 7.3355, "loss/crossentropy": 1.9586334988474845, "loss/hidden": 3.36015625, "loss/jsd": 0.0, "loss/logits": 0.1830888209864497, "step": 14460 }, { "epoch": 0.36175, "grad_norm": 29.375, "grad_norm_var": 11.797916666666667, "learning_rate": 0.0001, "loss": 7.4646, "loss/crossentropy": 2.0122861742973326, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.2004035959020257, "step": 14470 }, { "epoch": 0.362, "grad_norm": 32.0, "grad_norm_var": 11.684309895833334, "learning_rate": 0.0001, "loss": 7.4154, "loss/crossentropy": 2.1907495334744453, "loss/hidden": 3.424609375, "loss/jsd": 0.0, "loss/logits": 0.2014760635793209, "step": 14480 }, { "epoch": 0.36225, "grad_norm": 32.75, "grad_norm_var": 14.495247395833333, "learning_rate": 0.0001, "loss": 7.3255, "loss/crossentropy": 2.0414177522063257, "loss/hidden": 3.33515625, "loss/jsd": 0.0, "loss/logits": 0.17212325502187015, "step": 14490 }, { "epoch": 0.3625, "grad_norm": 28.875, "grad_norm_var": 9.85390625, "learning_rate": 0.0001, "loss": 7.4188, "loss/crossentropy": 1.875846453011036, "loss/hidden": 3.4109375, "loss/jsd": 0.0, "loss/logits": 0.18002864755690098, "step": 14500 }, { "epoch": 0.36275, "grad_norm": 32.5, "grad_norm_var": 1.7400390625, "learning_rate": 0.0001, "loss": 7.3819, "loss/crossentropy": 2.2282170712947846, "loss/hidden": 3.43359375, "loss/jsd": 0.0, "loss/logits": 0.20617649108171462, "step": 14510 }, { "epoch": 0.363, "grad_norm": 29.0, "grad_norm_var": 26.622330729166666, "learning_rate": 0.0001, "loss": 7.3494, "loss/crossentropy": 2.1207677230238913, "loss/hidden": 3.455859375, "loss/jsd": 0.0, "loss/logits": 0.18537767957895995, "step": 14520 }, { "epoch": 0.36325, "grad_norm": 28.5, "grad_norm_var": 1.15, "learning_rate": 0.0001, "loss": 7.2906, "loss/crossentropy": 2.1179309889674185, "loss/hidden": 3.39296875, "loss/jsd": 0.0, "loss/logits": 0.18408581465482712, "step": 14530 }, { "epoch": 0.3635, "grad_norm": 31.125, "grad_norm_var": 2.851822916666667, "learning_rate": 0.0001, "loss": 7.3786, "loss/crossentropy": 2.0486052967607975, "loss/hidden": 3.35859375, "loss/jsd": 0.0, "loss/logits": 0.17906265445053576, "step": 14540 }, { "epoch": 0.36375, "grad_norm": 33.0, "grad_norm_var": 3.184830729166667, "learning_rate": 0.0001, "loss": 7.3423, "loss/crossentropy": 2.301698251068592, "loss/hidden": 3.3765625, "loss/jsd": 0.0, "loss/logits": 0.19077361291274428, "step": 14550 }, { "epoch": 0.364, "grad_norm": 33.25, "grad_norm_var": 2.2134765625, "learning_rate": 0.0001, "loss": 7.3047, "loss/crossentropy": 2.074369618296623, "loss/hidden": 3.293359375, "loss/jsd": 0.0, "loss/logits": 0.17446096520870924, "step": 14560 }, { "epoch": 0.36425, "grad_norm": 32.5, "grad_norm_var": 2.6910807291666665, "learning_rate": 0.0001, "loss": 7.412, "loss/crossentropy": 2.070549990236759, "loss/hidden": 3.501953125, "loss/jsd": 0.0, "loss/logits": 0.2114204354584217, "step": 14570 }, { "epoch": 0.3645, "grad_norm": 31.875, "grad_norm_var": 31.843489583333334, "learning_rate": 0.0001, "loss": 7.4461, "loss/crossentropy": 2.2021229028701783, "loss/hidden": 3.412890625, "loss/jsd": 0.0, "loss/logits": 0.1946948293596506, "step": 14580 }, { "epoch": 0.36475, "grad_norm": 28.875, "grad_norm_var": 32.17916666666667, "learning_rate": 0.0001, "loss": 7.33, "loss/crossentropy": 2.07574619948864, "loss/hidden": 3.41875, "loss/jsd": 0.0, "loss/logits": 0.199923849850893, "step": 14590 }, { "epoch": 0.365, "grad_norm": 29.625, "grad_norm_var": 1.3020833333333333, "learning_rate": 0.0001, "loss": 7.3285, "loss/crossentropy": 2.236181080341339, "loss/hidden": 3.295703125, "loss/jsd": 0.0, "loss/logits": 0.18559221122413874, "step": 14600 }, { "epoch": 0.36525, "grad_norm": 27.875, "grad_norm_var": 1.6979166666666667, "learning_rate": 0.0001, "loss": 7.4122, "loss/crossentropy": 2.029696011543274, "loss/hidden": 3.365234375, "loss/jsd": 0.0, "loss/logits": 0.18932779133319855, "step": 14610 }, { "epoch": 0.3655, "grad_norm": 29.375, "grad_norm_var": 3.17265625, "learning_rate": 0.0001, "loss": 7.3296, "loss/crossentropy": 2.0676317110657694, "loss/hidden": 3.308984375, "loss/jsd": 0.0, "loss/logits": 0.1811735849827528, "step": 14620 }, { "epoch": 0.36575, "grad_norm": 31.5, "grad_norm_var": 3.8629557291666665, "learning_rate": 0.0001, "loss": 7.3105, "loss/crossentropy": 2.1635128699243067, "loss/hidden": 3.33671875, "loss/jsd": 0.0, "loss/logits": 0.19529270604252816, "step": 14630 }, { "epoch": 0.366, "grad_norm": 28.25, "grad_norm_var": 3.3955729166666666, "learning_rate": 0.0001, "loss": 7.3913, "loss/crossentropy": 2.0359733670949938, "loss/hidden": 3.5, "loss/jsd": 0.0, "loss/logits": 0.20129154790192844, "step": 14640 }, { "epoch": 0.36625, "grad_norm": 30.5, "grad_norm_var": 3.72265625, "learning_rate": 0.0001, "loss": 7.3418, "loss/crossentropy": 1.895400796085596, "loss/hidden": 3.4125, "loss/jsd": 0.0, "loss/logits": 0.17521814415231346, "step": 14650 }, { "epoch": 0.3665, "grad_norm": 30.375, "grad_norm_var": 1.7302083333333333, "learning_rate": 0.0001, "loss": 7.3042, "loss/crossentropy": 2.0394834615290165, "loss/hidden": 3.21953125, "loss/jsd": 0.0, "loss/logits": 0.17233089366927742, "step": 14660 }, { "epoch": 0.36675, "grad_norm": 28.25, "grad_norm_var": 36.44479166666667, "learning_rate": 0.0001, "loss": 7.3335, "loss/crossentropy": 1.8397274687886238, "loss/hidden": 3.4515625, "loss/jsd": 0.0, "loss/logits": 0.17788148634135723, "step": 14670 }, { "epoch": 0.367, "grad_norm": 27.875, "grad_norm_var": 38.53639322916667, "learning_rate": 0.0001, "loss": 7.3786, "loss/crossentropy": 2.2247194588184356, "loss/hidden": 3.4203125, "loss/jsd": 0.0, "loss/logits": 0.19322688207030297, "step": 14680 }, { "epoch": 0.36725, "grad_norm": 40.0, "grad_norm_var": 13.45390625, "learning_rate": 0.0001, "loss": 7.3179, "loss/crossentropy": 1.9937927357852459, "loss/hidden": 3.376171875, "loss/jsd": 0.0, "loss/logits": 0.17893323805183173, "step": 14690 }, { "epoch": 0.3675, "grad_norm": 32.75, "grad_norm_var": 11.25, "learning_rate": 0.0001, "loss": 7.2325, "loss/crossentropy": 2.0396489590406417, "loss/hidden": 3.44140625, "loss/jsd": 0.0, "loss/logits": 0.1912191865965724, "step": 14700 }, { "epoch": 0.36775, "grad_norm": 33.0, "grad_norm_var": 7.471875, "learning_rate": 0.0001, "loss": 7.4509, "loss/crossentropy": 2.1277743950486183, "loss/hidden": 3.449609375, "loss/jsd": 0.0, "loss/logits": 0.20408784411847591, "step": 14710 }, { "epoch": 0.368, "grad_norm": 30.75, "grad_norm_var": 8.546875, "learning_rate": 0.0001, "loss": 7.3855, "loss/crossentropy": 2.2228190809488297, "loss/hidden": 3.362890625, "loss/jsd": 0.0, "loss/logits": 0.1884877322241664, "step": 14720 }, { "epoch": 0.36825, "grad_norm": 28.625, "grad_norm_var": 15.142708333333333, "learning_rate": 0.0001, "loss": 7.4076, "loss/crossentropy": 2.1207469016313554, "loss/hidden": 3.34609375, "loss/jsd": 0.0, "loss/logits": 0.17780339773744344, "step": 14730 }, { "epoch": 0.3685, "grad_norm": 31.125, "grad_norm_var": 3.004622395833333, "learning_rate": 0.0001, "loss": 7.3209, "loss/crossentropy": 2.0066810354590414, "loss/hidden": 3.519921875, "loss/jsd": 0.0, "loss/logits": 0.20999562088400126, "step": 14740 }, { "epoch": 0.36875, "grad_norm": 38.25, "grad_norm_var": 15.4259765625, "learning_rate": 0.0001, "loss": 7.4197, "loss/crossentropy": 2.0302488803863525, "loss/hidden": 3.491796875, "loss/jsd": 0.0, "loss/logits": 0.1997592320665717, "step": 14750 }, { "epoch": 0.369, "grad_norm": 32.5, "grad_norm_var": 22.669791666666665, "learning_rate": 0.0001, "loss": 7.3414, "loss/crossentropy": 2.006557123363018, "loss/hidden": 3.333203125, "loss/jsd": 0.0, "loss/logits": 0.18148684445768595, "step": 14760 }, { "epoch": 0.36925, "grad_norm": 29.5, "grad_norm_var": 22.362434895833335, "learning_rate": 0.0001, "loss": 7.347, "loss/crossentropy": 2.07253782749176, "loss/hidden": 3.274609375, "loss/jsd": 0.0, "loss/logits": 0.17341973297297955, "step": 14770 }, { "epoch": 0.3695, "grad_norm": 30.75, "grad_norm_var": 17.227018229166667, "learning_rate": 0.0001, "loss": 7.3839, "loss/crossentropy": 2.056871312856674, "loss/hidden": 3.367578125, "loss/jsd": 0.0, "loss/logits": 0.21048452276736498, "step": 14780 }, { "epoch": 0.36975, "grad_norm": 27.875, "grad_norm_var": 38.5947265625, "learning_rate": 0.0001, "loss": 7.4979, "loss/crossentropy": 2.1910998940467836, "loss/hidden": 3.480859375, "loss/jsd": 0.0, "loss/logits": 0.20320441015064716, "step": 14790 }, { "epoch": 0.37, "grad_norm": 31.25, "grad_norm_var": 2.081788902243793e+18, "learning_rate": 0.0001, "loss": 7.3396, "loss/crossentropy": 2.0854848250746727, "loss/hidden": 3.37265625, "loss/jsd": 0.0, "loss/logits": 0.19095470141619444, "step": 14800 }, { "epoch": 0.37025, "grad_norm": 34.75, "grad_norm_var": 2.0817889032778286e+18, "learning_rate": 0.0001, "loss": 7.349, "loss/crossentropy": 2.024295690655708, "loss/hidden": 3.417578125, "loss/jsd": 0.0, "loss/logits": 0.2073504414409399, "step": 14810 }, { "epoch": 0.3705, "grad_norm": 30.375, "grad_norm_var": 8.116080729166667, "learning_rate": 0.0001, "loss": 7.1984, "loss/crossentropy": 1.9738368421792984, "loss/hidden": 3.352734375, "loss/jsd": 0.0, "loss/logits": 0.175398519821465, "step": 14820 }, { "epoch": 0.37075, "grad_norm": 34.0, "grad_norm_var": 3.95390625, "learning_rate": 0.0001, "loss": 7.3269, "loss/crossentropy": 2.0109583623707294, "loss/hidden": 3.432421875, "loss/jsd": 0.0, "loss/logits": 0.1902273640036583, "step": 14830 }, { "epoch": 0.371, "grad_norm": 30.25, "grad_norm_var": 3.410416666666667, "learning_rate": 0.0001, "loss": 7.3309, "loss/crossentropy": 2.0899639569222925, "loss/hidden": 3.444921875, "loss/jsd": 0.0, "loss/logits": 0.18828139845281838, "step": 14840 }, { "epoch": 0.37125, "grad_norm": 31.375, "grad_norm_var": 51.5087890625, "learning_rate": 0.0001, "loss": 7.2761, "loss/crossentropy": 2.0561746567487718, "loss/hidden": 3.308203125, "loss/jsd": 0.0, "loss/logits": 0.17576684867963194, "step": 14850 }, { "epoch": 0.3715, "grad_norm": 32.25, "grad_norm_var": 54.873372395833336, "learning_rate": 0.0001, "loss": 7.2875, "loss/crossentropy": 2.1231576301157475, "loss/hidden": 3.405859375, "loss/jsd": 0.0, "loss/logits": 0.1990961253643036, "step": 14860 }, { "epoch": 0.37175, "grad_norm": 34.25, "grad_norm_var": 29.984309895833334, "learning_rate": 0.0001, "loss": 7.3424, "loss/crossentropy": 2.1051357120275496, "loss/hidden": 3.29453125, "loss/jsd": 0.0, "loss/logits": 0.17979057859629394, "step": 14870 }, { "epoch": 0.372, "grad_norm": 28.25, "grad_norm_var": 19850.327018229167, "learning_rate": 0.0001, "loss": 7.3823, "loss/crossentropy": 2.3242007076740263, "loss/hidden": 3.49609375, "loss/jsd": 0.0, "loss/logits": 0.2740444682538509, "step": 14880 }, { "epoch": 0.37225, "grad_norm": 27.25, "grad_norm_var": 35.0625, "learning_rate": 0.0001, "loss": 7.2273, "loss/crossentropy": 2.143503928184509, "loss/hidden": 3.374609375, "loss/jsd": 0.0, "loss/logits": 0.19005000293254853, "step": 14890 }, { "epoch": 0.3725, "grad_norm": 34.0, "grad_norm_var": 38.23125, "learning_rate": 0.0001, "loss": 7.2655, "loss/crossentropy": 2.1128626547753813, "loss/hidden": 3.40703125, "loss/jsd": 0.0, "loss/logits": 0.18479138296097516, "step": 14900 }, { "epoch": 0.37275, "grad_norm": 31.0, "grad_norm_var": 26.7478515625, "learning_rate": 0.0001, "loss": 7.3153, "loss/crossentropy": 1.860854334384203, "loss/hidden": 3.255859375, "loss/jsd": 0.0, "loss/logits": 0.1592037882655859, "step": 14910 }, { "epoch": 0.373, "grad_norm": 29.625, "grad_norm_var": 30.737239583333334, "learning_rate": 0.0001, "loss": 7.287, "loss/crossentropy": 2.067621612548828, "loss/hidden": 3.468359375, "loss/jsd": 0.0, "loss/logits": 0.1897974604740739, "step": 14920 }, { "epoch": 0.37325, "grad_norm": 29.0, "grad_norm_var": 51.01223958333333, "learning_rate": 0.0001, "loss": 7.4038, "loss/crossentropy": 2.0471679329872132, "loss/hidden": 3.41484375, "loss/jsd": 0.0, "loss/logits": 0.18869173359125851, "step": 14930 }, { "epoch": 0.3735, "grad_norm": 40.25, "grad_norm_var": 42.828059895833334, "learning_rate": 0.0001, "loss": 7.2638, "loss/crossentropy": 2.0969312518835066, "loss/hidden": 3.308984375, "loss/jsd": 0.0, "loss/logits": 0.19103059088811278, "step": 14940 }, { "epoch": 0.37375, "grad_norm": 27.625, "grad_norm_var": 16.3625, "learning_rate": 0.0001, "loss": 7.3022, "loss/crossentropy": 2.208647185564041, "loss/hidden": 3.323046875, "loss/jsd": 0.0, "loss/logits": 0.1936396975070238, "step": 14950 }, { "epoch": 0.374, "grad_norm": 28.625, "grad_norm_var": 16.837239583333332, "learning_rate": 0.0001, "loss": 7.2414, "loss/crossentropy": 2.1974997609853744, "loss/hidden": 3.340625, "loss/jsd": 0.0, "loss/logits": 0.19469811543822288, "step": 14960 }, { "epoch": 0.37425, "grad_norm": 29.625, "grad_norm_var": 14.108072916666666, "learning_rate": 0.0001, "loss": 7.2849, "loss/crossentropy": 2.134792809188366, "loss/hidden": 3.20625, "loss/jsd": 0.0, "loss/logits": 0.17098608147352934, "step": 14970 }, { "epoch": 0.3745, "grad_norm": 33.25, "grad_norm_var": 27.1181640625, "learning_rate": 0.0001, "loss": 7.3014, "loss/crossentropy": 2.1276439666748046, "loss/hidden": 3.3859375, "loss/jsd": 0.0, "loss/logits": 0.18990135621279478, "step": 14980 }, { "epoch": 0.37475, "grad_norm": 29.75, "grad_norm_var": 16.033072916666665, "learning_rate": 0.0001, "loss": 7.3335, "loss/crossentropy": 2.0856949634850026, "loss/hidden": 3.3265625, "loss/jsd": 0.0, "loss/logits": 0.18325317203998565, "step": 14990 }, { "epoch": 0.375, "grad_norm": 31.625, "grad_norm_var": 3.321875, "learning_rate": 0.0001, "loss": 7.3642, "loss/crossentropy": 2.1136090487241743, "loss/hidden": 3.523828125, "loss/jsd": 0.0, "loss/logits": 0.20829910095781087, "step": 15000 } ], "logging_steps": 10, "max_steps": 40000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.28626504801321e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }