diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,77033 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7, + "eval_steps": 2000, + "global_step": 70000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0001, + "grad_norm": 62.5, + "learning_rate": 3.27e-05, + "loss": 96.4218, + "loss/aux_loss": 0.06346827149391174, + "loss/crossentropy": 10.520584106445312, + "loss/logits": 8.62325460910797, + "step": 10 + }, + { + "epoch": 0.0002, + "grad_norm": 64.0, + "grad_norm_var": 47.05416666666667, + "learning_rate": 3.54e-05, + "loss": 90.9983, + "loss/aux_loss": 0.05445752870291472, + "loss/crossentropy": 9.918135738372802, + "loss/logits": 8.018545007705688, + "step": 20 + }, + { + "epoch": 0.0003, + "grad_norm": 60.75, + "grad_norm_var": 3.05390625, + "learning_rate": 3.81e-05, + "loss": 84.3016, + "loss/aux_loss": 0.052422930113971235, + "loss/crossentropy": 9.341589832305909, + "loss/logits": 7.614971446990967, + "step": 30 + }, + { + "epoch": 0.0004, + "grad_norm": 59.0, + "grad_norm_var": 264.72682291666666, + "learning_rate": 4.08e-05, + "loss": 78.0706, + "loss/aux_loss": 0.05138566605746746, + "loss/crossentropy": 8.834511041641235, + "loss/logits": 6.946270298957825, + "step": 40 + }, + { + "epoch": 0.0005, + "grad_norm": 54.0, + "grad_norm_var": 295.6518229166667, + "learning_rate": 4.35e-05, + "loss": 73.2196, + "loss/aux_loss": 0.05028745252639055, + "loss/crossentropy": 8.335509753227234, + "loss/logits": 6.576072025299072, + "step": 50 + }, + { + "epoch": 0.0006, + "grad_norm": 49.5, + "grad_norm_var": 12.49765625, + "learning_rate": 4.62e-05, + "loss": 69.3149, + "loss/aux_loss": 0.04974018670618534, + "loss/crossentropy": 8.07826225757599, + "loss/logits": 6.17107310295105, + "step": 60 + }, + { + "epoch": 0.0007, + "grad_norm": 33.5, + "grad_norm_var": 37.90182291666667, + "learning_rate": 4.89e-05, + "loss": 65.9331, + "loss/aux_loss": 0.049574922397732736, + "loss/crossentropy": 7.850642085075378, + "loss/logits": 5.917052555084228, + "step": 70 + }, + { + "epoch": 0.0008, + "grad_norm": 24.75, + "grad_norm_var": 683.9268229166667, + "learning_rate": 5.16e-05, + "loss": 63.4084, + "loss/aux_loss": 0.0500051811337471, + "loss/crossentropy": 7.550819325447082, + "loss/logits": 5.728990888595581, + "step": 80 + }, + { + "epoch": 0.0009, + "grad_norm": 65.5, + "grad_norm_var": 863.5828125, + "learning_rate": 5.429999999999999e-05, + "loss": 61.9785, + "loss/aux_loss": 0.05104184336960316, + "loss/crossentropy": 7.397841739654541, + "loss/logits": 5.37682204246521, + "step": 90 + }, + { + "epoch": 0.001, + "grad_norm": 53.25, + "grad_norm_var": 342.37433268229165, + "learning_rate": 5.6999999999999996e-05, + "loss": 60.8601, + "loss/aux_loss": 0.05242748130112886, + "loss/crossentropy": 7.235720539093018, + "loss/logits": 5.34943995475769, + "step": 100 + }, + { + "epoch": 0.0011, + "grad_norm": 68.5, + "grad_norm_var": 300.8374348958333, + "learning_rate": 5.97e-05, + "loss": 59.9541, + "loss/aux_loss": 0.05149786453694105, + "loss/crossentropy": 7.193077087402344, + "loss/logits": 5.278037166595459, + "step": 110 + }, + { + "epoch": 0.0012, + "grad_norm": 29.375, + "grad_norm_var": 219.08326822916666, + "learning_rate": 6.24e-05, + "loss": 58.794, + "loss/aux_loss": 0.05140427742153406, + "loss/crossentropy": 7.130345010757447, + "loss/logits": 5.037704062461853, + "step": 120 + }, + { + "epoch": 0.0013, + "grad_norm": 15.0625, + "grad_norm_var": 210.59972330729167, + "learning_rate": 6.51e-05, + "loss": 58.1276, + "loss/aux_loss": 0.050786581449210645, + "loss/crossentropy": 6.900083208084107, + "loss/logits": 5.119465160369873, + "step": 130 + }, + { + "epoch": 0.0014, + "grad_norm": 15.4375, + "grad_norm_var": 242.2203125, + "learning_rate": 6.780000000000001e-05, + "loss": 57.251, + "loss/aux_loss": 0.050472350977361205, + "loss/crossentropy": 6.90623025894165, + "loss/logits": 4.989370441436767, + "step": 140 + }, + { + "epoch": 0.0015, + "grad_norm": 23.75, + "grad_norm_var": 53.895686848958334, + "learning_rate": 7.049999999999999e-05, + "loss": 55.9011, + "loss/aux_loss": 0.05010317321866751, + "loss/crossentropy": 6.878646898269653, + "loss/logits": 4.966757416725159, + "step": 150 + }, + { + "epoch": 0.0016, + "grad_norm": 22.875, + "grad_norm_var": 85.49140625, + "learning_rate": 7.32e-05, + "loss": 55.0174, + "loss/aux_loss": 0.05009230561554432, + "loss/crossentropy": 6.685335445404053, + "loss/logits": 4.792002511024475, + "step": 160 + }, + { + "epoch": 0.0017, + "grad_norm": 24.5, + "grad_norm_var": 172.98014322916666, + "learning_rate": 7.589999999999999e-05, + "loss": 54.2538, + "loss/aux_loss": 0.049807760119438174, + "loss/crossentropy": 6.676994824409485, + "loss/logits": 4.751176404953003, + "step": 170 + }, + { + "epoch": 0.0018, + "grad_norm": 49.0, + "grad_norm_var": 130.01295572916666, + "learning_rate": 7.86e-05, + "loss": 53.571, + "loss/aux_loss": 0.049537939578294755, + "loss/crossentropy": 6.596286082267762, + "loss/logits": 4.717176723480224, + "step": 180 + }, + { + "epoch": 0.0019, + "grad_norm": 22.0, + "grad_norm_var": 68.81145833333333, + "learning_rate": 8.13e-05, + "loss": 52.313, + "loss/aux_loss": 0.049018622189760205, + "loss/crossentropy": 6.518313908576966, + "loss/logits": 4.740558981895447, + "step": 190 + }, + { + "epoch": 0.002, + "grad_norm": 23.125, + "grad_norm_var": 39.942708333333336, + "learning_rate": 8.4e-05, + "loss": 51.4724, + "loss/aux_loss": 0.04901752769947052, + "loss/crossentropy": 6.31372344493866, + "loss/logits": 4.539055609703064, + "step": 200 + }, + { + "epoch": 0.0021, + "grad_norm": 25.5, + "grad_norm_var": 51.50584309895833, + "learning_rate": 8.67e-05, + "loss": 50.4864, + "loss/aux_loss": 0.049004881829023364, + "loss/crossentropy": 6.431901216506958, + "loss/logits": 4.541516590118408, + "step": 210 + }, + { + "epoch": 0.0022, + "grad_norm": 23.75, + "grad_norm_var": 45.890625, + "learning_rate": 8.94e-05, + "loss": 49.4348, + "loss/aux_loss": 0.0489825276657939, + "loss/crossentropy": 6.201497769355774, + "loss/logits": 4.2463623762130736, + "step": 220 + }, + { + "epoch": 0.0023, + "grad_norm": 32.75, + "grad_norm_var": 33.282291666666666, + "learning_rate": 9.21e-05, + "loss": 48.7892, + "loss/aux_loss": 0.04863291662186384, + "loss/crossentropy": 6.257090902328491, + "loss/logits": 4.189764153957367, + "step": 230 + }, + { + "epoch": 0.0024, + "grad_norm": 33.0, + "grad_norm_var": 26.2509765625, + "learning_rate": 9.479999999999999e-05, + "loss": 47.4078, + "loss/aux_loss": 0.04870687611401081, + "loss/crossentropy": 6.129881906509399, + "loss/logits": 4.127840185165406, + "step": 240 + }, + { + "epoch": 0.0025, + "grad_norm": 19.375, + "grad_norm_var": 26.835416666666667, + "learning_rate": 9.75e-05, + "loss": 46.931, + "loss/aux_loss": 0.04863628149032593, + "loss/crossentropy": 5.8494936466217045, + "loss/logits": 4.019280314445496, + "step": 250 + }, + { + "epoch": 0.0026, + "grad_norm": 37.75, + "grad_norm_var": 30.508072916666666, + "learning_rate": 0.0001002, + "loss": 45.6479, + "loss/aux_loss": 0.04853515811264515, + "loss/crossentropy": 5.952807331085205, + "loss/logits": 4.096065545082093, + "step": 260 + }, + { + "epoch": 0.0027, + "grad_norm": 29.25, + "grad_norm_var": 42.109309895833334, + "learning_rate": 0.0001029, + "loss": 45.4552, + "loss/aux_loss": 0.04864779394119978, + "loss/crossentropy": 5.901494193077087, + "loss/logits": 4.014258062839508, + "step": 270 + }, + { + "epoch": 0.0028, + "grad_norm": 30.75, + "grad_norm_var": 31.6494140625, + "learning_rate": 0.00010560000000000002, + "loss": 45.0348, + "loss/aux_loss": 0.04867569580674171, + "loss/crossentropy": 5.769275331497193, + "loss/logits": 3.9283937215805054, + "step": 280 + }, + { + "epoch": 0.0029, + "grad_norm": 24.0, + "grad_norm_var": 49.4306640625, + "learning_rate": 0.00010829999999999999, + "loss": 44.4484, + "loss/aux_loss": 0.048759896866977216, + "loss/crossentropy": 5.552310681343078, + "loss/logits": 3.828005838394165, + "step": 290 + }, + { + "epoch": 0.003, + "grad_norm": 23.5, + "grad_norm_var": 37.06666666666667, + "learning_rate": 0.00011099999999999999, + "loss": 44.2056, + "loss/aux_loss": 0.04860832653939724, + "loss/crossentropy": 5.736793255805969, + "loss/logits": 3.8119420647621154, + "step": 300 + }, + { + "epoch": 0.0031, + "grad_norm": 34.75, + "grad_norm_var": 25.449739583333333, + "learning_rate": 0.00011369999999999999, + "loss": 43.1406, + "loss/aux_loss": 0.04863522592931986, + "loss/crossentropy": 5.70694375038147, + "loss/logits": 3.7460230350494386, + "step": 310 + }, + { + "epoch": 0.0032, + "grad_norm": 28.75, + "grad_norm_var": 23.142643229166666, + "learning_rate": 0.0001164, + "loss": 43.2674, + "loss/aux_loss": 0.048594312928617, + "loss/crossentropy": 5.645468616485596, + "loss/logits": 3.795237183570862, + "step": 320 + }, + { + "epoch": 0.0033, + "grad_norm": 25.125, + "grad_norm_var": 27.248893229166665, + "learning_rate": 0.0001191, + "loss": 42.4531, + "loss/aux_loss": 0.048580970242619516, + "loss/crossentropy": 5.573257780075073, + "loss/logits": 3.625744652748108, + "step": 330 + }, + { + "epoch": 0.0034, + "grad_norm": 27.0, + "grad_norm_var": 18.6728515625, + "learning_rate": 0.00012179999999999999, + "loss": 42.3091, + "loss/aux_loss": 0.04864873345941305, + "loss/crossentropy": 5.666665482521057, + "loss/logits": 3.6989678740501404, + "step": 340 + }, + { + "epoch": 0.0035, + "grad_norm": 25.625, + "grad_norm_var": 14.792122395833333, + "learning_rate": 0.0001245, + "loss": 41.7634, + "loss/aux_loss": 0.048494835197925565, + "loss/crossentropy": 5.532446098327637, + "loss/logits": 3.6617549777030947, + "step": 350 + }, + { + "epoch": 0.0036, + "grad_norm": 27.125, + "grad_norm_var": 36.68274739583333, + "learning_rate": 0.0001272, + "loss": 41.3748, + "loss/aux_loss": 0.04851998519152403, + "loss/crossentropy": 5.461347937583923, + "loss/logits": 3.681316375732422, + "step": 360 + }, + { + "epoch": 0.0037, + "grad_norm": 32.25, + "grad_norm_var": 62.7072265625, + "learning_rate": 0.0001299, + "loss": 41.0029, + "loss/aux_loss": 0.0486336350440979, + "loss/crossentropy": 5.420117592811584, + "loss/logits": 3.614268946647644, + "step": 370 + }, + { + "epoch": 0.0038, + "grad_norm": 25.375, + "grad_norm_var": 29.937955729166667, + "learning_rate": 0.0001326, + "loss": 40.4483, + "loss/aux_loss": 0.048567987978458405, + "loss/crossentropy": 5.519010901451111, + "loss/logits": 3.4499247074127197, + "step": 380 + }, + { + "epoch": 0.0039, + "grad_norm": 28.375, + "grad_norm_var": 14.257291666666667, + "learning_rate": 0.0001353, + "loss": 39.9714, + "loss/aux_loss": 0.04847833849489689, + "loss/crossentropy": 5.376910948753357, + "loss/logits": 3.4180081248283387, + "step": 390 + }, + { + "epoch": 0.004, + "grad_norm": 25.25, + "grad_norm_var": 17.4041015625, + "learning_rate": 0.00013800000000000002, + "loss": 39.9151, + "loss/aux_loss": 0.048561175167560575, + "loss/crossentropy": 5.305628776550293, + "loss/logits": 3.4211841225624084, + "step": 400 + }, + { + "epoch": 0.0041, + "grad_norm": 26.625, + "grad_norm_var": 23.691080729166668, + "learning_rate": 0.00014069999999999998, + "loss": 39.9258, + "loss/aux_loss": 0.0485720319673419, + "loss/crossentropy": 5.2558026790618895, + "loss/logits": 3.477493929862976, + "step": 410 + }, + { + "epoch": 0.0042, + "grad_norm": 24.0, + "grad_norm_var": 22.3181640625, + "learning_rate": 0.0001434, + "loss": 39.5536, + "loss/aux_loss": 0.04851338397711515, + "loss/crossentropy": 5.359068250656128, + "loss/logits": 3.373235845565796, + "step": 420 + }, + { + "epoch": 0.0043, + "grad_norm": 23.25, + "grad_norm_var": 25.664518229166667, + "learning_rate": 0.00014609999999999997, + "loss": 38.9821, + "loss/aux_loss": 0.04848247561603784, + "loss/crossentropy": 5.391582441329956, + "loss/logits": 3.3668909788131716, + "step": 430 + }, + { + "epoch": 0.0044, + "grad_norm": 29.125, + "grad_norm_var": 27.742122395833334, + "learning_rate": 0.00014879999999999998, + "loss": 38.4838, + "loss/aux_loss": 0.048559782840311524, + "loss/crossentropy": 5.219124293327331, + "loss/logits": 3.2023038268089294, + "step": 440 + }, + { + "epoch": 0.0045, + "grad_norm": 24.125, + "grad_norm_var": 13.058268229166666, + "learning_rate": 0.0001515, + "loss": 37.7637, + "loss/aux_loss": 0.048486584424972536, + "loss/crossentropy": 5.257930779457093, + "loss/logits": 3.2549287557601927, + "step": 450 + }, + { + "epoch": 0.0046, + "grad_norm": 26.0, + "grad_norm_var": 13.198372395833333, + "learning_rate": 0.00015419999999999998, + "loss": 37.8807, + "loss/aux_loss": 0.048509182222187516, + "loss/crossentropy": 5.24229850769043, + "loss/logits": 3.2370536804199217, + "step": 460 + }, + { + "epoch": 0.0047, + "grad_norm": 26.375, + "grad_norm_var": 10.068489583333333, + "learning_rate": 0.0001569, + "loss": 37.1278, + "loss/aux_loss": 0.048433386348187925, + "loss/crossentropy": 5.20349223613739, + "loss/logits": 3.114039051532745, + "step": 470 + }, + { + "epoch": 0.0048, + "grad_norm": 20.875, + "grad_norm_var": 11.547916666666667, + "learning_rate": 0.0001596, + "loss": 36.9174, + "loss/aux_loss": 0.04847547374665737, + "loss/crossentropy": 4.9283219337463375, + "loss/logits": 3.213498628139496, + "step": 480 + }, + { + "epoch": 0.0049, + "grad_norm": 21.625, + "grad_norm_var": 7.476041666666666, + "learning_rate": 0.0001623, + "loss": 36.5318, + "loss/aux_loss": 0.048416960053145885, + "loss/crossentropy": 5.040558886528015, + "loss/logits": 3.13973091840744, + "step": 490 + }, + { + "epoch": 0.005, + "grad_norm": 22.5, + "grad_norm_var": 8.258072916666666, + "learning_rate": 0.000165, + "loss": 36.6402, + "loss/aux_loss": 0.048433396965265274, + "loss/crossentropy": 5.000589728355408, + "loss/logits": 3.196159243583679, + "step": 500 + }, + { + "epoch": 0.0051, + "grad_norm": 25.375, + "grad_norm_var": 17.601497395833334, + "learning_rate": 0.0001677, + "loss": 36.0775, + "loss/aux_loss": 0.048407428339123725, + "loss/crossentropy": 5.022399640083313, + "loss/logits": 3.1573411226272583, + "step": 510 + }, + { + "epoch": 0.0052, + "grad_norm": 26.25, + "grad_norm_var": 11.326497395833334, + "learning_rate": 0.0001704, + "loss": 35.8341, + "loss/aux_loss": 0.04850205350667238, + "loss/crossentropy": 5.029168057441711, + "loss/logits": 3.0023858308792115, + "step": 520 + }, + { + "epoch": 0.0053, + "grad_norm": 34.75, + "grad_norm_var": 26.2072265625, + "learning_rate": 0.0001731, + "loss": 35.7083, + "loss/aux_loss": 0.048469410836696626, + "loss/crossentropy": 4.937405061721802, + "loss/logits": 3.096103620529175, + "step": 530 + }, + { + "epoch": 0.0054, + "grad_norm": 26.5, + "grad_norm_var": 26.883333333333333, + "learning_rate": 0.00017580000000000002, + "loss": 35.1926, + "loss/aux_loss": 0.04851417765021324, + "loss/crossentropy": 4.968003535270691, + "loss/logits": 3.037220096588135, + "step": 540 + }, + { + "epoch": 0.0055, + "grad_norm": 21.75, + "grad_norm_var": 3.1494140625, + "learning_rate": 0.0001785, + "loss": 34.8693, + "loss/aux_loss": 0.048468691483139995, + "loss/crossentropy": 4.928069758415222, + "loss/logits": 3.0163326144218443, + "step": 550 + }, + { + "epoch": 0.0056, + "grad_norm": 21.125, + "grad_norm_var": 40.6625, + "learning_rate": 0.0001812, + "loss": 34.8376, + "loss/aux_loss": 0.04853217788040638, + "loss/crossentropy": 4.8081374049186705, + "loss/logits": 2.9309885263442994, + "step": 560 + }, + { + "epoch": 0.0057, + "grad_norm": 21.125, + "grad_norm_var": 9.170572916666666, + "learning_rate": 0.00018389999999999997, + "loss": 34.4132, + "loss/aux_loss": 0.04839835949242115, + "loss/crossentropy": 4.890771484375, + "loss/logits": 2.9162360787391663, + "step": 570 + }, + { + "epoch": 0.0058, + "grad_norm": 24.75, + "grad_norm_var": 5.412955729166667, + "learning_rate": 0.00018659999999999998, + "loss": 33.9858, + "loss/aux_loss": 0.04839918464422226, + "loss/crossentropy": 4.828824257850647, + "loss/logits": 2.9052307963371278, + "step": 580 + }, + { + "epoch": 0.0059, + "grad_norm": 102.5, + "grad_norm_var": 448.30104166666666, + "learning_rate": 0.0001893, + "loss": 34.3014, + "loss/aux_loss": 0.04844543803483248, + "loss/crossentropy": 4.836876845359802, + "loss/logits": 2.9816999673843383, + "step": 590 + }, + { + "epoch": 0.006, + "grad_norm": 23.0, + "grad_norm_var": 839.7497395833333, + "learning_rate": 0.00019199999999999998, + "loss": 34.1366, + "loss/aux_loss": 0.0485780967399478, + "loss/crossentropy": 4.918647742271423, + "loss/logits": 3.028424918651581, + "step": 600 + }, + { + "epoch": 0.0061, + "grad_norm": 19.5, + "grad_norm_var": 14.370247395833333, + "learning_rate": 0.0001947, + "loss": 33.7583, + "loss/aux_loss": 0.04842391442507506, + "loss/crossentropy": 4.706963205337525, + "loss/logits": 2.918571615219116, + "step": 610 + }, + { + "epoch": 0.0062, + "grad_norm": 16.0, + "grad_norm_var": 6.815625, + "learning_rate": 0.0001974, + "loss": 33.1779, + "loss/aux_loss": 0.04836068209260702, + "loss/crossentropy": 4.702796244621277, + "loss/logits": 2.8033588767051696, + "step": 620 + }, + { + "epoch": 0.0063, + "grad_norm": 24.75, + "grad_norm_var": 9.648030598958334, + "learning_rate": 0.00020009999999999998, + "loss": 32.6916, + "loss/aux_loss": 0.04836873207241297, + "loss/crossentropy": 4.663065433502197, + "loss/logits": 2.7192453861236574, + "step": 630 + }, + { + "epoch": 0.0064, + "grad_norm": 28.5, + "grad_norm_var": 13.41875, + "learning_rate": 0.0002028, + "loss": 32.5747, + "loss/aux_loss": 0.048406153731048104, + "loss/crossentropy": 4.850475025177002, + "loss/logits": 2.844682276248932, + "step": 640 + }, + { + "epoch": 0.0065, + "grad_norm": 15.1875, + "grad_norm_var": 10.483707682291667, + "learning_rate": 0.0002055, + "loss": 32.627, + "loss/aux_loss": 0.04839936923235655, + "loss/crossentropy": 4.642724204063415, + "loss/logits": 2.7970473051071165, + "step": 650 + }, + { + "epoch": 0.0066, + "grad_norm": 17.25, + "grad_norm_var": 9.181103515625, + "learning_rate": 0.0002082, + "loss": 31.9502, + "loss/aux_loss": 0.04840312860906124, + "loss/crossentropy": 4.64382244348526, + "loss/logits": 2.7651517271995543, + "step": 660 + }, + { + "epoch": 0.0067, + "grad_norm": 22.0, + "grad_norm_var": 6.279166666666667, + "learning_rate": 0.0002109, + "loss": 31.5068, + "loss/aux_loss": 0.048387892358005044, + "loss/crossentropy": 4.641875433921814, + "loss/logits": 2.7343064188957213, + "step": 670 + }, + { + "epoch": 0.0068, + "grad_norm": 20.0, + "grad_norm_var": 5.815348307291667, + "learning_rate": 0.00021360000000000001, + "loss": 30.7349, + "loss/aux_loss": 0.04838373064994812, + "loss/crossentropy": 4.57262305021286, + "loss/logits": 2.6575307488441466, + "step": 680 + }, + { + "epoch": 0.0069, + "grad_norm": 21.25, + "grad_norm_var": 3.3708170572916667, + "learning_rate": 0.00021629999999999997, + "loss": 30.9303, + "loss/aux_loss": 0.048367501422762874, + "loss/crossentropy": 4.517275846004486, + "loss/logits": 2.7227562189102175, + "step": 690 + }, + { + "epoch": 0.007, + "grad_norm": 17.75, + "grad_norm_var": 5.214322916666666, + "learning_rate": 0.00021899999999999998, + "loss": 30.7433, + "loss/aux_loss": 0.048321043699979783, + "loss/crossentropy": 4.465225088596344, + "loss/logits": 2.628221809864044, + "step": 700 + }, + { + "epoch": 0.0071, + "grad_norm": 18.125, + "grad_norm_var": 5.217643229166667, + "learning_rate": 0.00022169999999999997, + "loss": 30.6391, + "loss/aux_loss": 0.04836261495947838, + "loss/crossentropy": 4.5598583102226256, + "loss/logits": 2.5861354947090147, + "step": 710 + }, + { + "epoch": 0.0072, + "grad_norm": 18.625, + "grad_norm_var": 14.745247395833333, + "learning_rate": 0.00022439999999999998, + "loss": 30.0185, + "loss/aux_loss": 0.048368556424975395, + "loss/crossentropy": 4.439025247097016, + "loss/logits": 2.484178614616394, + "step": 720 + }, + { + "epoch": 0.0073, + "grad_norm": 20.625, + "grad_norm_var": 8.686458333333333, + "learning_rate": 0.0002271, + "loss": 29.7983, + "loss/aux_loss": 0.048322527296841146, + "loss/crossentropy": 4.3540124773979185, + "loss/logits": 2.446344316005707, + "step": 730 + }, + { + "epoch": 0.0074, + "grad_norm": 17.375, + "grad_norm_var": 5.894124348958333, + "learning_rate": 0.00022979999999999997, + "loss": 29.5599, + "loss/aux_loss": 0.04832367654889822, + "loss/crossentropy": 4.300390827655792, + "loss/logits": 2.501788628101349, + "step": 740 + }, + { + "epoch": 0.0075, + "grad_norm": 14.5, + "grad_norm_var": 6.899739583333333, + "learning_rate": 0.00023249999999999999, + "loss": 29.1483, + "loss/aux_loss": 0.04832951854914427, + "loss/crossentropy": 4.52186803817749, + "loss/logits": 2.4985528230667113, + "step": 750 + }, + { + "epoch": 0.0076, + "grad_norm": 17.875, + "grad_norm_var": 4.874332682291667, + "learning_rate": 0.0002352, + "loss": 29.0176, + "loss/aux_loss": 0.04831754751503468, + "loss/crossentropy": 4.319947266578675, + "loss/logits": 2.37314190864563, + "step": 760 + }, + { + "epoch": 0.0077, + "grad_norm": 18.75, + "grad_norm_var": 4.414518229166666, + "learning_rate": 0.00023789999999999998, + "loss": 28.4552, + "loss/aux_loss": 0.04835870675742626, + "loss/crossentropy": 4.228903424739838, + "loss/logits": 2.382171905040741, + "step": 770 + }, + { + "epoch": 0.0078, + "grad_norm": 17.875, + "grad_norm_var": 4.404622395833333, + "learning_rate": 0.0002406, + "loss": 27.9477, + "loss/aux_loss": 0.048351569660007955, + "loss/crossentropy": 4.279499888420105, + "loss/logits": 2.3113824844360353, + "step": 780 + }, + { + "epoch": 0.0079, + "grad_norm": 14.5, + "grad_norm_var": 4.849934895833333, + "learning_rate": 0.0002433, + "loss": 28.1858, + "loss/aux_loss": 0.04831267800182104, + "loss/crossentropy": 4.268010532855987, + "loss/logits": 2.357981026172638, + "step": 790 + }, + { + "epoch": 0.008, + "grad_norm": 17.875, + "grad_norm_var": 5.742822265625, + "learning_rate": 0.000246, + "loss": 27.944, + "loss/aux_loss": 0.04835358560085297, + "loss/crossentropy": 4.222308611869812, + "loss/logits": 2.316913056373596, + "step": 800 + }, + { + "epoch": 0.0081, + "grad_norm": 14.3125, + "grad_norm_var": 5.843343098958333, + "learning_rate": 0.0002487, + "loss": 27.446, + "loss/aux_loss": 0.048313943669199944, + "loss/crossentropy": 4.209726583957672, + "loss/logits": 2.374450123310089, + "step": 810 + }, + { + "epoch": 0.0082, + "grad_norm": 20.0, + "grad_norm_var": 14.9884765625, + "learning_rate": 0.0002514, + "loss": 27.673, + "loss/aux_loss": 0.04833365194499493, + "loss/crossentropy": 4.135816490650177, + "loss/logits": 2.3683163046836855, + "step": 820 + }, + { + "epoch": 0.0083, + "grad_norm": 20.0, + "grad_norm_var": 14.966650390625, + "learning_rate": 0.0002541, + "loss": 27.0603, + "loss/aux_loss": 0.04834430795162916, + "loss/crossentropy": 4.212264752388, + "loss/logits": 2.292864066362381, + "step": 830 + }, + { + "epoch": 0.0084, + "grad_norm": 13.0, + "grad_norm_var": 6.715869140625, + "learning_rate": 0.00025679999999999995, + "loss": 26.9205, + "loss/aux_loss": 0.048344089090824126, + "loss/crossentropy": 4.146613943576813, + "loss/logits": 2.337345379590988, + "step": 840 + }, + { + "epoch": 0.0085, + "grad_norm": 38.0, + "grad_norm_var": 36.98123372395833, + "learning_rate": 0.00025949999999999997, + "loss": 26.9794, + "loss/aux_loss": 0.04836427103728056, + "loss/crossentropy": 4.061418402194977, + "loss/logits": 2.285140597820282, + "step": 850 + }, + { + "epoch": 0.0086, + "grad_norm": 16.125, + "grad_norm_var": 29.924723307291668, + "learning_rate": 0.0002622, + "loss": 26.9251, + "loss/aux_loss": 0.048360053822398184, + "loss/crossentropy": 4.185706174373626, + "loss/logits": 2.221945381164551, + "step": 860 + }, + { + "epoch": 0.0087, + "grad_norm": 16.875, + "grad_norm_var": 4.328889973958334, + "learning_rate": 0.0002649, + "loss": 26.3776, + "loss/aux_loss": 0.04831914566457272, + "loss/crossentropy": 4.120305705070495, + "loss/logits": 2.215441507101059, + "step": 870 + }, + { + "epoch": 0.0088, + "grad_norm": 14.75, + "grad_norm_var": 1.937353515625, + "learning_rate": 0.0002676, + "loss": 26.4197, + "loss/aux_loss": 0.04831472560763359, + "loss/crossentropy": 4.091458034515381, + "loss/logits": 2.2590562105178833, + "step": 880 + }, + { + "epoch": 0.0089, + "grad_norm": 17.25, + "grad_norm_var": 5.08828125, + "learning_rate": 0.00027029999999999996, + "loss": 26.5735, + "loss/aux_loss": 0.048319687880575654, + "loss/crossentropy": 4.131750977039337, + "loss/logits": 2.2907270908355715, + "step": 890 + }, + { + "epoch": 0.009, + "grad_norm": 16.875, + "grad_norm_var": 2.8785807291666665, + "learning_rate": 0.00027299999999999997, + "loss": 25.9573, + "loss/aux_loss": 0.04829480424523354, + "loss/crossentropy": 4.1353423476219175, + "loss/logits": 2.2223441004753113, + "step": 900 + }, + { + "epoch": 0.0091, + "grad_norm": 15.8125, + "grad_norm_var": 1.7628743489583334, + "learning_rate": 0.0002757, + "loss": 25.6367, + "loss/aux_loss": 0.0482923174276948, + "loss/crossentropy": 4.0344107985496525, + "loss/logits": 2.1614388108253477, + "step": 910 + }, + { + "epoch": 0.0092, + "grad_norm": 19.125, + "grad_norm_var": 5.126822916666667, + "learning_rate": 0.0002784, + "loss": 25.4326, + "loss/aux_loss": 0.0483014602214098, + "loss/crossentropy": 3.857894313335419, + "loss/logits": 2.1173263430595397, + "step": 920 + }, + { + "epoch": 0.0093, + "grad_norm": 17.5, + "grad_norm_var": 3.0747395833333333, + "learning_rate": 0.0002811, + "loss": 24.9668, + "loss/aux_loss": 0.04832738190889359, + "loss/crossentropy": 3.821620452404022, + "loss/logits": 2.0369732558727263, + "step": 930 + }, + { + "epoch": 0.0094, + "grad_norm": 15.375, + "grad_norm_var": 3.349072265625, + "learning_rate": 0.00028379999999999996, + "loss": 25.2724, + "loss/aux_loss": 0.04831767976284027, + "loss/crossentropy": 4.015332496166229, + "loss/logits": 2.054348534345627, + "step": 940 + }, + { + "epoch": 0.0095, + "grad_norm": 13.875, + "grad_norm_var": 2.811572265625, + "learning_rate": 0.00028649999999999997, + "loss": 24.9269, + "loss/aux_loss": 0.04830477572977543, + "loss/crossentropy": 3.9600290179252626, + "loss/logits": 2.0728322982788088, + "step": 950 + }, + { + "epoch": 0.0096, + "grad_norm": 12.125, + "grad_norm_var": 2.8313639322916666, + "learning_rate": 0.0002892, + "loss": 24.9397, + "loss/aux_loss": 0.04828764032572508, + "loss/crossentropy": 3.9735502004623413, + "loss/logits": 2.0897044599056245, + "step": 960 + }, + { + "epoch": 0.0097, + "grad_norm": 12.9375, + "grad_norm_var": 3.466910807291667, + "learning_rate": 0.0002919, + "loss": 25.0229, + "loss/aux_loss": 0.04828515090048313, + "loss/crossentropy": 3.849641752243042, + "loss/logits": 2.0807200193405153, + "step": 970 + }, + { + "epoch": 0.0098, + "grad_norm": 17.125, + "grad_norm_var": 2.0829264322916665, + "learning_rate": 0.00029459999999999995, + "loss": 24.5474, + "loss/aux_loss": 0.04829510189592838, + "loss/crossentropy": 3.926040601730347, + "loss/logits": 1.9675580561161041, + "step": 980 + }, + { + "epoch": 0.0099, + "grad_norm": 14.1875, + "grad_norm_var": 2.063134765625, + "learning_rate": 0.00029729999999999996, + "loss": 24.7495, + "loss/aux_loss": 0.04827150721102953, + "loss/crossentropy": 3.920617640018463, + "loss/logits": 2.0571080267429354, + "step": 990 + }, + { + "epoch": 0.01, + "grad_norm": 23.125, + "grad_norm_var": 9.017041015625, + "learning_rate": 0.0003, + "loss": 24.6181, + "loss/aux_loss": 0.04830240122973919, + "loss/crossentropy": 3.960080420970917, + "loss/logits": 2.0806682467460633, + "step": 1000 + }, + { + "epoch": 0.0101, + "grad_norm": 13.625, + "grad_norm_var": 7.297330729166666, + "learning_rate": 0.0003, + "loss": 24.5401, + "loss/aux_loss": 0.048330770991742614, + "loss/crossentropy": 3.909256339073181, + "loss/logits": 2.0541693389415743, + "step": 1010 + }, + { + "epoch": 0.0102, + "grad_norm": 13.75, + "grad_norm_var": 2.5978515625, + "learning_rate": 0.0003, + "loss": 24.0457, + "loss/aux_loss": 0.04828005637973547, + "loss/crossentropy": 4.0945284247398375, + "loss/logits": 2.0571001410484313, + "step": 1020 + }, + { + "epoch": 0.0103, + "grad_norm": 12.0, + "grad_norm_var": 2.8195149739583334, + "learning_rate": 0.0003, + "loss": 23.9983, + "loss/aux_loss": 0.048290212824940684, + "loss/crossentropy": 3.792713475227356, + "loss/logits": 1.9736050605773925, + "step": 1030 + }, + { + "epoch": 0.0104, + "grad_norm": 14.4375, + "grad_norm_var": 45.916650390625, + "learning_rate": 0.0003, + "loss": 23.7592, + "loss/aux_loss": 0.048343191482126714, + "loss/crossentropy": 3.667546308040619, + "loss/logits": 1.9718676209449768, + "step": 1040 + }, + { + "epoch": 0.0105, + "grad_norm": 13.75, + "grad_norm_var": 4.7306640625, + "learning_rate": 0.0003, + "loss": 23.9655, + "loss/aux_loss": 0.04828641843050718, + "loss/crossentropy": 3.918486988544464, + "loss/logits": 2.0048129856586456, + "step": 1050 + }, + { + "epoch": 0.0106, + "grad_norm": 15.0, + "grad_norm_var": 1.9869140625, + "learning_rate": 0.0003, + "loss": 23.6091, + "loss/aux_loss": 0.048306448943912984, + "loss/crossentropy": 3.855974185466766, + "loss/logits": 1.956015944480896, + "step": 1060 + }, + { + "epoch": 0.0107, + "grad_norm": 13.125, + "grad_norm_var": 1.4332682291666667, + "learning_rate": 0.0003, + "loss": 23.576, + "loss/aux_loss": 0.048309461772441865, + "loss/crossentropy": 3.5664370179176332, + "loss/logits": 1.9399469137191772, + "step": 1070 + }, + { + "epoch": 0.0108, + "grad_norm": 16.875, + "grad_norm_var": 96.96066080729166, + "learning_rate": 0.0003, + "loss": 23.5042, + "loss/aux_loss": 0.04829024374485016, + "loss/crossentropy": 3.9391483783721926, + "loss/logits": 2.0180298566818236, + "step": 1080 + }, + { + "epoch": 0.0109, + "grad_norm": 14.25, + "grad_norm_var": 99.06608072916667, + "learning_rate": 0.0003, + "loss": 23.2801, + "loss/aux_loss": 0.04827498830854893, + "loss/crossentropy": 3.925715708732605, + "loss/logits": 1.9511402130126954, + "step": 1090 + }, + { + "epoch": 0.011, + "grad_norm": 11.875, + "grad_norm_var": 1.5015462239583333, + "learning_rate": 0.0003, + "loss": 23.2888, + "loss/aux_loss": 0.04827521629631519, + "loss/crossentropy": 4.049439036846161, + "loss/logits": 1.9741652667522431, + "step": 1100 + }, + { + "epoch": 0.0111, + "grad_norm": 13.5, + "grad_norm_var": 3.6861979166666665, + "learning_rate": 0.0003, + "loss": 22.8228, + "loss/aux_loss": 0.048285826854407785, + "loss/crossentropy": 3.7126415371894836, + "loss/logits": 1.8875436723232268, + "step": 1110 + }, + { + "epoch": 0.0112, + "grad_norm": 15.125, + "grad_norm_var": 3.2712076822916667, + "learning_rate": 0.0003, + "loss": 22.8436, + "loss/aux_loss": 0.048280049860477445, + "loss/crossentropy": 3.875200855731964, + "loss/logits": 1.8699533224105835, + "step": 1120 + }, + { + "epoch": 0.0113, + "grad_norm": 12.875, + "grad_norm_var": 1.5421712239583334, + "learning_rate": 0.0003, + "loss": 22.9724, + "loss/aux_loss": 0.04830393195152283, + "loss/crossentropy": 3.7354134917259216, + "loss/logits": 1.9599017381668091, + "step": 1130 + }, + { + "epoch": 0.0114, + "grad_norm": 11.1875, + "grad_norm_var": 1.6598307291666667, + "learning_rate": 0.0003, + "loss": 22.91, + "loss/aux_loss": 0.04829528890550137, + "loss/crossentropy": 3.832562971115112, + "loss/logits": 1.9021077275276184, + "step": 1140 + }, + { + "epoch": 0.0115, + "grad_norm": 12.0, + "grad_norm_var": 2.373551432291667, + "learning_rate": 0.0003, + "loss": 22.5944, + "loss/aux_loss": 0.04828084670007229, + "loss/crossentropy": 3.8583874821662905, + "loss/logits": 1.9061977505683898, + "step": 1150 + }, + { + "epoch": 0.0116, + "grad_norm": 11.6875, + "grad_norm_var": 4.650374348958334, + "learning_rate": 0.0003, + "loss": 22.6571, + "loss/aux_loss": 0.04829124473035336, + "loss/crossentropy": 3.729883003234863, + "loss/logits": 1.8983563661575318, + "step": 1160 + }, + { + "epoch": 0.0117, + "grad_norm": 12.75, + "grad_norm_var": 4.216080729166666, + "learning_rate": 0.0003, + "loss": 22.5304, + "loss/aux_loss": 0.0483067661523819, + "loss/crossentropy": 3.8876662373542787, + "loss/logits": 1.8905851602554322, + "step": 1170 + }, + { + "epoch": 0.0118, + "grad_norm": 12.1875, + "grad_norm_var": 1.5910807291666667, + "learning_rate": 0.0003, + "loss": 22.2809, + "loss/aux_loss": 0.048292340524494645, + "loss/crossentropy": 3.9721433520317078, + "loss/logits": 1.8897149801254272, + "step": 1180 + }, + { + "epoch": 0.0119, + "grad_norm": 14.4375, + "grad_norm_var": 7.739322916666667, + "learning_rate": 0.0003, + "loss": 22.4589, + "loss/aux_loss": 0.048297750391066076, + "loss/crossentropy": 3.6948838114738463, + "loss/logits": 1.8489306330680848, + "step": 1190 + }, + { + "epoch": 0.012, + "grad_norm": 10.5, + "grad_norm_var": 7.207666015625, + "learning_rate": 0.0003, + "loss": 22.2067, + "loss/aux_loss": 0.048272774554789066, + "loss/crossentropy": 3.913854885101318, + "loss/logits": 1.861431396007538, + "step": 1200 + }, + { + "epoch": 0.0121, + "grad_norm": 15.3125, + "grad_norm_var": 5.213655598958334, + "learning_rate": 0.0003, + "loss": 22.2212, + "loss/aux_loss": 0.04833245109766722, + "loss/crossentropy": 3.696351206302643, + "loss/logits": 1.8378067016601562, + "step": 1210 + }, + { + "epoch": 0.0122, + "grad_norm": 9.9375, + "grad_norm_var": 3.999853515625, + "learning_rate": 0.0003, + "loss": 22.0734, + "loss/aux_loss": 0.04830023720860481, + "loss/crossentropy": 3.807795548439026, + "loss/logits": 1.8107618153095246, + "step": 1220 + }, + { + "epoch": 0.0123, + "grad_norm": 12.5, + "grad_norm_var": 12.50078125, + "learning_rate": 0.0003, + "loss": 21.7587, + "loss/aux_loss": 0.04829862117767334, + "loss/crossentropy": 3.750839185714722, + "loss/logits": 1.794652533531189, + "step": 1230 + }, + { + "epoch": 0.0124, + "grad_norm": 12.375, + "grad_norm_var": 115.42858072916667, + "learning_rate": 0.0003, + "loss": 21.93, + "loss/aux_loss": 0.048292195051908494, + "loss/crossentropy": 3.7465561628341675, + "loss/logits": 1.797796505689621, + "step": 1240 + }, + { + "epoch": 0.0125, + "grad_norm": 12.25, + "grad_norm_var": 186.75416666666666, + "learning_rate": 0.0003, + "loss": 21.953, + "loss/aux_loss": 0.04834472518414259, + "loss/crossentropy": 3.6869328737258913, + "loss/logits": 1.797852247953415, + "step": 1250 + }, + { + "epoch": 0.0126, + "grad_norm": 9.75, + "grad_norm_var": 1.7327473958333333, + "learning_rate": 0.0003, + "loss": 21.9868, + "loss/aux_loss": 0.048277279175817964, + "loss/crossentropy": 3.6617552042007446, + "loss/logits": 1.7641812562942505, + "step": 1260 + }, + { + "epoch": 0.0127, + "grad_norm": 10.75, + "grad_norm_var": 1.4202473958333333, + "learning_rate": 0.0003, + "loss": 21.6879, + "loss/aux_loss": 0.04827971309423447, + "loss/crossentropy": 3.4563692212104797, + "loss/logits": 1.7538020849227904, + "step": 1270 + }, + { + "epoch": 0.0128, + "grad_norm": 9.6875, + "grad_norm_var": 0.5093098958333333, + "learning_rate": 0.0003, + "loss": 21.5679, + "loss/aux_loss": 0.048257603868842126, + "loss/crossentropy": 3.737559175491333, + "loss/logits": 1.8031953394412994, + "step": 1280 + }, + { + "epoch": 0.0129, + "grad_norm": 10.8125, + "grad_norm_var": 3.0283854166666666, + "learning_rate": 0.0003, + "loss": 21.801, + "loss/aux_loss": 0.0482830997556448, + "loss/crossentropy": 3.788530111312866, + "loss/logits": 1.8610890209674835, + "step": 1290 + }, + { + "epoch": 0.013, + "grad_norm": 10.3125, + "grad_norm_var": 0.363134765625, + "learning_rate": 0.0003, + "loss": 21.5052, + "loss/aux_loss": 0.04827403090894222, + "loss/crossentropy": 3.7146639943122866, + "loss/logits": 1.8084448158740998, + "step": 1300 + }, + { + "epoch": 0.0131, + "grad_norm": 10.3125, + "grad_norm_var": 0.35885416666666664, + "learning_rate": 0.0003, + "loss": 21.1896, + "loss/aux_loss": 0.048255456425249574, + "loss/crossentropy": 3.6508117794990538, + "loss/logits": 1.7647499084472655, + "step": 1310 + }, + { + "epoch": 0.0132, + "grad_norm": 10.75, + "grad_norm_var": 48.28553059895833, + "learning_rate": 0.0003, + "loss": 21.2771, + "loss/aux_loss": 0.04835470654070377, + "loss/crossentropy": 3.6754523515701294, + "loss/logits": 1.7459341287612915, + "step": 1320 + }, + { + "epoch": 0.0133, + "grad_norm": 9.75, + "grad_norm_var": 18.715348307291666, + "learning_rate": 0.0003, + "loss": 21.4809, + "loss/aux_loss": 0.04828258771449327, + "loss/crossentropy": 3.671703588962555, + "loss/logits": 1.753785401582718, + "step": 1330 + }, + { + "epoch": 0.0134, + "grad_norm": 10.375, + "grad_norm_var": 0.9012858072916666, + "learning_rate": 0.0003, + "loss": 21.3825, + "loss/aux_loss": 0.048258156329393384, + "loss/crossentropy": 3.691448616981506, + "loss/logits": 1.7658027529716491, + "step": 1340 + }, + { + "epoch": 0.0135, + "grad_norm": 8.4375, + "grad_norm_var": 0.6473307291666667, + "learning_rate": 0.0003, + "loss": 21.3845, + "loss/aux_loss": 0.04825436770915985, + "loss/crossentropy": 3.645776665210724, + "loss/logits": 1.7162048041820526, + "step": 1350 + }, + { + "epoch": 0.0136, + "grad_norm": 10.0625, + "grad_norm_var": 0.58984375, + "learning_rate": 0.0003, + "loss": 20.7527, + "loss/aux_loss": 0.04827475063502788, + "loss/crossentropy": 3.6466134548187257, + "loss/logits": 1.694813996553421, + "step": 1360 + }, + { + "epoch": 0.0137, + "grad_norm": 10.0625, + "grad_norm_var": 0.4046223958333333, + "learning_rate": 0.0003, + "loss": 21.017, + "loss/aux_loss": 0.04827818218618631, + "loss/crossentropy": 3.7088001132011414, + "loss/logits": 1.70878404378891, + "step": 1370 + }, + { + "epoch": 0.0138, + "grad_norm": 11.875, + "grad_norm_var": 0.70703125, + "learning_rate": 0.0003, + "loss": 20.9997, + "loss/aux_loss": 0.04826367888599634, + "loss/crossentropy": 3.7648239493370057, + "loss/logits": 1.7505885064601898, + "step": 1380 + }, + { + "epoch": 0.0139, + "grad_norm": 9.5625, + "grad_norm_var": 0.7249348958333334, + "learning_rate": 0.0003, + "loss": 20.798, + "loss/aux_loss": 0.04830121118575335, + "loss/crossentropy": 3.696249544620514, + "loss/logits": 1.7071794509887694, + "step": 1390 + }, + { + "epoch": 0.014, + "grad_norm": 8.875, + "grad_norm_var": 0.7769368489583334, + "learning_rate": 0.0003, + "loss": 21.0944, + "loss/aux_loss": 0.04826546385884285, + "loss/crossentropy": 3.6206825494766237, + "loss/logits": 1.766976636648178, + "step": 1400 + }, + { + "epoch": 0.0141, + "grad_norm": 9.0625, + "grad_norm_var": 0.6129557291666666, + "learning_rate": 0.0003, + "loss": 20.5101, + "loss/aux_loss": 0.048279773257672784, + "loss/crossentropy": 3.5249340176582336, + "loss/logits": 1.6939750254154204, + "step": 1410 + }, + { + "epoch": 0.0142, + "grad_norm": 9.25, + "grad_norm_var": 1.0231608072916667, + "learning_rate": 0.0003, + "loss": 20.5613, + "loss/aux_loss": 0.04827423859387636, + "loss/crossentropy": 3.45823814868927, + "loss/logits": 1.6488157391548157, + "step": 1420 + }, + { + "epoch": 0.0143, + "grad_norm": 9.125, + "grad_norm_var": 1.0061848958333333, + "learning_rate": 0.0003, + "loss": 20.6672, + "loss/aux_loss": 0.048273424990475176, + "loss/crossentropy": 3.5580545544624327, + "loss/logits": 1.6708523690700532, + "step": 1430 + }, + { + "epoch": 0.0144, + "grad_norm": 9.5625, + "grad_norm_var": 0.5536295572916666, + "learning_rate": 0.0003, + "loss": 20.7267, + "loss/aux_loss": 0.048252567276358606, + "loss/crossentropy": 3.50860835313797, + "loss/logits": 1.690982359647751, + "step": 1440 + }, + { + "epoch": 0.0145, + "grad_norm": 9.0, + "grad_norm_var": 0.4879557291666667, + "learning_rate": 0.0003, + "loss": 20.3679, + "loss/aux_loss": 0.048260470107197764, + "loss/crossentropy": 3.5979798078536986, + "loss/logits": 1.727076655626297, + "step": 1450 + }, + { + "epoch": 0.0146, + "grad_norm": 9.5625, + "grad_norm_var": 0.6821451822916667, + "learning_rate": 0.0003, + "loss": 20.5641, + "loss/aux_loss": 0.04824462234973907, + "loss/crossentropy": 3.473064345121384, + "loss/logits": 1.6370813488960265, + "step": 1460 + }, + { + "epoch": 0.0147, + "grad_norm": 9.25, + "grad_norm_var": 0.5143229166666666, + "learning_rate": 0.0003, + "loss": 20.4817, + "loss/aux_loss": 0.04825858902186155, + "loss/crossentropy": 3.680207347869873, + "loss/logits": 1.7349261403083802, + "step": 1470 + }, + { + "epoch": 0.0148, + "grad_norm": 10.0625, + "grad_norm_var": 168.77784830729166, + "learning_rate": 0.0003, + "loss": 20.4344, + "loss/aux_loss": 0.04829352758824825, + "loss/crossentropy": 3.643856203556061, + "loss/logits": 1.6838299632072449, + "step": 1480 + }, + { + "epoch": 0.0149, + "grad_norm": 10.0, + "grad_norm_var": 164.6166015625, + "learning_rate": 0.0003, + "loss": 20.2984, + "loss/aux_loss": 0.04828519467264414, + "loss/crossentropy": 3.491868484020233, + "loss/logits": 1.6502962768077851, + "step": 1490 + }, + { + "epoch": 0.015, + "grad_norm": 9.3125, + "grad_norm_var": 1.26796875, + "learning_rate": 0.0003, + "loss": 20.2734, + "loss/aux_loss": 0.0482696495950222, + "loss/crossentropy": 3.54322612285614, + "loss/logits": 1.6839805364608764, + "step": 1500 + }, + { + "epoch": 0.0151, + "grad_norm": 9.6875, + "grad_norm_var": 1.3467732747395833, + "learning_rate": 0.0003, + "loss": 20.223, + "loss/aux_loss": 0.048268103040754795, + "loss/crossentropy": 3.591710591316223, + "loss/logits": 1.6459056198596955, + "step": 1510 + }, + { + "epoch": 0.0152, + "grad_norm": 9.5625, + "grad_norm_var": 0.30625, + "learning_rate": 0.0003, + "loss": 19.9556, + "loss/aux_loss": 0.04826340805739164, + "loss/crossentropy": 3.663398194313049, + "loss/logits": 1.6375055193901062, + "step": 1520 + }, + { + "epoch": 0.0153, + "grad_norm": 9.6875, + "grad_norm_var": 8.3697265625, + "learning_rate": 0.0003, + "loss": 19.9875, + "loss/aux_loss": 0.048267958126962184, + "loss/crossentropy": 3.429060697555542, + "loss/logits": 1.5783089220523834, + "step": 1530 + }, + { + "epoch": 0.0154, + "grad_norm": 7.96875, + "grad_norm_var": 9.10787353515625, + "learning_rate": 0.0003, + "loss": 19.7039, + "loss/aux_loss": 0.048261369951069354, + "loss/crossentropy": 3.662845695018768, + "loss/logits": 1.6366191446781158, + "step": 1540 + }, + { + "epoch": 0.0155, + "grad_norm": 8.1875, + "grad_norm_var": 37.75735270182292, + "learning_rate": 0.0003, + "loss": 19.9279, + "loss/aux_loss": 0.048299112170934674, + "loss/crossentropy": 3.5160138845443725, + "loss/logits": 1.613296240568161, + "step": 1550 + }, + { + "epoch": 0.0156, + "grad_norm": 8.875, + "grad_norm_var": 0.6832967122395833, + "learning_rate": 0.0003, + "loss": 19.8753, + "loss/aux_loss": 0.04826027043163776, + "loss/crossentropy": 3.6247658729553223, + "loss/logits": 1.606148999929428, + "step": 1560 + }, + { + "epoch": 0.0157, + "grad_norm": 8.75, + "grad_norm_var": 6.793082682291667, + "learning_rate": 0.0003, + "loss": 19.7959, + "loss/aux_loss": 0.04825771022588014, + "loss/crossentropy": 3.3447017312049865, + "loss/logits": 1.5702200174331664, + "step": 1570 + }, + { + "epoch": 0.0158, + "grad_norm": 8.6875, + "grad_norm_var": 6.344645182291667, + "learning_rate": 0.0003, + "loss": 19.8071, + "loss/aux_loss": 0.048260610550642014, + "loss/crossentropy": 3.414109396934509, + "loss/logits": 1.6106845080852508, + "step": 1580 + }, + { + "epoch": 0.0159, + "grad_norm": 8.125, + "grad_norm_var": 0.25833333333333336, + "learning_rate": 0.0003, + "loss": 19.716, + "loss/aux_loss": 0.048247416689991954, + "loss/crossentropy": 3.4498027682304384, + "loss/logits": 1.621438193321228, + "step": 1590 + }, + { + "epoch": 0.016, + "grad_norm": 8.5625, + "grad_norm_var": 0.28177083333333336, + "learning_rate": 0.0003, + "loss": 19.6554, + "loss/aux_loss": 0.048260800912976264, + "loss/crossentropy": 3.564990556240082, + "loss/logits": 1.6194686591625214, + "step": 1600 + }, + { + "epoch": 0.0161, + "grad_norm": 9.75, + "grad_norm_var": 0.21692708333333333, + "learning_rate": 0.0003, + "loss": 19.6566, + "loss/aux_loss": 0.048259117268025876, + "loss/crossentropy": 3.4118771314620973, + "loss/logits": 1.6117245256900787, + "step": 1610 + }, + { + "epoch": 0.0162, + "grad_norm": 8.6875, + "grad_norm_var": 1.2012858072916666, + "learning_rate": 0.0003, + "loss": 19.2843, + "loss/aux_loss": 0.04827672149986029, + "loss/crossentropy": 3.3465479731559755, + "loss/logits": 1.5816888511180878, + "step": 1620 + }, + { + "epoch": 0.0163, + "grad_norm": 8.6875, + "grad_norm_var": 0.2676920572916667, + "learning_rate": 0.0003, + "loss": 19.4599, + "loss/aux_loss": 0.048250201344490054, + "loss/crossentropy": 3.515201151371002, + "loss/logits": 1.5413510143756866, + "step": 1630 + }, + { + "epoch": 0.0164, + "grad_norm": 8.4375, + "grad_norm_var": 0.42831624348958336, + "learning_rate": 0.0003, + "loss": 19.4, + "loss/aux_loss": 0.04824144206941128, + "loss/crossentropy": 3.4953723192214965, + "loss/logits": 1.6243361711502076, + "step": 1640 + }, + { + "epoch": 0.0165, + "grad_norm": 8.125, + "grad_norm_var": 0.42760009765625, + "learning_rate": 0.0003, + "loss": 19.5616, + "loss/aux_loss": 0.04824843630194664, + "loss/crossentropy": 3.373341774940491, + "loss/logits": 1.5757689416408538, + "step": 1650 + }, + { + "epoch": 0.0166, + "grad_norm": 9.3125, + "grad_norm_var": 0.18840738932291667, + "learning_rate": 0.0003, + "loss": 19.2866, + "loss/aux_loss": 0.04825681522488594, + "loss/crossentropy": 3.339651143550873, + "loss/logits": 1.5976973354816437, + "step": 1660 + }, + { + "epoch": 0.0167, + "grad_norm": 9.125, + "grad_norm_var": 5.00054931640625, + "learning_rate": 0.0003, + "loss": 19.5115, + "loss/aux_loss": 0.04826808106154203, + "loss/crossentropy": 3.5275720238685606, + "loss/logits": 1.5411208510398864, + "step": 1670 + }, + { + "epoch": 0.0168, + "grad_norm": 10.1875, + "grad_norm_var": 47.442952473958336, + "learning_rate": 0.0003, + "loss": 19.3194, + "loss/aux_loss": 0.048269005678594115, + "loss/crossentropy": 3.509286916255951, + "loss/logits": 1.5299911737442016, + "step": 1680 + }, + { + "epoch": 0.0169, + "grad_norm": 8.0625, + "grad_norm_var": 45.85546875, + "learning_rate": 0.0003, + "loss": 19.2647, + "loss/aux_loss": 0.048268322832882404, + "loss/crossentropy": 3.4770318508148192, + "loss/logits": 1.552085292339325, + "step": 1690 + }, + { + "epoch": 0.017, + "grad_norm": 8.375, + "grad_norm_var": 0.31067301432291666, + "learning_rate": 0.0003, + "loss": 19.1182, + "loss/aux_loss": 0.048229466564953326, + "loss/crossentropy": 3.3933613896369934, + "loss/logits": 1.5347690343856812, + "step": 1700 + }, + { + "epoch": 0.0171, + "grad_norm": 8.125, + "grad_norm_var": 0.16428629557291666, + "learning_rate": 0.0003, + "loss": 19.4514, + "loss/aux_loss": 0.04824534244835377, + "loss/crossentropy": 3.460851287841797, + "loss/logits": 1.5554963886737823, + "step": 1710 + }, + { + "epoch": 0.0172, + "grad_norm": 8.5625, + "grad_norm_var": 0.14095052083333334, + "learning_rate": 0.0003, + "loss": 19.0598, + "loss/aux_loss": 0.0482347022742033, + "loss/crossentropy": 3.4911609292030334, + "loss/logits": 1.6111477196216584, + "step": 1720 + }, + { + "epoch": 0.0173, + "grad_norm": 10.1875, + "grad_norm_var": 8.088505045572917, + "learning_rate": 0.0003, + "loss": 19.2344, + "loss/aux_loss": 0.04825140796601772, + "loss/crossentropy": 3.6284345746040345, + "loss/logits": 1.5853043377399445, + "step": 1730 + }, + { + "epoch": 0.0174, + "grad_norm": 8.4375, + "grad_norm_var": 8.165690104166666, + "learning_rate": 0.0003, + "loss": 18.7796, + "loss/aux_loss": 0.04823643118143082, + "loss/crossentropy": 3.4696247935295106, + "loss/logits": 1.61273393034935, + "step": 1740 + }, + { + "epoch": 0.0175, + "grad_norm": 7.84375, + "grad_norm_var": 0.20670166015625, + "learning_rate": 0.0003, + "loss": 18.8781, + "loss/aux_loss": 0.04823619779199362, + "loss/crossentropy": 3.4937520623207092, + "loss/logits": 1.5296509921550752, + "step": 1750 + }, + { + "epoch": 0.0176, + "grad_norm": 8.375, + "grad_norm_var": 0.24099934895833333, + "learning_rate": 0.0003, + "loss": 18.7339, + "loss/aux_loss": 0.04824729897081852, + "loss/crossentropy": 3.497152864933014, + "loss/logits": 1.575348162651062, + "step": 1760 + }, + { + "epoch": 0.0177, + "grad_norm": 8.5625, + "grad_norm_var": 0.38118082682291665, + "learning_rate": 0.0003, + "loss": 18.8957, + "loss/aux_loss": 0.04825843013823032, + "loss/crossentropy": 3.3257681131362915, + "loss/logits": 1.5223451495170592, + "step": 1770 + }, + { + "epoch": 0.0178, + "grad_norm": 8.1875, + "grad_norm_var": 0.24722900390625, + "learning_rate": 0.0003, + "loss": 19.0113, + "loss/aux_loss": 0.04823619592934847, + "loss/crossentropy": 3.5192960858345033, + "loss/logits": 1.6035509347915649, + "step": 1780 + }, + { + "epoch": 0.0179, + "grad_norm": 8.0625, + "grad_norm_var": 0.14099934895833333, + "learning_rate": 0.0003, + "loss": 18.8621, + "loss/aux_loss": 0.04824311789125204, + "loss/crossentropy": 3.4132414937019346, + "loss/logits": 1.5165437579154968, + "step": 1790 + }, + { + "epoch": 0.018, + "grad_norm": 8.6875, + "grad_norm_var": 3.5010050455729167, + "learning_rate": 0.0003, + "loss": 18.6503, + "loss/aux_loss": 0.04824370257556439, + "loss/crossentropy": 3.402624809741974, + "loss/logits": 1.53942152261734, + "step": 1800 + }, + { + "epoch": 0.0181, + "grad_norm": 8.5, + "grad_norm_var": 58.00753580729167, + "learning_rate": 0.0003, + "loss": 19.3208, + "loss/aux_loss": 0.04827347882091999, + "loss/crossentropy": 3.6006906509399412, + "loss/logits": 1.5737698435783387, + "step": 1810 + }, + { + "epoch": 0.0182, + "grad_norm": 8.5625, + "grad_norm_var": 60.507405598958336, + "learning_rate": 0.0003, + "loss": 18.8721, + "loss/aux_loss": 0.04824970234185457, + "loss/crossentropy": 3.5692302942276, + "loss/logits": 1.5307071149349212, + "step": 1820 + }, + { + "epoch": 0.0183, + "grad_norm": 11.25, + "grad_norm_var": 6.178238932291666, + "learning_rate": 0.0003, + "loss": 18.5814, + "loss/aux_loss": 0.04826509784907103, + "loss/crossentropy": 3.40536652803421, + "loss/logits": 1.5309065878391266, + "step": 1830 + }, + { + "epoch": 0.0184, + "grad_norm": 8.125, + "grad_norm_var": 6.129427083333334, + "learning_rate": 0.0003, + "loss": 18.7299, + "loss/aux_loss": 0.0482620395720005, + "loss/crossentropy": 3.196433222293854, + "loss/logits": 1.4907720267772675, + "step": 1840 + }, + { + "epoch": 0.0185, + "grad_norm": 8.3125, + "grad_norm_var": 0.17825520833333333, + "learning_rate": 0.0003, + "loss": 18.7455, + "loss/aux_loss": 0.04824581053107977, + "loss/crossentropy": 3.423103988170624, + "loss/logits": 1.5436949849128723, + "step": 1850 + }, + { + "epoch": 0.0186, + "grad_norm": 8.375, + "grad_norm_var": 2.1146484375, + "learning_rate": 0.0003, + "loss": 18.6511, + "loss/aux_loss": 0.048259328678250314, + "loss/crossentropy": 3.3652100563049316, + "loss/logits": 1.4386920034885406, + "step": 1860 + }, + { + "epoch": 0.0187, + "grad_norm": 8.5625, + "grad_norm_var": 1.8754191080729166, + "learning_rate": 0.0003, + "loss": 18.607, + "loss/aux_loss": 0.04825924132019281, + "loss/crossentropy": 3.448017656803131, + "loss/logits": 1.536920464038849, + "step": 1870 + }, + { + "epoch": 0.0188, + "grad_norm": 8.4375, + "grad_norm_var": 0.24664306640625, + "learning_rate": 0.0003, + "loss": 18.5559, + "loss/aux_loss": 0.048245815001428126, + "loss/crossentropy": 3.3453487277030947, + "loss/logits": 1.5264363229274749, + "step": 1880 + }, + { + "epoch": 0.0189, + "grad_norm": 7.875, + "grad_norm_var": 0.310791015625, + "learning_rate": 0.0003, + "loss": 18.5365, + "loss/aux_loss": 0.04823654443025589, + "loss/crossentropy": 3.373169445991516, + "loss/logits": 1.5205184280872346, + "step": 1890 + }, + { + "epoch": 0.019, + "grad_norm": 7.75, + "grad_norm_var": 0.32105712890625, + "learning_rate": 0.0003, + "loss": 18.496, + "loss/aux_loss": 0.048227923549711706, + "loss/crossentropy": 3.2882206797599793, + "loss/logits": 1.566467821598053, + "step": 1900 + }, + { + "epoch": 0.0191, + "grad_norm": 8.0, + "grad_norm_var": 0.301416015625, + "learning_rate": 0.0003, + "loss": 18.2065, + "loss/aux_loss": 0.04824189618229866, + "loss/crossentropy": 3.397510600090027, + "loss/logits": 1.4857994496822358, + "step": 1910 + }, + { + "epoch": 0.0192, + "grad_norm": 7.78125, + "grad_norm_var": 0.6359212239583333, + "learning_rate": 0.0003, + "loss": 18.5913, + "loss/aux_loss": 0.04824940506368876, + "loss/crossentropy": 3.396523857116699, + "loss/logits": 1.516043496131897, + "step": 1920 + }, + { + "epoch": 0.0193, + "grad_norm": 7.71875, + "grad_norm_var": 0.7240885416666667, + "learning_rate": 0.0003, + "loss": 18.9066, + "loss/aux_loss": 0.04823889695107937, + "loss/crossentropy": 3.5669950366020204, + "loss/logits": 1.5904681384563446, + "step": 1930 + }, + { + "epoch": 0.0194, + "grad_norm": 8.5, + "grad_norm_var": 0.16226806640625, + "learning_rate": 0.0003, + "loss": 18.5454, + "loss/aux_loss": 0.04824310019612312, + "loss/crossentropy": 3.4322083711624147, + "loss/logits": 1.4962215423583984, + "step": 1940 + }, + { + "epoch": 0.0195, + "grad_norm": 7.8125, + "grad_norm_var": 0.13033854166666667, + "learning_rate": 0.0003, + "loss": 18.4341, + "loss/aux_loss": 0.048227659240365026, + "loss/crossentropy": 3.4103391289711, + "loss/logits": 1.486283725500107, + "step": 1950 + }, + { + "epoch": 0.0196, + "grad_norm": 7.71875, + "grad_norm_var": 0.22428385416666666, + "learning_rate": 0.0003, + "loss": 18.2887, + "loss/aux_loss": 0.04824389982968569, + "loss/crossentropy": 3.2727973818778993, + "loss/logits": 1.4560063302516937, + "step": 1960 + }, + { + "epoch": 0.0197, + "grad_norm": 13.8125, + "grad_norm_var": 2.357926432291667, + "learning_rate": 0.0003, + "loss": 18.2356, + "loss/aux_loss": 0.04824434258043766, + "loss/crossentropy": 3.459345853328705, + "loss/logits": 1.460297852754593, + "step": 1970 + }, + { + "epoch": 0.0198, + "grad_norm": 11.4375, + "grad_norm_var": 18.870817057291667, + "learning_rate": 0.0003, + "loss": 18.5238, + "loss/aux_loss": 0.04825907479971647, + "loss/crossentropy": 3.424750554561615, + "loss/logits": 1.5167337119579316, + "step": 1980 + }, + { + "epoch": 0.0199, + "grad_norm": 7.90625, + "grad_norm_var": 18.401546223958334, + "learning_rate": 0.0003, + "loss": 18.2073, + "loss/aux_loss": 0.04825407154858112, + "loss/crossentropy": 3.2709259510040285, + "loss/logits": 1.4685286700725555, + "step": 1990 + }, + { + "epoch": 0.02, + "grad_norm": 7.96875, + "grad_norm_var": 0.3753865559895833, + "learning_rate": 0.0003, + "loss": 18.0374, + "loss/aux_loss": 0.04822464138269424, + "loss/crossentropy": 3.350880300998688, + "loss/logits": 1.4842880725860597, + "step": 2000 + }, + { + "epoch": 0.0201, + "grad_norm": 9.0, + "grad_norm_var": 54.53902587890625, + "learning_rate": 0.0003, + "loss": 18.1583, + "loss/aux_loss": 0.04824762139469385, + "loss/crossentropy": 3.2653831958770754, + "loss/logits": 1.4564920663833618, + "step": 2010 + }, + { + "epoch": 0.0202, + "grad_norm": 13.9375, + "grad_norm_var": 54.16106363932292, + "learning_rate": 0.0003, + "loss": 17.7872, + "loss/aux_loss": 0.04823515806347132, + "loss/crossentropy": 3.265163505077362, + "loss/logits": 1.410541558265686, + "step": 2020 + }, + { + "epoch": 0.0203, + "grad_norm": 8.3125, + "grad_norm_var": 2.7471964518229166, + "learning_rate": 0.0003, + "loss": 18.125, + "loss/aux_loss": 0.048241624422371385, + "loss/crossentropy": 3.389461839199066, + "loss/logits": 1.46819948554039, + "step": 2030 + }, + { + "epoch": 0.0204, + "grad_norm": 7.71875, + "grad_norm_var": 0.14091389973958332, + "learning_rate": 0.0003, + "loss": 18.0986, + "loss/aux_loss": 0.04822319280356169, + "loss/crossentropy": 3.543317806720734, + "loss/logits": 1.5373745501041411, + "step": 2040 + }, + { + "epoch": 0.0205, + "grad_norm": 7.625, + "grad_norm_var": 0.67301025390625, + "learning_rate": 0.0003, + "loss": 18.102, + "loss/aux_loss": 0.04826016817241907, + "loss/crossentropy": 3.3688653111457825, + "loss/logits": 1.4797544002532959, + "step": 2050 + }, + { + "epoch": 0.0206, + "grad_norm": 17.5, + "grad_norm_var": 5.9224609375, + "learning_rate": 0.0003, + "loss": 18.1704, + "loss/aux_loss": 0.04823215901851654, + "loss/crossentropy": 3.3886295437812803, + "loss/logits": 1.4773655652999877, + "step": 2060 + }, + { + "epoch": 0.0207, + "grad_norm": 8.5, + "grad_norm_var": 5.833333333333333, + "learning_rate": 0.0003, + "loss": 17.9943, + "loss/aux_loss": 0.04824898187071085, + "loss/crossentropy": 3.1112423300743104, + "loss/logits": 1.4438789427280425, + "step": 2070 + }, + { + "epoch": 0.0208, + "grad_norm": 7.46875, + "grad_norm_var": 0.5363932291666667, + "learning_rate": 0.0003, + "loss": 18.0344, + "loss/aux_loss": 0.0482429688796401, + "loss/crossentropy": 3.352174973487854, + "loss/logits": 1.4526370763778687, + "step": 2080 + }, + { + "epoch": 0.0209, + "grad_norm": 7.65625, + "grad_norm_var": 0.14293212890625, + "learning_rate": 0.0003, + "loss": 18.11, + "loss/aux_loss": 0.048234878666698934, + "loss/crossentropy": 3.240985023975372, + "loss/logits": 1.4991967618465423, + "step": 2090 + }, + { + "epoch": 0.021, + "grad_norm": 8.5625, + "grad_norm_var": 0.07864176432291667, + "learning_rate": 0.0003, + "loss": 17.8796, + "loss/aux_loss": 0.04822954386472702, + "loss/crossentropy": 3.5183646202087404, + "loss/logits": 1.4771794497966766, + "step": 2100 + }, + { + "epoch": 0.0211, + "grad_norm": 8.0, + "grad_norm_var": 0.15666910807291667, + "learning_rate": 0.0003, + "loss": 17.8422, + "loss/aux_loss": 0.048239548690617085, + "loss/crossentropy": 3.3616485238075255, + "loss/logits": 1.4091070950031281, + "step": 2110 + }, + { + "epoch": 0.0212, + "grad_norm": 7.9375, + "grad_norm_var": 11.716109212239584, + "learning_rate": 0.0003, + "loss": 17.7719, + "loss/aux_loss": 0.04824555143713951, + "loss/crossentropy": 3.439083182811737, + "loss/logits": 1.4733355700969697, + "step": 2120 + }, + { + "epoch": 0.0213, + "grad_norm": 8.5625, + "grad_norm_var": 84.65826822916667, + "learning_rate": 0.0003, + "loss": 17.7924, + "loss/aux_loss": 0.048250272311270236, + "loss/crossentropy": 3.4830735325813293, + "loss/logits": 1.4258549392223359, + "step": 2130 + }, + { + "epoch": 0.0214, + "grad_norm": 8.125, + "grad_norm_var": 53.116520182291666, + "learning_rate": 0.0003, + "loss": 17.8883, + "loss/aux_loss": 0.048237613029778005, + "loss/crossentropy": 3.2555914759635924, + "loss/logits": 1.4283065259456635, + "step": 2140 + }, + { + "epoch": 0.0215, + "grad_norm": 8.125, + "grad_norm_var": 0.3610310872395833, + "learning_rate": 0.0003, + "loss": 17.6419, + "loss/aux_loss": 0.04823284205049276, + "loss/crossentropy": 3.3793551921844482, + "loss/logits": 1.4274089336395264, + "step": 2150 + }, + { + "epoch": 0.0216, + "grad_norm": 7.21875, + "grad_norm_var": 1.8485677083333334, + "learning_rate": 0.0003, + "loss": 17.6442, + "loss/aux_loss": 0.048232033289968966, + "loss/crossentropy": 3.2164774179458617, + "loss/logits": 1.4371109902858734, + "step": 2160 + }, + { + "epoch": 0.0217, + "grad_norm": 7.71875, + "grad_norm_var": 1.8079264322916666, + "learning_rate": 0.0003, + "loss": 17.7263, + "loss/aux_loss": 0.04821321051567793, + "loss/crossentropy": 3.3871463894844056, + "loss/logits": 1.4453998267650605, + "step": 2170 + }, + { + "epoch": 0.0218, + "grad_norm": 8.875, + "grad_norm_var": 0.3880859375, + "learning_rate": 0.0003, + "loss": 17.6607, + "loss/aux_loss": 0.048226891085505486, + "loss/crossentropy": 3.213350570201874, + "loss/logits": 1.4165163397789002, + "step": 2180 + }, + { + "epoch": 0.0219, + "grad_norm": 8.125, + "grad_norm_var": 0.4266764322916667, + "learning_rate": 0.0003, + "loss": 17.7184, + "loss/aux_loss": 0.04822581373155117, + "loss/crossentropy": 3.582288146018982, + "loss/logits": 1.4369397819042207, + "step": 2190 + }, + { + "epoch": 0.022, + "grad_norm": 8.3125, + "grad_norm_var": 0.914306640625, + "learning_rate": 0.0003, + "loss": 17.7448, + "loss/aux_loss": 0.04825443848967552, + "loss/crossentropy": 3.306717586517334, + "loss/logits": 1.4155293583869935, + "step": 2200 + }, + { + "epoch": 0.0221, + "grad_norm": 9.0625, + "grad_norm_var": 0.8086222330729167, + "learning_rate": 0.0003, + "loss": 17.6038, + "loss/aux_loss": 0.04823794979602099, + "loss/crossentropy": 3.26631623506546, + "loss/logits": 1.3826520234346389, + "step": 2210 + }, + { + "epoch": 0.0222, + "grad_norm": 7.75, + "grad_norm_var": 0.35058186848958334, + "learning_rate": 0.0003, + "loss": 17.5963, + "loss/aux_loss": 0.048243265226483346, + "loss/crossentropy": 3.28060497045517, + "loss/logits": 1.4131011009216308, + "step": 2220 + }, + { + "epoch": 0.0223, + "grad_norm": 7.625, + "grad_norm_var": 0.18853759765625, + "learning_rate": 0.0003, + "loss": 17.6245, + "loss/aux_loss": 0.04822924640029669, + "loss/crossentropy": 3.287998414039612, + "loss/logits": 1.4267325103282928, + "step": 2230 + }, + { + "epoch": 0.0224, + "grad_norm": 7.875, + "grad_norm_var": 0.257421875, + "learning_rate": 0.0003, + "loss": 17.6129, + "loss/aux_loss": 0.04821740183979273, + "loss/crossentropy": 3.378717005252838, + "loss/logits": 1.4063582181930543, + "step": 2240 + }, + { + "epoch": 0.0225, + "grad_norm": 7.59375, + "grad_norm_var": 0.20963541666666666, + "learning_rate": 0.0003, + "loss": 17.6276, + "loss/aux_loss": 0.04823744297027588, + "loss/crossentropy": 3.4024731159210204, + "loss/logits": 1.4467666923999787, + "step": 2250 + }, + { + "epoch": 0.0226, + "grad_norm": 8.875, + "grad_norm_var": 0.246484375, + "learning_rate": 0.0003, + "loss": 17.6328, + "loss/aux_loss": 0.04822640102356672, + "loss/crossentropy": 3.333562135696411, + "loss/logits": 1.4312103688716888, + "step": 2260 + }, + { + "epoch": 0.0227, + "grad_norm": 8.0625, + "grad_norm_var": 0.67398681640625, + "learning_rate": 0.0003, + "loss": 17.5135, + "loss/aux_loss": 0.04822310116142035, + "loss/crossentropy": 3.320937788486481, + "loss/logits": 1.418309098482132, + "step": 2270 + }, + { + "epoch": 0.0228, + "grad_norm": 6.90625, + "grad_norm_var": 0.44390869140625, + "learning_rate": 0.0003, + "loss": 17.3436, + "loss/aux_loss": 0.04823301304131746, + "loss/crossentropy": 3.12678724527359, + "loss/logits": 1.379988819360733, + "step": 2280 + }, + { + "epoch": 0.0229, + "grad_norm": 11.625, + "grad_norm_var": 1.3313761393229167, + "learning_rate": 0.0003, + "loss": 17.6211, + "loss/aux_loss": 0.04822282623499632, + "loss/crossentropy": 3.2918911814689635, + "loss/logits": 1.4198866367340088, + "step": 2290 + }, + { + "epoch": 0.023, + "grad_norm": 8.0, + "grad_norm_var": 1.1663899739583334, + "learning_rate": 0.0003, + "loss": 17.4747, + "loss/aux_loss": 0.048235368356108664, + "loss/crossentropy": 3.306833505630493, + "loss/logits": 1.392419272661209, + "step": 2300 + }, + { + "epoch": 0.0231, + "grad_norm": 7.3125, + "grad_norm_var": 0.33331705729166666, + "learning_rate": 0.0003, + "loss": 17.2234, + "loss/aux_loss": 0.048222755640745164, + "loss/crossentropy": 3.361720085144043, + "loss/logits": 1.421975213289261, + "step": 2310 + }, + { + "epoch": 0.0232, + "grad_norm": 9.75, + "grad_norm_var": 112.17330322265624, + "learning_rate": 0.0003, + "loss": 17.2352, + "loss/aux_loss": 0.048219884373247625, + "loss/crossentropy": 3.3393725872039797, + "loss/logits": 1.4201282680034637, + "step": 2320 + }, + { + "epoch": 0.0233, + "grad_norm": 8.1875, + "grad_norm_var": 110.715625, + "learning_rate": 0.0003, + "loss": 17.2496, + "loss/aux_loss": 0.04821732547134161, + "loss/crossentropy": 3.389024722576141, + "loss/logits": 1.435178142786026, + "step": 2330 + }, + { + "epoch": 0.0234, + "grad_norm": 6.96875, + "grad_norm_var": 0.2806640625, + "learning_rate": 0.0003, + "loss": 17.331, + "loss/aux_loss": 0.04822313766926527, + "loss/crossentropy": 3.232159233093262, + "loss/logits": 1.3994586706161498, + "step": 2340 + }, + { + "epoch": 0.0235, + "grad_norm": 7.96875, + "grad_norm_var": 0.2126953125, + "learning_rate": 0.0003, + "loss": 17.2108, + "loss/aux_loss": 0.04821410346776247, + "loss/crossentropy": 3.443037581443787, + "loss/logits": 1.4070569813251494, + "step": 2350 + }, + { + "epoch": 0.0236, + "grad_norm": 8.0625, + "grad_norm_var": 0.13619384765625, + "learning_rate": 0.0003, + "loss": 17.301, + "loss/aux_loss": 0.048248034156858924, + "loss/crossentropy": 3.349116563796997, + "loss/logits": 1.3975160002708436, + "step": 2360 + }, + { + "epoch": 0.0237, + "grad_norm": 7.84375, + "grad_norm_var": 2.121415201822917, + "learning_rate": 0.0003, + "loss": 17.1628, + "loss/aux_loss": 0.04823202043771744, + "loss/crossentropy": 3.340719926357269, + "loss/logits": 1.3724242806434632, + "step": 2370 + }, + { + "epoch": 0.0238, + "grad_norm": 7.9375, + "grad_norm_var": 1.950390625, + "learning_rate": 0.0003, + "loss": 17.2873, + "loss/aux_loss": 0.0482368228957057, + "loss/crossentropy": 3.295661818981171, + "loss/logits": 1.4170908331871033, + "step": 2380 + }, + { + "epoch": 0.0239, + "grad_norm": 7.84375, + "grad_norm_var": 0.28982747395833336, + "learning_rate": 0.0003, + "loss": 17.1479, + "loss/aux_loss": 0.048220128566026685, + "loss/crossentropy": 3.149917113780975, + "loss/logits": 1.3780086159706115, + "step": 2390 + }, + { + "epoch": 0.024, + "grad_norm": 7.53125, + "grad_norm_var": 0.56021728515625, + "learning_rate": 0.0003, + "loss": 16.9571, + "loss/aux_loss": 0.04820647966116667, + "loss/crossentropy": 3.203866708278656, + "loss/logits": 1.3228682637214662, + "step": 2400 + }, + { + "epoch": 0.0241, + "grad_norm": 7.6875, + "grad_norm_var": 0.18140869140625, + "learning_rate": 0.0003, + "loss": 17.225, + "loss/aux_loss": 0.04822152461856603, + "loss/crossentropy": 3.219542622566223, + "loss/logits": 1.3637619763612747, + "step": 2410 + }, + { + "epoch": 0.0242, + "grad_norm": 7.28125, + "grad_norm_var": 0.19696858723958333, + "learning_rate": 0.0003, + "loss": 17.3445, + "loss/aux_loss": 0.04821507520973682, + "loss/crossentropy": 3.437433052062988, + "loss/logits": 1.4493371307849885, + "step": 2420 + }, + { + "epoch": 0.0243, + "grad_norm": 9.5, + "grad_norm_var": 0.36900634765625, + "learning_rate": 0.0003, + "loss": 17.2769, + "loss/aux_loss": 0.048233349435031415, + "loss/crossentropy": 3.327606177330017, + "loss/logits": 1.3730829060077667, + "step": 2430 + }, + { + "epoch": 0.0244, + "grad_norm": 7.0625, + "grad_norm_var": 0.58326416015625, + "learning_rate": 0.0003, + "loss": 17.1716, + "loss/aux_loss": 0.04821010734885931, + "loss/crossentropy": 3.442364740371704, + "loss/logits": 1.3955539762973785, + "step": 2440 + }, + { + "epoch": 0.0245, + "grad_norm": 7.4375, + "grad_norm_var": 0.19250895182291666, + "learning_rate": 0.0003, + "loss": 17.0851, + "loss/aux_loss": 0.04822715688496828, + "loss/crossentropy": 3.262503242492676, + "loss/logits": 1.3460212230682373, + "step": 2450 + }, + { + "epoch": 0.0246, + "grad_norm": 7.125, + "grad_norm_var": 0.09498697916666667, + "learning_rate": 0.0003, + "loss": 16.9652, + "loss/aux_loss": 0.04821744803339243, + "loss/crossentropy": 3.3741399884223937, + "loss/logits": 1.4004681944847106, + "step": 2460 + }, + { + "epoch": 0.0247, + "grad_norm": 7.875, + "grad_norm_var": 0.21073811848958332, + "learning_rate": 0.0003, + "loss": 16.7236, + "loss/aux_loss": 0.048226969130337236, + "loss/crossentropy": 3.1367203831672668, + "loss/logits": 1.3466423988342284, + "step": 2470 + }, + { + "epoch": 0.0248, + "grad_norm": 9.3125, + "grad_norm_var": 0.46910400390625, + "learning_rate": 0.0003, + "loss": 17.0984, + "loss/aux_loss": 0.048214548453688624, + "loss/crossentropy": 3.2789533734321594, + "loss/logits": 1.3584135174751282, + "step": 2480 + }, + { + "epoch": 0.0249, + "grad_norm": 8.4375, + "grad_norm_var": 0.721337890625, + "learning_rate": 0.0003, + "loss": 17.1085, + "loss/aux_loss": 0.048216362856328486, + "loss/crossentropy": 3.315259212255478, + "loss/logits": 1.396775197982788, + "step": 2490 + }, + { + "epoch": 0.025, + "grad_norm": 7.46875, + "grad_norm_var": 1.4252604166666667, + "learning_rate": 0.0003, + "loss": 16.9872, + "loss/aux_loss": 0.048228930495679376, + "loss/crossentropy": 3.373861300945282, + "loss/logits": 1.3967225074768066, + "step": 2500 + }, + { + "epoch": 0.0251, + "grad_norm": 9.625, + "grad_norm_var": 1.22525634765625, + "learning_rate": 0.0003, + "loss": 17.1017, + "loss/aux_loss": 0.04823280908167362, + "loss/crossentropy": 3.2639551222324372, + "loss/logits": 1.3296611040830613, + "step": 2510 + }, + { + "epoch": 0.0252, + "grad_norm": 8.25, + "grad_norm_var": 18.701493326822916, + "learning_rate": 0.0003, + "loss": 16.9072, + "loss/aux_loss": 0.04824296310544014, + "loss/crossentropy": 3.264930558204651, + "loss/logits": 1.3745314061641694, + "step": 2520 + }, + { + "epoch": 0.0253, + "grad_norm": 7.03125, + "grad_norm_var": 18.965034993489585, + "learning_rate": 0.0003, + "loss": 16.7434, + "loss/aux_loss": 0.048214029893279074, + "loss/crossentropy": 3.231077790260315, + "loss/logits": 1.3421391308307649, + "step": 2530 + }, + { + "epoch": 0.0254, + "grad_norm": 8.375, + "grad_norm_var": 0.3060546875, + "learning_rate": 0.0003, + "loss": 16.8817, + "loss/aux_loss": 0.048227564059197904, + "loss/crossentropy": 3.341559386253357, + "loss/logits": 1.367657434940338, + "step": 2540 + }, + { + "epoch": 0.0255, + "grad_norm": 8.0625, + "grad_norm_var": 0.450634765625, + "learning_rate": 0.0003, + "loss": 16.6762, + "loss/aux_loss": 0.04822465777397156, + "loss/crossentropy": 3.1764264702796936, + "loss/logits": 1.2923425018787384, + "step": 2550 + }, + { + "epoch": 0.0256, + "grad_norm": 8.6875, + "grad_norm_var": 0.9891560872395834, + "learning_rate": 0.0003, + "loss": 16.982, + "loss/aux_loss": 0.04822836928069592, + "loss/crossentropy": 3.1380072832107544, + "loss/logits": 1.329880553483963, + "step": 2560 + }, + { + "epoch": 0.0257, + "grad_norm": 7.0625, + "grad_norm_var": 6.76246337890625, + "learning_rate": 0.0003, + "loss": 16.8472, + "loss/aux_loss": 0.048224599473178385, + "loss/crossentropy": 3.171788203716278, + "loss/logits": 1.3234851002693175, + "step": 2570 + }, + { + "epoch": 0.0258, + "grad_norm": 7.75, + "grad_norm_var": 7.284305826822917, + "learning_rate": 0.0003, + "loss": 16.7384, + "loss/aux_loss": 0.04820049479603768, + "loss/crossentropy": 3.2456391513347627, + "loss/logits": 1.3257298290729522, + "step": 2580 + }, + { + "epoch": 0.0259, + "grad_norm": 8.6875, + "grad_norm_var": 0.70699462890625, + "learning_rate": 0.0003, + "loss": 16.7462, + "loss/aux_loss": 0.04820205494761467, + "loss/crossentropy": 3.2381733298301696, + "loss/logits": 1.3446310222148896, + "step": 2590 + }, + { + "epoch": 0.026, + "grad_norm": 6.875, + "grad_norm_var": 0.3619140625, + "learning_rate": 0.0003, + "loss": 16.7997, + "loss/aux_loss": 0.0482096241787076, + "loss/crossentropy": 3.317233157157898, + "loss/logits": 1.3122055113315583, + "step": 2600 + }, + { + "epoch": 0.0261, + "grad_norm": 7.25, + "grad_norm_var": 0.3204264322916667, + "learning_rate": 0.0003, + "loss": 16.7259, + "loss/aux_loss": 0.04821378495544195, + "loss/crossentropy": 3.224324756860733, + "loss/logits": 1.362189695239067, + "step": 2610 + }, + { + "epoch": 0.0262, + "grad_norm": 7.84375, + "grad_norm_var": 2.25601806640625, + "learning_rate": 0.0003, + "loss": 16.4682, + "loss/aux_loss": 0.04823370911180973, + "loss/crossentropy": 3.174444782733917, + "loss/logits": 1.3091946482658385, + "step": 2620 + }, + { + "epoch": 0.0263, + "grad_norm": 7.4375, + "grad_norm_var": 0.55625, + "learning_rate": 0.0003, + "loss": 16.8187, + "loss/aux_loss": 0.048213068023324014, + "loss/crossentropy": 3.3021878719329836, + "loss/logits": 1.3765088856220244, + "step": 2630 + }, + { + "epoch": 0.0264, + "grad_norm": 7.6875, + "grad_norm_var": 0.21092122395833332, + "learning_rate": 0.0003, + "loss": 16.6867, + "loss/aux_loss": 0.0482132213190198, + "loss/crossentropy": 3.2036616921424867, + "loss/logits": 1.3353200852870941, + "step": 2640 + }, + { + "epoch": 0.0265, + "grad_norm": 7.65625, + "grad_norm_var": 0.17981770833333333, + "learning_rate": 0.0003, + "loss": 16.6249, + "loss/aux_loss": 0.048217184469103815, + "loss/crossentropy": 3.1017094254493713, + "loss/logits": 1.293505471944809, + "step": 2650 + }, + { + "epoch": 0.0266, + "grad_norm": 7.28125, + "grad_norm_var": 0.24312744140625, + "learning_rate": 0.0003, + "loss": 16.8997, + "loss/aux_loss": 0.04821909796446562, + "loss/crossentropy": 3.3471083879470824, + "loss/logits": 1.4033735275268555, + "step": 2660 + }, + { + "epoch": 0.0267, + "grad_norm": 8.25, + "grad_norm_var": 0.14654541015625, + "learning_rate": 0.0003, + "loss": 16.6097, + "loss/aux_loss": 0.04820960406213999, + "loss/crossentropy": 3.306285870075226, + "loss/logits": 1.3414114236831665, + "step": 2670 + }, + { + "epoch": 0.0268, + "grad_norm": 7.53125, + "grad_norm_var": 0.12222900390625, + "learning_rate": 0.0003, + "loss": 16.7267, + "loss/aux_loss": 0.04820468667894602, + "loss/crossentropy": 3.1302775621414183, + "loss/logits": 1.3628006160259247, + "step": 2680 + }, + { + "epoch": 0.0269, + "grad_norm": 7.5, + "grad_norm_var": 0.09217122395833334, + "learning_rate": 0.0003, + "loss": 16.6414, + "loss/aux_loss": 0.04819583874195814, + "loss/crossentropy": 3.252695155143738, + "loss/logits": 1.3152020871639252, + "step": 2690 + }, + { + "epoch": 0.027, + "grad_norm": 6.9375, + "grad_norm_var": 0.1095703125, + "learning_rate": 0.0003, + "loss": 16.5657, + "loss/aux_loss": 0.04820863176137209, + "loss/crossentropy": 3.3271077156066893, + "loss/logits": 1.3572327196598053, + "step": 2700 + }, + { + "epoch": 0.0271, + "grad_norm": 7.125, + "grad_norm_var": 0.19192301432291667, + "learning_rate": 0.0003, + "loss": 16.5538, + "loss/aux_loss": 0.048196819797158244, + "loss/crossentropy": 3.245091903209686, + "loss/logits": 1.3349639832973481, + "step": 2710 + }, + { + "epoch": 0.0272, + "grad_norm": 7.75, + "grad_norm_var": 0.09881184895833334, + "learning_rate": 0.0003, + "loss": 16.6612, + "loss/aux_loss": 0.048208712972700594, + "loss/crossentropy": 3.2764087319374084, + "loss/logits": 1.3150906205177306, + "step": 2720 + }, + { + "epoch": 0.0273, + "grad_norm": 7.78125, + "grad_norm_var": 0.103125, + "learning_rate": 0.0003, + "loss": 16.6158, + "loss/aux_loss": 0.04819247759878635, + "loss/crossentropy": 3.273999774456024, + "loss/logits": 1.3284586131572724, + "step": 2730 + }, + { + "epoch": 0.0274, + "grad_norm": 7.875, + "grad_norm_var": 0.20220947265625, + "learning_rate": 0.0003, + "loss": 16.4436, + "loss/aux_loss": 0.04821184277534485, + "loss/crossentropy": 3.188088583946228, + "loss/logits": 1.3358671367168427, + "step": 2740 + }, + { + "epoch": 0.0275, + "grad_norm": 7.5625, + "grad_norm_var": 0.21302083333333333, + "learning_rate": 0.0003, + "loss": 16.3073, + "loss/aux_loss": 0.04821435939520598, + "loss/crossentropy": 3.136403810977936, + "loss/logits": 1.287468433380127, + "step": 2750 + }, + { + "epoch": 0.0276, + "grad_norm": 7.71875, + "grad_norm_var": 0.320947265625, + "learning_rate": 0.0003, + "loss": 16.5717, + "loss/aux_loss": 0.04819820411503315, + "loss/crossentropy": 3.159669041633606, + "loss/logits": 1.3258511304855347, + "step": 2760 + }, + { + "epoch": 0.0277, + "grad_norm": 7.5625, + "grad_norm_var": 0.45631510416666665, + "learning_rate": 0.0003, + "loss": 16.3568, + "loss/aux_loss": 0.04820672180503607, + "loss/crossentropy": 3.243663287162781, + "loss/logits": 1.2874810814857482, + "step": 2770 + }, + { + "epoch": 0.0278, + "grad_norm": 8.5625, + "grad_norm_var": 0.17890218098958333, + "learning_rate": 0.0003, + "loss": 16.5081, + "loss/aux_loss": 0.04820285327732563, + "loss/crossentropy": 3.083403432369232, + "loss/logits": 1.3088326066732408, + "step": 2780 + }, + { + "epoch": 0.0279, + "grad_norm": 7.09375, + "grad_norm_var": 0.27081705729166666, + "learning_rate": 0.0003, + "loss": 16.4457, + "loss/aux_loss": 0.04822454117238521, + "loss/crossentropy": 3.147365128993988, + "loss/logits": 1.2816300868988038, + "step": 2790 + }, + { + "epoch": 0.028, + "grad_norm": 7.53125, + "grad_norm_var": 0.17732747395833334, + "learning_rate": 0.0003, + "loss": 16.361, + "loss/aux_loss": 0.0482009943574667, + "loss/crossentropy": 3.2210754632949827, + "loss/logits": 1.3175399780273438, + "step": 2800 + }, + { + "epoch": 0.0281, + "grad_norm": 8.625, + "grad_norm_var": 0.15832926432291666, + "learning_rate": 0.0003, + "loss": 16.4841, + "loss/aux_loss": 0.04820490088313818, + "loss/crossentropy": 3.225731301307678, + "loss/logits": 1.3222549259662628, + "step": 2810 + }, + { + "epoch": 0.0282, + "grad_norm": 7.125, + "grad_norm_var": 0.30517171223958334, + "learning_rate": 0.0003, + "loss": 16.3537, + "loss/aux_loss": 0.04823195319622755, + "loss/crossentropy": 3.0282162189483643, + "loss/logits": 1.2549142867326737, + "step": 2820 + }, + { + "epoch": 0.0283, + "grad_norm": 7.03125, + "grad_norm_var": 0.2861328125, + "learning_rate": 0.0003, + "loss": 16.1414, + "loss/aux_loss": 0.04820448886603117, + "loss/crossentropy": 3.1825570702552795, + "loss/logits": 1.2594711065292359, + "step": 2830 + }, + { + "epoch": 0.0284, + "grad_norm": 7.40625, + "grad_norm_var": 0.32847900390625, + "learning_rate": 0.0003, + "loss": 16.3758, + "loss/aux_loss": 0.04820435829460621, + "loss/crossentropy": 3.3334421873092652, + "loss/logits": 1.3914376556873322, + "step": 2840 + }, + { + "epoch": 0.0285, + "grad_norm": 8.0, + "grad_norm_var": 0.41412353515625, + "learning_rate": 0.0003, + "loss": 16.3215, + "loss/aux_loss": 0.048213552497327325, + "loss/crossentropy": 3.167293357849121, + "loss/logits": 1.3143129229545594, + "step": 2850 + }, + { + "epoch": 0.0286, + "grad_norm": 8.5625, + "grad_norm_var": 0.34501546223958335, + "learning_rate": 0.0003, + "loss": 16.343, + "loss/aux_loss": 0.048203857988119124, + "loss/crossentropy": 3.168602633476257, + "loss/logits": 1.3595575094223022, + "step": 2860 + }, + { + "epoch": 0.0287, + "grad_norm": 7.1875, + "grad_norm_var": 26.682645670572917, + "learning_rate": 0.0003, + "loss": 16.2912, + "loss/aux_loss": 0.04821203649044037, + "loss/crossentropy": 3.1967454075813295, + "loss/logits": 1.3017989635467528, + "step": 2870 + }, + { + "epoch": 0.0288, + "grad_norm": 7.40625, + "grad_norm_var": 0.23925374348958334, + "learning_rate": 0.0003, + "loss": 16.4016, + "loss/aux_loss": 0.04820753578096628, + "loss/crossentropy": 3.1777406215667723, + "loss/logits": 1.3188459992408752, + "step": 2880 + }, + { + "epoch": 0.0289, + "grad_norm": 8.5, + "grad_norm_var": 0.16041259765625, + "learning_rate": 0.0003, + "loss": 16.317, + "loss/aux_loss": 0.048205715417861936, + "loss/crossentropy": 3.2767266154289247, + "loss/logits": 1.2999807298183441, + "step": 2890 + }, + { + "epoch": 0.029, + "grad_norm": 7.53125, + "grad_norm_var": 0.20714518229166667, + "learning_rate": 0.0003, + "loss": 16.2915, + "loss/aux_loss": 0.04820862989872694, + "loss/crossentropy": 3.1509845733642576, + "loss/logits": 1.2746458113193513, + "step": 2900 + }, + { + "epoch": 0.0291, + "grad_norm": 9.0625, + "grad_norm_var": 0.45621337890625, + "learning_rate": 0.0003, + "loss": 16.3776, + "loss/aux_loss": 0.0481941731646657, + "loss/crossentropy": 3.201290011405945, + "loss/logits": 1.2829424917697907, + "step": 2910 + }, + { + "epoch": 0.0292, + "grad_norm": 7.34375, + "grad_norm_var": 0.30279541015625, + "learning_rate": 0.0003, + "loss": 16.4304, + "loss/aux_loss": 0.048205789737403394, + "loss/crossentropy": 3.203648090362549, + "loss/logits": 1.3133781254291534, + "step": 2920 + }, + { + "epoch": 0.0293, + "grad_norm": 8.0, + "grad_norm_var": 0.14918212890625, + "learning_rate": 0.0003, + "loss": 16.2822, + "loss/aux_loss": 0.048185784742236136, + "loss/crossentropy": 3.1988660097122192, + "loss/logits": 1.2729051291942597, + "step": 2930 + }, + { + "epoch": 0.0294, + "grad_norm": 6.6875, + "grad_norm_var": 0.37200520833333334, + "learning_rate": 0.0003, + "loss": 16.0591, + "loss/aux_loss": 0.04820053558796644, + "loss/crossentropy": 3.2543280601501463, + "loss/logits": 1.2817346930503846, + "step": 2940 + }, + { + "epoch": 0.0295, + "grad_norm": 8.875, + "grad_norm_var": 0.5007161458333333, + "learning_rate": 0.0003, + "loss": 16.1596, + "loss/aux_loss": 0.048183665983378886, + "loss/crossentropy": 3.2304122924804686, + "loss/logits": 1.3286712884902954, + "step": 2950 + }, + { + "epoch": 0.0296, + "grad_norm": 7.8125, + "grad_norm_var": 13.414176432291667, + "learning_rate": 0.0003, + "loss": 16.1869, + "loss/aux_loss": 0.048213465884327886, + "loss/crossentropy": 3.363071584701538, + "loss/logits": 1.3079668641090394, + "step": 2960 + }, + { + "epoch": 0.0297, + "grad_norm": 7.375, + "grad_norm_var": 14.020833333333334, + "learning_rate": 0.0003, + "loss": 15.9104, + "loss/aux_loss": 0.048193281330168244, + "loss/crossentropy": 3.13505756855011, + "loss/logits": 1.331637018918991, + "step": 2970 + }, + { + "epoch": 0.0298, + "grad_norm": 7.40625, + "grad_norm_var": 0.18229166666666666, + "learning_rate": 0.0003, + "loss": 16.3034, + "loss/aux_loss": 0.04818958211690187, + "loss/crossentropy": 3.2140108823776243, + "loss/logits": 1.3344007432460785, + "step": 2980 + }, + { + "epoch": 0.0299, + "grad_norm": 7.46875, + "grad_norm_var": 0.428369140625, + "learning_rate": 0.0003, + "loss": 16.2893, + "loss/aux_loss": 0.048207861743867396, + "loss/crossentropy": 3.2560169219970705, + "loss/logits": 1.279381561279297, + "step": 2990 + }, + { + "epoch": 0.03, + "grad_norm": 8.75, + "grad_norm_var": 2.956734212239583, + "learning_rate": 0.0003, + "loss": 16.2927, + "loss/aux_loss": 0.04818707294762135, + "loss/crossentropy": 3.217176949977875, + "loss/logits": 1.3047203302383423, + "step": 3000 + }, + { + "epoch": 0.0301, + "grad_norm": 9.125, + "grad_norm_var": 0.6278645833333333, + "learning_rate": 0.0003, + "loss": 16.2956, + "loss/aux_loss": 0.048182461969554426, + "loss/crossentropy": 3.2268913865089415, + "loss/logits": 1.2893227458000183, + "step": 3010 + }, + { + "epoch": 0.0302, + "grad_norm": 7.0625, + "grad_norm_var": 0.35689697265625, + "learning_rate": 0.0003, + "loss": 15.8782, + "loss/aux_loss": 0.04818679504096508, + "loss/crossentropy": 3.0483377814292907, + "loss/logits": 1.3037330031394958, + "step": 3020 + }, + { + "epoch": 0.0303, + "grad_norm": 6.65625, + "grad_norm_var": 0.13655192057291668, + "learning_rate": 0.0003, + "loss": 16.2123, + "loss/aux_loss": 0.04818045124411583, + "loss/crossentropy": 3.0745912194252014, + "loss/logits": 1.2567619979381561, + "step": 3030 + }, + { + "epoch": 0.0304, + "grad_norm": 7.3125, + "grad_norm_var": 0.1279296875, + "learning_rate": 0.0003, + "loss": 16.2275, + "loss/aux_loss": 0.04818618576973677, + "loss/crossentropy": 3.313616728782654, + "loss/logits": 1.3123571872711182, + "step": 3040 + }, + { + "epoch": 0.0305, + "grad_norm": 6.8125, + "grad_norm_var": 0.306640625, + "learning_rate": 0.0003, + "loss": 15.9426, + "loss/aux_loss": 0.048186902329325675, + "loss/crossentropy": 3.066525948047638, + "loss/logits": 1.2751049637794494, + "step": 3050 + }, + { + "epoch": 0.0306, + "grad_norm": 7.53125, + "grad_norm_var": 9.796805826822917, + "learning_rate": 0.0003, + "loss": 16.1247, + "loss/aux_loss": 0.048196819610893726, + "loss/crossentropy": 3.2423496723175047, + "loss/logits": 1.305676233768463, + "step": 3060 + }, + { + "epoch": 0.0307, + "grad_norm": 6.78125, + "grad_norm_var": 2.33648681640625, + "learning_rate": 0.0003, + "loss": 15.9665, + "loss/aux_loss": 0.04817989952862263, + "loss/crossentropy": 3.178650379180908, + "loss/logits": 1.2622127085924149, + "step": 3070 + }, + { + "epoch": 0.0308, + "grad_norm": 8.125, + "grad_norm_var": 0.16197916666666667, + "learning_rate": 0.0003, + "loss": 15.8411, + "loss/aux_loss": 0.048175792768597604, + "loss/crossentropy": 3.2178627133369444, + "loss/logits": 1.2681122601032258, + "step": 3080 + }, + { + "epoch": 0.0309, + "grad_norm": 7.90625, + "grad_norm_var": 0.19068603515625, + "learning_rate": 0.0003, + "loss": 15.9343, + "loss/aux_loss": 0.04819290656596422, + "loss/crossentropy": 3.0956594944000244, + "loss/logits": 1.2836836636066438, + "step": 3090 + }, + { + "epoch": 0.031, + "grad_norm": 6.8125, + "grad_norm_var": 0.12760009765625, + "learning_rate": 0.0003, + "loss": 16.1508, + "loss/aux_loss": 0.04819896165281534, + "loss/crossentropy": 3.242776608467102, + "loss/logits": 1.260975569486618, + "step": 3100 + }, + { + "epoch": 0.0311, + "grad_norm": 6.75, + "grad_norm_var": 0.20302327473958334, + "learning_rate": 0.0003, + "loss": 15.8037, + "loss/aux_loss": 0.048188280686736105, + "loss/crossentropy": 3.0742504239082336, + "loss/logits": 1.247715598344803, + "step": 3110 + }, + { + "epoch": 0.0312, + "grad_norm": 7.5, + "grad_norm_var": 0.09726155598958333, + "learning_rate": 0.0003, + "loss": 16.1331, + "loss/aux_loss": 0.048201543465256694, + "loss/crossentropy": 3.1597721457481383, + "loss/logits": 1.2908858835697175, + "step": 3120 + }, + { + "epoch": 0.0313, + "grad_norm": 7.6875, + "grad_norm_var": 0.5277303059895834, + "learning_rate": 0.0003, + "loss": 15.8831, + "loss/aux_loss": 0.04819868616759777, + "loss/crossentropy": 3.1610820293426514, + "loss/logits": 1.2815950632095336, + "step": 3130 + }, + { + "epoch": 0.0314, + "grad_norm": 7.3125, + "grad_norm_var": 0.3472493489583333, + "learning_rate": 0.0003, + "loss": 16.038, + "loss/aux_loss": 0.048184423707425594, + "loss/crossentropy": 3.1954041719436646, + "loss/logits": 1.278364223241806, + "step": 3140 + }, + { + "epoch": 0.0315, + "grad_norm": 7.1875, + "grad_norm_var": 0.039567057291666666, + "learning_rate": 0.0003, + "loss": 15.916, + "loss/aux_loss": 0.048186035640537736, + "loss/crossentropy": 3.1560078144073485, + "loss/logits": 1.3042196780443192, + "step": 3150 + }, + { + "epoch": 0.0316, + "grad_norm": 7.75, + "grad_norm_var": 0.35128580729166664, + "learning_rate": 0.0003, + "loss": 15.9454, + "loss/aux_loss": 0.04819800220429897, + "loss/crossentropy": 3.1910813629627226, + "loss/logits": 1.2715374946594238, + "step": 3160 + }, + { + "epoch": 0.0317, + "grad_norm": 8.4375, + "grad_norm_var": 0.6532389322916666, + "learning_rate": 0.0003, + "loss": 15.6996, + "loss/aux_loss": 0.048211091198027134, + "loss/crossentropy": 3.116113305091858, + "loss/logits": 1.2663143903017045, + "step": 3170 + }, + { + "epoch": 0.0318, + "grad_norm": 7.53125, + "grad_norm_var": 5.35582275390625, + "learning_rate": 0.0003, + "loss": 16.1554, + "loss/aux_loss": 0.048189323768019675, + "loss/crossentropy": 3.1312987327575685, + "loss/logits": 1.2820051074028016, + "step": 3180 + }, + { + "epoch": 0.0319, + "grad_norm": 7.65625, + "grad_norm_var": 5.358837890625, + "learning_rate": 0.0003, + "loss": 15.9255, + "loss/aux_loss": 0.04818729739636183, + "loss/crossentropy": 3.0685723185539246, + "loss/logits": 1.2486902892589569, + "step": 3190 + }, + { + "epoch": 0.032, + "grad_norm": 7.09375, + "grad_norm_var": 0.20826416015625, + "learning_rate": 0.0003, + "loss": 15.8965, + "loss/aux_loss": 0.04819039478898048, + "loss/crossentropy": 3.235087752342224, + "loss/logits": 1.253222393989563, + "step": 3200 + }, + { + "epoch": 0.0321, + "grad_norm": 7.53125, + "grad_norm_var": 0.9037760416666667, + "learning_rate": 0.0003, + "loss": 16.0786, + "loss/aux_loss": 0.0481830982491374, + "loss/crossentropy": 3.1696943759918215, + "loss/logits": 1.2844607293605805, + "step": 3210 + }, + { + "epoch": 0.0322, + "grad_norm": 7.4375, + "grad_norm_var": 0.9649739583333333, + "learning_rate": 0.0003, + "loss": 15.7072, + "loss/aux_loss": 0.04817593917250633, + "loss/crossentropy": 3.0222031831741334, + "loss/logits": 1.2671060264110565, + "step": 3220 + }, + { + "epoch": 0.0323, + "grad_norm": 7.4375, + "grad_norm_var": 0.08889567057291667, + "learning_rate": 0.0003, + "loss": 15.7409, + "loss/aux_loss": 0.04818395711481571, + "loss/crossentropy": 3.200468099117279, + "loss/logits": 1.3267085552215576, + "step": 3230 + }, + { + "epoch": 0.0324, + "grad_norm": 10.4375, + "grad_norm_var": 188.03019205729166, + "learning_rate": 0.0003, + "loss": 15.8984, + "loss/aux_loss": 0.0481902739033103, + "loss/crossentropy": 3.126142477989197, + "loss/logits": 1.2616908073425293, + "step": 3240 + }, + { + "epoch": 0.0325, + "grad_norm": 7.34375, + "grad_norm_var": 188.55543212890626, + "learning_rate": 0.0003, + "loss": 15.9279, + "loss/aux_loss": 0.048199089244008064, + "loss/crossentropy": 3.1426218867301943, + "loss/logits": 1.2768731236457824, + "step": 3250 + }, + { + "epoch": 0.0326, + "grad_norm": 7.0, + "grad_norm_var": 0.15623372395833332, + "learning_rate": 0.0003, + "loss": 15.6804, + "loss/aux_loss": 0.04819824192672968, + "loss/crossentropy": 3.130205762386322, + "loss/logits": 1.2509568214416504, + "step": 3260 + }, + { + "epoch": 0.0327, + "grad_norm": 7.03125, + "grad_norm_var": 0.19342041015625, + "learning_rate": 0.0003, + "loss": 15.7794, + "loss/aux_loss": 0.048181666433811186, + "loss/crossentropy": 3.203247845172882, + "loss/logits": 1.248792153596878, + "step": 3270 + }, + { + "epoch": 0.0328, + "grad_norm": 6.90625, + "grad_norm_var": 0.3446451822916667, + "learning_rate": 0.0003, + "loss": 15.6809, + "loss/aux_loss": 0.04819200746715069, + "loss/crossentropy": 3.2286102890968325, + "loss/logits": 1.2171394854784012, + "step": 3280 + }, + { + "epoch": 0.0329, + "grad_norm": 6.75, + "grad_norm_var": 0.19798177083333332, + "learning_rate": 0.0003, + "loss": 15.677, + "loss/aux_loss": 0.048186296969652175, + "loss/crossentropy": 3.0580495953559876, + "loss/logits": 1.2661554515361786, + "step": 3290 + }, + { + "epoch": 0.033, + "grad_norm": 7.0, + "grad_norm_var": 76.57779541015626, + "learning_rate": 0.0003, + "loss": 15.86, + "loss/aux_loss": 0.04821221027523279, + "loss/crossentropy": 2.969318687915802, + "loss/logits": 1.2122164875268937, + "step": 3300 + }, + { + "epoch": 0.0331, + "grad_norm": 7.09375, + "grad_norm_var": 0.06282145182291667, + "learning_rate": 0.0003, + "loss": 15.7174, + "loss/aux_loss": 0.048201913200318816, + "loss/crossentropy": 3.2775181770324706, + "loss/logits": 1.2739900410175324, + "step": 3310 + }, + { + "epoch": 0.0332, + "grad_norm": 7.375, + "grad_norm_var": 0.08489583333333334, + "learning_rate": 0.0003, + "loss": 15.6614, + "loss/aux_loss": 0.048193711787462234, + "loss/crossentropy": 3.079313504695892, + "loss/logits": 1.2485756576061249, + "step": 3320 + }, + { + "epoch": 0.0333, + "grad_norm": 7.125, + "grad_norm_var": 0.068994140625, + "learning_rate": 0.0003, + "loss": 15.8517, + "loss/aux_loss": 0.048206409066915513, + "loss/crossentropy": 3.030042564868927, + "loss/logits": 1.1876587241888046, + "step": 3330 + }, + { + "epoch": 0.0334, + "grad_norm": 7.625, + "grad_norm_var": 0.196337890625, + "learning_rate": 0.0003, + "loss": 15.6156, + "loss/aux_loss": 0.04819293972104788, + "loss/crossentropy": 3.024798274040222, + "loss/logits": 1.2279581785202027, + "step": 3340 + }, + { + "epoch": 0.0335, + "grad_norm": 8.1875, + "grad_norm_var": 0.192822265625, + "learning_rate": 0.0003, + "loss": 15.6715, + "loss/aux_loss": 0.048190113715827466, + "loss/crossentropy": 3.1095346808433533, + "loss/logits": 1.2218665778636932, + "step": 3350 + }, + { + "epoch": 0.0336, + "grad_norm": 7.875, + "grad_norm_var": 0.8559733072916667, + "learning_rate": 0.0003, + "loss": 15.4338, + "loss/aux_loss": 0.04819390587508678, + "loss/crossentropy": 3.0634355664253237, + "loss/logits": 1.2227939546108246, + "step": 3360 + }, + { + "epoch": 0.0337, + "grad_norm": 6.84375, + "grad_norm_var": 0.8440104166666667, + "learning_rate": 0.0003, + "loss": 15.6555, + "loss/aux_loss": 0.04817529227584601, + "loss/crossentropy": 3.2624236226081846, + "loss/logits": 1.2496430993080139, + "step": 3370 + }, + { + "epoch": 0.0338, + "grad_norm": 7.25, + "grad_norm_var": 0.08365478515625, + "learning_rate": 0.0003, + "loss": 15.7475, + "loss/aux_loss": 0.048181839287281036, + "loss/crossentropy": 3.000634413957596, + "loss/logits": 1.2105579853057862, + "step": 3380 + }, + { + "epoch": 0.0339, + "grad_norm": 7.71875, + "grad_norm_var": 0.118212890625, + "learning_rate": 0.0003, + "loss": 15.878, + "loss/aux_loss": 0.04818106349557638, + "loss/crossentropy": 3.1646122694015504, + "loss/logits": 1.2451474606990813, + "step": 3390 + }, + { + "epoch": 0.034, + "grad_norm": 8.125, + "grad_norm_var": 0.15712483723958334, + "learning_rate": 0.0003, + "loss": 15.5974, + "loss/aux_loss": 0.04819212630391121, + "loss/crossentropy": 3.0883402824401855, + "loss/logits": 1.231631088256836, + "step": 3400 + }, + { + "epoch": 0.0341, + "grad_norm": 7.90625, + "grad_norm_var": 43.145894368489586, + "learning_rate": 0.0003, + "loss": 15.6349, + "loss/aux_loss": 0.048203271254897116, + "loss/crossentropy": 3.0851239562034607, + "loss/logits": 1.2473519384860992, + "step": 3410 + }, + { + "epoch": 0.0342, + "grad_norm": 8.125, + "grad_norm_var": 42.346354166666664, + "learning_rate": 0.0003, + "loss": 15.6873, + "loss/aux_loss": 0.04819300062954426, + "loss/crossentropy": 3.2211881279945374, + "loss/logits": 1.2318155229091645, + "step": 3420 + }, + { + "epoch": 0.0343, + "grad_norm": 7.28125, + "grad_norm_var": 0.29765218098958335, + "learning_rate": 0.0003, + "loss": 15.6778, + "loss/aux_loss": 0.04818199146538973, + "loss/crossentropy": 3.163746166229248, + "loss/logits": 1.2294625520706177, + "step": 3430 + }, + { + "epoch": 0.0344, + "grad_norm": 6.875, + "grad_norm_var": 0.31027018229166664, + "learning_rate": 0.0003, + "loss": 15.859, + "loss/aux_loss": 0.04818481933325529, + "loss/crossentropy": 3.2157267451286318, + "loss/logits": 1.2203275740146637, + "step": 3440 + }, + { + "epoch": 0.0345, + "grad_norm": 7.46875, + "grad_norm_var": 0.4372355143229167, + "learning_rate": 0.0003, + "loss": 15.4943, + "loss/aux_loss": 0.04818508345633745, + "loss/crossentropy": 2.981385588645935, + "loss/logits": 1.2513148784637451, + "step": 3450 + }, + { + "epoch": 0.0346, + "grad_norm": 7.96875, + "grad_norm_var": 0.27421875, + "learning_rate": 0.0003, + "loss": 15.6128, + "loss/aux_loss": 0.048185240291059014, + "loss/crossentropy": 3.065082919597626, + "loss/logits": 1.2668458700180054, + "step": 3460 + }, + { + "epoch": 0.0347, + "grad_norm": 7.34375, + "grad_norm_var": 0.0998046875, + "learning_rate": 0.0003, + "loss": 15.6479, + "loss/aux_loss": 0.04818276725709438, + "loss/crossentropy": 3.196527397632599, + "loss/logits": 1.2704426288604735, + "step": 3470 + }, + { + "epoch": 0.0348, + "grad_norm": 7.53125, + "grad_norm_var": 77.90299072265626, + "learning_rate": 0.0003, + "loss": 15.213, + "loss/aux_loss": 0.04820560179650783, + "loss/crossentropy": 3.0156648635864256, + "loss/logits": 1.1708497077226638, + "step": 3480 + }, + { + "epoch": 0.0349, + "grad_norm": 6.5, + "grad_norm_var": 0.52232666015625, + "learning_rate": 0.0003, + "loss": 15.5807, + "loss/aux_loss": 0.04817814268171787, + "loss/crossentropy": 3.3053033113479615, + "loss/logits": 1.2551276683807373, + "step": 3490 + }, + { + "epoch": 0.035, + "grad_norm": 7.25, + "grad_norm_var": 0.22096354166666668, + "learning_rate": 0.0003, + "loss": 15.4642, + "loss/aux_loss": 0.048177143558859825, + "loss/crossentropy": 3.1717012524604797, + "loss/logits": 1.2090398788452148, + "step": 3500 + }, + { + "epoch": 0.0351, + "grad_norm": 7.34375, + "grad_norm_var": 0.17125244140625, + "learning_rate": 0.0003, + "loss": 15.469, + "loss/aux_loss": 0.04816991053521633, + "loss/crossentropy": 3.0794217944145204, + "loss/logits": 1.2378638923168181, + "step": 3510 + }, + { + "epoch": 0.0352, + "grad_norm": 7.34375, + "grad_norm_var": 0.5747233072916667, + "learning_rate": 0.0003, + "loss": 15.3886, + "loss/aux_loss": 0.04817315954715014, + "loss/crossentropy": 3.1408966541290284, + "loss/logits": 1.1839520275592803, + "step": 3520 + }, + { + "epoch": 0.0353, + "grad_norm": 8.375, + "grad_norm_var": 0.579541015625, + "learning_rate": 0.0003, + "loss": 15.4735, + "loss/aux_loss": 0.04817787241190672, + "loss/crossentropy": 3.1789328932762144, + "loss/logits": 1.241087591648102, + "step": 3530 + }, + { + "epoch": 0.0354, + "grad_norm": 7.1875, + "grad_norm_var": 0.35953369140625, + "learning_rate": 0.0003, + "loss": 15.4254, + "loss/aux_loss": 0.04818295389413833, + "loss/crossentropy": 3.1350058197975157, + "loss/logits": 1.218758872151375, + "step": 3540 + }, + { + "epoch": 0.0355, + "grad_norm": 7.5, + "grad_norm_var": 0.14218343098958333, + "learning_rate": 0.0003, + "loss": 15.4481, + "loss/aux_loss": 0.048182402923703196, + "loss/crossentropy": 3.1021770238876343, + "loss/logits": 1.157341206073761, + "step": 3550 + }, + { + "epoch": 0.0356, + "grad_norm": 7.5625, + "grad_norm_var": 0.220947265625, + "learning_rate": 0.0003, + "loss": 15.6278, + "loss/aux_loss": 0.048180959187448025, + "loss/crossentropy": 3.1709139943122864, + "loss/logits": 1.2263819336891175, + "step": 3560 + }, + { + "epoch": 0.0357, + "grad_norm": 7.375, + "grad_norm_var": 0.3221638997395833, + "learning_rate": 0.0003, + "loss": 15.3394, + "loss/aux_loss": 0.04818211700767279, + "loss/crossentropy": 3.2739925384521484, + "loss/logits": 1.2379388093948365, + "step": 3570 + }, + { + "epoch": 0.0358, + "grad_norm": 6.90625, + "grad_norm_var": 0.8153605143229167, + "learning_rate": 0.0003, + "loss": 15.5107, + "loss/aux_loss": 0.04817323740571737, + "loss/crossentropy": 3.136904263496399, + "loss/logits": 1.2624209761619567, + "step": 3580 + }, + { + "epoch": 0.0359, + "grad_norm": 7.4375, + "grad_norm_var": 0.128125, + "learning_rate": 0.0003, + "loss": 15.654, + "loss/aux_loss": 0.04819139763712883, + "loss/crossentropy": 3.1375385880470277, + "loss/logits": 1.2516057163476944, + "step": 3590 + }, + { + "epoch": 0.036, + "grad_norm": 6.75, + "grad_norm_var": 0.09338785807291666, + "learning_rate": 0.0003, + "loss": 15.5777, + "loss/aux_loss": 0.04818712417036295, + "loss/crossentropy": 3.3446964859962462, + "loss/logits": 1.214635932445526, + "step": 3600 + }, + { + "epoch": 0.0361, + "grad_norm": 7.46875, + "grad_norm_var": 0.10930582682291666, + "learning_rate": 0.0003, + "loss": 15.4192, + "loss/aux_loss": 0.04818251971155405, + "loss/crossentropy": 3.177507519721985, + "loss/logits": 1.2218758046627045, + "step": 3610 + }, + { + "epoch": 0.0362, + "grad_norm": 8.75, + "grad_norm_var": 0.21578369140625, + "learning_rate": 0.0003, + "loss": 15.6698, + "loss/aux_loss": 0.048180416226387024, + "loss/crossentropy": 3.1352601170539858, + "loss/logits": 1.2688252985477448, + "step": 3620 + }, + { + "epoch": 0.0363, + "grad_norm": 7.75, + "grad_norm_var": 0.240625, + "learning_rate": 0.0003, + "loss": 15.4033, + "loss/aux_loss": 0.04819142427295446, + "loss/crossentropy": 3.1684580206871034, + "loss/logits": 1.2518825322389602, + "step": 3630 + }, + { + "epoch": 0.0364, + "grad_norm": 7.25, + "grad_norm_var": 0.072509765625, + "learning_rate": 0.0003, + "loss": 15.3378, + "loss/aux_loss": 0.04819103125482797, + "loss/crossentropy": 3.210425066947937, + "loss/logits": 1.2049184322357178, + "step": 3640 + }, + { + "epoch": 0.0365, + "grad_norm": 7.0625, + "grad_norm_var": 49.72636311848958, + "learning_rate": 0.0003, + "loss": 15.2234, + "loss/aux_loss": 0.048194903507828714, + "loss/crossentropy": 3.2832493662834166, + "loss/logits": 1.221068474650383, + "step": 3650 + }, + { + "epoch": 0.0366, + "grad_norm": 9.125, + "grad_norm_var": 0.3234375, + "learning_rate": 0.0003, + "loss": 15.3403, + "loss/aux_loss": 0.04818172939121723, + "loss/crossentropy": 3.280470097064972, + "loss/logits": 1.229696273803711, + "step": 3660 + }, + { + "epoch": 0.0367, + "grad_norm": 7.5, + "grad_norm_var": 0.39843343098958334, + "learning_rate": 0.0003, + "loss": 15.3494, + "loss/aux_loss": 0.048176801204681395, + "loss/crossentropy": 3.0316362023353576, + "loss/logits": 1.2116109132766724, + "step": 3670 + }, + { + "epoch": 0.0368, + "grad_norm": 7.84375, + "grad_norm_var": 0.3272786458333333, + "learning_rate": 0.0003, + "loss": 15.4476, + "loss/aux_loss": 0.04817160293459892, + "loss/crossentropy": 3.2188750505447388, + "loss/logits": 1.2286329954862594, + "step": 3680 + }, + { + "epoch": 0.0369, + "grad_norm": 8.5625, + "grad_norm_var": 0.23899332682291666, + "learning_rate": 0.0003, + "loss": 15.3283, + "loss/aux_loss": 0.04817473813891411, + "loss/crossentropy": 3.1428863406181335, + "loss/logits": 1.2700409144163132, + "step": 3690 + }, + { + "epoch": 0.037, + "grad_norm": 7.375, + "grad_norm_var": 0.1796875, + "learning_rate": 0.0003, + "loss": 15.2114, + "loss/aux_loss": 0.048194908909499644, + "loss/crossentropy": 2.9049007534980773, + "loss/logits": 1.209693717956543, + "step": 3700 + }, + { + "epoch": 0.0371, + "grad_norm": 7.15625, + "grad_norm_var": 4.603153483072917, + "learning_rate": 0.0003, + "loss": 15.4273, + "loss/aux_loss": 0.04818948246538639, + "loss/crossentropy": 3.1089539527893066, + "loss/logits": 1.231841367483139, + "step": 3710 + }, + { + "epoch": 0.0372, + "grad_norm": 7.5625, + "grad_norm_var": 0.122119140625, + "learning_rate": 0.0003, + "loss": 15.3983, + "loss/aux_loss": 0.04818321000784635, + "loss/crossentropy": 2.9950480341911314, + "loss/logits": 1.1607284903526307, + "step": 3720 + }, + { + "epoch": 0.0373, + "grad_norm": 7.4375, + "grad_norm_var": 0.15904947916666667, + "learning_rate": 0.0003, + "loss": 15.1891, + "loss/aux_loss": 0.04817815236747265, + "loss/crossentropy": 3.161143660545349, + "loss/logits": 1.238188961148262, + "step": 3730 + }, + { + "epoch": 0.0374, + "grad_norm": 7.03125, + "grad_norm_var": 0.3502604166666667, + "learning_rate": 0.0003, + "loss": 15.3511, + "loss/aux_loss": 0.04818747155368328, + "loss/crossentropy": 3.086538052558899, + "loss/logits": 1.1786428213119506, + "step": 3740 + }, + { + "epoch": 0.0375, + "grad_norm": 7.46875, + "grad_norm_var": 0.11708577473958333, + "learning_rate": 0.0003, + "loss": 15.4379, + "loss/aux_loss": 0.04817507416009903, + "loss/crossentropy": 3.203218102455139, + "loss/logits": 1.2379136860370636, + "step": 3750 + }, + { + "epoch": 0.0376, + "grad_norm": 7.03125, + "grad_norm_var": 0.09334309895833333, + "learning_rate": 0.0003, + "loss": 15.1209, + "loss/aux_loss": 0.048195258155465125, + "loss/crossentropy": 2.8767175674438477, + "loss/logits": 1.135795423388481, + "step": 3760 + }, + { + "epoch": 0.0377, + "grad_norm": 8.0, + "grad_norm_var": 0.5704386393229167, + "learning_rate": 0.0003, + "loss": 15.1931, + "loss/aux_loss": 0.04818210508674383, + "loss/crossentropy": 3.0543838024139403, + "loss/logits": 1.1589192599058151, + "step": 3770 + }, + { + "epoch": 0.0378, + "grad_norm": 8.6875, + "grad_norm_var": 0.9666015625, + "learning_rate": 0.0003, + "loss": 15.3202, + "loss/aux_loss": 0.04818203579634428, + "loss/crossentropy": 3.1746195673942568, + "loss/logits": 1.2106265246868133, + "step": 3780 + }, + { + "epoch": 0.0379, + "grad_norm": 9.125, + "grad_norm_var": 54.760921223958334, + "learning_rate": 0.0003, + "loss": 15.0472, + "loss/aux_loss": 0.04817959927022457, + "loss/crossentropy": 3.1303970336914064, + "loss/logits": 1.1739704608917236, + "step": 3790 + }, + { + "epoch": 0.038, + "grad_norm": 7.28125, + "grad_norm_var": 55.79407552083333, + "learning_rate": 0.0003, + "loss": 15.335, + "loss/aux_loss": 0.048162929527461526, + "loss/crossentropy": 3.1775804996490478, + "loss/logits": 1.2157859086990357, + "step": 3800 + }, + { + "epoch": 0.0381, + "grad_norm": 7.4375, + "grad_norm_var": 0.09794514973958333, + "learning_rate": 0.0003, + "loss": 15.4016, + "loss/aux_loss": 0.048171533085405824, + "loss/crossentropy": 3.184191071987152, + "loss/logits": 1.200016838312149, + "step": 3810 + }, + { + "epoch": 0.0382, + "grad_norm": 8.125, + "grad_norm_var": 0.15972900390625, + "learning_rate": 0.0003, + "loss": 15.2867, + "loss/aux_loss": 0.048174711503088476, + "loss/crossentropy": 3.093715155124664, + "loss/logits": 1.1898449569940568, + "step": 3820 + }, + { + "epoch": 0.0383, + "grad_norm": 7.53125, + "grad_norm_var": 0.28216145833333334, + "learning_rate": 0.0003, + "loss": 15.179, + "loss/aux_loss": 0.04817448034882545, + "loss/crossentropy": 3.1919341683387756, + "loss/logits": 1.2327097624540329, + "step": 3830 + }, + { + "epoch": 0.0384, + "grad_norm": 16.25, + "grad_norm_var": 4.76041259765625, + "learning_rate": 0.0003, + "loss": 15.1992, + "loss/aux_loss": 0.048165909759700296, + "loss/crossentropy": 3.132761836051941, + "loss/logits": 1.1952956855297088, + "step": 3840 + }, + { + "epoch": 0.0385, + "grad_norm": 7.96875, + "grad_norm_var": 5.12109375, + "learning_rate": 0.0003, + "loss": 15.2487, + "loss/aux_loss": 0.0481891430914402, + "loss/crossentropy": 2.9917232036590575, + "loss/logits": 1.1668372660875321, + "step": 3850 + }, + { + "epoch": 0.0386, + "grad_norm": 7.9375, + "grad_norm_var": 0.13531494140625, + "learning_rate": 0.0003, + "loss": 15.1914, + "loss/aux_loss": 0.0481577729806304, + "loss/crossentropy": 3.071371626853943, + "loss/logits": 1.2018966376781464, + "step": 3860 + }, + { + "epoch": 0.0387, + "grad_norm": 7.46875, + "grad_norm_var": 0.12994791666666666, + "learning_rate": 0.0003, + "loss": 15.0261, + "loss/aux_loss": 0.04815917555242777, + "loss/crossentropy": 3.0264050543308256, + "loss/logits": 1.1740915864706039, + "step": 3870 + }, + { + "epoch": 0.0388, + "grad_norm": 7.4375, + "grad_norm_var": 0.2575154622395833, + "learning_rate": 0.0003, + "loss": 15.1013, + "loss/aux_loss": 0.04817144125699997, + "loss/crossentropy": 3.0360918402671815, + "loss/logits": 1.152667647600174, + "step": 3880 + }, + { + "epoch": 0.0389, + "grad_norm": 8.1875, + "grad_norm_var": 0.21964518229166666, + "learning_rate": 0.0003, + "loss": 15.0667, + "loss/aux_loss": 0.04816736020147801, + "loss/crossentropy": 3.1190317153930662, + "loss/logits": 1.185880446434021, + "step": 3890 + }, + { + "epoch": 0.039, + "grad_norm": 9.0625, + "grad_norm_var": 0.39088134765625, + "learning_rate": 0.0003, + "loss": 15.1793, + "loss/aux_loss": 0.04816778711974621, + "loss/crossentropy": 3.00163277387619, + "loss/logits": 1.1773586809635161, + "step": 3900 + }, + { + "epoch": 0.0391, + "grad_norm": 7.65625, + "grad_norm_var": 0.35870768229166666, + "learning_rate": 0.0003, + "loss": 15.1278, + "loss/aux_loss": 0.04818731751292944, + "loss/crossentropy": 2.939511752128601, + "loss/logits": 1.2299345314502717, + "step": 3910 + }, + { + "epoch": 0.0392, + "grad_norm": 7.5, + "grad_norm_var": 0.09436442057291666, + "learning_rate": 0.0003, + "loss": 15.0897, + "loss/aux_loss": 0.0481788320466876, + "loss/crossentropy": 3.0653002142906187, + "loss/logits": 1.1968173742294312, + "step": 3920 + }, + { + "epoch": 0.0393, + "grad_norm": 7.28125, + "grad_norm_var": 0.16145426432291668, + "learning_rate": 0.0003, + "loss": 15.0164, + "loss/aux_loss": 0.04818780794739723, + "loss/crossentropy": 2.995146155357361, + "loss/logits": 1.155534029006958, + "step": 3930 + }, + { + "epoch": 0.0394, + "grad_norm": 7.78125, + "grad_norm_var": 0.11588134765625, + "learning_rate": 0.0003, + "loss": 15.0909, + "loss/aux_loss": 0.04817194156348705, + "loss/crossentropy": 2.9778522849082947, + "loss/logits": 1.188610589504242, + "step": 3940 + }, + { + "epoch": 0.0395, + "grad_norm": 8.5, + "grad_norm_var": 0.22825520833333332, + "learning_rate": 0.0003, + "loss": 14.9485, + "loss/aux_loss": 0.04817061126232147, + "loss/crossentropy": 3.0706464409828187, + "loss/logits": 1.1276392668485642, + "step": 3950 + }, + { + "epoch": 0.0396, + "grad_norm": 7.59375, + "grad_norm_var": 0.1765625, + "learning_rate": 0.0003, + "loss": 15.1761, + "loss/aux_loss": 0.04817082397639751, + "loss/crossentropy": 3.0294368386268617, + "loss/logits": 1.1730817139148713, + "step": 3960 + }, + { + "epoch": 0.0397, + "grad_norm": 7.78125, + "grad_norm_var": 0.37659098307291666, + "learning_rate": 0.0003, + "loss": 14.9544, + "loss/aux_loss": 0.048177217692136766, + "loss/crossentropy": 3.003262734413147, + "loss/logits": 1.1717498630285264, + "step": 3970 + }, + { + "epoch": 0.0398, + "grad_norm": 7.59375, + "grad_norm_var": 0.46226806640625, + "learning_rate": 0.0003, + "loss": 15.0399, + "loss/aux_loss": 0.04816052261739969, + "loss/crossentropy": 3.108014762401581, + "loss/logits": 1.1962302416563033, + "step": 3980 + }, + { + "epoch": 0.0399, + "grad_norm": 8.0625, + "grad_norm_var": 0.15533854166666666, + "learning_rate": 0.0003, + "loss": 15.0799, + "loss/aux_loss": 0.04815749432891607, + "loss/crossentropy": 3.185481405258179, + "loss/logits": 1.2161674737930297, + "step": 3990 + }, + { + "epoch": 0.04, + "grad_norm": 8.25, + "grad_norm_var": 0.155712890625, + "learning_rate": 0.0003, + "loss": 15.0788, + "loss/aux_loss": 0.0481644194573164, + "loss/crossentropy": 3.1585240364074707, + "loss/logits": 1.1965474605560302, + "step": 4000 + }, + { + "epoch": 0.0401, + "grad_norm": 7.21875, + "grad_norm_var": 0.21419270833333334, + "learning_rate": 0.0003, + "loss": 15.0548, + "loss/aux_loss": 0.04816298447549343, + "loss/crossentropy": 3.1302665889263155, + "loss/logits": 1.1530128061771392, + "step": 4010 + }, + { + "epoch": 0.0402, + "grad_norm": 7.53125, + "grad_norm_var": 0.13173421223958334, + "learning_rate": 0.0003, + "loss": 15.0497, + "loss/aux_loss": 0.04816345106810331, + "loss/crossentropy": 3.142714560031891, + "loss/logits": 1.1879263758659362, + "step": 4020 + }, + { + "epoch": 0.0403, + "grad_norm": 7.6875, + "grad_norm_var": 0.06339518229166667, + "learning_rate": 0.0003, + "loss": 14.9433, + "loss/aux_loss": 0.04815952125936747, + "loss/crossentropy": 3.0843304634094237, + "loss/logits": 1.2199938654899598, + "step": 4030 + }, + { + "epoch": 0.0404, + "grad_norm": 6.875, + "grad_norm_var": 0.15402018229166667, + "learning_rate": 0.0003, + "loss": 15.19, + "loss/aux_loss": 0.04817276708781719, + "loss/crossentropy": 3.128165376186371, + "loss/logits": 1.1765313237905501, + "step": 4040 + }, + { + "epoch": 0.0405, + "grad_norm": 30.375, + "grad_norm_var": 32.932275390625, + "learning_rate": 0.0003, + "loss": 14.8461, + "loss/aux_loss": 0.048168274387717244, + "loss/crossentropy": 3.005221629142761, + "loss/logits": 1.1654815077781677, + "step": 4050 + }, + { + "epoch": 0.0406, + "grad_norm": 7.59375, + "grad_norm_var": 32.006754557291664, + "learning_rate": 0.0003, + "loss": 15.0802, + "loss/aux_loss": 0.04817748311907053, + "loss/crossentropy": 3.095579755306244, + "loss/logits": 1.1656386017799378, + "step": 4060 + }, + { + "epoch": 0.0407, + "grad_norm": 8.6875, + "grad_norm_var": 1.2711873372395834, + "learning_rate": 0.0003, + "loss": 15.1126, + "loss/aux_loss": 0.04816674739122391, + "loss/crossentropy": 3.1798906683921815, + "loss/logits": 1.207847249507904, + "step": 4070 + }, + { + "epoch": 0.0408, + "grad_norm": 7.875, + "grad_norm_var": 1.26412353515625, + "learning_rate": 0.0003, + "loss": 15.1471, + "loss/aux_loss": 0.04817005805671215, + "loss/crossentropy": 3.125558304786682, + "loss/logits": 1.1919424772262572, + "step": 4080 + }, + { + "epoch": 0.0409, + "grad_norm": 7.5625, + "grad_norm_var": 0.2809529622395833, + "learning_rate": 0.0003, + "loss": 14.853, + "loss/aux_loss": 0.048167549446225165, + "loss/crossentropy": 3.097035455703735, + "loss/logits": 1.1527066469192504, + "step": 4090 + }, + { + "epoch": 0.041, + "grad_norm": 7.4375, + "grad_norm_var": 0.690087890625, + "learning_rate": 0.0003, + "loss": 14.8294, + "loss/aux_loss": 0.04817544762045145, + "loss/crossentropy": 2.9536795616149902, + "loss/logits": 1.1623014092445374, + "step": 4100 + }, + { + "epoch": 0.0411, + "grad_norm": 7.90625, + "grad_norm_var": 1.3970011393229167, + "learning_rate": 0.0003, + "loss": 15.0244, + "loss/aux_loss": 0.048174246400594714, + "loss/crossentropy": 3.1965074062347414, + "loss/logits": 1.162026983499527, + "step": 4110 + }, + { + "epoch": 0.0412, + "grad_norm": 8.1875, + "grad_norm_var": 0.08495686848958334, + "learning_rate": 0.0003, + "loss": 14.9105, + "loss/aux_loss": 0.048172399029135705, + "loss/crossentropy": 3.0823826670646666, + "loss/logits": 1.199626660346985, + "step": 4120 + }, + { + "epoch": 0.0413, + "grad_norm": 7.40625, + "grad_norm_var": 0.15282796223958334, + "learning_rate": 0.0003, + "loss": 15.0299, + "loss/aux_loss": 0.048171821609139444, + "loss/crossentropy": 3.1277252316474913, + "loss/logits": 1.1875766038894653, + "step": 4130 + }, + { + "epoch": 0.0414, + "grad_norm": 7.875, + "grad_norm_var": 0.18984375, + "learning_rate": 0.0003, + "loss": 14.9988, + "loss/aux_loss": 0.04816291127353907, + "loss/crossentropy": 2.999015522003174, + "loss/logits": 1.1738766431808472, + "step": 4140 + }, + { + "epoch": 0.0415, + "grad_norm": 7.59375, + "grad_norm_var": 0.2892578125, + "learning_rate": 0.0003, + "loss": 15.1225, + "loss/aux_loss": 0.04815590269863605, + "loss/crossentropy": 3.1540396094322203, + "loss/logits": 1.2201361060142517, + "step": 4150 + }, + { + "epoch": 0.0416, + "grad_norm": 8.0625, + "grad_norm_var": 0.25598551432291666, + "learning_rate": 0.0003, + "loss": 14.6316, + "loss/aux_loss": 0.04817016571760178, + "loss/crossentropy": 3.0777355790138246, + "loss/logits": 1.148938202857971, + "step": 4160 + }, + { + "epoch": 0.0417, + "grad_norm": 7.21875, + "grad_norm_var": 0.8997395833333334, + "learning_rate": 0.0003, + "loss": 14.8675, + "loss/aux_loss": 0.04817523639649153, + "loss/crossentropy": 3.154482841491699, + "loss/logits": 1.1807423561811448, + "step": 4170 + }, + { + "epoch": 0.0418, + "grad_norm": 7.875, + "grad_norm_var": 0.19582926432291667, + "learning_rate": 0.0003, + "loss": 14.7081, + "loss/aux_loss": 0.048159117065370086, + "loss/crossentropy": 3.1314535260200502, + "loss/logits": 1.1639139771461486, + "step": 4180 + }, + { + "epoch": 0.0419, + "grad_norm": 7.90625, + "grad_norm_var": 0.18684488932291668, + "learning_rate": 0.0003, + "loss": 14.9338, + "loss/aux_loss": 0.048169083148241046, + "loss/crossentropy": 3.0862425684928896, + "loss/logits": 1.1751366287469864, + "step": 4190 + }, + { + "epoch": 0.042, + "grad_norm": 7.625, + "grad_norm_var": 0.11756184895833334, + "learning_rate": 0.0003, + "loss": 14.87, + "loss/aux_loss": 0.04817427862435579, + "loss/crossentropy": 3.106311786174774, + "loss/logits": 1.1633146226406097, + "step": 4200 + }, + { + "epoch": 0.0421, + "grad_norm": 7.5625, + "grad_norm_var": 13.382124837239584, + "learning_rate": 0.0003, + "loss": 14.8649, + "loss/aux_loss": 0.0481806568801403, + "loss/crossentropy": 2.915513515472412, + "loss/logits": 1.1206313014030456, + "step": 4210 + }, + { + "epoch": 0.0422, + "grad_norm": 7.4375, + "grad_norm_var": 13.166520182291666, + "learning_rate": 0.0003, + "loss": 14.8359, + "loss/aux_loss": 0.04816134050488472, + "loss/crossentropy": 3.0834913730621336, + "loss/logits": 1.1465971380472184, + "step": 4220 + }, + { + "epoch": 0.0423, + "grad_norm": 7.34375, + "grad_norm_var": 0.16795247395833332, + "learning_rate": 0.0003, + "loss": 14.9675, + "loss/aux_loss": 0.04815997164696455, + "loss/crossentropy": 3.181843435764313, + "loss/logits": 1.1813198417425155, + "step": 4230 + }, + { + "epoch": 0.0424, + "grad_norm": 8.1875, + "grad_norm_var": 0.26461181640625, + "learning_rate": 0.0003, + "loss": 14.9965, + "loss/aux_loss": 0.048162421025335786, + "loss/crossentropy": 3.0605735301971437, + "loss/logits": 1.2021290510892868, + "step": 4240 + }, + { + "epoch": 0.0425, + "grad_norm": 9.8125, + "grad_norm_var": 0.52867431640625, + "learning_rate": 0.0003, + "loss": 14.8701, + "loss/aux_loss": 0.04816258866339922, + "loss/crossentropy": 3.0808632254600523, + "loss/logits": 1.213870882987976, + "step": 4250 + }, + { + "epoch": 0.0426, + "grad_norm": 7.3125, + "grad_norm_var": 0.35133056640625, + "learning_rate": 0.0003, + "loss": 14.9504, + "loss/aux_loss": 0.04816483333706856, + "loss/crossentropy": 3.2533419847488405, + "loss/logits": 1.2143194258213044, + "step": 4260 + }, + { + "epoch": 0.0427, + "grad_norm": 8.25, + "grad_norm_var": 0.18931884765625, + "learning_rate": 0.0003, + "loss": 14.8317, + "loss/aux_loss": 0.04816996194422245, + "loss/crossentropy": 2.921747499704361, + "loss/logits": 1.1640391945838928, + "step": 4270 + }, + { + "epoch": 0.0428, + "grad_norm": 8.3125, + "grad_norm_var": 0.29081624348958335, + "learning_rate": 0.0003, + "loss": 15.0626, + "loss/aux_loss": 0.04816504456102848, + "loss/crossentropy": 3.0316648125648498, + "loss/logits": 1.1759659737348556, + "step": 4280 + }, + { + "epoch": 0.0429, + "grad_norm": 7.125, + "grad_norm_var": 0.26145833333333335, + "learning_rate": 0.0003, + "loss": 14.7015, + "loss/aux_loss": 0.04817175418138504, + "loss/crossentropy": 3.0507488489151, + "loss/logits": 1.181325948238373, + "step": 4290 + }, + { + "epoch": 0.043, + "grad_norm": 7.78125, + "grad_norm_var": 0.21343994140625, + "learning_rate": 0.0003, + "loss": 14.8985, + "loss/aux_loss": 0.04816540405154228, + "loss/crossentropy": 3.0950448393821715, + "loss/logits": 1.2072101056575775, + "step": 4300 + }, + { + "epoch": 0.0431, + "grad_norm": 8.1875, + "grad_norm_var": 0.12737223307291667, + "learning_rate": 0.0003, + "loss": 14.9397, + "loss/aux_loss": 0.048161011561751364, + "loss/crossentropy": 2.983470690250397, + "loss/logits": 1.1625974208116532, + "step": 4310 + }, + { + "epoch": 0.0432, + "grad_norm": 8.375, + "grad_norm_var": 0.17089436848958334, + "learning_rate": 0.0003, + "loss": 14.9934, + "loss/aux_loss": 0.04816465843468905, + "loss/crossentropy": 3.100528526306152, + "loss/logits": 1.2168638974428176, + "step": 4320 + }, + { + "epoch": 0.0433, + "grad_norm": 7.09375, + "grad_norm_var": 0.15452067057291666, + "learning_rate": 0.0003, + "loss": 14.8999, + "loss/aux_loss": 0.04815626051276922, + "loss/crossentropy": 3.1541409373283384, + "loss/logits": 1.1997069358825683, + "step": 4330 + }, + { + "epoch": 0.0434, + "grad_norm": 7.9375, + "grad_norm_var": 1.5087198893229166, + "learning_rate": 0.0003, + "loss": 14.7317, + "loss/aux_loss": 0.04816477261483669, + "loss/crossentropy": 3.048540270328522, + "loss/logits": 1.2071462273597717, + "step": 4340 + }, + { + "epoch": 0.0435, + "grad_norm": 7.8125, + "grad_norm_var": 0.10623372395833333, + "learning_rate": 0.0003, + "loss": 14.713, + "loss/aux_loss": 0.04816347248852253, + "loss/crossentropy": 2.9770607709884644, + "loss/logits": 1.1550648272037507, + "step": 4350 + }, + { + "epoch": 0.0436, + "grad_norm": 8.0625, + "grad_norm_var": 0.13566080729166666, + "learning_rate": 0.0003, + "loss": 15.0412, + "loss/aux_loss": 0.04816342815756798, + "loss/crossentropy": 2.9482832670211794, + "loss/logits": 1.1551517724990845, + "step": 4360 + }, + { + "epoch": 0.0437, + "grad_norm": 8.0625, + "grad_norm_var": 0.10624593098958333, + "learning_rate": 0.0003, + "loss": 14.8035, + "loss/aux_loss": 0.04816410057246685, + "loss/crossentropy": 3.0711460292339323, + "loss/logits": 1.1584541529417038, + "step": 4370 + }, + { + "epoch": 0.0438, + "grad_norm": 7.375, + "grad_norm_var": 0.20885009765625, + "learning_rate": 0.0003, + "loss": 14.7389, + "loss/aux_loss": 0.0481667784973979, + "loss/crossentropy": 2.9650609135627746, + "loss/logits": 1.1401590436697007, + "step": 4380 + }, + { + "epoch": 0.0439, + "grad_norm": 7.75, + "grad_norm_var": 0.15338134765625, + "learning_rate": 0.0003, + "loss": 14.5523, + "loss/aux_loss": 0.04817138686776161, + "loss/crossentropy": 3.0582551836967466, + "loss/logits": 1.0985677868127823, + "step": 4390 + }, + { + "epoch": 0.044, + "grad_norm": 7.875, + "grad_norm_var": 0.12655843098958333, + "learning_rate": 0.0003, + "loss": 14.7145, + "loss/aux_loss": 0.04816395286470652, + "loss/crossentropy": 3.0119667410850526, + "loss/logits": 1.1839350372552873, + "step": 4400 + }, + { + "epoch": 0.0441, + "grad_norm": 8.5, + "grad_norm_var": 0.13058268229166667, + "learning_rate": 0.0003, + "loss": 14.7486, + "loss/aux_loss": 0.04816783182322979, + "loss/crossentropy": 2.9910679340362547, + "loss/logits": 1.1801847249269486, + "step": 4410 + }, + { + "epoch": 0.0442, + "grad_norm": 18.625, + "grad_norm_var": 7.329557291666666, + "learning_rate": 0.0003, + "loss": 14.7453, + "loss/aux_loss": 0.04816215075552464, + "loss/crossentropy": 2.981612813472748, + "loss/logits": 1.183125939965248, + "step": 4420 + }, + { + "epoch": 0.0443, + "grad_norm": 7.0625, + "grad_norm_var": 7.44742431640625, + "learning_rate": 0.0003, + "loss": 14.626, + "loss/aux_loss": 0.04817271661013365, + "loss/crossentropy": 2.9835289478302003, + "loss/logits": 1.158128410577774, + "step": 4430 + }, + { + "epoch": 0.0444, + "grad_norm": 7.90625, + "grad_norm_var": 0.14615478515625, + "learning_rate": 0.0003, + "loss": 14.5175, + "loss/aux_loss": 0.04816505704075098, + "loss/crossentropy": 3.097472053766251, + "loss/logits": 1.168840977549553, + "step": 4440 + }, + { + "epoch": 0.0445, + "grad_norm": 12.4375, + "grad_norm_var": 1.418994140625, + "learning_rate": 0.0003, + "loss": 14.561, + "loss/aux_loss": 0.04816557168960571, + "loss/crossentropy": 3.1079689621925355, + "loss/logits": 1.1443527430295943, + "step": 4450 + }, + { + "epoch": 0.0446, + "grad_norm": 8.625, + "grad_norm_var": 1.31578369140625, + "learning_rate": 0.0003, + "loss": 14.674, + "loss/aux_loss": 0.04816855322569609, + "loss/crossentropy": 2.927138316631317, + "loss/logits": 1.148129415512085, + "step": 4460 + }, + { + "epoch": 0.0447, + "grad_norm": 7.75, + "grad_norm_var": 0.18723551432291666, + "learning_rate": 0.0003, + "loss": 14.5317, + "loss/aux_loss": 0.04815917164087295, + "loss/crossentropy": 3.1104054749011993, + "loss/logits": 1.1480105966329575, + "step": 4470 + }, + { + "epoch": 0.0448, + "grad_norm": 8.25, + "grad_norm_var": 0.16847330729166668, + "learning_rate": 0.0003, + "loss": 14.4452, + "loss/aux_loss": 0.048180781118571755, + "loss/crossentropy": 2.762672412395477, + "loss/logits": 1.0904426872730255, + "step": 4480 + }, + { + "epoch": 0.0449, + "grad_norm": 7.40625, + "grad_norm_var": 0.16334635416666668, + "learning_rate": 0.0003, + "loss": 14.6466, + "loss/aux_loss": 0.0481576981022954, + "loss/crossentropy": 3.10465407371521, + "loss/logits": 1.152064311504364, + "step": 4490 + }, + { + "epoch": 0.045, + "grad_norm": 7.125, + "grad_norm_var": 0.09772135416666666, + "learning_rate": 0.0003, + "loss": 14.4935, + "loss/aux_loss": 0.048153439350426194, + "loss/crossentropy": 2.9880860924720762, + "loss/logits": 1.1234510779380797, + "step": 4500 + }, + { + "epoch": 0.0451, + "grad_norm": 8.1875, + "grad_norm_var": 0.47294514973958335, + "learning_rate": 0.0003, + "loss": 14.5779, + "loss/aux_loss": 0.048160174302756785, + "loss/crossentropy": 3.0695066869258882, + "loss/logits": 1.1693350702524186, + "step": 4510 + }, + { + "epoch": 0.0452, + "grad_norm": 8.0625, + "grad_norm_var": 0.54361572265625, + "learning_rate": 0.0003, + "loss": 14.5628, + "loss/aux_loss": 0.04815432522445917, + "loss/crossentropy": 3.1152522921562196, + "loss/logits": 1.1614423453807832, + "step": 4520 + }, + { + "epoch": 0.0453, + "grad_norm": 7.34375, + "grad_norm_var": 0.43072509765625, + "learning_rate": 0.0003, + "loss": 14.578, + "loss/aux_loss": 0.04815590996295214, + "loss/crossentropy": 3.122402215003967, + "loss/logits": 1.191055852174759, + "step": 4530 + }, + { + "epoch": 0.0454, + "grad_norm": 8.25, + "grad_norm_var": 5.175028483072917, + "learning_rate": 0.0003, + "loss": 14.7617, + "loss/aux_loss": 0.048156161420047285, + "loss/crossentropy": 3.0762326240539553, + "loss/logits": 1.1874168932437896, + "step": 4540 + }, + { + "epoch": 0.0455, + "grad_norm": 9.125, + "grad_norm_var": 1.2244099934895833, + "learning_rate": 0.0003, + "loss": 14.7042, + "loss/aux_loss": 0.04815144389867783, + "loss/crossentropy": 3.053194510936737, + "loss/logits": 1.1742142677307128, + "step": 4550 + }, + { + "epoch": 0.0456, + "grad_norm": 8.125, + "grad_norm_var": 0.3241170247395833, + "learning_rate": 0.0003, + "loss": 14.5613, + "loss/aux_loss": 0.04815953467041254, + "loss/crossentropy": 3.0894832491874693, + "loss/logits": 1.123066246509552, + "step": 4560 + }, + { + "epoch": 0.0457, + "grad_norm": 7.71875, + "grad_norm_var": 0.22732747395833333, + "learning_rate": 0.0003, + "loss": 14.6751, + "loss/aux_loss": 0.048164136707782745, + "loss/crossentropy": 3.278604805469513, + "loss/logits": 1.1765309482812882, + "step": 4570 + }, + { + "epoch": 0.0458, + "grad_norm": 7.875, + "grad_norm_var": 0.2938761393229167, + "learning_rate": 0.0003, + "loss": 14.5222, + "loss/aux_loss": 0.04816136136651039, + "loss/crossentropy": 3.122606945037842, + "loss/logits": 1.1583560228347778, + "step": 4580 + }, + { + "epoch": 0.0459, + "grad_norm": 8.25, + "grad_norm_var": 0.24804280598958334, + "learning_rate": 0.0003, + "loss": 14.666, + "loss/aux_loss": 0.048167569935321806, + "loss/crossentropy": 2.8724692463874817, + "loss/logits": 1.1007645279169083, + "step": 4590 + }, + { + "epoch": 0.046, + "grad_norm": 7.53125, + "grad_norm_var": 0.08318684895833334, + "learning_rate": 0.0003, + "loss": 14.5749, + "loss/aux_loss": 0.048159733042120935, + "loss/crossentropy": 3.096747946739197, + "loss/logits": 1.1529816329479217, + "step": 4600 + }, + { + "epoch": 0.0461, + "grad_norm": 7.6875, + "grad_norm_var": 0.08440348307291666, + "learning_rate": 0.0003, + "loss": 14.6826, + "loss/aux_loss": 0.048148723877966405, + "loss/crossentropy": 3.0729199647903442, + "loss/logits": 1.2091837465763091, + "step": 4610 + }, + { + "epoch": 0.0462, + "grad_norm": 8.125, + "grad_norm_var": 0.05917561848958333, + "learning_rate": 0.0003, + "loss": 14.5881, + "loss/aux_loss": 0.04815642535686493, + "loss/crossentropy": 3.0322453498840334, + "loss/logits": 1.1239049285650253, + "step": 4620 + }, + { + "epoch": 0.0463, + "grad_norm": 7.4375, + "grad_norm_var": 0.5194010416666667, + "learning_rate": 0.0003, + "loss": 14.5686, + "loss/aux_loss": 0.048170761205255985, + "loss/crossentropy": 3.0770667433738708, + "loss/logits": 1.1368163347244262, + "step": 4630 + }, + { + "epoch": 0.0464, + "grad_norm": 8.125, + "grad_norm_var": 0.6132649739583333, + "learning_rate": 0.0003, + "loss": 14.3923, + "loss/aux_loss": 0.048156014271080495, + "loss/crossentropy": 3.1633099794387816, + "loss/logits": 1.1499724864959717, + "step": 4640 + }, + { + "epoch": 0.0465, + "grad_norm": 7.96875, + "grad_norm_var": 1.9275349934895833, + "learning_rate": 0.0003, + "loss": 14.3876, + "loss/aux_loss": 0.048154591023921965, + "loss/crossentropy": 3.066124379634857, + "loss/logits": 1.1568672150373458, + "step": 4650 + }, + { + "epoch": 0.0466, + "grad_norm": 8.625, + "grad_norm_var": 7.602848307291667, + "learning_rate": 0.0003, + "loss": 14.6209, + "loss/aux_loss": 0.048150830902159214, + "loss/crossentropy": 2.938699722290039, + "loss/logits": 1.1516984760761262, + "step": 4660 + }, + { + "epoch": 0.0467, + "grad_norm": 7.46875, + "grad_norm_var": 6.106083170572917, + "learning_rate": 0.0003, + "loss": 14.6674, + "loss/aux_loss": 0.04815071895718574, + "loss/crossentropy": 3.0229847908020018, + "loss/logits": 1.136454886198044, + "step": 4670 + }, + { + "epoch": 0.0468, + "grad_norm": 7.5625, + "grad_norm_var": 0.10810139973958334, + "learning_rate": 0.0003, + "loss": 14.8643, + "loss/aux_loss": 0.04815872758626938, + "loss/crossentropy": 2.9929285645484924, + "loss/logits": 1.1554243832826614, + "step": 4680 + }, + { + "epoch": 0.0469, + "grad_norm": 7.875, + "grad_norm_var": 0.1236328125, + "learning_rate": 0.0003, + "loss": 14.5369, + "loss/aux_loss": 0.04814963173121214, + "loss/crossentropy": 3.0233195781707765, + "loss/logits": 1.1462786018848419, + "step": 4690 + }, + { + "epoch": 0.047, + "grad_norm": 7.96875, + "grad_norm_var": 0.14599202473958334, + "learning_rate": 0.0003, + "loss": 14.6454, + "loss/aux_loss": 0.04816053248941898, + "loss/crossentropy": 2.9162982583045958, + "loss/logits": 1.1257199048995972, + "step": 4700 + }, + { + "epoch": 0.0471, + "grad_norm": 8.875, + "grad_norm_var": 0.27884114583333336, + "learning_rate": 0.0003, + "loss": 14.4935, + "loss/aux_loss": 0.04815351460129023, + "loss/crossentropy": 3.166695535182953, + "loss/logits": 1.173658263683319, + "step": 4710 + }, + { + "epoch": 0.0472, + "grad_norm": 7.4375, + "grad_norm_var": 0.27346598307291664, + "learning_rate": 0.0003, + "loss": 14.4925, + "loss/aux_loss": 0.048155609704554084, + "loss/crossentropy": 3.00724972486496, + "loss/logits": 1.147817325592041, + "step": 4720 + }, + { + "epoch": 0.0473, + "grad_norm": 7.53125, + "grad_norm_var": 0.2710774739583333, + "learning_rate": 0.0003, + "loss": 14.3945, + "loss/aux_loss": 0.04815868772566319, + "loss/crossentropy": 3.056724321842194, + "loss/logits": 1.1427915573120118, + "step": 4730 + }, + { + "epoch": 0.0474, + "grad_norm": 7.78125, + "grad_norm_var": 0.20940348307291667, + "learning_rate": 0.0003, + "loss": 14.5173, + "loss/aux_loss": 0.048164534568786624, + "loss/crossentropy": 2.8747935056686402, + "loss/logits": 1.09987430870533, + "step": 4740 + }, + { + "epoch": 0.0475, + "grad_norm": 8.0, + "grad_norm_var": 0.23528645833333334, + "learning_rate": 0.0003, + "loss": 14.49, + "loss/aux_loss": 0.048151783645153046, + "loss/crossentropy": 2.8358932733535767, + "loss/logits": 1.0616690814495087, + "step": 4750 + }, + { + "epoch": 0.0476, + "grad_norm": 7.625, + "grad_norm_var": 0.36178385416666664, + "learning_rate": 0.0003, + "loss": 14.4691, + "loss/aux_loss": 0.0481619393453002, + "loss/crossentropy": 3.1596203804016114, + "loss/logits": 1.210114187002182, + "step": 4760 + }, + { + "epoch": 0.0477, + "grad_norm": 8.0, + "grad_norm_var": 0.15462239583333334, + "learning_rate": 0.0003, + "loss": 14.5602, + "loss/aux_loss": 0.048146472126245496, + "loss/crossentropy": 3.001967716217041, + "loss/logits": 1.096169427037239, + "step": 4770 + }, + { + "epoch": 0.0478, + "grad_norm": 7.6875, + "grad_norm_var": 0.18140869140625, + "learning_rate": 0.0003, + "loss": 14.4163, + "loss/aux_loss": 0.04816320165991783, + "loss/crossentropy": 2.876752531528473, + "loss/logits": 1.1474198400974274, + "step": 4780 + }, + { + "epoch": 0.0479, + "grad_norm": 9.0625, + "grad_norm_var": 0.24347330729166666, + "learning_rate": 0.0003, + "loss": 14.2293, + "loss/aux_loss": 0.0481548685580492, + "loss/crossentropy": 2.9930427193641664, + "loss/logits": 1.1371810525655746, + "step": 4790 + }, + { + "epoch": 0.048, + "grad_norm": 8.6875, + "grad_norm_var": 0.22493082682291668, + "learning_rate": 0.0003, + "loss": 14.6348, + "loss/aux_loss": 0.048159336857497695, + "loss/crossentropy": 3.0536611795425417, + "loss/logits": 1.1425227701663971, + "step": 4800 + }, + { + "epoch": 0.0481, + "grad_norm": 8.1875, + "grad_norm_var": 0.19347330729166667, + "learning_rate": 0.0003, + "loss": 14.3975, + "loss/aux_loss": 0.04815634544938803, + "loss/crossentropy": 2.9617689490318297, + "loss/logits": 1.1070881575345992, + "step": 4810 + }, + { + "epoch": 0.0482, + "grad_norm": 7.6875, + "grad_norm_var": 0.27467447916666665, + "learning_rate": 0.0003, + "loss": 14.404, + "loss/aux_loss": 0.04816010873764753, + "loss/crossentropy": 2.945191979408264, + "loss/logits": 1.1223448246717453, + "step": 4820 + }, + { + "epoch": 0.0483, + "grad_norm": 8.1875, + "grad_norm_var": 0.27291666666666664, + "learning_rate": 0.0003, + "loss": 14.4795, + "loss/aux_loss": 0.048158070631325246, + "loss/crossentropy": 2.93973708152771, + "loss/logits": 1.1405175089836121, + "step": 4830 + }, + { + "epoch": 0.0484, + "grad_norm": 7.8125, + "grad_norm_var": 0.269384765625, + "learning_rate": 0.0003, + "loss": 14.6711, + "loss/aux_loss": 0.048154968209564684, + "loss/crossentropy": 3.004188358783722, + "loss/logits": 1.1400604486465453, + "step": 4840 + }, + { + "epoch": 0.0485, + "grad_norm": 9.375, + "grad_norm_var": 1.53570556640625, + "learning_rate": 0.0003, + "loss": 14.5167, + "loss/aux_loss": 0.04815916530787945, + "loss/crossentropy": 2.9153899431228636, + "loss/logits": 1.1129061222076415, + "step": 4850 + }, + { + "epoch": 0.0486, + "grad_norm": 7.65625, + "grad_norm_var": 1.5292805989583333, + "learning_rate": 0.0003, + "loss": 14.3376, + "loss/aux_loss": 0.04816344752907753, + "loss/crossentropy": 3.0024606227874755, + "loss/logits": 1.1566831320524216, + "step": 4860 + }, + { + "epoch": 0.0487, + "grad_norm": 8.0, + "grad_norm_var": 0.24295247395833333, + "learning_rate": 0.0003, + "loss": 14.1824, + "loss/aux_loss": 0.0481420386582613, + "loss/crossentropy": 2.8804391503334044, + "loss/logits": 1.098368188738823, + "step": 4870 + }, + { + "epoch": 0.0488, + "grad_norm": 7.78125, + "grad_norm_var": 0.20292561848958332, + "learning_rate": 0.0003, + "loss": 14.4489, + "loss/aux_loss": 0.04815696161240339, + "loss/crossentropy": 2.9788331627845763, + "loss/logits": 1.1000428795814514, + "step": 4880 + }, + { + "epoch": 0.0489, + "grad_norm": 7.71875, + "grad_norm_var": 0.38917643229166665, + "learning_rate": 0.0003, + "loss": 14.6589, + "loss/aux_loss": 0.04814700428396464, + "loss/crossentropy": 3.02801970243454, + "loss/logits": 1.1341104060411453, + "step": 4890 + }, + { + "epoch": 0.049, + "grad_norm": 8.375, + "grad_norm_var": 0.40823160807291664, + "learning_rate": 0.0003, + "loss": 14.4838, + "loss/aux_loss": 0.048148921132087706, + "loss/crossentropy": 3.061317926645279, + "loss/logits": 1.1178199291229247, + "step": 4900 + }, + { + "epoch": 0.0491, + "grad_norm": 8.125, + "grad_norm_var": 0.31573893229166666, + "learning_rate": 0.0003, + "loss": 14.3403, + "loss/aux_loss": 0.04815245717763901, + "loss/crossentropy": 3.0220317125320433, + "loss/logits": 1.0949908673763276, + "step": 4910 + }, + { + "epoch": 0.0492, + "grad_norm": 8.4375, + "grad_norm_var": 0.13892822265625, + "learning_rate": 0.0003, + "loss": 14.4711, + "loss/aux_loss": 0.04815434459596872, + "loss/crossentropy": 3.0331790328025816, + "loss/logits": 1.0993872165679932, + "step": 4920 + }, + { + "epoch": 0.0493, + "grad_norm": 7.65625, + "grad_norm_var": 0.26901041666666664, + "learning_rate": 0.0003, + "loss": 14.2343, + "loss/aux_loss": 0.048155249655246736, + "loss/crossentropy": 2.97544447183609, + "loss/logits": 1.1062311738729478, + "step": 4930 + }, + { + "epoch": 0.0494, + "grad_norm": 14.875, + "grad_norm_var": 3.21763916015625, + "learning_rate": 0.0003, + "loss": 14.4118, + "loss/aux_loss": 0.048153795301914215, + "loss/crossentropy": 3.121039032936096, + "loss/logits": 1.1289394974708558, + "step": 4940 + }, + { + "epoch": 0.0495, + "grad_norm": 7.71875, + "grad_norm_var": 3.0484212239583335, + "learning_rate": 0.0003, + "loss": 14.3417, + "loss/aux_loss": 0.04815581478178501, + "loss/crossentropy": 3.015563631057739, + "loss/logits": 1.1119945228099823, + "step": 4950 + }, + { + "epoch": 0.0496, + "grad_norm": 13.0625, + "grad_norm_var": 3.7396484375, + "learning_rate": 0.0003, + "loss": 14.3192, + "loss/aux_loss": 0.048152280040085316, + "loss/crossentropy": 2.8887117922306063, + "loss/logits": 1.096293193101883, + "step": 4960 + }, + { + "epoch": 0.0497, + "grad_norm": 8.0, + "grad_norm_var": 3.763016764322917, + "learning_rate": 0.0003, + "loss": 14.274, + "loss/aux_loss": 0.04814924951642752, + "loss/crossentropy": 3.0876585960388185, + "loss/logits": 1.1102905184030534, + "step": 4970 + }, + { + "epoch": 0.0498, + "grad_norm": 7.5, + "grad_norm_var": 0.10857747395833334, + "learning_rate": 0.0003, + "loss": 14.2513, + "loss/aux_loss": 0.0481419550254941, + "loss/crossentropy": 3.1604169964790345, + "loss/logits": 1.135578241944313, + "step": 4980 + }, + { + "epoch": 0.0499, + "grad_norm": 7.84375, + "grad_norm_var": 0.10129801432291667, + "learning_rate": 0.0003, + "loss": 14.1859, + "loss/aux_loss": 0.0481536041945219, + "loss/crossentropy": 2.9750654339790343, + "loss/logits": 1.1200665444135667, + "step": 4990 + }, + { + "epoch": 0.05, + "grad_norm": 7.875, + "grad_norm_var": 0.1974609375, + "learning_rate": 0.0003, + "loss": 14.2076, + "loss/aux_loss": 0.048159463331103325, + "loss/crossentropy": 2.9379626870155335, + "loss/logits": 1.0767972767353058, + "step": 5000 + }, + { + "epoch": 0.0501, + "grad_norm": 8.0625, + "grad_norm_var": 0.13033854166666667, + "learning_rate": 0.0003, + "loss": 14.3938, + "loss/aux_loss": 0.04816003683954477, + "loss/crossentropy": 2.8308571100234987, + "loss/logits": 1.1243964433670044, + "step": 5010 + }, + { + "epoch": 0.0502, + "grad_norm": 8.375, + "grad_norm_var": 0.18127848307291666, + "learning_rate": 0.0003, + "loss": 14.3241, + "loss/aux_loss": 0.048151292651891706, + "loss/crossentropy": 2.9098775744438172, + "loss/logits": 1.1258880913257598, + "step": 5020 + }, + { + "epoch": 0.0503, + "grad_norm": 7.59375, + "grad_norm_var": 0.74088134765625, + "learning_rate": 0.0003, + "loss": 14.5341, + "loss/aux_loss": 0.048141808994114396, + "loss/crossentropy": 3.010620355606079, + "loss/logits": 1.1356734812259675, + "step": 5030 + }, + { + "epoch": 0.0504, + "grad_norm": 8.5, + "grad_norm_var": 0.177587890625, + "learning_rate": 0.0003, + "loss": 14.4808, + "loss/aux_loss": 0.048152133263647555, + "loss/crossentropy": 2.956807887554169, + "loss/logits": 1.0851380228996277, + "step": 5040 + }, + { + "epoch": 0.0505, + "grad_norm": 8.375, + "grad_norm_var": 0.22001546223958332, + "learning_rate": 0.0003, + "loss": 14.4873, + "loss/aux_loss": 0.04815137479454279, + "loss/crossentropy": 3.0215251445770264, + "loss/logits": 1.166229221224785, + "step": 5050 + }, + { + "epoch": 0.0506, + "grad_norm": 7.53125, + "grad_norm_var": 0.19120686848958332, + "learning_rate": 0.0003, + "loss": 14.2421, + "loss/aux_loss": 0.04814422242343426, + "loss/crossentropy": 3.028605377674103, + "loss/logits": 1.1260013222694396, + "step": 5060 + }, + { + "epoch": 0.0507, + "grad_norm": 7.4375, + "grad_norm_var": 0.13407796223958332, + "learning_rate": 0.0003, + "loss": 14.4855, + "loss/aux_loss": 0.04815457910299301, + "loss/crossentropy": 3.0295214653015137, + "loss/logits": 1.1266607105731965, + "step": 5070 + }, + { + "epoch": 0.0508, + "grad_norm": 8.125, + "grad_norm_var": 16.2919921875, + "learning_rate": 0.0003, + "loss": 14.278, + "loss/aux_loss": 0.048153937235474585, + "loss/crossentropy": 3.0743547797203066, + "loss/logits": 1.1398055493831634, + "step": 5080 + }, + { + "epoch": 0.0509, + "grad_norm": 8.3125, + "grad_norm_var": 0.3312459309895833, + "learning_rate": 0.0003, + "loss": 14.3122, + "loss/aux_loss": 0.048145625926554206, + "loss/crossentropy": 2.9891109347343443, + "loss/logits": 1.144765716791153, + "step": 5090 + }, + { + "epoch": 0.051, + "grad_norm": 8.375, + "grad_norm_var": 0.25514322916666665, + "learning_rate": 0.0003, + "loss": 14.0619, + "loss/aux_loss": 0.048147767595946786, + "loss/crossentropy": 2.923846483230591, + "loss/logits": 1.104039838910103, + "step": 5100 + }, + { + "epoch": 0.0511, + "grad_norm": 8.4375, + "grad_norm_var": 0.18834228515625, + "learning_rate": 0.0003, + "loss": 14.3523, + "loss/aux_loss": 0.04815386533737183, + "loss/crossentropy": 2.9420456171035765, + "loss/logits": 1.110900694131851, + "step": 5110 + }, + { + "epoch": 0.0512, + "grad_norm": 8.4375, + "grad_norm_var": 0.348291015625, + "learning_rate": 0.0003, + "loss": 14.21, + "loss/aux_loss": 0.04815777577459812, + "loss/crossentropy": 2.9808182954788207, + "loss/logits": 1.1418810188770294, + "step": 5120 + }, + { + "epoch": 0.0513, + "grad_norm": 8.1875, + "grad_norm_var": 0.24459635416666667, + "learning_rate": 0.0003, + "loss": 14.1038, + "loss/aux_loss": 0.048149819299578664, + "loss/crossentropy": 2.9293219327926634, + "loss/logits": 1.151758760213852, + "step": 5130 + }, + { + "epoch": 0.0514, + "grad_norm": 7.78125, + "grad_norm_var": 0.5398274739583333, + "learning_rate": 0.0003, + "loss": 14.0253, + "loss/aux_loss": 0.04813775867223739, + "loss/crossentropy": 3.086165702342987, + "loss/logits": 1.1042977631092072, + "step": 5140 + }, + { + "epoch": 0.0515, + "grad_norm": 7.96875, + "grad_norm_var": 0.27209879557291666, + "learning_rate": 0.0003, + "loss": 14.2102, + "loss/aux_loss": 0.04814709778875113, + "loss/crossentropy": 3.16923828125, + "loss/logits": 1.1310043185949326, + "step": 5150 + }, + { + "epoch": 0.0516, + "grad_norm": 7.53125, + "grad_norm_var": 0.10705973307291666, + "learning_rate": 0.0003, + "loss": 14.2875, + "loss/aux_loss": 0.04815742298960686, + "loss/crossentropy": 2.9624265909194945, + "loss/logits": 1.1060597985982894, + "step": 5160 + }, + { + "epoch": 0.0517, + "grad_norm": 8.375, + "grad_norm_var": 0.072509765625, + "learning_rate": 0.0003, + "loss": 14.4538, + "loss/aux_loss": 0.04815982095897198, + "loss/crossentropy": 2.942893236875534, + "loss/logits": 1.113595375418663, + "step": 5170 + }, + { + "epoch": 0.0518, + "grad_norm": 8.125, + "grad_norm_var": 0.13105061848958333, + "learning_rate": 0.0003, + "loss": 14.0981, + "loss/aux_loss": 0.04815513715147972, + "loss/crossentropy": 3.05008624792099, + "loss/logits": 1.1298416316509248, + "step": 5180 + }, + { + "epoch": 0.0519, + "grad_norm": 7.59375, + "grad_norm_var": 0.17190348307291667, + "learning_rate": 0.0003, + "loss": 14.2019, + "loss/aux_loss": 0.04814068842679262, + "loss/crossentropy": 2.9983504891395567, + "loss/logits": 1.1261755168437957, + "step": 5190 + }, + { + "epoch": 0.052, + "grad_norm": 8.25, + "grad_norm_var": 0.10623372395833333, + "learning_rate": 0.0003, + "loss": 14.0924, + "loss/aux_loss": 0.04815409407019615, + "loss/crossentropy": 2.815429699420929, + "loss/logits": 1.1029013335704803, + "step": 5200 + }, + { + "epoch": 0.0521, + "grad_norm": 7.96875, + "grad_norm_var": 0.075634765625, + "learning_rate": 0.0003, + "loss": 14.0729, + "loss/aux_loss": 0.04814719296991825, + "loss/crossentropy": 3.0798101305961607, + "loss/logits": 1.1071963399648665, + "step": 5210 + }, + { + "epoch": 0.0522, + "grad_norm": 7.5, + "grad_norm_var": 0.10396728515625, + "learning_rate": 0.0003, + "loss": 14.2266, + "loss/aux_loss": 0.04813891816884279, + "loss/crossentropy": 3.0311917304992675, + "loss/logits": 1.1385094463825225, + "step": 5220 + }, + { + "epoch": 0.0523, + "grad_norm": 7.59375, + "grad_norm_var": 0.09295247395833334, + "learning_rate": 0.0003, + "loss": 14.0168, + "loss/aux_loss": 0.04814713895320892, + "loss/crossentropy": 2.8070708096027372, + "loss/logits": 1.037602314352989, + "step": 5230 + }, + { + "epoch": 0.0524, + "grad_norm": 8.5625, + "grad_norm_var": 0.23748372395833334, + "learning_rate": 0.0003, + "loss": 14.206, + "loss/aux_loss": 0.048150969482958314, + "loss/crossentropy": 2.9220390915870667, + "loss/logits": 1.1021725416183472, + "step": 5240 + }, + { + "epoch": 0.0525, + "grad_norm": 7.96875, + "grad_norm_var": 0.3568644205729167, + "learning_rate": 0.0003, + "loss": 14.2243, + "loss/aux_loss": 0.048154151812195775, + "loss/crossentropy": 3.0725671291351317, + "loss/logits": 1.1407492518424989, + "step": 5250 + }, + { + "epoch": 0.0526, + "grad_norm": 8.25, + "grad_norm_var": 1.03599853515625, + "learning_rate": 0.0003, + "loss": 14.2113, + "loss/aux_loss": 0.04815595541149378, + "loss/crossentropy": 3.073868250846863, + "loss/logits": 1.086431348323822, + "step": 5260 + }, + { + "epoch": 0.0527, + "grad_norm": 8.6875, + "grad_norm_var": 0.8241495768229167, + "learning_rate": 0.0003, + "loss": 14.2052, + "loss/aux_loss": 0.04815474133938551, + "loss/crossentropy": 2.942746305465698, + "loss/logits": 1.085268846154213, + "step": 5270 + }, + { + "epoch": 0.0528, + "grad_norm": 8.125, + "grad_norm_var": 0.457421875, + "learning_rate": 0.0003, + "loss": 14.1014, + "loss/aux_loss": 0.0481459453701973, + "loss/crossentropy": 2.9510623097419737, + "loss/logits": 1.086976206302643, + "step": 5280 + }, + { + "epoch": 0.0529, + "grad_norm": 9.8125, + "grad_norm_var": 0.6368448893229167, + "learning_rate": 0.0003, + "loss": 14.0806, + "loss/aux_loss": 0.04814429916441441, + "loss/crossentropy": 2.91622234582901, + "loss/logits": 1.1365332275629043, + "step": 5290 + }, + { + "epoch": 0.053, + "grad_norm": 8.125, + "grad_norm_var": 0.32281494140625, + "learning_rate": 0.0003, + "loss": 14.1434, + "loss/aux_loss": 0.04813830778002739, + "loss/crossentropy": 2.9429489850997923, + "loss/logits": 1.1115789502859115, + "step": 5300 + }, + { + "epoch": 0.0531, + "grad_norm": 7.46875, + "grad_norm_var": 0.2721638997395833, + "learning_rate": 0.0003, + "loss": 14.1278, + "loss/aux_loss": 0.04815119802951813, + "loss/crossentropy": 3.0424102902412415, + "loss/logits": 1.137840673327446, + "step": 5310 + }, + { + "epoch": 0.0532, + "grad_norm": 8.1875, + "grad_norm_var": 0.20178629557291666, + "learning_rate": 0.0003, + "loss": 14.2472, + "loss/aux_loss": 0.048156299628317356, + "loss/crossentropy": 2.9693280339241026, + "loss/logits": 1.1287171095609665, + "step": 5320 + }, + { + "epoch": 0.0533, + "grad_norm": 8.25, + "grad_norm_var": 0.07245686848958334, + "learning_rate": 0.0003, + "loss": 14.3863, + "loss/aux_loss": 0.04813747089356184, + "loss/crossentropy": 3.1067948579788207, + "loss/logits": 1.1718181252479554, + "step": 5330 + }, + { + "epoch": 0.0534, + "grad_norm": 10.875, + "grad_norm_var": 0.6001139322916667, + "learning_rate": 0.0003, + "loss": 14.1949, + "loss/aux_loss": 0.0481356767937541, + "loss/crossentropy": 2.9768314242362974, + "loss/logits": 1.1081100910902024, + "step": 5340 + }, + { + "epoch": 0.0535, + "grad_norm": 8.0625, + "grad_norm_var": 0.6735677083333333, + "learning_rate": 0.0003, + "loss": 14.1155, + "loss/aux_loss": 0.04814883153885603, + "loss/crossentropy": 3.0978642463684083, + "loss/logits": 1.112101286649704, + "step": 5350 + }, + { + "epoch": 0.0536, + "grad_norm": 8.375, + "grad_norm_var": 0.17733968098958333, + "learning_rate": 0.0003, + "loss": 14.2689, + "loss/aux_loss": 0.048155237548053266, + "loss/crossentropy": 2.9267095983028413, + "loss/logits": 1.1321902126073837, + "step": 5360 + }, + { + "epoch": 0.0537, + "grad_norm": 8.1875, + "grad_norm_var": 0.123828125, + "learning_rate": 0.0003, + "loss": 14.0068, + "loss/aux_loss": 0.048150830715894696, + "loss/crossentropy": 3.0328433394432066, + "loss/logits": 1.0583814442157746, + "step": 5370 + }, + { + "epoch": 0.0538, + "grad_norm": 8.4375, + "grad_norm_var": 0.160791015625, + "learning_rate": 0.0003, + "loss": 14.2637, + "loss/aux_loss": 0.04814900886267424, + "loss/crossentropy": 2.8612841725349427, + "loss/logits": 1.0983431458473205, + "step": 5380 + }, + { + "epoch": 0.0539, + "grad_norm": 7.75, + "grad_norm_var": 0.16663004557291666, + "learning_rate": 0.0003, + "loss": 14.0972, + "loss/aux_loss": 0.04815038740634918, + "loss/crossentropy": 2.872392749786377, + "loss/logits": 1.062236163020134, + "step": 5390 + }, + { + "epoch": 0.054, + "grad_norm": 8.5, + "grad_norm_var": 0.20662434895833334, + "learning_rate": 0.0003, + "loss": 14.0276, + "loss/aux_loss": 0.048151925951242444, + "loss/crossentropy": 2.777138501405716, + "loss/logits": 1.043939945101738, + "step": 5400 + }, + { + "epoch": 0.0541, + "grad_norm": 9.125, + "grad_norm_var": 0.20220947265625, + "learning_rate": 0.0003, + "loss": 14.1898, + "loss/aux_loss": 0.04814861789345741, + "loss/crossentropy": 2.9948280215263368, + "loss/logits": 1.0816247820854188, + "step": 5410 + }, + { + "epoch": 0.0542, + "grad_norm": 8.125, + "grad_norm_var": 0.16243082682291668, + "learning_rate": 0.0003, + "loss": 14.1748, + "loss/aux_loss": 0.04814356118440628, + "loss/crossentropy": 2.984057831764221, + "loss/logits": 1.1022383213043212, + "step": 5420 + }, + { + "epoch": 0.0543, + "grad_norm": 8.375, + "grad_norm_var": 0.21638997395833334, + "learning_rate": 0.0003, + "loss": 14.1883, + "loss/aux_loss": 0.04814845807850361, + "loss/crossentropy": 2.9097337126731873, + "loss/logits": 1.0658887088298798, + "step": 5430 + }, + { + "epoch": 0.0544, + "grad_norm": 46.75, + "grad_norm_var": 91.66760660807292, + "learning_rate": 0.0003, + "loss": 14.2163, + "loss/aux_loss": 0.04814211465418339, + "loss/crossentropy": 3.150989270210266, + "loss/logits": 1.1435310065746307, + "step": 5440 + }, + { + "epoch": 0.0545, + "grad_norm": 7.90625, + "grad_norm_var": 91.24908854166667, + "learning_rate": 0.0003, + "loss": 14.2637, + "loss/aux_loss": 0.04814708679914474, + "loss/crossentropy": 3.065591824054718, + "loss/logits": 1.1185233294963837, + "step": 5450 + }, + { + "epoch": 0.0546, + "grad_norm": 8.75, + "grad_norm_var": 0.31951497395833334, + "learning_rate": 0.0003, + "loss": 14.3104, + "loss/aux_loss": 0.04814552329480648, + "loss/crossentropy": 3.0562121748924254, + "loss/logits": 1.1377945810556411, + "step": 5460 + }, + { + "epoch": 0.0547, + "grad_norm": 7.78125, + "grad_norm_var": 0.33352457682291664, + "learning_rate": 0.0003, + "loss": 14.0194, + "loss/aux_loss": 0.048139683343470095, + "loss/crossentropy": 3.186306917667389, + "loss/logits": 1.0923507630825042, + "step": 5470 + }, + { + "epoch": 0.0548, + "grad_norm": 8.5, + "grad_norm_var": 0.11419270833333334, + "learning_rate": 0.0003, + "loss": 14.0885, + "loss/aux_loss": 0.0481383940204978, + "loss/crossentropy": 3.0000529527664184, + "loss/logits": 1.0960578143596649, + "step": 5480 + }, + { + "epoch": 0.0549, + "grad_norm": 8.4375, + "grad_norm_var": 0.14837239583333334, + "learning_rate": 0.0003, + "loss": 14.1311, + "loss/aux_loss": 0.048138375580310824, + "loss/crossentropy": 3.0034351110458375, + "loss/logits": 1.079491952061653, + "step": 5490 + }, + { + "epoch": 0.055, + "grad_norm": 8.3125, + "grad_norm_var": 0.12092692057291667, + "learning_rate": 0.0003, + "loss": 14.1602, + "loss/aux_loss": 0.04813902676105499, + "loss/crossentropy": 3.0370962262153625, + "loss/logits": 1.071971568465233, + "step": 5500 + }, + { + "epoch": 0.0551, + "grad_norm": 9.125, + "grad_norm_var": 0.159619140625, + "learning_rate": 0.0003, + "loss": 14.1168, + "loss/aux_loss": 0.04815224166959524, + "loss/crossentropy": 2.94165198802948, + "loss/logits": 1.09517442882061, + "step": 5510 + }, + { + "epoch": 0.0552, + "grad_norm": 8.75, + "grad_norm_var": 0.38424479166666664, + "learning_rate": 0.0003, + "loss": 14.1283, + "loss/aux_loss": 0.048148746229708196, + "loss/crossentropy": 2.889024776220322, + "loss/logits": 1.0823973000049592, + "step": 5520 + }, + { + "epoch": 0.0553, + "grad_norm": 7.90625, + "grad_norm_var": 0.37433268229166666, + "learning_rate": 0.0003, + "loss": 14.2453, + "loss/aux_loss": 0.04814521931111813, + "loss/crossentropy": 2.9829455733299257, + "loss/logits": 1.1254934877157212, + "step": 5530 + }, + { + "epoch": 0.0554, + "grad_norm": 8.1875, + "grad_norm_var": 0.23541259765625, + "learning_rate": 0.0003, + "loss": 14.1698, + "loss/aux_loss": 0.04814098011702299, + "loss/crossentropy": 2.950876700878143, + "loss/logits": 1.1378295987844467, + "step": 5540 + }, + { + "epoch": 0.0555, + "grad_norm": 7.90625, + "grad_norm_var": 0.186572265625, + "learning_rate": 0.0003, + "loss": 14.1457, + "loss/aux_loss": 0.04814466387033463, + "loss/crossentropy": 2.9882196366786955, + "loss/logits": 1.0835947006940843, + "step": 5550 + }, + { + "epoch": 0.0556, + "grad_norm": 8.125, + "grad_norm_var": 0.25572509765625, + "learning_rate": 0.0003, + "loss": 14.0392, + "loss/aux_loss": 0.04815062917768955, + "loss/crossentropy": 2.829884684085846, + "loss/logits": 1.0776374101638795, + "step": 5560 + }, + { + "epoch": 0.0557, + "grad_norm": 20.25, + "grad_norm_var": 8.858072916666666, + "learning_rate": 0.0003, + "loss": 14.0187, + "loss/aux_loss": 0.04814136177301407, + "loss/crossentropy": 3.0365766048431397, + "loss/logits": 1.1214863985776902, + "step": 5570 + }, + { + "epoch": 0.0558, + "grad_norm": 8.875, + "grad_norm_var": 71.89733072916667, + "learning_rate": 0.0003, + "loss": 14.0507, + "loss/aux_loss": 0.048154527135193345, + "loss/crossentropy": 2.925475996732712, + "loss/logits": 1.0984899312257768, + "step": 5580 + }, + { + "epoch": 0.0559, + "grad_norm": 9.1875, + "grad_norm_var": 68.73677978515624, + "learning_rate": 0.0003, + "loss": 14.0769, + "loss/aux_loss": 0.048137818835675715, + "loss/crossentropy": 3.0427648425102234, + "loss/logits": 1.1238386183977127, + "step": 5590 + }, + { + "epoch": 0.056, + "grad_norm": 8.625, + "grad_norm_var": 0.35065104166666666, + "learning_rate": 0.0003, + "loss": 14.018, + "loss/aux_loss": 0.04814153481274843, + "loss/crossentropy": 2.9993926525115966, + "loss/logits": 1.0859254390001296, + "step": 5600 + }, + { + "epoch": 0.0561, + "grad_norm": 7.84375, + "grad_norm_var": 0.24605712890625, + "learning_rate": 0.0003, + "loss": 14.0132, + "loss/aux_loss": 0.04814342502504587, + "loss/crossentropy": 3.1588930010795595, + "loss/logits": 1.1444143801927567, + "step": 5610 + }, + { + "epoch": 0.0562, + "grad_norm": 8.3125, + "grad_norm_var": 0.13964436848958334, + "learning_rate": 0.0003, + "loss": 13.9644, + "loss/aux_loss": 0.04813936911523342, + "loss/crossentropy": 2.9891305387020113, + "loss/logits": 1.08986476957798, + "step": 5620 + }, + { + "epoch": 0.0563, + "grad_norm": 7.71875, + "grad_norm_var": 0.15133056640625, + "learning_rate": 0.0003, + "loss": 14.0305, + "loss/aux_loss": 0.04814296532422304, + "loss/crossentropy": 3.094134247303009, + "loss/logits": 1.1330428838729858, + "step": 5630 + }, + { + "epoch": 0.0564, + "grad_norm": 8.0, + "grad_norm_var": 0.22073160807291667, + "learning_rate": 0.0003, + "loss": 14.0265, + "loss/aux_loss": 0.04816127121448517, + "loss/crossentropy": 2.90863493680954, + "loss/logits": 1.0908836662769317, + "step": 5640 + }, + { + "epoch": 0.0565, + "grad_norm": 8.75, + "grad_norm_var": 0.20128580729166667, + "learning_rate": 0.0003, + "loss": 14.1691, + "loss/aux_loss": 0.048136289790272714, + "loss/crossentropy": 3.045944094657898, + "loss/logits": 1.1683479130268097, + "step": 5650 + }, + { + "epoch": 0.0566, + "grad_norm": 7.59375, + "grad_norm_var": 0.221484375, + "learning_rate": 0.0003, + "loss": 13.8956, + "loss/aux_loss": 0.0481420211493969, + "loss/crossentropy": 2.9611165285110475, + "loss/logits": 1.1002487033605575, + "step": 5660 + }, + { + "epoch": 0.0567, + "grad_norm": 7.78125, + "grad_norm_var": 0.1322265625, + "learning_rate": 0.0003, + "loss": 13.9722, + "loss/aux_loss": 0.04813913106918335, + "loss/crossentropy": 2.832181286811829, + "loss/logits": 1.085351037979126, + "step": 5670 + }, + { + "epoch": 0.0568, + "grad_norm": 8.4375, + "grad_norm_var": 0.09976806640625, + "learning_rate": 0.0003, + "loss": 13.9409, + "loss/aux_loss": 0.04813998658210039, + "loss/crossentropy": 3.0723737359046934, + "loss/logits": 1.1221662908792496, + "step": 5680 + }, + { + "epoch": 0.0569, + "grad_norm": 8.75, + "grad_norm_var": 0.18485921223958332, + "learning_rate": 0.0003, + "loss": 13.9103, + "loss/aux_loss": 0.048149769008159635, + "loss/crossentropy": 2.8909295797348022, + "loss/logits": 1.0669385582208633, + "step": 5690 + }, + { + "epoch": 0.057, + "grad_norm": 8.1875, + "grad_norm_var": 0.164306640625, + "learning_rate": 0.0003, + "loss": 14.2382, + "loss/aux_loss": 0.04814224392175674, + "loss/crossentropy": 3.028742825984955, + "loss/logits": 1.1198367089033128, + "step": 5700 + }, + { + "epoch": 0.0571, + "grad_norm": 7.9375, + "grad_norm_var": 0.16002197265625, + "learning_rate": 0.0003, + "loss": 13.9939, + "loss/aux_loss": 0.04813466928899288, + "loss/crossentropy": 3.091606914997101, + "loss/logits": 1.111482274532318, + "step": 5710 + }, + { + "epoch": 0.0572, + "grad_norm": 8.1875, + "grad_norm_var": 0.13919270833333333, + "learning_rate": 0.0003, + "loss": 13.8898, + "loss/aux_loss": 0.04814035035669804, + "loss/crossentropy": 2.9719881653785705, + "loss/logits": 1.1058259099721908, + "step": 5720 + }, + { + "epoch": 0.0573, + "grad_norm": 8.0625, + "grad_norm_var": 20.40621337890625, + "learning_rate": 0.0003, + "loss": 14.125, + "loss/aux_loss": 0.048147077485918996, + "loss/crossentropy": 2.940553843975067, + "loss/logits": 1.0541133284568787, + "step": 5730 + }, + { + "epoch": 0.0574, + "grad_norm": 9.125, + "grad_norm_var": 0.126416015625, + "learning_rate": 0.0003, + "loss": 14.0709, + "loss/aux_loss": 0.04814092367887497, + "loss/crossentropy": 2.9801509261131285, + "loss/logits": 1.1179528176784514, + "step": 5740 + }, + { + "epoch": 0.0575, + "grad_norm": 8.0625, + "grad_norm_var": 0.21633707682291667, + "learning_rate": 0.0003, + "loss": 13.8988, + "loss/aux_loss": 0.048131171986460684, + "loss/crossentropy": 3.1363558411598205, + "loss/logits": 1.1174342811107636, + "step": 5750 + }, + { + "epoch": 0.0576, + "grad_norm": 8.1875, + "grad_norm_var": 0.42724202473958334, + "learning_rate": 0.0003, + "loss": 13.8693, + "loss/aux_loss": 0.048153743520379065, + "loss/crossentropy": 3.0531252682209016, + "loss/logits": 1.0992789357900619, + "step": 5760 + }, + { + "epoch": 0.0577, + "grad_norm": 7.96875, + "grad_norm_var": 0.39302978515625, + "learning_rate": 0.0003, + "loss": 13.758, + "loss/aux_loss": 0.048135829716920854, + "loss/crossentropy": 3.001027262210846, + "loss/logits": 1.0745349794626236, + "step": 5770 + }, + { + "epoch": 0.0578, + "grad_norm": 8.6875, + "grad_norm_var": 0.33866780598958335, + "learning_rate": 0.0003, + "loss": 13.7585, + "loss/aux_loss": 0.04814055394381285, + "loss/crossentropy": 2.9369577765464783, + "loss/logits": 1.0799493759870529, + "step": 5780 + }, + { + "epoch": 0.0579, + "grad_norm": 8.1875, + "grad_norm_var": 0.6395833333333333, + "learning_rate": 0.0003, + "loss": 14.1834, + "loss/aux_loss": 0.04815905783325434, + "loss/crossentropy": 3.069815826416016, + "loss/logits": 1.1343096286058425, + "step": 5790 + }, + { + "epoch": 0.058, + "grad_norm": 8.1875, + "grad_norm_var": 0.27545572916666666, + "learning_rate": 0.0003, + "loss": 13.9437, + "loss/aux_loss": 0.04814049322158098, + "loss/crossentropy": 2.8448895037174227, + "loss/logits": 1.0605036556720733, + "step": 5800 + }, + { + "epoch": 0.0581, + "grad_norm": 8.625, + "grad_norm_var": 0.33121337890625, + "learning_rate": 0.0003, + "loss": 13.9217, + "loss/aux_loss": 0.048141079396009444, + "loss/crossentropy": 2.959049415588379, + "loss/logits": 1.1173090249300004, + "step": 5810 + }, + { + "epoch": 0.0582, + "grad_norm": 8.5625, + "grad_norm_var": 0.582666015625, + "learning_rate": 0.0003, + "loss": 13.9177, + "loss/aux_loss": 0.04814137741923332, + "loss/crossentropy": 2.9781831741333007, + "loss/logits": 1.088547134399414, + "step": 5820 + }, + { + "epoch": 0.0583, + "grad_norm": 8.5, + "grad_norm_var": 0.52457275390625, + "learning_rate": 0.0003, + "loss": 13.8936, + "loss/aux_loss": 0.04813809935003519, + "loss/crossentropy": 2.920023334026337, + "loss/logits": 1.0897945940494538, + "step": 5830 + }, + { + "epoch": 0.0584, + "grad_norm": 8.5, + "grad_norm_var": 0.13635660807291666, + "learning_rate": 0.0003, + "loss": 13.929, + "loss/aux_loss": 0.04814415480941534, + "loss/crossentropy": 2.9571971893310547, + "loss/logits": 1.0757667511701583, + "step": 5840 + }, + { + "epoch": 0.0585, + "grad_norm": 8.125, + "grad_norm_var": 0.13433837890625, + "learning_rate": 0.0003, + "loss": 13.9993, + "loss/aux_loss": 0.04814188275486231, + "loss/crossentropy": 2.9641053080558777, + "loss/logits": 1.0629219651222228, + "step": 5850 + }, + { + "epoch": 0.0586, + "grad_norm": 8.4375, + "grad_norm_var": 0.4556640625, + "learning_rate": 0.0003, + "loss": 13.9047, + "loss/aux_loss": 0.04814865179359913, + "loss/crossentropy": 2.838666582107544, + "loss/logits": 1.08486467897892, + "step": 5860 + }, + { + "epoch": 0.0587, + "grad_norm": 8.25, + "grad_norm_var": 27.4416015625, + "learning_rate": 0.0003, + "loss": 13.7725, + "loss/aux_loss": 0.048147336766123774, + "loss/crossentropy": 2.966917932033539, + "loss/logits": 1.0681630432605744, + "step": 5870 + }, + { + "epoch": 0.0588, + "grad_norm": 8.3125, + "grad_norm_var": 18.5572265625, + "learning_rate": 0.0003, + "loss": 13.805, + "loss/aux_loss": 0.04814778696745634, + "loss/crossentropy": 2.870664370059967, + "loss/logits": 1.0493683815002441, + "step": 5880 + }, + { + "epoch": 0.0589, + "grad_norm": 8.5, + "grad_norm_var": 0.5998331705729166, + "learning_rate": 0.0003, + "loss": 13.8099, + "loss/aux_loss": 0.04814050365239382, + "loss/crossentropy": 2.922038221359253, + "loss/logits": 1.0779344737529755, + "step": 5890 + }, + { + "epoch": 0.059, + "grad_norm": 9.375, + "grad_norm_var": 0.202978515625, + "learning_rate": 0.0003, + "loss": 13.9627, + "loss/aux_loss": 0.04813796691596508, + "loss/crossentropy": 3.1271554470062255, + "loss/logits": 1.1131070137023926, + "step": 5900 + }, + { + "epoch": 0.0591, + "grad_norm": 8.625, + "grad_norm_var": 0.20284830729166667, + "learning_rate": 0.0003, + "loss": 13.7172, + "loss/aux_loss": 0.048142065107822415, + "loss/crossentropy": 2.9341515243053435, + "loss/logits": 1.0991775900125504, + "step": 5910 + }, + { + "epoch": 0.0592, + "grad_norm": 8.375, + "grad_norm_var": 0.32784830729166664, + "learning_rate": 0.0003, + "loss": 13.9532, + "loss/aux_loss": 0.04815160110592842, + "loss/crossentropy": 2.963713300228119, + "loss/logits": 1.0947488635778426, + "step": 5920 + }, + { + "epoch": 0.0593, + "grad_norm": 8.1875, + "grad_norm_var": 0.3666015625, + "learning_rate": 0.0003, + "loss": 13.8793, + "loss/aux_loss": 0.048143844306468966, + "loss/crossentropy": 2.8118023216724395, + "loss/logits": 1.0895264118909835, + "step": 5930 + }, + { + "epoch": 0.0594, + "grad_norm": 8.375, + "grad_norm_var": 0.08274739583333333, + "learning_rate": 0.0003, + "loss": 13.7795, + "loss/aux_loss": 0.04813539497554302, + "loss/crossentropy": 2.8907833218574526, + "loss/logits": 1.0538031846284865, + "step": 5940 + }, + { + "epoch": 0.0595, + "grad_norm": 8.5625, + "grad_norm_var": 0.24547119140625, + "learning_rate": 0.0003, + "loss": 14.0992, + "loss/aux_loss": 0.048147369921207425, + "loss/crossentropy": 2.9670627653598785, + "loss/logits": 1.1421405851840973, + "step": 5950 + }, + { + "epoch": 0.0596, + "grad_norm": 9.375, + "grad_norm_var": 0.67174072265625, + "learning_rate": 0.0003, + "loss": 13.8134, + "loss/aux_loss": 0.04814773909747601, + "loss/crossentropy": 2.9220254778862, + "loss/logits": 1.0881559133529664, + "step": 5960 + }, + { + "epoch": 0.0597, + "grad_norm": 8.8125, + "grad_norm_var": 68.9384765625, + "learning_rate": 0.0003, + "loss": 14.0155, + "loss/aux_loss": 0.048162427730858326, + "loss/crossentropy": 2.977382260560989, + "loss/logits": 1.0755089968442917, + "step": 5970 + }, + { + "epoch": 0.0598, + "grad_norm": 8.0625, + "grad_norm_var": 68.26399739583333, + "learning_rate": 0.0003, + "loss": 14.0255, + "loss/aux_loss": 0.048142471350729465, + "loss/crossentropy": 2.8615992307662963, + "loss/logits": 1.0675591200590133, + "step": 5980 + }, + { + "epoch": 0.0599, + "grad_norm": 8.6875, + "grad_norm_var": 0.53717041015625, + "learning_rate": 0.0003, + "loss": 13.6566, + "loss/aux_loss": 0.048138993233442305, + "loss/crossentropy": 3.0715150594711305, + "loss/logits": 1.0647211134433747, + "step": 5990 + }, + { + "epoch": 0.06, + "grad_norm": 8.5, + "grad_norm_var": 0.13583577473958333, + "learning_rate": 0.0003, + "loss": 13.8894, + "loss/aux_loss": 0.04813880603760481, + "loss/crossentropy": 2.99127779006958, + "loss/logits": 1.0782989412546158, + "step": 6000 + }, + { + "epoch": 0.0601, + "grad_norm": 8.375, + "grad_norm_var": 0.13151041666666666, + "learning_rate": 0.0003, + "loss": 13.9352, + "loss/aux_loss": 0.04813908338546753, + "loss/crossentropy": 2.9843607366085054, + "loss/logits": 1.0743216931819917, + "step": 6010 + }, + { + "epoch": 0.0602, + "grad_norm": 8.375, + "grad_norm_var": 1.2416015625, + "learning_rate": 0.0003, + "loss": 13.9324, + "loss/aux_loss": 0.04814204126596451, + "loss/crossentropy": 3.001486176252365, + "loss/logits": 1.0827761620283127, + "step": 6020 + }, + { + "epoch": 0.0603, + "grad_norm": 9.0625, + "grad_norm_var": 0.3578125, + "learning_rate": 0.0003, + "loss": 13.5495, + "loss/aux_loss": 0.048134736530482766, + "loss/crossentropy": 2.8943534910678865, + "loss/logits": 1.089774450659752, + "step": 6030 + }, + { + "epoch": 0.0604, + "grad_norm": 8.25, + "grad_norm_var": 0.21990559895833334, + "learning_rate": 0.0003, + "loss": 13.91, + "loss/aux_loss": 0.0481480710208416, + "loss/crossentropy": 3.125103998184204, + "loss/logits": 1.1064673095941544, + "step": 6040 + }, + { + "epoch": 0.0605, + "grad_norm": 9.5625, + "grad_norm_var": 0.5637980143229167, + "learning_rate": 0.0003, + "loss": 13.7693, + "loss/aux_loss": 0.04813214130699635, + "loss/crossentropy": 2.9569589614868166, + "loss/logits": 1.0811177968978882, + "step": 6050 + }, + { + "epoch": 0.0606, + "grad_norm": 9.0, + "grad_norm_var": 0.32428385416666666, + "learning_rate": 0.0003, + "loss": 13.737, + "loss/aux_loss": 0.048133809491991995, + "loss/crossentropy": 2.873425018787384, + "loss/logits": 1.066912430524826, + "step": 6060 + }, + { + "epoch": 0.0607, + "grad_norm": 8.8125, + "grad_norm_var": 0.12480061848958333, + "learning_rate": 0.0003, + "loss": 13.7512, + "loss/aux_loss": 0.048146062158048154, + "loss/crossentropy": 2.874787837266922, + "loss/logits": 0.99142906665802, + "step": 6070 + }, + { + "epoch": 0.0608, + "grad_norm": 9.5625, + "grad_norm_var": 0.5244140625, + "learning_rate": 0.0003, + "loss": 13.7192, + "loss/aux_loss": 0.04813813380897045, + "loss/crossentropy": 2.8974472165107725, + "loss/logits": 1.0416524529457092, + "step": 6080 + }, + { + "epoch": 0.0609, + "grad_norm": 11.25, + "grad_norm_var": 0.8244140625, + "learning_rate": 0.0003, + "loss": 14.0021, + "loss/aux_loss": 0.04813482966274023, + "loss/crossentropy": 2.9082088649272917, + "loss/logits": 1.1170342415571213, + "step": 6090 + }, + { + "epoch": 0.061, + "grad_norm": 7.90625, + "grad_norm_var": 0.6071248372395833, + "learning_rate": 0.0003, + "loss": 13.8868, + "loss/aux_loss": 0.048123362846672534, + "loss/crossentropy": 3.10910404920578, + "loss/logits": 1.1065054565668107, + "step": 6100 + }, + { + "epoch": 0.0611, + "grad_norm": 9.0625, + "grad_norm_var": 0.14185791015625, + "learning_rate": 0.0003, + "loss": 13.9858, + "loss/aux_loss": 0.04813080281019211, + "loss/crossentropy": 2.9956843733787535, + "loss/logits": 1.0740672290325164, + "step": 6110 + }, + { + "epoch": 0.0612, + "grad_norm": 8.875, + "grad_norm_var": 0.11495768229166667, + "learning_rate": 0.0003, + "loss": 13.8486, + "loss/aux_loss": 0.048135568387806416, + "loss/crossentropy": 3.0476453006267548, + "loss/logits": 1.074926945567131, + "step": 6120 + }, + { + "epoch": 0.0613, + "grad_norm": 11.0, + "grad_norm_var": 44.31295572916667, + "learning_rate": 0.0003, + "loss": 13.7639, + "loss/aux_loss": 0.04814387541264296, + "loss/crossentropy": 3.015053462982178, + "loss/logits": 1.0650121331214906, + "step": 6130 + }, + { + "epoch": 0.0614, + "grad_norm": 9.3125, + "grad_norm_var": 18.512353515625, + "learning_rate": 0.0003, + "loss": 13.6984, + "loss/aux_loss": 0.04813182633370161, + "loss/crossentropy": 2.9327735245227813, + "loss/logits": 1.062555307149887, + "step": 6140 + }, + { + "epoch": 0.0615, + "grad_norm": 7.84375, + "grad_norm_var": 0.5481404622395833, + "learning_rate": 0.0003, + "loss": 13.7344, + "loss/aux_loss": 0.048124780878424644, + "loss/crossentropy": 2.884230363368988, + "loss/logits": 1.0680664718151092, + "step": 6150 + }, + { + "epoch": 0.0616, + "grad_norm": 8.9375, + "grad_norm_var": 0.13401285807291666, + "learning_rate": 0.0003, + "loss": 14.0286, + "loss/aux_loss": 0.04813941400498152, + "loss/crossentropy": 2.973123300075531, + "loss/logits": 1.0950632393360138, + "step": 6160 + }, + { + "epoch": 0.0617, + "grad_norm": 7.96875, + "grad_norm_var": 0.15256754557291666, + "learning_rate": 0.0003, + "loss": 13.4949, + "loss/aux_loss": 0.04813013020902872, + "loss/crossentropy": 3.0717917561531065, + "loss/logits": 1.096383735537529, + "step": 6170 + }, + { + "epoch": 0.0618, + "grad_norm": 8.8125, + "grad_norm_var": 0.3224894205729167, + "learning_rate": 0.0003, + "loss": 13.6958, + "loss/aux_loss": 0.04813482668250799, + "loss/crossentropy": 2.905612015724182, + "loss/logits": 1.0801061391830444, + "step": 6180 + }, + { + "epoch": 0.0619, + "grad_norm": 8.125, + "grad_norm_var": 1.2854166666666667, + "learning_rate": 0.0003, + "loss": 13.8524, + "loss/aux_loss": 0.04814470838755369, + "loss/crossentropy": 3.090296733379364, + "loss/logits": 1.1191903114318849, + "step": 6190 + }, + { + "epoch": 0.062, + "grad_norm": 8.5625, + "grad_norm_var": 0.2548014322916667, + "learning_rate": 0.0003, + "loss": 13.8103, + "loss/aux_loss": 0.04813280999660492, + "loss/crossentropy": 2.9565974533557893, + "loss/logits": 1.0762405812740325, + "step": 6200 + }, + { + "epoch": 0.0621, + "grad_norm": 8.8125, + "grad_norm_var": 0.12838541666666667, + "learning_rate": 0.0003, + "loss": 13.9951, + "loss/aux_loss": 0.04814301636070013, + "loss/crossentropy": 3.0025951147079466, + "loss/logits": 1.094373619556427, + "step": 6210 + }, + { + "epoch": 0.0622, + "grad_norm": 8.8125, + "grad_norm_var": 0.15935872395833334, + "learning_rate": 0.0003, + "loss": 13.7959, + "loss/aux_loss": 0.04813591837882995, + "loss/crossentropy": 2.890333390235901, + "loss/logits": 1.1366484671831132, + "step": 6220 + }, + { + "epoch": 0.0623, + "grad_norm": 8.0, + "grad_norm_var": 0.483056640625, + "learning_rate": 0.0003, + "loss": 13.9138, + "loss/aux_loss": 0.04814350325614214, + "loss/crossentropy": 2.989057755470276, + "loss/logits": 1.0995355397462845, + "step": 6230 + }, + { + "epoch": 0.0624, + "grad_norm": 8.4375, + "grad_norm_var": 0.23398030598958333, + "learning_rate": 0.0003, + "loss": 13.7247, + "loss/aux_loss": 0.04812443405389786, + "loss/crossentropy": 2.929747235774994, + "loss/logits": 1.070769226551056, + "step": 6240 + }, + { + "epoch": 0.0625, + "grad_norm": 8.9375, + "grad_norm_var": 0.19888916015625, + "learning_rate": 0.0003, + "loss": 13.8281, + "loss/aux_loss": 0.04813152626156807, + "loss/crossentropy": 2.9640577673912047, + "loss/logits": 1.0986740648746491, + "step": 6250 + }, + { + "epoch": 0.0626, + "grad_norm": 8.8125, + "grad_norm_var": 0.21834309895833334, + "learning_rate": 0.0003, + "loss": 13.693, + "loss/aux_loss": 0.048149599321186544, + "loss/crossentropy": 3.0804611802101136, + "loss/logits": 1.0116279065608977, + "step": 6260 + }, + { + "epoch": 0.0627, + "grad_norm": 18.0, + "grad_norm_var": 5.355322265625, + "learning_rate": 0.0003, + "loss": 13.5479, + "loss/aux_loss": 0.04814065471291542, + "loss/crossentropy": 2.9952731311321257, + "loss/logits": 1.1052643030881881, + "step": 6270 + }, + { + "epoch": 0.0628, + "grad_norm": 9.1875, + "grad_norm_var": 5.497359212239584, + "learning_rate": 0.0003, + "loss": 13.7811, + "loss/aux_loss": 0.048146970197558404, + "loss/crossentropy": 3.019810402393341, + "loss/logits": 1.0878846973180771, + "step": 6280 + }, + { + "epoch": 0.0629, + "grad_norm": 9.375, + "grad_norm_var": 0.2244140625, + "learning_rate": 0.0003, + "loss": 13.7007, + "loss/aux_loss": 0.04813486896455288, + "loss/crossentropy": 3.0539894580841063, + "loss/logits": 1.1094782143831252, + "step": 6290 + }, + { + "epoch": 0.063, + "grad_norm": 8.75, + "grad_norm_var": 0.15198160807291666, + "learning_rate": 0.0003, + "loss": 13.6997, + "loss/aux_loss": 0.048145148530602457, + "loss/crossentropy": 2.8999388575553895, + "loss/logits": 1.0792778134346008, + "step": 6300 + }, + { + "epoch": 0.0631, + "grad_norm": 8.5, + "grad_norm_var": 111.25478108723958, + "learning_rate": 0.0003, + "loss": 13.7056, + "loss/aux_loss": 0.048147874511778356, + "loss/crossentropy": 2.960583436489105, + "loss/logits": 1.0709624886512756, + "step": 6310 + }, + { + "epoch": 0.0632, + "grad_norm": 9.125, + "grad_norm_var": 109.709228515625, + "learning_rate": 0.0003, + "loss": 13.8739, + "loss/aux_loss": 0.04814481791108847, + "loss/crossentropy": 2.9161580562591554, + "loss/logits": 1.0524902671575547, + "step": 6320 + }, + { + "epoch": 0.0633, + "grad_norm": 8.8125, + "grad_norm_var": 0.32526041666666666, + "learning_rate": 0.0003, + "loss": 13.9297, + "loss/aux_loss": 0.048139688558876514, + "loss/crossentropy": 3.0711183190345763, + "loss/logits": 1.1227669954299926, + "step": 6330 + }, + { + "epoch": 0.0634, + "grad_norm": 9.0625, + "grad_norm_var": 0.3661295572916667, + "learning_rate": 0.0003, + "loss": 13.6939, + "loss/aux_loss": 0.048136590234935286, + "loss/crossentropy": 2.935191023349762, + "loss/logits": 1.071747088432312, + "step": 6340 + }, + { + "epoch": 0.0635, + "grad_norm": 8.9375, + "grad_norm_var": 0.06131184895833333, + "learning_rate": 0.0003, + "loss": 13.8008, + "loss/aux_loss": 0.04813410900533199, + "loss/crossentropy": 3.015982925891876, + "loss/logits": 1.0928217798471451, + "step": 6350 + }, + { + "epoch": 0.0636, + "grad_norm": 8.3125, + "grad_norm_var": 0.12537434895833333, + "learning_rate": 0.0003, + "loss": 13.7574, + "loss/aux_loss": 0.0481348292902112, + "loss/crossentropy": 2.796919822692871, + "loss/logits": 1.064060640335083, + "step": 6360 + }, + { + "epoch": 0.0637, + "grad_norm": 8.8125, + "grad_norm_var": 0.23995768229166667, + "learning_rate": 0.0003, + "loss": 13.8537, + "loss/aux_loss": 0.04814393315464258, + "loss/crossentropy": 3.1008806109428404, + "loss/logits": 1.0501255184412002, + "step": 6370 + }, + { + "epoch": 0.0638, + "grad_norm": 8.625, + "grad_norm_var": 0.116259765625, + "learning_rate": 0.0003, + "loss": 13.6849, + "loss/aux_loss": 0.048127164505422114, + "loss/crossentropy": 2.8864944219589233, + "loss/logits": 1.0921163856983185, + "step": 6380 + }, + { + "epoch": 0.0639, + "grad_norm": 8.625, + "grad_norm_var": 76.34386393229167, + "learning_rate": 0.0003, + "loss": 13.6183, + "loss/aux_loss": 0.048148133978247645, + "loss/crossentropy": 2.8359330534934997, + "loss/logits": 1.0765960454940795, + "step": 6390 + }, + { + "epoch": 0.064, + "grad_norm": 8.3125, + "grad_norm_var": 0.277587890625, + "learning_rate": 0.0003, + "loss": 13.6039, + "loss/aux_loss": 0.04812461007386446, + "loss/crossentropy": 2.999741852283478, + "loss/logits": 1.0552677452564239, + "step": 6400 + }, + { + "epoch": 0.0641, + "grad_norm": 8.8125, + "grad_norm_var": 0.2833333333333333, + "learning_rate": 0.0003, + "loss": 13.7492, + "loss/aux_loss": 0.04812895692884922, + "loss/crossentropy": 3.226720857620239, + "loss/logits": 1.0827998757362365, + "step": 6410 + }, + { + "epoch": 0.0642, + "grad_norm": 8.6875, + "grad_norm_var": 0.13553059895833333, + "learning_rate": 0.0003, + "loss": 13.7557, + "loss/aux_loss": 0.04812961965799332, + "loss/crossentropy": 2.979693388938904, + "loss/logits": 1.0688404828310012, + "step": 6420 + }, + { + "epoch": 0.0643, + "grad_norm": 8.5, + "grad_norm_var": 0.28761393229166665, + "learning_rate": 0.0003, + "loss": 13.6565, + "loss/aux_loss": 0.04814809542149305, + "loss/crossentropy": 2.8194834649562837, + "loss/logits": 1.0419757306575774, + "step": 6430 + }, + { + "epoch": 0.0644, + "grad_norm": 8.5, + "grad_norm_var": 0.25193684895833335, + "learning_rate": 0.0003, + "loss": 13.7414, + "loss/aux_loss": 0.048131111077964306, + "loss/crossentropy": 2.894507110118866, + "loss/logits": 1.0287611424922942, + "step": 6440 + }, + { + "epoch": 0.0645, + "grad_norm": 8.25, + "grad_norm_var": 23.7390625, + "learning_rate": 0.0003, + "loss": 13.6572, + "loss/aux_loss": 0.04814545251429081, + "loss/crossentropy": 2.9162492036819456, + "loss/logits": 1.0510101735591888, + "step": 6450 + }, + { + "epoch": 0.0646, + "grad_norm": 8.0625, + "grad_norm_var": 0.353369140625, + "learning_rate": 0.0003, + "loss": 13.6879, + "loss/aux_loss": 0.04813457876443863, + "loss/crossentropy": 3.00765939950943, + "loss/logits": 1.0992391586303711, + "step": 6460 + }, + { + "epoch": 0.0647, + "grad_norm": 9.0, + "grad_norm_var": 0.2699055989583333, + "learning_rate": 0.0003, + "loss": 13.7296, + "loss/aux_loss": 0.04814710468053818, + "loss/crossentropy": 2.933130156993866, + "loss/logits": 1.0464091002941132, + "step": 6470 + }, + { + "epoch": 0.0648, + "grad_norm": 8.75, + "grad_norm_var": 0.21276041666666667, + "learning_rate": 0.0003, + "loss": 13.5071, + "loss/aux_loss": 0.0481443403288722, + "loss/crossentropy": 2.893195056915283, + "loss/logits": 1.0626126766204833, + "step": 6480 + }, + { + "epoch": 0.0649, + "grad_norm": 9.875, + "grad_norm_var": 0.32551676432291665, + "learning_rate": 0.0003, + "loss": 13.7133, + "loss/aux_loss": 0.04813146814703941, + "loss/crossentropy": 3.026914322376251, + "loss/logits": 1.1285286754369737, + "step": 6490 + }, + { + "epoch": 0.065, + "grad_norm": 8.8125, + "grad_norm_var": 0.19479166666666667, + "learning_rate": 0.0003, + "loss": 13.7583, + "loss/aux_loss": 0.048133007064461705, + "loss/crossentropy": 3.017659032344818, + "loss/logits": 1.064196562767029, + "step": 6500 + }, + { + "epoch": 0.0651, + "grad_norm": 9.5625, + "grad_norm_var": 0.2625, + "learning_rate": 0.0003, + "loss": 13.6455, + "loss/aux_loss": 0.04813391268253327, + "loss/crossentropy": 2.9791279196739198, + "loss/logits": 1.068275386095047, + "step": 6510 + }, + { + "epoch": 0.0652, + "grad_norm": 8.0, + "grad_norm_var": 0.26712239583333336, + "learning_rate": 0.0003, + "loss": 13.7033, + "loss/aux_loss": 0.04812156092375517, + "loss/crossentropy": 2.9347316145896913, + "loss/logits": 1.0693759769201279, + "step": 6520 + }, + { + "epoch": 0.0653, + "grad_norm": 8.125, + "grad_norm_var": 0.15130208333333334, + "learning_rate": 0.0003, + "loss": 13.4842, + "loss/aux_loss": 0.04813071470707655, + "loss/crossentropy": 3.036532533168793, + "loss/logits": 1.07112657725811, + "step": 6530 + }, + { + "epoch": 0.0654, + "grad_norm": 8.4375, + "grad_norm_var": 0.08318684895833334, + "learning_rate": 0.0003, + "loss": 13.6518, + "loss/aux_loss": 0.04812001138925552, + "loss/crossentropy": 3.02432986497879, + "loss/logits": 1.0842196673154831, + "step": 6540 + }, + { + "epoch": 0.0655, + "grad_norm": 8.1875, + "grad_norm_var": 0.14270833333333333, + "learning_rate": 0.0003, + "loss": 13.5997, + "loss/aux_loss": 0.04814151749014854, + "loss/crossentropy": 2.8338264346122743, + "loss/logits": 1.0342780292034148, + "step": 6550 + }, + { + "epoch": 0.0656, + "grad_norm": 8.3125, + "grad_norm_var": 0.245166015625, + "learning_rate": 0.0003, + "loss": 13.6701, + "loss/aux_loss": 0.04813636671751738, + "loss/crossentropy": 2.8755680441856386, + "loss/logits": 1.0737797766923904, + "step": 6560 + }, + { + "epoch": 0.0657, + "grad_norm": 9.3125, + "grad_norm_var": 5.57515869140625, + "learning_rate": 0.0003, + "loss": 13.7123, + "loss/aux_loss": 0.048146852850914, + "loss/crossentropy": 2.824841636419296, + "loss/logits": 1.0282084316015243, + "step": 6570 + }, + { + "epoch": 0.0658, + "grad_norm": 9.75, + "grad_norm_var": 4.969559733072916, + "learning_rate": 0.0003, + "loss": 13.5714, + "loss/aux_loss": 0.04814463872462511, + "loss/crossentropy": 2.800820177793503, + "loss/logits": 1.0070704787969589, + "step": 6580 + }, + { + "epoch": 0.0659, + "grad_norm": 8.9375, + "grad_norm_var": 0.8249837239583333, + "learning_rate": 0.0003, + "loss": 13.8031, + "loss/aux_loss": 0.04813964460045099, + "loss/crossentropy": 2.9267017126083372, + "loss/logits": 1.0605516761541367, + "step": 6590 + }, + { + "epoch": 0.066, + "grad_norm": 8.625, + "grad_norm_var": 0.20974934895833333, + "learning_rate": 0.0003, + "loss": 13.6457, + "loss/aux_loss": 0.04813900291919708, + "loss/crossentropy": 2.8911925733089445, + "loss/logits": 1.0161655098199844, + "step": 6600 + }, + { + "epoch": 0.0661, + "grad_norm": 9.375, + "grad_norm_var": 0.08899739583333334, + "learning_rate": 0.0003, + "loss": 13.6958, + "loss/aux_loss": 0.048127881996333596, + "loss/crossentropy": 3.0655489921569825, + "loss/logits": 1.0680898874998093, + "step": 6610 + }, + { + "epoch": 0.0662, + "grad_norm": 8.4375, + "grad_norm_var": 0.26223958333333336, + "learning_rate": 0.0003, + "loss": 13.6129, + "loss/aux_loss": 0.04813065193593502, + "loss/crossentropy": 3.0288997888565063, + "loss/logits": 1.0894548326730729, + "step": 6620 + }, + { + "epoch": 0.0663, + "grad_norm": 8.8125, + "grad_norm_var": 0.22980143229166666, + "learning_rate": 0.0003, + "loss": 13.5437, + "loss/aux_loss": 0.04813928250223398, + "loss/crossentropy": 2.8766731202602385, + "loss/logits": 1.0580507218837738, + "step": 6630 + }, + { + "epoch": 0.0664, + "grad_norm": 9.25, + "grad_norm_var": 0.6728515625, + "learning_rate": 0.0003, + "loss": 13.5473, + "loss/aux_loss": 0.04812793843448162, + "loss/crossentropy": 3.049793744087219, + "loss/logits": 1.0934020727872849, + "step": 6640 + }, + { + "epoch": 0.0665, + "grad_norm": 8.9375, + "grad_norm_var": 0.7403483072916667, + "learning_rate": 0.0003, + "loss": 13.6928, + "loss/aux_loss": 0.04813400413841009, + "loss/crossentropy": 3.0123080134391786, + "loss/logits": 1.0466331481933593, + "step": 6650 + }, + { + "epoch": 0.0666, + "grad_norm": 8.3125, + "grad_norm_var": 0.11365559895833334, + "learning_rate": 0.0003, + "loss": 13.6429, + "loss/aux_loss": 0.04813118148595095, + "loss/crossentropy": 2.941323435306549, + "loss/logits": 1.0748949706554414, + "step": 6660 + }, + { + "epoch": 0.0667, + "grad_norm": 10.5, + "grad_norm_var": 22.969010416666666, + "learning_rate": 0.0003, + "loss": 13.6803, + "loss/aux_loss": 0.04812760762870312, + "loss/crossentropy": 2.996757823228836, + "loss/logits": 1.0776958972215653, + "step": 6670 + }, + { + "epoch": 0.0668, + "grad_norm": 8.3125, + "grad_norm_var": 22.934830729166666, + "learning_rate": 0.0003, + "loss": 13.4982, + "loss/aux_loss": 0.04812924452126026, + "loss/crossentropy": 3.0282610774040224, + "loss/logits": 1.0759372055530547, + "step": 6680 + }, + { + "epoch": 0.0669, + "grad_norm": 8.5625, + "grad_norm_var": 0.09557291666666666, + "learning_rate": 0.0003, + "loss": 13.5785, + "loss/aux_loss": 0.04813319090753794, + "loss/crossentropy": 3.0203604459762574, + "loss/logits": 1.0731590211391449, + "step": 6690 + }, + { + "epoch": 0.067, + "grad_norm": 9.1875, + "grad_norm_var": 0.20358072916666667, + "learning_rate": 0.0003, + "loss": 13.567, + "loss/aux_loss": 0.04814446251839399, + "loss/crossentropy": 2.9507773220539093, + "loss/logits": 1.070712435245514, + "step": 6700 + }, + { + "epoch": 0.0671, + "grad_norm": 9.3125, + "grad_norm_var": 0.06041666666666667, + "learning_rate": 0.0003, + "loss": 13.7238, + "loss/aux_loss": 0.04814040027558804, + "loss/crossentropy": 2.8811200976371767, + "loss/logits": 1.0566608518362046, + "step": 6710 + }, + { + "epoch": 0.0672, + "grad_norm": 8.4375, + "grad_norm_var": 0.111962890625, + "learning_rate": 0.0003, + "loss": 13.5595, + "loss/aux_loss": 0.048139039613306525, + "loss/crossentropy": 3.0208721280097963, + "loss/logits": 1.0752500742673874, + "step": 6720 + }, + { + "epoch": 0.0673, + "grad_norm": 10.25, + "grad_norm_var": 0.47317708333333336, + "learning_rate": 0.0003, + "loss": 13.5703, + "loss/aux_loss": 0.04813670702278614, + "loss/crossentropy": 3.0942620396614076, + "loss/logits": 1.0718396067619325, + "step": 6730 + }, + { + "epoch": 0.0674, + "grad_norm": 8.5625, + "grad_norm_var": 0.3111979166666667, + "learning_rate": 0.0003, + "loss": 13.7648, + "loss/aux_loss": 0.048137816973030566, + "loss/crossentropy": 3.032657301425934, + "loss/logits": 1.0898617118597032, + "step": 6740 + }, + { + "epoch": 0.0675, + "grad_norm": 8.6875, + "grad_norm_var": 0.3921875, + "learning_rate": 0.0003, + "loss": 13.682, + "loss/aux_loss": 0.048137097433209416, + "loss/crossentropy": 3.14407594203949, + "loss/logits": 1.0231775403022767, + "step": 6750 + }, + { + "epoch": 0.0676, + "grad_norm": 8.3125, + "grad_norm_var": 0.31942952473958336, + "learning_rate": 0.0003, + "loss": 13.5635, + "loss/aux_loss": 0.048129927739501, + "loss/crossentropy": 3.025140118598938, + "loss/logits": 1.051462560892105, + "step": 6760 + }, + { + "epoch": 0.0677, + "grad_norm": 9.1875, + "grad_norm_var": 0.3827433268229167, + "learning_rate": 0.0003, + "loss": 13.4952, + "loss/aux_loss": 0.04813184943050146, + "loss/crossentropy": 3.036393105983734, + "loss/logits": 1.0355240046977996, + "step": 6770 + }, + { + "epoch": 0.0678, + "grad_norm": 9.6875, + "grad_norm_var": 0.45315348307291664, + "learning_rate": 0.0003, + "loss": 13.6149, + "loss/aux_loss": 0.048135831765830514, + "loss/crossentropy": 3.0204987287521363, + "loss/logits": 1.0412966758012772, + "step": 6780 + }, + { + "epoch": 0.0679, + "grad_norm": 9.375, + "grad_norm_var": 0.19273681640625, + "learning_rate": 0.0003, + "loss": 13.5112, + "loss/aux_loss": 0.048138886131346224, + "loss/crossentropy": 2.7807726860046387, + "loss/logits": 1.0488616794347763, + "step": 6790 + }, + { + "epoch": 0.068, + "grad_norm": 8.8125, + "grad_norm_var": 0.2384765625, + "learning_rate": 0.0003, + "loss": 13.8943, + "loss/aux_loss": 0.048131432943046094, + "loss/crossentropy": 3.068627381324768, + "loss/logits": 1.097953936457634, + "step": 6800 + }, + { + "epoch": 0.0681, + "grad_norm": 8.6875, + "grad_norm_var": 0.16027018229166667, + "learning_rate": 0.0003, + "loss": 13.4602, + "loss/aux_loss": 0.04812794364988804, + "loss/crossentropy": 2.918659710884094, + "loss/logits": 1.0384095519781114, + "step": 6810 + }, + { + "epoch": 0.0682, + "grad_norm": 9.4375, + "grad_norm_var": 0.13854166666666667, + "learning_rate": 0.0003, + "loss": 13.5068, + "loss/aux_loss": 0.04812986459583044, + "loss/crossentropy": 2.9638909816741945, + "loss/logits": 1.0766464948654175, + "step": 6820 + }, + { + "epoch": 0.0683, + "grad_norm": 8.6875, + "grad_norm_var": 0.30514322916666664, + "learning_rate": 0.0003, + "loss": 13.4987, + "loss/aux_loss": 0.04813037347048521, + "loss/crossentropy": 2.929929780960083, + "loss/logits": 1.0736204475164413, + "step": 6830 + }, + { + "epoch": 0.0684, + "grad_norm": 8.625, + "grad_norm_var": 0.1087890625, + "learning_rate": 0.0003, + "loss": 13.5562, + "loss/aux_loss": 0.048125201091170314, + "loss/crossentropy": 2.925602376461029, + "loss/logits": 1.0603425681591034, + "step": 6840 + }, + { + "epoch": 0.0685, + "grad_norm": 10.625, + "grad_norm_var": 32.565608723958334, + "learning_rate": 0.0003, + "loss": 13.612, + "loss/aux_loss": 0.048129927739501, + "loss/crossentropy": 2.9750213265419005, + "loss/logits": 1.0510219603776931, + "step": 6850 + }, + { + "epoch": 0.0686, + "grad_norm": 8.4375, + "grad_norm_var": 32.53943684895833, + "learning_rate": 0.0003, + "loss": 13.5853, + "loss/aux_loss": 0.048130680806934834, + "loss/crossentropy": 2.939654362201691, + "loss/logits": 1.081050756573677, + "step": 6860 + }, + { + "epoch": 0.0687, + "grad_norm": 10.25, + "grad_norm_var": 2.6824055989583333, + "learning_rate": 0.0003, + "loss": 13.6429, + "loss/aux_loss": 0.048120100237429145, + "loss/crossentropy": 3.081457090377808, + "loss/logits": 1.042839017510414, + "step": 6870 + }, + { + "epoch": 0.0688, + "grad_norm": 8.875, + "grad_norm_var": 0.37109375, + "learning_rate": 0.0003, + "loss": 13.5743, + "loss/aux_loss": 0.04812851026654243, + "loss/crossentropy": 2.9788452863693236, + "loss/logits": 1.082206028699875, + "step": 6880 + }, + { + "epoch": 0.0689, + "grad_norm": 8.8125, + "grad_norm_var": 0.12545572916666667, + "learning_rate": 0.0003, + "loss": 13.3157, + "loss/aux_loss": 0.048133143596351145, + "loss/crossentropy": 2.8911037921905516, + "loss/logits": 1.064788919687271, + "step": 6890 + }, + { + "epoch": 0.069, + "grad_norm": 9.1875, + "grad_norm_var": 0.22213541666666667, + "learning_rate": 0.0003, + "loss": 13.6541, + "loss/aux_loss": 0.04813574869185686, + "loss/crossentropy": 3.002419984340668, + "loss/logits": 1.031218209862709, + "step": 6900 + }, + { + "epoch": 0.0691, + "grad_norm": 10.0, + "grad_norm_var": 0.33274739583333335, + "learning_rate": 0.0003, + "loss": 13.4968, + "loss/aux_loss": 0.048126774840056896, + "loss/crossentropy": 2.7656411051750185, + "loss/logits": 1.0477019995450974, + "step": 6910 + }, + { + "epoch": 0.0692, + "grad_norm": 8.25, + "grad_norm_var": 0.7972493489583333, + "learning_rate": 0.0003, + "loss": 13.4553, + "loss/aux_loss": 0.04813210777938366, + "loss/crossentropy": 2.996220147609711, + "loss/logits": 1.076385298371315, + "step": 6920 + }, + { + "epoch": 0.0693, + "grad_norm": 8.375, + "grad_norm_var": 0.7956990559895833, + "learning_rate": 0.0003, + "loss": 13.6957, + "loss/aux_loss": 0.04813826754689217, + "loss/crossentropy": 3.052574133872986, + "loss/logits": 1.0720904529094697, + "step": 6930 + }, + { + "epoch": 0.0694, + "grad_norm": 9.5, + "grad_norm_var": 0.28631184895833334, + "learning_rate": 0.0003, + "loss": 13.2369, + "loss/aux_loss": 0.048143592104315756, + "loss/crossentropy": 2.9044690668582915, + "loss/logits": 1.0308396130800248, + "step": 6940 + }, + { + "epoch": 0.0695, + "grad_norm": 9.0, + "grad_norm_var": 7.226416015625, + "learning_rate": 0.0003, + "loss": 13.4, + "loss/aux_loss": 0.04815136883407831, + "loss/crossentropy": 2.968579125404358, + "loss/logits": 1.0896869003772736, + "step": 6950 + }, + { + "epoch": 0.0696, + "grad_norm": 8.9375, + "grad_norm_var": 0.508447265625, + "learning_rate": 0.0003, + "loss": 13.505, + "loss/aux_loss": 0.048129218816757205, + "loss/crossentropy": 2.9993494272232057, + "loss/logits": 1.0352261871099473, + "step": 6960 + }, + { + "epoch": 0.0697, + "grad_norm": 9.875, + "grad_norm_var": 0.6348307291666667, + "learning_rate": 0.0003, + "loss": 13.6564, + "loss/aux_loss": 0.04812279660254717, + "loss/crossentropy": 2.970927131175995, + "loss/logits": 1.0954313904047013, + "step": 6970 + }, + { + "epoch": 0.0698, + "grad_norm": 9.5, + "grad_norm_var": 0.2203125, + "learning_rate": 0.0003, + "loss": 13.5257, + "loss/aux_loss": 0.04813241846859455, + "loss/crossentropy": 3.085184133052826, + "loss/logits": 1.0751633316278457, + "step": 6980 + }, + { + "epoch": 0.0699, + "grad_norm": 9.625, + "grad_norm_var": 0.24140625, + "learning_rate": 0.0003, + "loss": 13.5025, + "loss/aux_loss": 0.048134620860219, + "loss/crossentropy": 2.9781801462173463, + "loss/logits": 1.073571789264679, + "step": 6990 + }, + { + "epoch": 0.07, + "grad_norm": 9.0, + "grad_norm_var": 0.211572265625, + "learning_rate": 0.0003, + "loss": 13.3884, + "loss/aux_loss": 0.048129561357200146, + "loss/crossentropy": 2.836885952949524, + "loss/logits": 1.0355841994285584, + "step": 7000 + }, + { + "epoch": 0.0701, + "grad_norm": 8.9375, + "grad_norm_var": 0.05572916666666667, + "learning_rate": 0.0003, + "loss": 13.3751, + "loss/aux_loss": 0.04813597537577152, + "loss/crossentropy": 3.124106729030609, + "loss/logits": 1.087377232313156, + "step": 7010 + }, + { + "epoch": 0.0702, + "grad_norm": 10.0625, + "grad_norm_var": 0.28932291666666665, + "learning_rate": 0.0003, + "loss": 13.6506, + "loss/aux_loss": 0.04812758322805166, + "loss/crossentropy": 3.0415536522865296, + "loss/logits": 1.0985166609287262, + "step": 7020 + }, + { + "epoch": 0.0703, + "grad_norm": 9.125, + "grad_norm_var": 0.32928059895833334, + "learning_rate": 0.0003, + "loss": 13.5101, + "loss/aux_loss": 0.048123250156641005, + "loss/crossentropy": 2.903241181373596, + "loss/logits": 1.0201388955116273, + "step": 7030 + }, + { + "epoch": 0.0704, + "grad_norm": 11.0625, + "grad_norm_var": 0.6508951822916667, + "learning_rate": 0.0003, + "loss": 13.5161, + "loss/aux_loss": 0.048136004246771336, + "loss/crossentropy": 2.9222341775894165, + "loss/logits": 1.0136090040206909, + "step": 7040 + }, + { + "epoch": 0.0705, + "grad_norm": 8.875, + "grad_norm_var": 0.5880208333333333, + "learning_rate": 0.0003, + "loss": 13.5017, + "loss/aux_loss": 0.048121250979602334, + "loss/crossentropy": 2.9636539459228515, + "loss/logits": 1.079690435528755, + "step": 7050 + }, + { + "epoch": 0.0706, + "grad_norm": 9.0625, + "grad_norm_var": 0.231884765625, + "learning_rate": 0.0003, + "loss": 13.4343, + "loss/aux_loss": 0.048114245571196076, + "loss/crossentropy": 3.0601498603820803, + "loss/logits": 1.066567412018776, + "step": 7060 + }, + { + "epoch": 0.0707, + "grad_norm": 8.375, + "grad_norm_var": 0.17994791666666668, + "learning_rate": 0.0003, + "loss": 13.3876, + "loss/aux_loss": 0.048132015578448775, + "loss/crossentropy": 2.9693622946739198, + "loss/logits": 1.0554438531398773, + "step": 7070 + }, + { + "epoch": 0.0708, + "grad_norm": 9.1875, + "grad_norm_var": 0.4327473958333333, + "learning_rate": 0.0003, + "loss": 13.495, + "loss/aux_loss": 0.04813415054231882, + "loss/crossentropy": 2.8446732878685, + "loss/logits": 1.0407306522130966, + "step": 7080 + }, + { + "epoch": 0.0709, + "grad_norm": 9.1875, + "grad_norm_var": 0.16599934895833332, + "learning_rate": 0.0003, + "loss": 13.4971, + "loss/aux_loss": 0.04813690483570099, + "loss/crossentropy": 2.961193633079529, + "loss/logits": 1.04742229282856, + "step": 7090 + }, + { + "epoch": 0.071, + "grad_norm": 9.1875, + "grad_norm_var": 0.207666015625, + "learning_rate": 0.0003, + "loss": 13.3663, + "loss/aux_loss": 0.04812674857676029, + "loss/crossentropy": 2.898916572332382, + "loss/logits": 1.0212170660495759, + "step": 7100 + }, + { + "epoch": 0.0711, + "grad_norm": 9.875, + "grad_norm_var": 0.1853515625, + "learning_rate": 0.0003, + "loss": 13.5333, + "loss/aux_loss": 0.0481316477060318, + "loss/crossentropy": 2.9347579002380373, + "loss/logits": 0.9982910871505737, + "step": 7110 + }, + { + "epoch": 0.0712, + "grad_norm": 8.5, + "grad_norm_var": 0.17381184895833332, + "learning_rate": 0.0003, + "loss": 13.4628, + "loss/aux_loss": 0.048122935183346274, + "loss/crossentropy": 2.993864929676056, + "loss/logits": 1.0623649686574936, + "step": 7120 + }, + { + "epoch": 0.0713, + "grad_norm": 9.625, + "grad_norm_var": 6.217643229166667, + "learning_rate": 0.0003, + "loss": 13.3297, + "loss/aux_loss": 0.048131784237921235, + "loss/crossentropy": 2.932550811767578, + "loss/logits": 1.0577648341655732, + "step": 7130 + }, + { + "epoch": 0.0714, + "grad_norm": 11.3125, + "grad_norm_var": 18.408072916666665, + "learning_rate": 0.0003, + "loss": 13.6702, + "loss/aux_loss": 0.04816462509334087, + "loss/crossentropy": 3.037775385379791, + "loss/logits": 1.0722199440002442, + "step": 7140 + }, + { + "epoch": 0.0715, + "grad_norm": 10.125, + "grad_norm_var": 5.566910807291666, + "learning_rate": 0.0003, + "loss": 13.5737, + "loss/aux_loss": 0.048116024024784564, + "loss/crossentropy": 2.9626861453056335, + "loss/logits": 1.0311239361763, + "step": 7150 + }, + { + "epoch": 0.0716, + "grad_norm": 10.375, + "grad_norm_var": 2.6786295572916665, + "learning_rate": 0.0003, + "loss": 13.4786, + "loss/aux_loss": 0.04811809528619051, + "loss/crossentropy": 2.9172492921352386, + "loss/logits": 1.056332242488861, + "step": 7160 + }, + { + "epoch": 0.0717, + "grad_norm": 9.8125, + "grad_norm_var": 2.5603515625, + "learning_rate": 0.0003, + "loss": 13.4434, + "loss/aux_loss": 0.04812202490866184, + "loss/crossentropy": 3.057690107822418, + "loss/logits": 1.0905145525932312, + "step": 7170 + }, + { + "epoch": 0.0718, + "grad_norm": 11.8125, + "grad_norm_var": 26.733707682291666, + "learning_rate": 0.0003, + "loss": 13.5006, + "loss/aux_loss": 0.04812044147402048, + "loss/crossentropy": 2.9608686804771422, + "loss/logits": 1.0904987782239914, + "step": 7180 + }, + { + "epoch": 0.0719, + "grad_norm": 9.625, + "grad_norm_var": 25.960921223958334, + "learning_rate": 0.0003, + "loss": 13.4463, + "loss/aux_loss": 0.048137583397328855, + "loss/crossentropy": 2.895015776157379, + "loss/logits": 1.0172715038061142, + "step": 7190 + }, + { + "epoch": 0.072, + "grad_norm": 8.25, + "grad_norm_var": 0.31417643229166664, + "learning_rate": 0.0003, + "loss": 13.5728, + "loss/aux_loss": 0.04812595229595899, + "loss/crossentropy": 2.8322587251663207, + "loss/logits": 1.015550658106804, + "step": 7200 + }, + { + "epoch": 0.0721, + "grad_norm": 9.875, + "grad_norm_var": 0.2728515625, + "learning_rate": 0.0003, + "loss": 13.6334, + "loss/aux_loss": 0.048122029192745684, + "loss/crossentropy": 3.037608253955841, + "loss/logits": 1.059059676527977, + "step": 7210 + }, + { + "epoch": 0.0722, + "grad_norm": 10.5625, + "grad_norm_var": 0.31951497395833334, + "learning_rate": 0.0003, + "loss": 13.4644, + "loss/aux_loss": 0.04813063070178032, + "loss/crossentropy": 2.9392677783966064, + "loss/logits": 1.026180136203766, + "step": 7220 + }, + { + "epoch": 0.0723, + "grad_norm": 8.5, + "grad_norm_var": 0.5988118489583333, + "learning_rate": 0.0003, + "loss": 13.4826, + "loss/aux_loss": 0.04812701418995857, + "loss/crossentropy": 2.9874269366264343, + "loss/logits": 1.0393612265586853, + "step": 7230 + }, + { + "epoch": 0.0724, + "grad_norm": 8.4375, + "grad_norm_var": 0.6067708333333334, + "learning_rate": 0.0003, + "loss": 13.2889, + "loss/aux_loss": 0.04812222328037023, + "loss/crossentropy": 2.7734349012374877, + "loss/logits": 1.0329697102308273, + "step": 7240 + }, + { + "epoch": 0.0725, + "grad_norm": 8.9375, + "grad_norm_var": 0.18592122395833333, + "learning_rate": 0.0003, + "loss": 13.5025, + "loss/aux_loss": 0.04812582526355982, + "loss/crossentropy": 2.9962441444396974, + "loss/logits": 1.024059322476387, + "step": 7250 + }, + { + "epoch": 0.0726, + "grad_norm": 9.4375, + "grad_norm_var": 0.06796875, + "learning_rate": 0.0003, + "loss": 13.2807, + "loss/aux_loss": 0.048132246173918244, + "loss/crossentropy": 2.820746290683746, + "loss/logits": 1.0364280879497527, + "step": 7260 + }, + { + "epoch": 0.0727, + "grad_norm": 8.5625, + "grad_norm_var": 0.19034830729166666, + "learning_rate": 0.0003, + "loss": 13.5448, + "loss/aux_loss": 0.048121783323585986, + "loss/crossentropy": 3.010144531726837, + "loss/logits": 1.0459368169307708, + "step": 7270 + }, + { + "epoch": 0.0728, + "grad_norm": 9.0, + "grad_norm_var": 5.168733723958334, + "learning_rate": 0.0003, + "loss": 13.3248, + "loss/aux_loss": 0.048125391267240046, + "loss/crossentropy": 2.889055919647217, + "loss/logits": 1.0278723955154419, + "step": 7280 + }, + { + "epoch": 0.0729, + "grad_norm": 9.875, + "grad_norm_var": 5.200634765625, + "learning_rate": 0.0003, + "loss": 13.2574, + "loss/aux_loss": 0.04812876787036657, + "loss/crossentropy": 2.8185496270656585, + "loss/logits": 1.0287230491638184, + "step": 7290 + }, + { + "epoch": 0.073, + "grad_norm": 8.875, + "grad_norm_var": 0.32233072916666666, + "learning_rate": 0.0003, + "loss": 13.3965, + "loss/aux_loss": 0.04812293406575918, + "loss/crossentropy": 2.804865860939026, + "loss/logits": 1.0147784382104874, + "step": 7300 + }, + { + "epoch": 0.0731, + "grad_norm": 8.5, + "grad_norm_var": 0.18904622395833334, + "learning_rate": 0.0003, + "loss": 13.3527, + "loss/aux_loss": 0.04813094306737185, + "loss/crossentropy": 2.918304455280304, + "loss/logits": 1.079040315747261, + "step": 7310 + }, + { + "epoch": 0.0732, + "grad_norm": 10.8125, + "grad_norm_var": 49.551936848958334, + "learning_rate": 0.0003, + "loss": 13.3824, + "loss/aux_loss": 0.04812980853021145, + "loss/crossentropy": 2.9288637161254885, + "loss/logits": 1.0903980165719986, + "step": 7320 + }, + { + "epoch": 0.0733, + "grad_norm": 9.0, + "grad_norm_var": 49.57076822916667, + "learning_rate": 0.0003, + "loss": 13.2886, + "loss/aux_loss": 0.04812570326030254, + "loss/crossentropy": 3.002810549736023, + "loss/logits": 1.0841778188943862, + "step": 7330 + }, + { + "epoch": 0.0734, + "grad_norm": 8.9375, + "grad_norm_var": 3.1300618489583334, + "learning_rate": 0.0003, + "loss": 13.5399, + "loss/aux_loss": 0.04813100174069405, + "loss/crossentropy": 2.796613943576813, + "loss/logits": 1.0452454775571822, + "step": 7340 + }, + { + "epoch": 0.0735, + "grad_norm": 9.5, + "grad_norm_var": 8.736442057291667, + "learning_rate": 0.0003, + "loss": 13.5117, + "loss/aux_loss": 0.04813167788088322, + "loss/crossentropy": 2.962803506851196, + "loss/logits": 1.0180111587047578, + "step": 7350 + }, + { + "epoch": 0.0736, + "grad_norm": 8.5625, + "grad_norm_var": 7.845768229166667, + "learning_rate": 0.0003, + "loss": 13.3754, + "loss/aux_loss": 0.04813878424465656, + "loss/crossentropy": 2.81993590593338, + "loss/logits": 1.0155292719602584, + "step": 7360 + }, + { + "epoch": 0.0737, + "grad_norm": 9.875, + "grad_norm_var": 0.30514322916666664, + "learning_rate": 0.0003, + "loss": 13.3653, + "loss/aux_loss": 0.04812592975795269, + "loss/crossentropy": 2.8093133509159087, + "loss/logits": 1.0710052281618119, + "step": 7370 + }, + { + "epoch": 0.0738, + "grad_norm": 8.6875, + "grad_norm_var": 0.24295247395833333, + "learning_rate": 0.0003, + "loss": 13.5517, + "loss/aux_loss": 0.04812061432749033, + "loss/crossentropy": 2.838201379776001, + "loss/logits": 1.0604495793581008, + "step": 7380 + }, + { + "epoch": 0.0739, + "grad_norm": 9.4375, + "grad_norm_var": 0.20167643229166668, + "learning_rate": 0.0003, + "loss": 13.5058, + "loss/aux_loss": 0.048129613324999806, + "loss/crossentropy": 3.0717398285865785, + "loss/logits": 1.068494337797165, + "step": 7390 + }, + { + "epoch": 0.074, + "grad_norm": 9.0, + "grad_norm_var": 0.10050455729166667, + "learning_rate": 0.0003, + "loss": 13.3541, + "loss/aux_loss": 0.048131177015602586, + "loss/crossentropy": 2.886682081222534, + "loss/logits": 1.030799898505211, + "step": 7400 + }, + { + "epoch": 0.0741, + "grad_norm": 8.75, + "grad_norm_var": 0.44505208333333335, + "learning_rate": 0.0003, + "loss": 13.336, + "loss/aux_loss": 0.04812990296632051, + "loss/crossentropy": 2.886744201183319, + "loss/logits": 1.0507986098527908, + "step": 7410 + }, + { + "epoch": 0.0742, + "grad_norm": 9.5625, + "grad_norm_var": 0.338525390625, + "learning_rate": 0.0003, + "loss": 13.5748, + "loss/aux_loss": 0.04813168831169605, + "loss/crossentropy": 2.8818042397499086, + "loss/logits": 1.0512411534786223, + "step": 7420 + }, + { + "epoch": 0.0743, + "grad_norm": 8.6875, + "grad_norm_var": 0.3155598958333333, + "learning_rate": 0.0003, + "loss": 13.4908, + "loss/aux_loss": 0.04812338091433048, + "loss/crossentropy": 2.8881842494010925, + "loss/logits": 1.0324068903923034, + "step": 7430 + }, + { + "epoch": 0.0744, + "grad_norm": 9.6875, + "grad_norm_var": 0.24993489583333334, + "learning_rate": 0.0003, + "loss": 13.3201, + "loss/aux_loss": 0.04813193250447512, + "loss/crossentropy": 2.707760387659073, + "loss/logits": 1.041485771536827, + "step": 7440 + }, + { + "epoch": 0.0745, + "grad_norm": 9.3125, + "grad_norm_var": 0.5744140625, + "learning_rate": 0.0003, + "loss": 13.1496, + "loss/aux_loss": 0.04813508708029986, + "loss/crossentropy": 2.864105689525604, + "loss/logits": 0.9952063351869583, + "step": 7450 + }, + { + "epoch": 0.0746, + "grad_norm": 9.1875, + "grad_norm_var": 0.17433268229166668, + "learning_rate": 0.0003, + "loss": 13.6981, + "loss/aux_loss": 0.04812800846993923, + "loss/crossentropy": 3.035504865646362, + "loss/logits": 1.0508066952228545, + "step": 7460 + }, + { + "epoch": 0.0747, + "grad_norm": 8.75, + "grad_norm_var": 0.115869140625, + "learning_rate": 0.0003, + "loss": 13.3485, + "loss/aux_loss": 0.048122041299939154, + "loss/crossentropy": 2.7956653356552126, + "loss/logits": 1.0159823626279831, + "step": 7470 + }, + { + "epoch": 0.0748, + "grad_norm": 9.0, + "grad_norm_var": 0.3651041666666667, + "learning_rate": 0.0003, + "loss": 13.4521, + "loss/aux_loss": 0.048129369504749775, + "loss/crossentropy": 2.969923257827759, + "loss/logits": 1.0386756300926208, + "step": 7480 + }, + { + "epoch": 0.0749, + "grad_norm": 9.125, + "grad_norm_var": 1.2169270833333334, + "learning_rate": 0.0003, + "loss": 13.3878, + "loss/aux_loss": 0.04812902975827456, + "loss/crossentropy": 3.011993145942688, + "loss/logits": 1.042042750120163, + "step": 7490 + }, + { + "epoch": 0.075, + "grad_norm": 9.0, + "grad_norm_var": 0.242431640625, + "learning_rate": 0.0003, + "loss": 13.4144, + "loss/aux_loss": 0.04812177959829569, + "loss/crossentropy": 3.07977237701416, + "loss/logits": 1.0765836715698243, + "step": 7500 + }, + { + "epoch": 0.0751, + "grad_norm": 9.75, + "grad_norm_var": 0.33645833333333336, + "learning_rate": 0.0003, + "loss": 13.4444, + "loss/aux_loss": 0.04811902064830065, + "loss/crossentropy": 2.9798253655433653, + "loss/logits": 1.0538707852363587, + "step": 7510 + }, + { + "epoch": 0.0752, + "grad_norm": 9.0, + "grad_norm_var": 0.18904622395833334, + "learning_rate": 0.0003, + "loss": 13.3292, + "loss/aux_loss": 0.048129689320921897, + "loss/crossentropy": 2.9677703261375425, + "loss/logits": 1.0421554505825044, + "step": 7520 + }, + { + "epoch": 0.0753, + "grad_norm": 9.0625, + "grad_norm_var": 0.10859375, + "learning_rate": 0.0003, + "loss": 13.1599, + "loss/aux_loss": 0.0481248639523983, + "loss/crossentropy": 2.9143801808357237, + "loss/logits": 1.0279333680868148, + "step": 7530 + }, + { + "epoch": 0.0754, + "grad_norm": 12.0625, + "grad_norm_var": 0.9742024739583334, + "learning_rate": 0.0003, + "loss": 13.2219, + "loss/aux_loss": 0.048115496151149274, + "loss/crossentropy": 2.980224275588989, + "loss/logits": 1.039728471636772, + "step": 7540 + }, + { + "epoch": 0.0755, + "grad_norm": 9.125, + "grad_norm_var": 0.9067057291666667, + "learning_rate": 0.0003, + "loss": 13.3821, + "loss/aux_loss": 0.04812954906374216, + "loss/crossentropy": 2.9692449450492857, + "loss/logits": 1.047850751876831, + "step": 7550 + }, + { + "epoch": 0.0756, + "grad_norm": 9.3125, + "grad_norm_var": 0.07805989583333334, + "learning_rate": 0.0003, + "loss": 13.3762, + "loss/aux_loss": 0.048129818961024286, + "loss/crossentropy": 2.990733635425568, + "loss/logits": 1.035420474410057, + "step": 7560 + }, + { + "epoch": 0.0757, + "grad_norm": 9.375, + "grad_norm_var": 5.944514973958333, + "learning_rate": 0.0003, + "loss": 13.5087, + "loss/aux_loss": 0.048126287385821344, + "loss/crossentropy": 3.0128140330314634, + "loss/logits": 1.0564094483852386, + "step": 7570 + }, + { + "epoch": 0.0758, + "grad_norm": 9.1875, + "grad_norm_var": 0.4749348958333333, + "learning_rate": 0.0003, + "loss": 13.2836, + "loss/aux_loss": 0.048132631182670596, + "loss/crossentropy": 2.837554985284805, + "loss/logits": 1.0422370553016662, + "step": 7580 + }, + { + "epoch": 0.0759, + "grad_norm": 9.0625, + "grad_norm_var": 0.45388997395833336, + "learning_rate": 0.0003, + "loss": 13.1942, + "loss/aux_loss": 0.048142952285706996, + "loss/crossentropy": 2.6931034505367277, + "loss/logits": 0.96292115598917, + "step": 7590 + }, + { + "epoch": 0.076, + "grad_norm": 22.125, + "grad_norm_var": 10.807275390625, + "learning_rate": 0.0003, + "loss": 13.1736, + "loss/aux_loss": 0.04812065456062555, + "loss/crossentropy": 2.9220955312252044, + "loss/logits": 1.0179326832294464, + "step": 7600 + }, + { + "epoch": 0.0761, + "grad_norm": 10.125, + "grad_norm_var": 10.003629557291667, + "learning_rate": 0.0003, + "loss": 13.4376, + "loss/aux_loss": 0.04812586084008217, + "loss/crossentropy": 3.021818733215332, + "loss/logits": 1.0483725011348723, + "step": 7610 + }, + { + "epoch": 0.0762, + "grad_norm": 8.6875, + "grad_norm_var": 1.119384765625, + "learning_rate": 0.0003, + "loss": 13.2549, + "loss/aux_loss": 0.04812179896980524, + "loss/crossentropy": 2.780340301990509, + "loss/logits": 1.0378026425838471, + "step": 7620 + }, + { + "epoch": 0.0763, + "grad_norm": 9.875, + "grad_norm_var": 1.0983723958333333, + "learning_rate": 0.0003, + "loss": 13.22, + "loss/aux_loss": 0.04812696985900402, + "loss/crossentropy": 2.8523794054985045, + "loss/logits": 1.0297119617462158, + "step": 7630 + }, + { + "epoch": 0.0764, + "grad_norm": 9.0, + "grad_norm_var": 4.044645182291666, + "learning_rate": 0.0003, + "loss": 13.391, + "loss/aux_loss": 0.04813089091330767, + "loss/crossentropy": 2.98396714925766, + "loss/logits": 1.0715709984302522, + "step": 7640 + }, + { + "epoch": 0.0765, + "grad_norm": 8.8125, + "grad_norm_var": 4.178759765625, + "learning_rate": 0.0003, + "loss": 13.1596, + "loss/aux_loss": 0.048119811527431014, + "loss/crossentropy": 2.919600564241409, + "loss/logits": 0.9985195219516754, + "step": 7650 + }, + { + "epoch": 0.0766, + "grad_norm": 10.3125, + "grad_norm_var": 0.21808268229166666, + "learning_rate": 0.0003, + "loss": 13.4931, + "loss/aux_loss": 0.048118037171661854, + "loss/crossentropy": 2.9762576520442963, + "loss/logits": 1.0814913272857667, + "step": 7660 + }, + { + "epoch": 0.0767, + "grad_norm": 9.6875, + "grad_norm_var": 0.18014322916666667, + "learning_rate": 0.0003, + "loss": 13.3496, + "loss/aux_loss": 0.04812421500682831, + "loss/crossentropy": 2.9494404554367066, + "loss/logits": 1.0210811465978622, + "step": 7670 + }, + { + "epoch": 0.0768, + "grad_norm": 9.5625, + "grad_norm_var": 0.19524739583333334, + "learning_rate": 0.0003, + "loss": 13.3235, + "loss/aux_loss": 0.04811930097639561, + "loss/crossentropy": 2.8925601482391357, + "loss/logits": 1.0253148704767228, + "step": 7680 + }, + { + "epoch": 0.0769, + "grad_norm": 9.5, + "grad_norm_var": 0.3277180989583333, + "learning_rate": 0.0003, + "loss": 13.4136, + "loss/aux_loss": 0.04813026450574398, + "loss/crossentropy": 2.9719626665115357, + "loss/logits": 1.1240254521369935, + "step": 7690 + }, + { + "epoch": 0.077, + "grad_norm": 9.3125, + "grad_norm_var": 0.28932291666666665, + "learning_rate": 0.0003, + "loss": 13.3069, + "loss/aux_loss": 0.048117564991116524, + "loss/crossentropy": 2.9367773652076723, + "loss/logits": 1.0887930393218994, + "step": 7700 + }, + { + "epoch": 0.0771, + "grad_norm": 9.3125, + "grad_norm_var": 0.16608072916666666, + "learning_rate": 0.0003, + "loss": 13.4119, + "loss/aux_loss": 0.04813154824078083, + "loss/crossentropy": 2.821038991212845, + "loss/logits": 1.0327024161815643, + "step": 7710 + }, + { + "epoch": 0.0772, + "grad_norm": 9.6875, + "grad_norm_var": 0.1103515625, + "learning_rate": 0.0003, + "loss": 13.3759, + "loss/aux_loss": 0.04812257084995508, + "loss/crossentropy": 2.922965955734253, + "loss/logits": 1.0887499898672104, + "step": 7720 + }, + { + "epoch": 0.0773, + "grad_norm": 8.75, + "grad_norm_var": 0.39212239583333336, + "learning_rate": 0.0003, + "loss": 13.1938, + "loss/aux_loss": 0.04812620896846056, + "loss/crossentropy": 2.873304957151413, + "loss/logits": 1.0040156990289688, + "step": 7730 + }, + { + "epoch": 0.0774, + "grad_norm": 9.75, + "grad_norm_var": 0.46608072916666665, + "learning_rate": 0.0003, + "loss": 13.5227, + "loss/aux_loss": 0.048138899728655815, + "loss/crossentropy": 2.9522013902664184, + "loss/logits": 1.0542542576789855, + "step": 7740 + }, + { + "epoch": 0.0775, + "grad_norm": 8.9375, + "grad_norm_var": 0.5067057291666667, + "learning_rate": 0.0003, + "loss": 13.2829, + "loss/aux_loss": 0.04812390860170126, + "loss/crossentropy": 2.9055544257164003, + "loss/logits": 1.0098161727190018, + "step": 7750 + }, + { + "epoch": 0.0776, + "grad_norm": 9.0625, + "grad_norm_var": 0.22941080729166666, + "learning_rate": 0.0003, + "loss": 13.3498, + "loss/aux_loss": 0.04813548941165209, + "loss/crossentropy": 2.980088174343109, + "loss/logits": 1.0641280621290208, + "step": 7760 + }, + { + "epoch": 0.0777, + "grad_norm": 10.5, + "grad_norm_var": 0.3078125, + "learning_rate": 0.0003, + "loss": 13.3331, + "loss/aux_loss": 0.048122762329876424, + "loss/crossentropy": 2.9135417342185974, + "loss/logits": 1.0203843981027603, + "step": 7770 + }, + { + "epoch": 0.0778, + "grad_norm": 9.25, + "grad_norm_var": 7.404150390625, + "learning_rate": 0.0003, + "loss": 13.2873, + "loss/aux_loss": 0.04813399352133274, + "loss/crossentropy": 2.841181445121765, + "loss/logits": 1.055801859498024, + "step": 7780 + }, + { + "epoch": 0.0779, + "grad_norm": 10.6875, + "grad_norm_var": 6.986197916666667, + "learning_rate": 0.0003, + "loss": 13.1383, + "loss/aux_loss": 0.04812779631465673, + "loss/crossentropy": 2.8845179080963135, + "loss/logits": 1.0134330958127975, + "step": 7790 + }, + { + "epoch": 0.078, + "grad_norm": 34.0, + "grad_norm_var": 37.05651041666667, + "learning_rate": 0.0003, + "loss": 13.2773, + "loss/aux_loss": 0.04813188221305609, + "loss/crossentropy": 2.9372805774211885, + "loss/logits": 0.9858429193496704, + "step": 7800 + }, + { + "epoch": 0.0781, + "grad_norm": 8.5625, + "grad_norm_var": 37.1462890625, + "learning_rate": 0.0003, + "loss": 13.3157, + "loss/aux_loss": 0.04813784416764975, + "loss/crossentropy": 2.767670226097107, + "loss/logits": 1.0314590692520142, + "step": 7810 + }, + { + "epoch": 0.0782, + "grad_norm": 9.1875, + "grad_norm_var": 0.728759765625, + "learning_rate": 0.0003, + "loss": 13.071, + "loss/aux_loss": 0.048109129257500174, + "loss/crossentropy": 2.9309176981449125, + "loss/logits": 1.0385325998067856, + "step": 7820 + }, + { + "epoch": 0.0783, + "grad_norm": 9.6875, + "grad_norm_var": 0.3277180989583333, + "learning_rate": 0.0003, + "loss": 13.1624, + "loss/aux_loss": 0.04811421576887369, + "loss/crossentropy": 2.97960284948349, + "loss/logits": 1.0207297384738923, + "step": 7830 + }, + { + "epoch": 0.0784, + "grad_norm": 9.9375, + "grad_norm_var": 1.3139973958333333, + "learning_rate": 0.0003, + "loss": 13.1052, + "loss/aux_loss": 0.048128409497439864, + "loss/crossentropy": 2.8389533042907713, + "loss/logits": 1.035956397652626, + "step": 7840 + }, + { + "epoch": 0.0785, + "grad_norm": 9.125, + "grad_norm_var": 0.7958333333333333, + "learning_rate": 0.0003, + "loss": 13.2657, + "loss/aux_loss": 0.04812541268765926, + "loss/crossentropy": 2.9305792689323424, + "loss/logits": 1.0182174772024155, + "step": 7850 + }, + { + "epoch": 0.0786, + "grad_norm": 8.9375, + "grad_norm_var": 0.4175618489583333, + "learning_rate": 0.0003, + "loss": 13.3332, + "loss/aux_loss": 0.0481265714392066, + "loss/crossentropy": 3.0935042262077332, + "loss/logits": 1.0543415069580078, + "step": 7860 + }, + { + "epoch": 0.0787, + "grad_norm": 9.3125, + "grad_norm_var": 0.5536295572916666, + "learning_rate": 0.0003, + "loss": 13.2632, + "loss/aux_loss": 0.048121869936585425, + "loss/crossentropy": 2.859709286689758, + "loss/logits": 1.0271731585264205, + "step": 7870 + }, + { + "epoch": 0.0788, + "grad_norm": 9.375, + "grad_norm_var": 0.32180989583333336, + "learning_rate": 0.0003, + "loss": 13.3159, + "loss/aux_loss": 0.04812927972525358, + "loss/crossentropy": 2.855903148651123, + "loss/logits": 1.0336874067783355, + "step": 7880 + }, + { + "epoch": 0.0789, + "grad_norm": 9.75, + "grad_norm_var": 0.236572265625, + "learning_rate": 0.0003, + "loss": 13.2051, + "loss/aux_loss": 0.048123538866639136, + "loss/crossentropy": 2.887688386440277, + "loss/logits": 1.0365424662828446, + "step": 7890 + }, + { + "epoch": 0.079, + "grad_norm": 11.25, + "grad_norm_var": 0.37473958333333335, + "learning_rate": 0.0003, + "loss": 13.265, + "loss/aux_loss": 0.04812454991042614, + "loss/crossentropy": 2.9836980283260344, + "loss/logits": 1.014005294442177, + "step": 7900 + }, + { + "epoch": 0.0791, + "grad_norm": 9.5, + "grad_norm_var": 0.369384765625, + "learning_rate": 0.0003, + "loss": 13.2299, + "loss/aux_loss": 0.048119301721453664, + "loss/crossentropy": 3.002364158630371, + "loss/logits": 1.0333095729351043, + "step": 7910 + }, + { + "epoch": 0.0792, + "grad_norm": 8.8125, + "grad_norm_var": 0.18326822916666666, + "learning_rate": 0.0003, + "loss": 13.2434, + "loss/aux_loss": 0.048124428279697894, + "loss/crossentropy": 2.854734891653061, + "loss/logits": 1.014777159690857, + "step": 7920 + }, + { + "epoch": 0.0793, + "grad_norm": 9.5, + "grad_norm_var": 0.18014322916666667, + "learning_rate": 0.0003, + "loss": 13.3392, + "loss/aux_loss": 0.04812102187424898, + "loss/crossentropy": 2.9866424322128298, + "loss/logits": 1.04144589304924, + "step": 7930 + }, + { + "epoch": 0.0794, + "grad_norm": 8.9375, + "grad_norm_var": 6.890625, + "learning_rate": 0.0003, + "loss": 13.2693, + "loss/aux_loss": 0.048126323707401754, + "loss/crossentropy": 2.918100368976593, + "loss/logits": 1.0156398355960845, + "step": 7940 + }, + { + "epoch": 0.0795, + "grad_norm": 9.0625, + "grad_norm_var": 0.5878743489583333, + "learning_rate": 0.0003, + "loss": 13.0763, + "loss/aux_loss": 0.0481284249573946, + "loss/crossentropy": 2.915715491771698, + "loss/logits": 1.0164682030677796, + "step": 7950 + }, + { + "epoch": 0.0796, + "grad_norm": 9.25, + "grad_norm_var": 0.1791015625, + "learning_rate": 0.0003, + "loss": 13.0751, + "loss/aux_loss": 0.04813295528292656, + "loss/crossentropy": 2.8354081392288206, + "loss/logits": 0.978666540980339, + "step": 7960 + }, + { + "epoch": 0.0797, + "grad_norm": 8.8125, + "grad_norm_var": 0.2955729166666667, + "learning_rate": 0.0003, + "loss": 13.3034, + "loss/aux_loss": 0.0481256989762187, + "loss/crossentropy": 2.8299093306064607, + "loss/logits": 1.0147087454795838, + "step": 7970 + }, + { + "epoch": 0.0798, + "grad_norm": 9.0625, + "grad_norm_var": 0.25286458333333334, + "learning_rate": 0.0003, + "loss": 13.1301, + "loss/aux_loss": 0.04812098871916533, + "loss/crossentropy": 2.9957703232765196, + "loss/logits": 1.0261031478643416, + "step": 7980 + }, + { + "epoch": 0.0799, + "grad_norm": 9.25, + "grad_norm_var": 0.09308268229166666, + "learning_rate": 0.0003, + "loss": 13.1294, + "loss/aux_loss": 0.048122938722372055, + "loss/crossentropy": 2.8062502324581144, + "loss/logits": 1.0008294701576232, + "step": 7990 + }, + { + "epoch": 0.08, + "grad_norm": 9.3125, + "grad_norm_var": 0.074072265625, + "learning_rate": 0.0003, + "loss": 13.2716, + "loss/aux_loss": 0.04812256768345833, + "loss/crossentropy": 2.9094210386276247, + "loss/logits": 1.0217216283082962, + "step": 8000 + }, + { + "epoch": 0.0801, + "grad_norm": 9.25, + "grad_norm_var": 0.5249348958333333, + "learning_rate": 0.0003, + "loss": 13.2679, + "loss/aux_loss": 0.048118163272738455, + "loss/crossentropy": 2.896920144557953, + "loss/logits": 1.0116204470396042, + "step": 8010 + }, + { + "epoch": 0.0802, + "grad_norm": 9.125, + "grad_norm_var": 0.2696451822916667, + "learning_rate": 0.0003, + "loss": 13.1435, + "loss/aux_loss": 0.048120760917663576, + "loss/crossentropy": 2.954100179672241, + "loss/logits": 1.0261077135801315, + "step": 8020 + }, + { + "epoch": 0.0803, + "grad_norm": 10.375, + "grad_norm_var": 590.15078125, + "learning_rate": 0.0003, + "loss": 13.296, + "loss/aux_loss": 0.04813782777637243, + "loss/crossentropy": 2.9724114894866944, + "loss/logits": 1.0246656686067581, + "step": 8030 + }, + { + "epoch": 0.0804, + "grad_norm": 9.3125, + "grad_norm_var": 588.4619140625, + "learning_rate": 0.0003, + "loss": 13.3131, + "loss/aux_loss": 0.04812064114958048, + "loss/crossentropy": 2.873293662071228, + "loss/logits": 1.0408964782953263, + "step": 8040 + }, + { + "epoch": 0.0805, + "grad_norm": 9.5, + "grad_norm_var": 0.21927083333333333, + "learning_rate": 0.0003, + "loss": 13.4035, + "loss/aux_loss": 0.04812377672642469, + "loss/crossentropy": 3.0599602937698362, + "loss/logits": 1.0257072687149047, + "step": 8050 + }, + { + "epoch": 0.0806, + "grad_norm": 9.125, + "grad_norm_var": 1.2195149739583333, + "learning_rate": 0.0003, + "loss": 13.1163, + "loss/aux_loss": 0.0481159932911396, + "loss/crossentropy": 2.8620250105857847, + "loss/logits": 0.9948093295097351, + "step": 8060 + }, + { + "epoch": 0.0807, + "grad_norm": 9.0, + "grad_norm_var": 18.907405598958334, + "learning_rate": 0.0003, + "loss": 13.4342, + "loss/aux_loss": 0.048125050216913226, + "loss/crossentropy": 2.892456221580505, + "loss/logits": 1.0422109365463257, + "step": 8070 + }, + { + "epoch": 0.0808, + "grad_norm": 10.5, + "grad_norm_var": 6.577457682291667, + "learning_rate": 0.0003, + "loss": 13.336, + "loss/aux_loss": 0.04813191127032042, + "loss/crossentropy": 2.735273379087448, + "loss/logits": 1.033458188176155, + "step": 8080 + }, + { + "epoch": 0.0809, + "grad_norm": 9.125, + "grad_norm_var": 54.12708333333333, + "learning_rate": 0.0003, + "loss": 13.3489, + "loss/aux_loss": 0.04812964014708996, + "loss/crossentropy": 2.843802607059479, + "loss/logits": 1.0632713794708253, + "step": 8090 + }, + { + "epoch": 0.081, + "grad_norm": 9.625, + "grad_norm_var": 0.4376139322916667, + "learning_rate": 0.0003, + "loss": 13.0564, + "loss/aux_loss": 0.04813598971813917, + "loss/crossentropy": 2.841505432128906, + "loss/logits": 1.0077117711305619, + "step": 8100 + }, + { + "epoch": 0.0811, + "grad_norm": 9.5, + "grad_norm_var": 0.24088541666666666, + "learning_rate": 0.0003, + "loss": 13.1167, + "loss/aux_loss": 0.04812768436968327, + "loss/crossentropy": 2.8955862760543822, + "loss/logits": 0.997417938709259, + "step": 8110 + }, + { + "epoch": 0.0812, + "grad_norm": 9.875, + "grad_norm_var": 0.233837890625, + "learning_rate": 0.0003, + "loss": 13.1671, + "loss/aux_loss": 0.04812203329056501, + "loss/crossentropy": 2.961570382118225, + "loss/logits": 1.0406351834535599, + "step": 8120 + }, + { + "epoch": 0.0813, + "grad_norm": 9.375, + "grad_norm_var": 0.32864583333333336, + "learning_rate": 0.0003, + "loss": 13.2364, + "loss/aux_loss": 0.04812574498355389, + "loss/crossentropy": 2.8037814855575562, + "loss/logits": 1.0229216545820237, + "step": 8130 + }, + { + "epoch": 0.0814, + "grad_norm": 9.6875, + "grad_norm_var": 0.20677083333333332, + "learning_rate": 0.0003, + "loss": 13.1272, + "loss/aux_loss": 0.04812443684786558, + "loss/crossentropy": 2.901040017604828, + "loss/logits": 1.0405553728342056, + "step": 8140 + }, + { + "epoch": 0.0815, + "grad_norm": 9.1875, + "grad_norm_var": 0.42962239583333334, + "learning_rate": 0.0003, + "loss": 13.2754, + "loss/aux_loss": 0.04811316020786762, + "loss/crossentropy": 3.041601026058197, + "loss/logits": 1.049459946155548, + "step": 8150 + }, + { + "epoch": 0.0816, + "grad_norm": 9.125, + "grad_norm_var": 0.41354166666666664, + "learning_rate": 0.0003, + "loss": 13.1756, + "loss/aux_loss": 0.048122165724635124, + "loss/crossentropy": 2.9084804534912108, + "loss/logits": 1.028309690952301, + "step": 8160 + }, + { + "epoch": 0.0817, + "grad_norm": 9.625, + "grad_norm_var": 0.23566080729166666, + "learning_rate": 0.0003, + "loss": 13.2451, + "loss/aux_loss": 0.04811309780925512, + "loss/crossentropy": 2.7368631422519685, + "loss/logits": 1.002194732427597, + "step": 8170 + }, + { + "epoch": 0.0818, + "grad_norm": 9.8125, + "grad_norm_var": 0.187744140625, + "learning_rate": 0.0003, + "loss": 13.0183, + "loss/aux_loss": 0.0481279119849205, + "loss/crossentropy": 2.908668839931488, + "loss/logits": 1.0175655782222748, + "step": 8180 + }, + { + "epoch": 0.0819, + "grad_norm": 9.0, + "grad_norm_var": 0.21927083333333333, + "learning_rate": 0.0003, + "loss": 13.3305, + "loss/aux_loss": 0.0481220519170165, + "loss/crossentropy": 2.986106610298157, + "loss/logits": 1.025682133436203, + "step": 8190 + }, + { + "epoch": 0.082, + "grad_norm": 9.125, + "grad_norm_var": 0.20115559895833332, + "learning_rate": 0.0003, + "loss": 13.0415, + "loss/aux_loss": 0.04811887349933386, + "loss/crossentropy": 2.9409547805786134, + "loss/logits": 0.9966282039880753, + "step": 8200 + }, + { + "epoch": 0.0821, + "grad_norm": 9.6875, + "grad_norm_var": 9.998811848958333, + "learning_rate": 0.0003, + "loss": 13.2195, + "loss/aux_loss": 0.04813245311379433, + "loss/crossentropy": 2.8335422039031983, + "loss/logits": 0.9939737856388092, + "step": 8210 + }, + { + "epoch": 0.0822, + "grad_norm": 9.4375, + "grad_norm_var": 9.541129557291667, + "learning_rate": 0.0003, + "loss": 13.2792, + "loss/aux_loss": 0.04812641255557537, + "loss/crossentropy": 2.935366129875183, + "loss/logits": 0.9935110956430435, + "step": 8220 + }, + { + "epoch": 0.0823, + "grad_norm": 10.4375, + "grad_norm_var": 1.395947265625, + "learning_rate": 0.0003, + "loss": 13.4096, + "loss/aux_loss": 0.048128544352948666, + "loss/crossentropy": 3.0038156509399414, + "loss/logits": 1.00513653755188, + "step": 8230 + }, + { + "epoch": 0.0824, + "grad_norm": 11.1875, + "grad_norm_var": 1.0632649739583333, + "learning_rate": 0.0003, + "loss": 13.2308, + "loss/aux_loss": 0.048123649694025515, + "loss/crossentropy": 2.852214002609253, + "loss/logits": 0.9939478904008865, + "step": 8240 + }, + { + "epoch": 0.0825, + "grad_norm": 9.875, + "grad_norm_var": 1.1158854166666667, + "learning_rate": 0.0003, + "loss": 13.1413, + "loss/aux_loss": 0.0481396097689867, + "loss/crossentropy": 2.907454788684845, + "loss/logits": 1.0276815801858903, + "step": 8250 + }, + { + "epoch": 0.0826, + "grad_norm": 9.8125, + "grad_norm_var": 0.37224934895833334, + "learning_rate": 0.0003, + "loss": 13.1788, + "loss/aux_loss": 0.04811225663870573, + "loss/crossentropy": 2.921643829345703, + "loss/logits": 1.0124903351068497, + "step": 8260 + }, + { + "epoch": 0.0827, + "grad_norm": 10.125, + "grad_norm_var": 1.4671875, + "learning_rate": 0.0003, + "loss": 13.2313, + "loss/aux_loss": 0.0481251984834671, + "loss/crossentropy": 2.755663204193115, + "loss/logits": 1.007522416114807, + "step": 8270 + }, + { + "epoch": 0.0828, + "grad_norm": 10.125, + "grad_norm_var": 1.0781087239583333, + "learning_rate": 0.0003, + "loss": 13.1676, + "loss/aux_loss": 0.048118762858212, + "loss/crossentropy": 2.948284614086151, + "loss/logits": 1.0121029019355774, + "step": 8280 + }, + { + "epoch": 0.0829, + "grad_norm": 11.4375, + "grad_norm_var": 0.3949055989583333, + "learning_rate": 0.0003, + "loss": 13.1264, + "loss/aux_loss": 0.04811645671725273, + "loss/crossentropy": 3.065406286716461, + "loss/logits": 1.0118898630142212, + "step": 8290 + }, + { + "epoch": 0.083, + "grad_norm": 10.6875, + "grad_norm_var": 3.4449055989583335, + "learning_rate": 0.0003, + "loss": 13.2246, + "loss/aux_loss": 0.04812601022422314, + "loss/crossentropy": 2.9024933516979217, + "loss/logits": 1.0029625982046126, + "step": 8300 + }, + { + "epoch": 0.0831, + "grad_norm": 10.625, + "grad_norm_var": 0.6285807291666666, + "learning_rate": 0.0003, + "loss": 13.2829, + "loss/aux_loss": 0.04812156446278095, + "loss/crossentropy": 3.0828394651412965, + "loss/logits": 1.098636594414711, + "step": 8310 + }, + { + "epoch": 0.0832, + "grad_norm": 8.6875, + "grad_norm_var": 0.43748372395833335, + "learning_rate": 0.0003, + "loss": 13.0539, + "loss/aux_loss": 0.048117737844586374, + "loss/crossentropy": 2.9925745487213136, + "loss/logits": 1.0268135398626328, + "step": 8320 + }, + { + "epoch": 0.0833, + "grad_norm": 8.6875, + "grad_norm_var": 0.29010416666666666, + "learning_rate": 0.0003, + "loss": 12.9795, + "loss/aux_loss": 0.048119370639324185, + "loss/crossentropy": 2.8304718136787415, + "loss/logits": 0.9774332970380784, + "step": 8330 + }, + { + "epoch": 0.0834, + "grad_norm": 10.75, + "grad_norm_var": 2.7471354166666666, + "learning_rate": 0.0003, + "loss": 13.2265, + "loss/aux_loss": 0.04812915232032537, + "loss/crossentropy": 2.805083268880844, + "loss/logits": 1.0131800711154937, + "step": 8340 + }, + { + "epoch": 0.0835, + "grad_norm": 10.25, + "grad_norm_var": 2.8114583333333334, + "learning_rate": 0.0003, + "loss": 13.1295, + "loss/aux_loss": 0.04811917226761579, + "loss/crossentropy": 3.0019118428230285, + "loss/logits": 1.0275006771087647, + "step": 8350 + }, + { + "epoch": 0.0836, + "grad_norm": 10.0625, + "grad_norm_var": 0.3181640625, + "learning_rate": 0.0003, + "loss": 13.2826, + "loss/aux_loss": 0.048113813251256944, + "loss/crossentropy": 2.8711145401000975, + "loss/logits": 1.0059622257947922, + "step": 8360 + }, + { + "epoch": 0.0837, + "grad_norm": 9.1875, + "grad_norm_var": 0.19607747395833333, + "learning_rate": 0.0003, + "loss": 13.2408, + "loss/aux_loss": 0.04812101162970066, + "loss/crossentropy": 3.0431210875511168, + "loss/logits": 1.0434471309185027, + "step": 8370 + }, + { + "epoch": 0.0838, + "grad_norm": 9.75, + "grad_norm_var": 0.2228515625, + "learning_rate": 0.0003, + "loss": 13.0946, + "loss/aux_loss": 0.04811821822077036, + "loss/crossentropy": 2.918779957294464, + "loss/logits": 1.0080697566270829, + "step": 8380 + }, + { + "epoch": 0.0839, + "grad_norm": 9.125, + "grad_norm_var": 113.34347330729166, + "learning_rate": 0.0003, + "loss": 12.9875, + "loss/aux_loss": 0.04812317434698343, + "loss/crossentropy": 2.8696110606193543, + "loss/logits": 1.021797129511833, + "step": 8390 + }, + { + "epoch": 0.084, + "grad_norm": 9.5625, + "grad_norm_var": 113.546728515625, + "learning_rate": 0.0003, + "loss": 13.0979, + "loss/aux_loss": 0.048132005520164965, + "loss/crossentropy": 2.8620001435279847, + "loss/logits": 1.0312078952789308, + "step": 8400 + }, + { + "epoch": 0.0841, + "grad_norm": 8.8125, + "grad_norm_var": 0.22537434895833333, + "learning_rate": 0.0003, + "loss": 13.1934, + "loss/aux_loss": 0.048122233152389525, + "loss/crossentropy": 2.7849833965301514, + "loss/logits": 1.0247801810503006, + "step": 8410 + }, + { + "epoch": 0.0842, + "grad_norm": 8.5, + "grad_norm_var": 0.118994140625, + "learning_rate": 0.0003, + "loss": 13.2404, + "loss/aux_loss": 0.048128989338874814, + "loss/crossentropy": 2.9442156195640563, + "loss/logits": 1.0153923511505127, + "step": 8420 + }, + { + "epoch": 0.0843, + "grad_norm": 9.8125, + "grad_norm_var": 0.2554524739583333, + "learning_rate": 0.0003, + "loss": 13.0926, + "loss/aux_loss": 0.04813152328133583, + "loss/crossentropy": 2.9320022106170653, + "loss/logits": 1.0175166606903077, + "step": 8430 + }, + { + "epoch": 0.0844, + "grad_norm": 9.875, + "grad_norm_var": 5.853238932291666, + "learning_rate": 0.0003, + "loss": 13.2954, + "loss/aux_loss": 0.04813098907470703, + "loss/crossentropy": 2.9264755129814146, + "loss/logits": 1.0203221708536148, + "step": 8440 + }, + { + "epoch": 0.0845, + "grad_norm": 9.375, + "grad_norm_var": 6.127197265625, + "learning_rate": 0.0003, + "loss": 13.1144, + "loss/aux_loss": 0.04811909180134535, + "loss/crossentropy": 2.916581463813782, + "loss/logits": 0.9956386595964432, + "step": 8450 + }, + { + "epoch": 0.0846, + "grad_norm": 8.75, + "grad_norm_var": 0.160009765625, + "learning_rate": 0.0003, + "loss": 13.178, + "loss/aux_loss": 0.04812520742416382, + "loss/crossentropy": 2.9652814626693726, + "loss/logits": 1.02629674077034, + "step": 8460 + }, + { + "epoch": 0.0847, + "grad_norm": 9.4375, + "grad_norm_var": 0.116650390625, + "learning_rate": 0.0003, + "loss": 13.1988, + "loss/aux_loss": 0.048121622577309606, + "loss/crossentropy": 2.971061831712723, + "loss/logits": 1.0064853310585022, + "step": 8470 + }, + { + "epoch": 0.0848, + "grad_norm": 10.5625, + "grad_norm_var": 0.15701497395833333, + "learning_rate": 0.0003, + "loss": 13.1627, + "loss/aux_loss": 0.048127346113324164, + "loss/crossentropy": 2.9350649237632753, + "loss/logits": 0.9827002599835396, + "step": 8480 + }, + { + "epoch": 0.0849, + "grad_norm": 8.25, + "grad_norm_var": 0.310009765625, + "learning_rate": 0.0003, + "loss": 12.9945, + "loss/aux_loss": 0.048121594451367856, + "loss/crossentropy": 2.8912373781204224, + "loss/logits": 0.9815872967243194, + "step": 8490 + }, + { + "epoch": 0.085, + "grad_norm": 10.125, + "grad_norm_var": 0.8640625, + "learning_rate": 0.0003, + "loss": 13.3333, + "loss/aux_loss": 0.04812703672796488, + "loss/crossentropy": 3.0496490001678467, + "loss/logits": 1.0207342118024827, + "step": 8500 + }, + { + "epoch": 0.0851, + "grad_norm": 9.6875, + "grad_norm_var": 16.475244140625, + "learning_rate": 0.0003, + "loss": 13.2066, + "loss/aux_loss": 0.048138654045760634, + "loss/crossentropy": 2.858214247226715, + "loss/logits": 1.0198309272527695, + "step": 8510 + }, + { + "epoch": 0.0852, + "grad_norm": 9.375, + "grad_norm_var": 16.990738932291666, + "learning_rate": 0.0003, + "loss": 12.9225, + "loss/aux_loss": 0.04811470378190279, + "loss/crossentropy": 2.6653155386447906, + "loss/logits": 0.9890996038913726, + "step": 8520 + }, + { + "epoch": 0.0853, + "grad_norm": 10.0625, + "grad_norm_var": 0.29217122395833334, + "learning_rate": 0.0003, + "loss": 13.0286, + "loss/aux_loss": 0.04812881331890821, + "loss/crossentropy": 2.818387824296951, + "loss/logits": 1.0328501909971237, + "step": 8530 + }, + { + "epoch": 0.0854, + "grad_norm": 9.9375, + "grad_norm_var": 0.4891764322916667, + "learning_rate": 0.0003, + "loss": 12.9661, + "loss/aux_loss": 0.04812416769564152, + "loss/crossentropy": 2.7885517358779905, + "loss/logits": 1.0058355391025544, + "step": 8540 + }, + { + "epoch": 0.0855, + "grad_norm": 9.3125, + "grad_norm_var": 0.6669108072916666, + "learning_rate": 0.0003, + "loss": 13.1039, + "loss/aux_loss": 0.04812269229441881, + "loss/crossentropy": 2.9664511680603027, + "loss/logits": 1.0393647104501724, + "step": 8550 + }, + { + "epoch": 0.0856, + "grad_norm": 9.0625, + "grad_norm_var": 0.323681640625, + "learning_rate": 0.0003, + "loss": 13.211, + "loss/aux_loss": 0.04812395125627518, + "loss/crossentropy": 2.9345888257026673, + "loss/logits": 1.017241859436035, + "step": 8560 + }, + { + "epoch": 0.0857, + "grad_norm": 9.625, + "grad_norm_var": 37.13743489583333, + "learning_rate": 0.0003, + "loss": 13.1819, + "loss/aux_loss": 0.04813410099595785, + "loss/crossentropy": 2.9240545988082887, + "loss/logits": 1.0288849472999573, + "step": 8570 + }, + { + "epoch": 0.0858, + "grad_norm": 10.5, + "grad_norm_var": 0.5333333333333333, + "learning_rate": 0.0003, + "loss": 13.2227, + "loss/aux_loss": 0.048121962882578376, + "loss/crossentropy": 2.855889308452606, + "loss/logits": 1.0060656636953353, + "step": 8580 + }, + { + "epoch": 0.0859, + "grad_norm": 12.0625, + "grad_norm_var": 0.6429524739583333, + "learning_rate": 0.0003, + "loss": 13.1099, + "loss/aux_loss": 0.048110452853143214, + "loss/crossentropy": 2.8173590660095216, + "loss/logits": 1.0144326239824295, + "step": 8590 + }, + { + "epoch": 0.086, + "grad_norm": 90.5, + "grad_norm_var": 409.363134765625, + "learning_rate": 0.0003, + "loss": 13.2486, + "loss/aux_loss": 0.048120449669659136, + "loss/crossentropy": 2.889569455385208, + "loss/logits": 1.0104403495788574, + "step": 8600 + }, + { + "epoch": 0.0861, + "grad_norm": 10.0625, + "grad_norm_var": 405.5660807291667, + "learning_rate": 0.0003, + "loss": 12.9023, + "loss/aux_loss": 0.04813555497676134, + "loss/crossentropy": 2.891407001018524, + "loss/logits": 0.9928454220294952, + "step": 8610 + }, + { + "epoch": 0.0862, + "grad_norm": 10.3125, + "grad_norm_var": 1.1833333333333333, + "learning_rate": 0.0003, + "loss": 13.1671, + "loss/aux_loss": 0.048119562491774556, + "loss/crossentropy": 2.9349429488182066, + "loss/logits": 1.0234294265508652, + "step": 8620 + }, + { + "epoch": 0.0863, + "grad_norm": 9.8125, + "grad_norm_var": 1.1183430989583334, + "learning_rate": 0.0003, + "loss": 13.1522, + "loss/aux_loss": 0.048119149915874, + "loss/crossentropy": 2.953269922733307, + "loss/logits": 1.029870542883873, + "step": 8630 + }, + { + "epoch": 0.0864, + "grad_norm": 21.625, + "grad_norm_var": 8.992952473958333, + "learning_rate": 0.0003, + "loss": 13.223, + "loss/aux_loss": 0.048137610964477065, + "loss/crossentropy": 2.854456979036331, + "loss/logits": 1.0149786740541458, + "step": 8640 + }, + { + "epoch": 0.0865, + "grad_norm": 9.125, + "grad_norm_var": 8.875455729166667, + "learning_rate": 0.0003, + "loss": 12.8707, + "loss/aux_loss": 0.048116568848490714, + "loss/crossentropy": 2.9507800936698914, + "loss/logits": 1.0633498966693877, + "step": 8650 + }, + { + "epoch": 0.0866, + "grad_norm": 9.75, + "grad_norm_var": 0.17433268229166668, + "learning_rate": 0.0003, + "loss": 13.2224, + "loss/aux_loss": 0.048123492300510405, + "loss/crossentropy": 2.8969777107238768, + "loss/logits": 0.9931401669979095, + "step": 8660 + }, + { + "epoch": 0.0867, + "grad_norm": 9.0, + "grad_norm_var": 8.690885416666667, + "learning_rate": 0.0003, + "loss": 13.1396, + "loss/aux_loss": 0.048116271197795865, + "loss/crossentropy": 2.907929790019989, + "loss/logits": 1.0244786828756332, + "step": 8670 + }, + { + "epoch": 0.0868, + "grad_norm": 9.625, + "grad_norm_var": 8.547135416666666, + "learning_rate": 0.0003, + "loss": 12.9869, + "loss/aux_loss": 0.04812566060572863, + "loss/crossentropy": 2.813516306877136, + "loss/logits": 0.9591111838817596, + "step": 8680 + }, + { + "epoch": 0.0869, + "grad_norm": 10.5625, + "grad_norm_var": 0.22706705729166668, + "learning_rate": 0.0003, + "loss": 13.0031, + "loss/aux_loss": 0.04811654146760702, + "loss/crossentropy": 2.7499096274375914, + "loss/logits": 0.9870797544717789, + "step": 8690 + }, + { + "epoch": 0.087, + "grad_norm": 9.625, + "grad_norm_var": 0.4505208333333333, + "learning_rate": 0.0003, + "loss": 13.0701, + "loss/aux_loss": 0.04811768177896738, + "loss/crossentropy": 2.812624078989029, + "loss/logits": 0.9721063941717147, + "step": 8700 + }, + { + "epoch": 0.0871, + "grad_norm": 9.6875, + "grad_norm_var": 0.13357747395833333, + "learning_rate": 0.0003, + "loss": 13.2135, + "loss/aux_loss": 0.048116148076951505, + "loss/crossentropy": 2.9922009468078614, + "loss/logits": 1.015498149394989, + "step": 8710 + }, + { + "epoch": 0.0872, + "grad_norm": 9.75, + "grad_norm_var": 0.13326822916666667, + "learning_rate": 0.0003, + "loss": 13.0671, + "loss/aux_loss": 0.048121779784560205, + "loss/crossentropy": 2.827225810289383, + "loss/logits": 0.9838965624570847, + "step": 8720 + }, + { + "epoch": 0.0873, + "grad_norm": 9.6875, + "grad_norm_var": 0.26764322916666666, + "learning_rate": 0.0003, + "loss": 13.1019, + "loss/aux_loss": 0.04811729565262794, + "loss/crossentropy": 2.959608232975006, + "loss/logits": 0.9972497940063476, + "step": 8730 + }, + { + "epoch": 0.0874, + "grad_norm": 9.3125, + "grad_norm_var": 0.16920572916666668, + "learning_rate": 0.0003, + "loss": 12.7734, + "loss/aux_loss": 0.048125034943223, + "loss/crossentropy": 2.6938082754611967, + "loss/logits": 0.9462698817253112, + "step": 8740 + }, + { + "epoch": 0.0875, + "grad_norm": 9.5625, + "grad_norm_var": 0.21560872395833333, + "learning_rate": 0.0003, + "loss": 13.0511, + "loss/aux_loss": 0.04812650829553604, + "loss/crossentropy": 2.7083167552948, + "loss/logits": 0.989093354344368, + "step": 8750 + }, + { + "epoch": 0.0876, + "grad_norm": 9.625, + "grad_norm_var": 0.198291015625, + "learning_rate": 0.0003, + "loss": 12.9391, + "loss/aux_loss": 0.0481193732470274, + "loss/crossentropy": 2.9418309926986694, + "loss/logits": 0.9726715385913849, + "step": 8760 + }, + { + "epoch": 0.0877, + "grad_norm": 9.375, + "grad_norm_var": 0.22213541666666667, + "learning_rate": 0.0003, + "loss": 13.1485, + "loss/aux_loss": 0.048114814795553684, + "loss/crossentropy": 2.9309451580047607, + "loss/logits": 1.0644985824823379, + "step": 8770 + }, + { + "epoch": 0.0878, + "grad_norm": 9.75, + "grad_norm_var": 92.121728515625, + "learning_rate": 0.0003, + "loss": 13.0324, + "loss/aux_loss": 0.04812544099986553, + "loss/crossentropy": 2.960424965620041, + "loss/logits": 0.9863006621599197, + "step": 8780 + }, + { + "epoch": 0.0879, + "grad_norm": 9.125, + "grad_norm_var": 0.211572265625, + "learning_rate": 0.0003, + "loss": 13.2247, + "loss/aux_loss": 0.048114399425685406, + "loss/crossentropy": 2.848669397830963, + "loss/logits": 1.0225479423999786, + "step": 8790 + }, + { + "epoch": 0.088, + "grad_norm": 9.125, + "grad_norm_var": 0.250634765625, + "learning_rate": 0.0003, + "loss": 13.0081, + "loss/aux_loss": 0.0481090260669589, + "loss/crossentropy": 3.036766457557678, + "loss/logits": 1.005482006072998, + "step": 8800 + }, + { + "epoch": 0.0881, + "grad_norm": 9.75, + "grad_norm_var": 0.10545247395833333, + "learning_rate": 0.0003, + "loss": 13.0766, + "loss/aux_loss": 0.048118382692337036, + "loss/crossentropy": 2.9599334478378294, + "loss/logits": 1.022737380862236, + "step": 8810 + }, + { + "epoch": 0.0882, + "grad_norm": 9.6875, + "grad_norm_var": 0.33639322916666664, + "learning_rate": 0.0003, + "loss": 13.0431, + "loss/aux_loss": 0.04812664575874805, + "loss/crossentropy": 2.761233627796173, + "loss/logits": 1.0341258555650712, + "step": 8820 + }, + { + "epoch": 0.0883, + "grad_norm": 10.125, + "grad_norm_var": 0.3087076822916667, + "learning_rate": 0.0003, + "loss": 12.9998, + "loss/aux_loss": 0.048121739737689496, + "loss/crossentropy": 2.9523282289505004, + "loss/logits": 1.0163812279701232, + "step": 8830 + }, + { + "epoch": 0.0884, + "grad_norm": 10.25, + "grad_norm_var": 49.53359375, + "learning_rate": 0.0003, + "loss": 13.0433, + "loss/aux_loss": 0.048123051226139066, + "loss/crossentropy": 2.9619523882865906, + "loss/logits": 1.0154429644346237, + "step": 8840 + }, + { + "epoch": 0.0885, + "grad_norm": 9.0625, + "grad_norm_var": 48.044010416666666, + "learning_rate": 0.0003, + "loss": 13.0594, + "loss/aux_loss": 0.048118606954813, + "loss/crossentropy": 2.768035900592804, + "loss/logits": 1.0090958893299102, + "step": 8850 + }, + { + "epoch": 0.0886, + "grad_norm": 9.625, + "grad_norm_var": 0.24021809895833332, + "learning_rate": 0.0003, + "loss": 13.0136, + "loss/aux_loss": 0.04812074787914753, + "loss/crossentropy": 2.864524757862091, + "loss/logits": 0.9908095836639405, + "step": 8860 + }, + { + "epoch": 0.0887, + "grad_norm": 10.75, + "grad_norm_var": 0.5041015625, + "learning_rate": 0.0003, + "loss": 13.0489, + "loss/aux_loss": 0.04811746664345264, + "loss/crossentropy": 2.8981815814971923, + "loss/logits": 0.9988605201244354, + "step": 8870 + }, + { + "epoch": 0.0888, + "grad_norm": 9.3125, + "grad_norm_var": 0.303125, + "learning_rate": 0.0003, + "loss": 13.0495, + "loss/aux_loss": 0.04811958447098732, + "loss/crossentropy": 2.953895443677902, + "loss/logits": 0.9865518122911453, + "step": 8880 + }, + { + "epoch": 0.0889, + "grad_norm": 9.5625, + "grad_norm_var": 0.25636393229166665, + "learning_rate": 0.0003, + "loss": 13.0068, + "loss/aux_loss": 0.048110640980303286, + "loss/crossentropy": 2.844312059879303, + "loss/logits": 0.999048775434494, + "step": 8890 + }, + { + "epoch": 0.089, + "grad_norm": 9.125, + "grad_norm_var": 0.3324055989583333, + "learning_rate": 0.0003, + "loss": 13.1248, + "loss/aux_loss": 0.048114532604813576, + "loss/crossentropy": 2.8329219222068787, + "loss/logits": 1.0227496713399886, + "step": 8900 + }, + { + "epoch": 0.0891, + "grad_norm": 9.4375, + "grad_norm_var": 0.695166015625, + "learning_rate": 0.0003, + "loss": 13.2069, + "loss/aux_loss": 0.048122035898268224, + "loss/crossentropy": 2.9590111494064333, + "loss/logits": 1.020371201634407, + "step": 8910 + }, + { + "epoch": 0.0892, + "grad_norm": 8.625, + "grad_norm_var": 0.4691243489583333, + "learning_rate": 0.0003, + "loss": 12.9333, + "loss/aux_loss": 0.04812424685806036, + "loss/crossentropy": 2.803478956222534, + "loss/logits": 0.9767372757196426, + "step": 8920 + }, + { + "epoch": 0.0893, + "grad_norm": 9.0625, + "grad_norm_var": 0.2598958333333333, + "learning_rate": 0.0003, + "loss": 13.2306, + "loss/aux_loss": 0.04812055286020041, + "loss/crossentropy": 2.90518371462822, + "loss/logits": 0.9993892073631286, + "step": 8930 + }, + { + "epoch": 0.0894, + "grad_norm": 10.0, + "grad_norm_var": 0.15462239583333334, + "learning_rate": 0.0003, + "loss": 13.2241, + "loss/aux_loss": 0.04811331238597631, + "loss/crossentropy": 2.9701803803443907, + "loss/logits": 0.9940306186676026, + "step": 8940 + }, + { + "epoch": 0.0895, + "grad_norm": 10.375, + "grad_norm_var": 0.246337890625, + "learning_rate": 0.0003, + "loss": 13.0112, + "loss/aux_loss": 0.04811495840549469, + "loss/crossentropy": 3.0566563248634337, + "loss/logits": 1.0184517830610276, + "step": 8950 + }, + { + "epoch": 0.0896, + "grad_norm": 9.4375, + "grad_norm_var": 4.417952473958334, + "learning_rate": 0.0003, + "loss": 12.9823, + "loss/aux_loss": 0.04812169056385755, + "loss/crossentropy": 2.948707568645477, + "loss/logits": 0.9956671565771102, + "step": 8960 + }, + { + "epoch": 0.0897, + "grad_norm": 9.125, + "grad_norm_var": 4.491520182291667, + "learning_rate": 0.0003, + "loss": 13.0456, + "loss/aux_loss": 0.04811549689620733, + "loss/crossentropy": 2.8826889276504515, + "loss/logits": 1.0055119961500167, + "step": 8970 + }, + { + "epoch": 0.0898, + "grad_norm": 9.5625, + "grad_norm_var": 4.914306640625, + "learning_rate": 0.0003, + "loss": 13.3204, + "loss/aux_loss": 0.04811891969293356, + "loss/crossentropy": 3.0966086268424986, + "loss/logits": 1.0039119273424149, + "step": 8980 + }, + { + "epoch": 0.0899, + "grad_norm": 9.5625, + "grad_norm_var": 0.9282389322916667, + "learning_rate": 0.0003, + "loss": 13.0716, + "loss/aux_loss": 0.04812802001833916, + "loss/crossentropy": 2.773033380508423, + "loss/logits": 1.0084805130958556, + "step": 8990 + }, + { + "epoch": 0.09, + "grad_norm": 10.1875, + "grad_norm_var": 0.8994791666666667, + "learning_rate": 0.0003, + "loss": 13.2718, + "loss/aux_loss": 0.0481186056509614, + "loss/crossentropy": 2.934514182806015, + "loss/logits": 1.0663816720247268, + "step": 9000 + }, + { + "epoch": 0.0901, + "grad_norm": 9.1875, + "grad_norm_var": 0.2916015625, + "learning_rate": 0.0003, + "loss": 13.0098, + "loss/aux_loss": 0.04811523351818323, + "loss/crossentropy": 2.8817059993743896, + "loss/logits": 1.0227952599525452, + "step": 9010 + }, + { + "epoch": 0.0902, + "grad_norm": 10.8125, + "grad_norm_var": 1.7858723958333333, + "learning_rate": 0.0003, + "loss": 12.9215, + "loss/aux_loss": 0.04812025129795074, + "loss/crossentropy": 2.9280009150505064, + "loss/logits": 1.008087882399559, + "step": 9020 + }, + { + "epoch": 0.0903, + "grad_norm": 10.625, + "grad_norm_var": 1.5988932291666667, + "learning_rate": 0.0003, + "loss": 12.9675, + "loss/aux_loss": 0.048126825504004954, + "loss/crossentropy": 2.752195543050766, + "loss/logits": 0.938539656996727, + "step": 9030 + }, + { + "epoch": 0.0904, + "grad_norm": 9.6875, + "grad_norm_var": 0.17838541666666666, + "learning_rate": 0.0003, + "loss": 13.1815, + "loss/aux_loss": 0.04811705574393273, + "loss/crossentropy": 3.056387519836426, + "loss/logits": 1.0307760834693909, + "step": 9040 + }, + { + "epoch": 0.0905, + "grad_norm": 9.5625, + "grad_norm_var": 0.14724934895833333, + "learning_rate": 0.0003, + "loss": 13.2356, + "loss/aux_loss": 0.04812313225120306, + "loss/crossentropy": 2.984651046991348, + "loss/logits": 1.0027798056602477, + "step": 9050 + }, + { + "epoch": 0.0906, + "grad_norm": 8.8125, + "grad_norm_var": 0.325634765625, + "learning_rate": 0.0003, + "loss": 13.0133, + "loss/aux_loss": 0.04811926949769259, + "loss/crossentropy": 2.916082763671875, + "loss/logits": 0.9860832780599594, + "step": 9060 + }, + { + "epoch": 0.0907, + "grad_norm": 10.8125, + "grad_norm_var": 0.266259765625, + "learning_rate": 0.0003, + "loss": 12.9402, + "loss/aux_loss": 0.048123452626168725, + "loss/crossentropy": 2.843355292081833, + "loss/logits": 0.9923195570707322, + "step": 9070 + }, + { + "epoch": 0.0908, + "grad_norm": 9.625, + "grad_norm_var": 0.5714680989583333, + "learning_rate": 0.0003, + "loss": 12.7962, + "loss/aux_loss": 0.04811744131147862, + "loss/crossentropy": 2.929332971572876, + "loss/logits": 1.011452180147171, + "step": 9080 + }, + { + "epoch": 0.0909, + "grad_norm": 10.4375, + "grad_norm_var": 0.22369791666666666, + "learning_rate": 0.0003, + "loss": 13.0572, + "loss/aux_loss": 0.04812127202749252, + "loss/crossentropy": 2.9542043566703797, + "loss/logits": 0.9913775563240051, + "step": 9090 + }, + { + "epoch": 0.091, + "grad_norm": 10.0, + "grad_norm_var": 0.4495930989583333, + "learning_rate": 0.0003, + "loss": 13.0991, + "loss/aux_loss": 0.048116521537303926, + "loss/crossentropy": 2.845492494106293, + "loss/logits": 1.0074622273445129, + "step": 9100 + }, + { + "epoch": 0.0911, + "grad_norm": 10.0625, + "grad_norm_var": 0.633837890625, + "learning_rate": 0.0003, + "loss": 12.9897, + "loss/aux_loss": 0.048106766492128375, + "loss/crossentropy": 2.902200919389725, + "loss/logits": 1.0262346029281617, + "step": 9110 + }, + { + "epoch": 0.0912, + "grad_norm": 10.375, + "grad_norm_var": 5.352718098958333, + "learning_rate": 0.0003, + "loss": 13.0402, + "loss/aux_loss": 0.04812852665781975, + "loss/crossentropy": 2.9274023175239563, + "loss/logits": 0.9989449590444565, + "step": 9120 + }, + { + "epoch": 0.0913, + "grad_norm": 10.125, + "grad_norm_var": 5.371077473958334, + "learning_rate": 0.0003, + "loss": 13.1259, + "loss/aux_loss": 0.048123916052281855, + "loss/crossentropy": 2.957271945476532, + "loss/logits": 1.0392587214708329, + "step": 9130 + }, + { + "epoch": 0.0914, + "grad_norm": 11.0, + "grad_norm_var": 0.8235514322916667, + "learning_rate": 0.0003, + "loss": 12.8346, + "loss/aux_loss": 0.04811377823352814, + "loss/crossentropy": 2.7599571704864503, + "loss/logits": 0.9800522536039352, + "step": 9140 + }, + { + "epoch": 0.0915, + "grad_norm": 9.0625, + "grad_norm_var": 14.900764973958333, + "learning_rate": 0.0003, + "loss": 13.0363, + "loss/aux_loss": 0.048123881407082084, + "loss/crossentropy": 2.8104595303535462, + "loss/logits": 0.9722192943096161, + "step": 9150 + }, + { + "epoch": 0.0916, + "grad_norm": 10.8125, + "grad_norm_var": 14.4884765625, + "learning_rate": 0.0003, + "loss": 13.0768, + "loss/aux_loss": 0.04811329320073128, + "loss/crossentropy": 2.85021288394928, + "loss/logits": 1.0288948625326158, + "step": 9160 + }, + { + "epoch": 0.0917, + "grad_norm": 9.25, + "grad_norm_var": 0.5411295572916667, + "learning_rate": 0.0003, + "loss": 13.0842, + "loss/aux_loss": 0.04812074415385723, + "loss/crossentropy": 2.9284089267253877, + "loss/logits": 1.0179531484842301, + "step": 9170 + }, + { + "epoch": 0.0918, + "grad_norm": 9.9375, + "grad_norm_var": 0.6243326822916667, + "learning_rate": 0.0003, + "loss": 13.0484, + "loss/aux_loss": 0.04811491388827562, + "loss/crossentropy": 2.8634442031383514, + "loss/logits": 0.988609355688095, + "step": 9180 + }, + { + "epoch": 0.0919, + "grad_norm": 15.25, + "grad_norm_var": 2.2025390625, + "learning_rate": 0.0003, + "loss": 12.9292, + "loss/aux_loss": 0.04811589177697897, + "loss/crossentropy": 3.035586249828339, + "loss/logits": 1.017078360915184, + "step": 9190 + }, + { + "epoch": 0.092, + "grad_norm": 10.3125, + "grad_norm_var": 2.242447916666667, + "learning_rate": 0.0003, + "loss": 13.0149, + "loss/aux_loss": 0.04812207706272602, + "loss/crossentropy": 2.962714272737503, + "loss/logits": 0.9997862339019775, + "step": 9200 + }, + { + "epoch": 0.0921, + "grad_norm": 11.625, + "grad_norm_var": 0.5791666666666667, + "learning_rate": 0.0003, + "loss": 12.9794, + "loss/aux_loss": 0.04811257142573595, + "loss/crossentropy": 2.904304379224777, + "loss/logits": 0.9970894068479538, + "step": 9210 + }, + { + "epoch": 0.0922, + "grad_norm": 9.75, + "grad_norm_var": 0.455322265625, + "learning_rate": 0.0003, + "loss": 12.997, + "loss/aux_loss": 0.048116791248321536, + "loss/crossentropy": 2.9704554200172426, + "loss/logits": 1.009730476140976, + "step": 9220 + }, + { + "epoch": 0.0923, + "grad_norm": 9.3125, + "grad_norm_var": 0.22263997395833332, + "learning_rate": 0.0003, + "loss": 13.0321, + "loss/aux_loss": 0.04812146797776222, + "loss/crossentropy": 2.927646744251251, + "loss/logits": 0.9740961879491806, + "step": 9230 + }, + { + "epoch": 0.0924, + "grad_norm": 10.25, + "grad_norm_var": 0.6384765625, + "learning_rate": 0.0003, + "loss": 13.0843, + "loss/aux_loss": 0.048116331547498704, + "loss/crossentropy": 2.9682451248168946, + "loss/logits": 1.0054449021816254, + "step": 9240 + }, + { + "epoch": 0.0925, + "grad_norm": 9.25, + "grad_norm_var": 0.81796875, + "learning_rate": 0.0003, + "loss": 12.8984, + "loss/aux_loss": 0.0481177942827344, + "loss/crossentropy": 2.9605862140655517, + "loss/logits": 0.9988209009170532, + "step": 9250 + }, + { + "epoch": 0.0926, + "grad_norm": 9.9375, + "grad_norm_var": 7.1212890625, + "learning_rate": 0.0003, + "loss": 13.126, + "loss/aux_loss": 0.04811736159026623, + "loss/crossentropy": 2.968954026699066, + "loss/logits": 0.9968051850795746, + "step": 9260 + }, + { + "epoch": 0.0927, + "grad_norm": 10.5, + "grad_norm_var": 7.165999348958334, + "learning_rate": 0.0003, + "loss": 12.9006, + "loss/aux_loss": 0.04812134802341461, + "loss/crossentropy": 2.95685738325119, + "loss/logits": 1.017841598391533, + "step": 9270 + }, + { + "epoch": 0.0928, + "grad_norm": 10.1875, + "grad_norm_var": 302.6844889322917, + "learning_rate": 0.0003, + "loss": 13.0568, + "loss/aux_loss": 0.04813589584082365, + "loss/crossentropy": 2.8788455188274384, + "loss/logits": 0.998471787571907, + "step": 9280 + }, + { + "epoch": 0.0929, + "grad_norm": 9.8125, + "grad_norm_var": 304.08396809895834, + "learning_rate": 0.0003, + "loss": 12.9694, + "loss/aux_loss": 0.04811546951532364, + "loss/crossentropy": 2.8760639309883116, + "loss/logits": 0.9897254168987274, + "step": 9290 + }, + { + "epoch": 0.093, + "grad_norm": 10.0625, + "grad_norm_var": 0.14270833333333333, + "learning_rate": 0.0003, + "loss": 12.875, + "loss/aux_loss": 0.0481147637590766, + "loss/crossentropy": 2.9088239908218383, + "loss/logits": 0.9841889888048172, + "step": 9300 + }, + { + "epoch": 0.0931, + "grad_norm": 9.8125, + "grad_norm_var": 0.21131184895833333, + "learning_rate": 0.0003, + "loss": 12.9544, + "loss/aux_loss": 0.04811472594738007, + "loss/crossentropy": 3.0154574632644655, + "loss/logits": 0.9868688434362411, + "step": 9310 + }, + { + "epoch": 0.0932, + "grad_norm": 10.1875, + "grad_norm_var": 0.297509765625, + "learning_rate": 0.0003, + "loss": 13.0221, + "loss/aux_loss": 0.04811571668833494, + "loss/crossentropy": 2.744140291213989, + "loss/logits": 0.9741410970687866, + "step": 9320 + }, + { + "epoch": 0.0933, + "grad_norm": 9.625, + "grad_norm_var": 0.41354166666666664, + "learning_rate": 0.0003, + "loss": 13.0847, + "loss/aux_loss": 0.04812242966145277, + "loss/crossentropy": 2.8483268916606903, + "loss/logits": 1.0017479300498962, + "step": 9330 + }, + { + "epoch": 0.0934, + "grad_norm": 9.5, + "grad_norm_var": 16.938916015625, + "learning_rate": 0.0003, + "loss": 12.8932, + "loss/aux_loss": 0.048115167394280435, + "loss/crossentropy": 2.951818656921387, + "loss/logits": 1.034898152947426, + "step": 9340 + }, + { + "epoch": 0.0935, + "grad_norm": 9.3125, + "grad_norm_var": 0.141259765625, + "learning_rate": 0.0003, + "loss": 13.0014, + "loss/aux_loss": 0.048115997575223446, + "loss/crossentropy": 2.835058981180191, + "loss/logits": 0.9820165306329727, + "step": 9350 + }, + { + "epoch": 0.0936, + "grad_norm": 10.1875, + "grad_norm_var": 0.151806640625, + "learning_rate": 0.0003, + "loss": 12.8639, + "loss/aux_loss": 0.04810713436454535, + "loss/crossentropy": 2.9016472816467287, + "loss/logits": 1.0063132762908935, + "step": 9360 + }, + { + "epoch": 0.0937, + "grad_norm": 9.9375, + "grad_norm_var": 0.9284993489583333, + "learning_rate": 0.0003, + "loss": 12.858, + "loss/aux_loss": 0.04812375083565712, + "loss/crossentropy": 2.984380769729614, + "loss/logits": 1.0249317467212677, + "step": 9370 + }, + { + "epoch": 0.0938, + "grad_norm": 10.5, + "grad_norm_var": 0.8635416666666667, + "learning_rate": 0.0003, + "loss": 12.9143, + "loss/aux_loss": 0.0481270782649517, + "loss/crossentropy": 3.0072665452957152, + "loss/logits": 0.9971794277429581, + "step": 9380 + }, + { + "epoch": 0.0939, + "grad_norm": 10.0625, + "grad_norm_var": 0.22902018229166668, + "learning_rate": 0.0003, + "loss": 12.9288, + "loss/aux_loss": 0.04811225328594446, + "loss/crossentropy": 2.952876567840576, + "loss/logits": 0.981144642829895, + "step": 9390 + }, + { + "epoch": 0.094, + "grad_norm": 9.5, + "grad_norm_var": 0.20546875, + "learning_rate": 0.0003, + "loss": 12.9573, + "loss/aux_loss": 0.04811703842133284, + "loss/crossentropy": 2.9657641530036924, + "loss/logits": 1.008799707889557, + "step": 9400 + }, + { + "epoch": 0.0941, + "grad_norm": 10.125, + "grad_norm_var": 0.28020833333333334, + "learning_rate": 0.0003, + "loss": 13.0344, + "loss/aux_loss": 0.04812249001115561, + "loss/crossentropy": 2.868061417341232, + "loss/logits": 0.9425824016332627, + "step": 9410 + }, + { + "epoch": 0.0942, + "grad_norm": 9.0625, + "grad_norm_var": 0.2880045572916667, + "learning_rate": 0.0003, + "loss": 12.8889, + "loss/aux_loss": 0.04811691902577877, + "loss/crossentropy": 2.810444962978363, + "loss/logits": 0.9671340584754944, + "step": 9420 + }, + { + "epoch": 0.0943, + "grad_norm": 9.25, + "grad_norm_var": 0.250244140625, + "learning_rate": 0.0003, + "loss": 12.8051, + "loss/aux_loss": 0.04814633168280125, + "loss/crossentropy": 2.7531135201454164, + "loss/logits": 0.9443521648645401, + "step": 9430 + }, + { + "epoch": 0.0944, + "grad_norm": 9.5, + "grad_norm_var": 0.11144205729166666, + "learning_rate": 0.0003, + "loss": 12.9351, + "loss/aux_loss": 0.04811623003333807, + "loss/crossentropy": 2.773250675201416, + "loss/logits": 0.9573301702737809, + "step": 9440 + }, + { + "epoch": 0.0945, + "grad_norm": 10.0625, + "grad_norm_var": 0.14998372395833334, + "learning_rate": 0.0003, + "loss": 12.9976, + "loss/aux_loss": 0.048125031776726244, + "loss/crossentropy": 2.843584269285202, + "loss/logits": 0.9623809665441513, + "step": 9450 + }, + { + "epoch": 0.0946, + "grad_norm": 10.75, + "grad_norm_var": 0.35740559895833335, + "learning_rate": 0.0003, + "loss": 12.9905, + "loss/aux_loss": 0.0481179354712367, + "loss/crossentropy": 3.025428628921509, + "loss/logits": 1.0071224570274353, + "step": 9460 + }, + { + "epoch": 0.0947, + "grad_norm": 9.25, + "grad_norm_var": 0.340625, + "learning_rate": 0.0003, + "loss": 12.9006, + "loss/aux_loss": 0.0481242848560214, + "loss/crossentropy": 2.919004487991333, + "loss/logits": 1.0092800080776214, + "step": 9470 + }, + { + "epoch": 0.0948, + "grad_norm": 9.4375, + "grad_norm_var": 0.364306640625, + "learning_rate": 0.0003, + "loss": 12.8888, + "loss/aux_loss": 0.04811065457761288, + "loss/crossentropy": 3.0337927043437958, + "loss/logits": 0.970859882235527, + "step": 9480 + }, + { + "epoch": 0.0949, + "grad_norm": 9.4375, + "grad_norm_var": 0.2561848958333333, + "learning_rate": 0.0003, + "loss": 12.969, + "loss/aux_loss": 0.048123066686093806, + "loss/crossentropy": 2.9421743154525757, + "loss/logits": 1.0259678810834885, + "step": 9490 + }, + { + "epoch": 0.095, + "grad_norm": 9.75, + "grad_norm_var": 0.2704264322916667, + "learning_rate": 0.0003, + "loss": 12.9057, + "loss/aux_loss": 0.04810989499092102, + "loss/crossentropy": 2.908745914697647, + "loss/logits": 1.004162722826004, + "step": 9500 + }, + { + "epoch": 0.0951, + "grad_norm": 9.8125, + "grad_norm_var": 0.354150390625, + "learning_rate": 0.0003, + "loss": 12.8508, + "loss/aux_loss": 0.04811614695936441, + "loss/crossentropy": 2.8484590649604797, + "loss/logits": 0.9944918006658554, + "step": 9510 + }, + { + "epoch": 0.0952, + "grad_norm": 10.0625, + "grad_norm_var": 0.296728515625, + "learning_rate": 0.0003, + "loss": 12.8238, + "loss/aux_loss": 0.04812110308557749, + "loss/crossentropy": 2.9715175151824953, + "loss/logits": 0.9781792253255844, + "step": 9520 + }, + { + "epoch": 0.0953, + "grad_norm": 9.9375, + "grad_norm_var": 0.26170247395833335, + "learning_rate": 0.0003, + "loss": 12.8021, + "loss/aux_loss": 0.04812615159898996, + "loss/crossentropy": 2.8001496493816376, + "loss/logits": 0.943726196885109, + "step": 9530 + }, + { + "epoch": 0.0954, + "grad_norm": 9.125, + "grad_norm_var": 0.2950520833333333, + "learning_rate": 0.0003, + "loss": 13.0212, + "loss/aux_loss": 0.048111764900386336, + "loss/crossentropy": 2.9262121081352235, + "loss/logits": 1.0509262353181839, + "step": 9540 + }, + { + "epoch": 0.0955, + "grad_norm": 10.3125, + "grad_norm_var": 0.11717122395833333, + "learning_rate": 0.0003, + "loss": 12.7972, + "loss/aux_loss": 0.04811500422656536, + "loss/crossentropy": 2.7417452692985536, + "loss/logits": 0.963932403922081, + "step": 9550 + }, + { + "epoch": 0.0956, + "grad_norm": 10.6875, + "grad_norm_var": 0.4046223958333333, + "learning_rate": 0.0003, + "loss": 12.7335, + "loss/aux_loss": 0.04812417142093182, + "loss/crossentropy": 2.8524417519569396, + "loss/logits": 0.9906006306409836, + "step": 9560 + }, + { + "epoch": 0.0957, + "grad_norm": 10.375, + "grad_norm_var": 0.7884765625, + "learning_rate": 0.0003, + "loss": 12.6479, + "loss/aux_loss": 0.048113958537578584, + "loss/crossentropy": 2.860063922405243, + "loss/logits": 0.9770903497934341, + "step": 9570 + }, + { + "epoch": 0.0958, + "grad_norm": 9.25, + "grad_norm_var": 0.14869791666666668, + "learning_rate": 0.0003, + "loss": 12.97, + "loss/aux_loss": 0.048113430850207806, + "loss/crossentropy": 2.7825845539569856, + "loss/logits": 0.9913632333278656, + "step": 9580 + }, + { + "epoch": 0.0959, + "grad_norm": 10.6875, + "grad_norm_var": 1.1207682291666667, + "learning_rate": 0.0003, + "loss": 13.0485, + "loss/aux_loss": 0.04811343587934971, + "loss/crossentropy": 2.7735751450061796, + "loss/logits": 0.9879475176334381, + "step": 9590 + }, + { + "epoch": 0.096, + "grad_norm": 9.5, + "grad_norm_var": 1.1030598958333333, + "learning_rate": 0.0003, + "loss": 13.0665, + "loss/aux_loss": 0.048116713762283325, + "loss/crossentropy": 2.8584636390209197, + "loss/logits": 0.9740468025207519, + "step": 9600 + }, + { + "epoch": 0.0961, + "grad_norm": 11.0625, + "grad_norm_var": 0.21712239583333334, + "learning_rate": 0.0003, + "loss": 13.0707, + "loss/aux_loss": 0.04812497589737177, + "loss/crossentropy": 2.8642295002937317, + "loss/logits": 1.0438130795955658, + "step": 9610 + }, + { + "epoch": 0.0962, + "grad_norm": 10.375, + "grad_norm_var": 78.78743489583333, + "learning_rate": 0.0003, + "loss": 12.9392, + "loss/aux_loss": 0.04811538271605968, + "loss/crossentropy": 2.8932973623275755, + "loss/logits": 1.0000649869441987, + "step": 9620 + }, + { + "epoch": 0.0963, + "grad_norm": 10.1875, + "grad_norm_var": 0.401025390625, + "learning_rate": 0.0003, + "loss": 12.8071, + "loss/aux_loss": 0.04812538847327232, + "loss/crossentropy": 2.641858923435211, + "loss/logits": 0.9451945751905442, + "step": 9630 + }, + { + "epoch": 0.0964, + "grad_norm": 9.5625, + "grad_norm_var": 0.15206705729166667, + "learning_rate": 0.0003, + "loss": 12.8081, + "loss/aux_loss": 0.04811387863010168, + "loss/crossentropy": 2.752705854177475, + "loss/logits": 0.9918626010417938, + "step": 9640 + }, + { + "epoch": 0.0965, + "grad_norm": 10.75, + "grad_norm_var": 0.32810872395833335, + "learning_rate": 0.0003, + "loss": 12.8976, + "loss/aux_loss": 0.04811955615878105, + "loss/crossentropy": 2.823894906044006, + "loss/logits": 0.9711399942636489, + "step": 9650 + }, + { + "epoch": 0.0966, + "grad_norm": 9.1875, + "grad_norm_var": 0.3337890625, + "learning_rate": 0.0003, + "loss": 12.9509, + "loss/aux_loss": 0.0481190113350749, + "loss/crossentropy": 2.993069517612457, + "loss/logits": 0.9871428191661835, + "step": 9660 + }, + { + "epoch": 0.0967, + "grad_norm": 9.5625, + "grad_norm_var": 0.5012858072916667, + "learning_rate": 0.0003, + "loss": 12.7698, + "loss/aux_loss": 0.048111490719020364, + "loss/crossentropy": 2.8160251498222353, + "loss/logits": 0.9605364561080932, + "step": 9670 + }, + { + "epoch": 0.0968, + "grad_norm": 9.25, + "grad_norm_var": 0.17667643229166666, + "learning_rate": 0.0003, + "loss": 12.8989, + "loss/aux_loss": 0.04811663068830967, + "loss/crossentropy": 2.9415274262428284, + "loss/logits": 0.9684463948011398, + "step": 9680 + }, + { + "epoch": 0.0969, + "grad_norm": 10.5, + "grad_norm_var": 53.12389322916667, + "learning_rate": 0.0003, + "loss": 12.8548, + "loss/aux_loss": 0.048127869702875616, + "loss/crossentropy": 2.8381851077079774, + "loss/logits": 0.9528964549303055, + "step": 9690 + }, + { + "epoch": 0.097, + "grad_norm": 10.625, + "grad_norm_var": 51.1869140625, + "learning_rate": 0.0003, + "loss": 12.8864, + "loss/aux_loss": 0.04811157062649727, + "loss/crossentropy": 2.917622911930084, + "loss/logits": 1.0014064520597459, + "step": 9700 + }, + { + "epoch": 0.0971, + "grad_norm": 9.5625, + "grad_norm_var": 0.356884765625, + "learning_rate": 0.0003, + "loss": 12.97, + "loss/aux_loss": 0.04812054745852947, + "loss/crossentropy": 2.870450019836426, + "loss/logits": 0.990039375424385, + "step": 9710 + }, + { + "epoch": 0.0972, + "grad_norm": 9.75, + "grad_norm_var": 0.4280598958333333, + "learning_rate": 0.0003, + "loss": 12.8376, + "loss/aux_loss": 0.048113865032792094, + "loss/crossentropy": 2.947874927520752, + "loss/logits": 1.01834077835083, + "step": 9720 + }, + { + "epoch": 0.0973, + "grad_norm": 9.75, + "grad_norm_var": 0.202587890625, + "learning_rate": 0.0003, + "loss": 12.6772, + "loss/aux_loss": 0.04811162706464529, + "loss/crossentropy": 2.6616825222969056, + "loss/logits": 0.922445324063301, + "step": 9730 + }, + { + "epoch": 0.0974, + "grad_norm": 9.3125, + "grad_norm_var": 0.23567708333333334, + "learning_rate": 0.0003, + "loss": 12.8276, + "loss/aux_loss": 0.048115427419543264, + "loss/crossentropy": 2.8596638798713685, + "loss/logits": 0.9671652972698211, + "step": 9740 + }, + { + "epoch": 0.0975, + "grad_norm": 9.4375, + "grad_norm_var": 0.27980143229166665, + "learning_rate": 0.0003, + "loss": 12.8823, + "loss/aux_loss": 0.048122276365756986, + "loss/crossentropy": 2.9232805013656615, + "loss/logits": 0.9951166033744812, + "step": 9750 + }, + { + "epoch": 0.0976, + "grad_norm": 9.375, + "grad_norm_var": 0.2384765625, + "learning_rate": 0.0003, + "loss": 12.8269, + "loss/aux_loss": 0.04811736922711134, + "loss/crossentropy": 3.0413878917694093, + "loss/logits": 1.0016505420207977, + "step": 9760 + }, + { + "epoch": 0.0977, + "grad_norm": 9.6875, + "grad_norm_var": 0.24529622395833334, + "learning_rate": 0.0003, + "loss": 12.8884, + "loss/aux_loss": 0.048109458200633524, + "loss/crossentropy": 2.893119239807129, + "loss/logits": 1.0159206092357635, + "step": 9770 + }, + { + "epoch": 0.0978, + "grad_norm": 10.1875, + "grad_norm_var": 0.20045572916666668, + "learning_rate": 0.0003, + "loss": 12.8463, + "loss/aux_loss": 0.048116378486156464, + "loss/crossentropy": 3.002572274208069, + "loss/logits": 1.0260325849056244, + "step": 9780 + }, + { + "epoch": 0.0979, + "grad_norm": 9.8125, + "grad_norm_var": 0.1884765625, + "learning_rate": 0.0003, + "loss": 12.7471, + "loss/aux_loss": 0.04811564590781927, + "loss/crossentropy": 2.8663101851940156, + "loss/logits": 0.945113542675972, + "step": 9790 + }, + { + "epoch": 0.098, + "grad_norm": 9.9375, + "grad_norm_var": 0.36041666666666666, + "learning_rate": 0.0003, + "loss": 13.0079, + "loss/aux_loss": 0.0481245506554842, + "loss/crossentropy": 2.7454223036766052, + "loss/logits": 0.9564665943384171, + "step": 9800 + }, + { + "epoch": 0.0981, + "grad_norm": 9.8125, + "grad_norm_var": 0.4278645833333333, + "learning_rate": 0.0003, + "loss": 12.8525, + "loss/aux_loss": 0.04811416696757078, + "loss/crossentropy": 2.8844858169555665, + "loss/logits": 0.9936564028263092, + "step": 9810 + }, + { + "epoch": 0.0982, + "grad_norm": 9.5, + "grad_norm_var": 0.221728515625, + "learning_rate": 0.0003, + "loss": 12.8522, + "loss/aux_loss": 0.048113705776631835, + "loss/crossentropy": 2.937456488609314, + "loss/logits": 0.9975145667791366, + "step": 9820 + }, + { + "epoch": 0.0983, + "grad_norm": 9.1875, + "grad_norm_var": 0.23318684895833333, + "learning_rate": 0.0003, + "loss": 12.948, + "loss/aux_loss": 0.04812261760234833, + "loss/crossentropy": 2.863471287488937, + "loss/logits": 0.9870826095342636, + "step": 9830 + }, + { + "epoch": 0.0984, + "grad_norm": 9.9375, + "grad_norm_var": 0.4388020833333333, + "learning_rate": 0.0003, + "loss": 12.785, + "loss/aux_loss": 0.04811761137098074, + "loss/crossentropy": 2.860468626022339, + "loss/logits": 0.9974869579076767, + "step": 9840 + }, + { + "epoch": 0.0985, + "grad_norm": 10.875, + "grad_norm_var": 0.39108072916666664, + "learning_rate": 0.0003, + "loss": 12.9577, + "loss/aux_loss": 0.04811706598848105, + "loss/crossentropy": 2.8506002187728883, + "loss/logits": 1.000709992647171, + "step": 9850 + }, + { + "epoch": 0.0986, + "grad_norm": 9.0, + "grad_norm_var": 0.254931640625, + "learning_rate": 0.0003, + "loss": 12.5998, + "loss/aux_loss": 0.04811438079923391, + "loss/crossentropy": 2.918227458000183, + "loss/logits": 0.9769401401281357, + "step": 9860 + }, + { + "epoch": 0.0987, + "grad_norm": 10.9375, + "grad_norm_var": 0.31295572916666664, + "learning_rate": 0.0003, + "loss": 12.979, + "loss/aux_loss": 0.04811355788260698, + "loss/crossentropy": 2.909677565097809, + "loss/logits": 1.0252159029245376, + "step": 9870 + }, + { + "epoch": 0.0988, + "grad_norm": 10.4375, + "grad_norm_var": 0.2921223958333333, + "learning_rate": 0.0003, + "loss": 12.8383, + "loss/aux_loss": 0.04811448734253645, + "loss/crossentropy": 2.835783588886261, + "loss/logits": 1.0406290709972381, + "step": 9880 + }, + { + "epoch": 0.0989, + "grad_norm": 10.1875, + "grad_norm_var": 1.6822265625, + "learning_rate": 0.0003, + "loss": 12.7556, + "loss/aux_loss": 0.04814105350524187, + "loss/crossentropy": 2.7648268580436706, + "loss/logits": 0.9558891981840134, + "step": 9890 + }, + { + "epoch": 0.099, + "grad_norm": 10.4375, + "grad_norm_var": 1.9072916666666666, + "learning_rate": 0.0003, + "loss": 12.7867, + "loss/aux_loss": 0.04811670910567045, + "loss/crossentropy": 2.68316650390625, + "loss/logits": 0.9622927576303482, + "step": 9900 + }, + { + "epoch": 0.0991, + "grad_norm": 10.125, + "grad_norm_var": 0.46608072916666665, + "learning_rate": 0.0003, + "loss": 12.8684, + "loss/aux_loss": 0.04812458418309688, + "loss/crossentropy": 2.880593103170395, + "loss/logits": 0.9721406042575836, + "step": 9910 + }, + { + "epoch": 0.0992, + "grad_norm": 10.5, + "grad_norm_var": 0.55546875, + "learning_rate": 0.0003, + "loss": 12.817, + "loss/aux_loss": 0.048123272694647314, + "loss/crossentropy": 2.6709973573684693, + "loss/logits": 0.9354108065366745, + "step": 9920 + }, + { + "epoch": 0.0993, + "grad_norm": 9.6875, + "grad_norm_var": 0.395166015625, + "learning_rate": 0.0003, + "loss": 12.8799, + "loss/aux_loss": 0.04812105931341648, + "loss/crossentropy": 2.934725469350815, + "loss/logits": 0.9813075840473175, + "step": 9930 + }, + { + "epoch": 0.0994, + "grad_norm": 9.625, + "grad_norm_var": 0.6903483072916666, + "learning_rate": 0.0003, + "loss": 12.903, + "loss/aux_loss": 0.048118251748383044, + "loss/crossentropy": 2.8453499555587767, + "loss/logits": 0.9796870052814484, + "step": 9940 + }, + { + "epoch": 0.0995, + "grad_norm": 14.625, + "grad_norm_var": 2.387223307291667, + "learning_rate": 0.0003, + "loss": 12.8252, + "loss/aux_loss": 0.048117080517113206, + "loss/crossentropy": 2.8250075817108153, + "loss/logits": 0.9736212283372879, + "step": 9950 + }, + { + "epoch": 0.0996, + "grad_norm": 9.875, + "grad_norm_var": 1.3822265625, + "learning_rate": 0.0003, + "loss": 12.6306, + "loss/aux_loss": 0.04811842925846577, + "loss/crossentropy": 2.854235601425171, + "loss/logits": 1.007426416873932, + "step": 9960 + }, + { + "epoch": 0.0997, + "grad_norm": 9.875, + "grad_norm_var": 0.3648274739583333, + "learning_rate": 0.0003, + "loss": 12.6579, + "loss/aux_loss": 0.04811596740037203, + "loss/crossentropy": 2.898962616920471, + "loss/logits": 0.9763563752174378, + "step": 9970 + }, + { + "epoch": 0.0998, + "grad_norm": 10.5, + "grad_norm_var": 0.172900390625, + "learning_rate": 0.0003, + "loss": 12.788, + "loss/aux_loss": 0.048104763589799406, + "loss/crossentropy": 2.9158723652362823, + "loss/logits": 1.0095852971076966, + "step": 9980 + }, + { + "epoch": 0.0999, + "grad_norm": 10.8125, + "grad_norm_var": 0.2770182291666667, + "learning_rate": 0.0003, + "loss": 12.7328, + "loss/aux_loss": 0.04811613652855158, + "loss/crossentropy": 2.781576532125473, + "loss/logits": 1.0038779705762864, + "step": 9990 + }, + { + "epoch": 0.1, + "grad_norm": 9.6875, + "grad_norm_var": 0.6130208333333333, + "learning_rate": 0.0003, + "loss": 12.8425, + "loss/aux_loss": 0.048115148395299914, + "loss/crossentropy": 2.7442554593086244, + "loss/logits": 0.9685165584087372, + "step": 10000 + }, + { + "epoch": 0.1001, + "grad_norm": 10.25, + "grad_norm_var": 1.0940104166666667, + "learning_rate": 0.0003, + "loss": 12.7596, + "loss/aux_loss": 0.04810796473175287, + "loss/crossentropy": 2.8970122635364532, + "loss/logits": 0.9651453495025635, + "step": 10010 + }, + { + "epoch": 0.1002, + "grad_norm": 10.875, + "grad_norm_var": 0.8113932291666667, + "learning_rate": 0.0003, + "loss": 13.0034, + "loss/aux_loss": 0.048116610012948514, + "loss/crossentropy": 2.872769057750702, + "loss/logits": 1.0002406895160676, + "step": 10020 + }, + { + "epoch": 0.1003, + "grad_norm": 12.1875, + "grad_norm_var": 0.6697265625, + "learning_rate": 0.0003, + "loss": 12.7285, + "loss/aux_loss": 0.04810873456299305, + "loss/crossentropy": 2.888649785518646, + "loss/logits": 0.9968151926994324, + "step": 10030 + }, + { + "epoch": 0.1004, + "grad_norm": 10.0625, + "grad_norm_var": 0.540625, + "learning_rate": 0.0003, + "loss": 12.8715, + "loss/aux_loss": 0.048114927113056184, + "loss/crossentropy": 2.9668263673782347, + "loss/logits": 1.0093841701745987, + "step": 10040 + }, + { + "epoch": 0.1005, + "grad_norm": 10.5625, + "grad_norm_var": 0.202587890625, + "learning_rate": 0.0003, + "loss": 12.871, + "loss/aux_loss": 0.048109718784689906, + "loss/crossentropy": 2.841026210784912, + "loss/logits": 0.9876527488231659, + "step": 10050 + }, + { + "epoch": 0.1006, + "grad_norm": 10.5625, + "grad_norm_var": 0.4266764322916667, + "learning_rate": 0.0003, + "loss": 12.7901, + "loss/aux_loss": 0.04810853134840727, + "loss/crossentropy": 2.692527735233307, + "loss/logits": 0.9895975649356842, + "step": 10060 + }, + { + "epoch": 0.1007, + "grad_norm": 9.875, + "grad_norm_var": 0.4041015625, + "learning_rate": 0.0003, + "loss": 12.4866, + "loss/aux_loss": 0.04811131805181503, + "loss/crossentropy": 2.908632504940033, + "loss/logits": 0.9596006900072098, + "step": 10070 + }, + { + "epoch": 0.1008, + "grad_norm": 10.125, + "grad_norm_var": 0.3046875, + "learning_rate": 0.0003, + "loss": 12.7782, + "loss/aux_loss": 0.04811954293400049, + "loss/crossentropy": 2.901764976978302, + "loss/logits": 1.0220121264457702, + "step": 10080 + }, + { + "epoch": 0.1009, + "grad_norm": 10.8125, + "grad_norm_var": 0.45358072916666664, + "learning_rate": 0.0003, + "loss": 12.8296, + "loss/aux_loss": 0.04812715277075767, + "loss/crossentropy": 2.7433866381645204, + "loss/logits": 0.9685066968202591, + "step": 10090 + }, + { + "epoch": 0.101, + "grad_norm": 11.6875, + "grad_norm_var": 0.4431640625, + "learning_rate": 0.0003, + "loss": 12.7263, + "loss/aux_loss": 0.04812118727713823, + "loss/crossentropy": 2.9564905166625977, + "loss/logits": 1.0407138913869858, + "step": 10100 + }, + { + "epoch": 0.1011, + "grad_norm": 11.125, + "grad_norm_var": 0.3551432291666667, + "learning_rate": 0.0003, + "loss": 12.8901, + "loss/aux_loss": 0.04811162799596787, + "loss/crossentropy": 2.962075352668762, + "loss/logits": 1.002569890022278, + "step": 10110 + }, + { + "epoch": 0.1012, + "grad_norm": 9.75, + "grad_norm_var": 0.42185872395833335, + "learning_rate": 0.0003, + "loss": 12.777, + "loss/aux_loss": 0.04811493325978518, + "loss/crossentropy": 2.9621083974838256, + "loss/logits": 1.0220870167016982, + "step": 10120 + }, + { + "epoch": 0.1013, + "grad_norm": 70.5, + "grad_norm_var": 225.11183268229166, + "learning_rate": 0.0003, + "loss": 12.8111, + "loss/aux_loss": 0.04811273105442524, + "loss/crossentropy": 2.8190457224845886, + "loss/logits": 0.9978448241949082, + "step": 10130 + }, + { + "epoch": 0.1014, + "grad_norm": 9.75, + "grad_norm_var": 225.50514322916666, + "learning_rate": 0.0003, + "loss": 12.8321, + "loss/aux_loss": 0.0481122450903058, + "loss/crossentropy": 2.7599350273609162, + "loss/logits": 0.9731186151504516, + "step": 10140 + }, + { + "epoch": 0.1015, + "grad_norm": 10.6875, + "grad_norm_var": 0.35271809895833334, + "learning_rate": 0.0003, + "loss": 12.7794, + "loss/aux_loss": 0.0481121052056551, + "loss/crossentropy": 2.918788194656372, + "loss/logits": 1.0331996023654937, + "step": 10150 + }, + { + "epoch": 0.1016, + "grad_norm": 10.0625, + "grad_norm_var": 0.2916666666666667, + "learning_rate": 0.0003, + "loss": 12.9847, + "loss/aux_loss": 0.04810354914516211, + "loss/crossentropy": 2.9620222568511965, + "loss/logits": 0.9895435065031052, + "step": 10160 + }, + { + "epoch": 0.1017, + "grad_norm": 10.1875, + "grad_norm_var": 61.03513997395833, + "learning_rate": 0.0003, + "loss": 12.905, + "loss/aux_loss": 0.048123286291956904, + "loss/crossentropy": 2.8044037401676176, + "loss/logits": 0.9406631171703339, + "step": 10170 + }, + { + "epoch": 0.1018, + "grad_norm": 10.5, + "grad_norm_var": 0.23118489583333332, + "learning_rate": 0.0003, + "loss": 12.838, + "loss/aux_loss": 0.04811715167015791, + "loss/crossentropy": 2.957458180189133, + "loss/logits": 0.9943399399518966, + "step": 10180 + }, + { + "epoch": 0.1019, + "grad_norm": 12.125, + "grad_norm_var": 8.0984375, + "learning_rate": 0.0003, + "loss": 12.8759, + "loss/aux_loss": 0.048108558543026446, + "loss/crossentropy": 2.867668330669403, + "loss/logits": 0.9879345417022705, + "step": 10190 + }, + { + "epoch": 0.102, + "grad_norm": 10.125, + "grad_norm_var": 7.892301432291666, + "learning_rate": 0.0003, + "loss": 12.9061, + "loss/aux_loss": 0.04811856150627136, + "loss/crossentropy": 2.870317333936691, + "loss/logits": 1.0022278010845185, + "step": 10200 + }, + { + "epoch": 0.1021, + "grad_norm": 10.125, + "grad_norm_var": 0.1166015625, + "learning_rate": 0.0003, + "loss": 12.7326, + "loss/aux_loss": 0.048116784729063514, + "loss/crossentropy": 2.678470027446747, + "loss/logits": 0.9406646758317947, + "step": 10210 + }, + { + "epoch": 0.1022, + "grad_norm": 11.0, + "grad_norm_var": 1.2061848958333334, + "learning_rate": 0.0003, + "loss": 12.7671, + "loss/aux_loss": 0.04811875224113464, + "loss/crossentropy": 2.9115442454814913, + "loss/logits": 0.9856185555458069, + "step": 10220 + }, + { + "epoch": 0.1023, + "grad_norm": 10.3125, + "grad_norm_var": 2.476676432291667, + "learning_rate": 0.0003, + "loss": 12.8463, + "loss/aux_loss": 0.048114926740527156, + "loss/crossentropy": 2.987521970272064, + "loss/logits": 0.9880728483200073, + "step": 10230 + }, + { + "epoch": 0.1024, + "grad_norm": 9.6875, + "grad_norm_var": 0.4239583333333333, + "learning_rate": 0.0003, + "loss": 12.9066, + "loss/aux_loss": 0.04811114761978388, + "loss/crossentropy": 2.957077658176422, + "loss/logits": 1.0159188747406005, + "step": 10240 + }, + { + "epoch": 0.1025, + "grad_norm": 10.25, + "grad_norm_var": 0.19348958333333333, + "learning_rate": 0.0003, + "loss": 12.8744, + "loss/aux_loss": 0.04811904225498438, + "loss/crossentropy": 2.9327427983283996, + "loss/logits": 1.014840191602707, + "step": 10250 + }, + { + "epoch": 0.1026, + "grad_norm": 11.25, + "grad_norm_var": 26.683854166666666, + "learning_rate": 0.0003, + "loss": 12.8864, + "loss/aux_loss": 0.04811773095279932, + "loss/crossentropy": 2.9283841848373413, + "loss/logits": 0.9840510159730911, + "step": 10260 + }, + { + "epoch": 0.1027, + "grad_norm": 9.6875, + "grad_norm_var": 5.591129557291667, + "learning_rate": 0.0003, + "loss": 12.7431, + "loss/aux_loss": 0.04809635002166033, + "loss/crossentropy": 2.9701969385147096, + "loss/logits": 1.00972381234169, + "step": 10270 + }, + { + "epoch": 0.1028, + "grad_norm": 9.875, + "grad_norm_var": 0.2659993489583333, + "learning_rate": 0.0003, + "loss": 12.6803, + "loss/aux_loss": 0.04810932390391827, + "loss/crossentropy": 2.9193489074707033, + "loss/logits": 0.9834885329008103, + "step": 10280 + }, + { + "epoch": 0.1029, + "grad_norm": 9.6875, + "grad_norm_var": 0.2775390625, + "learning_rate": 0.0003, + "loss": 12.8644, + "loss/aux_loss": 0.048101594857871535, + "loss/crossentropy": 2.9100057601928713, + "loss/logits": 0.99757040143013, + "step": 10290 + }, + { + "epoch": 0.103, + "grad_norm": 10.0625, + "grad_norm_var": 0.3106770833333333, + "learning_rate": 0.0003, + "loss": 12.6547, + "loss/aux_loss": 0.04812427274882793, + "loss/crossentropy": 2.7211422979831696, + "loss/logits": 0.9326405107975007, + "step": 10300 + }, + { + "epoch": 0.1031, + "grad_norm": 10.75, + "grad_norm_var": 0.27029622395833336, + "learning_rate": 0.0003, + "loss": 12.9066, + "loss/aux_loss": 0.04811542592942715, + "loss/crossentropy": 2.8410415768623354, + "loss/logits": 0.9540263682603836, + "step": 10310 + }, + { + "epoch": 0.1032, + "grad_norm": 10.0625, + "grad_norm_var": 0.152587890625, + "learning_rate": 0.0003, + "loss": 12.8612, + "loss/aux_loss": 0.048105557821691035, + "loss/crossentropy": 2.894010055065155, + "loss/logits": 0.9962664604187011, + "step": 10320 + }, + { + "epoch": 0.1033, + "grad_norm": 10.625, + "grad_norm_var": 0.5556640625, + "learning_rate": 0.0003, + "loss": 12.7817, + "loss/aux_loss": 0.04811650700867176, + "loss/crossentropy": 2.7931331276893614, + "loss/logits": 0.9934678196907043, + "step": 10330 + }, + { + "epoch": 0.1034, + "grad_norm": 10.1875, + "grad_norm_var": 0.115087890625, + "learning_rate": 0.0003, + "loss": 12.6721, + "loss/aux_loss": 0.04811523836106062, + "loss/crossentropy": 2.8752257347106935, + "loss/logits": 0.9926656931638718, + "step": 10340 + }, + { + "epoch": 0.1035, + "grad_norm": 10.3125, + "grad_norm_var": 0.208837890625, + "learning_rate": 0.0003, + "loss": 12.747, + "loss/aux_loss": 0.04811122994869947, + "loss/crossentropy": 2.863996922969818, + "loss/logits": 0.9837069183588028, + "step": 10350 + }, + { + "epoch": 0.1036, + "grad_norm": 9.8125, + "grad_norm_var": 0.19959309895833333, + "learning_rate": 0.0003, + "loss": 12.7221, + "loss/aux_loss": 0.048110642656683925, + "loss/crossentropy": 2.9583349347114565, + "loss/logits": 0.9609038531780243, + "step": 10360 + }, + { + "epoch": 0.1037, + "grad_norm": 11.625, + "grad_norm_var": 0.349072265625, + "learning_rate": 0.0003, + "loss": 12.8269, + "loss/aux_loss": 0.04811428822577, + "loss/crossentropy": 2.875106942653656, + "loss/logits": 0.9508922189474106, + "step": 10370 + }, + { + "epoch": 0.1038, + "grad_norm": 9.3125, + "grad_norm_var": 0.5851399739583333, + "learning_rate": 0.0003, + "loss": 12.7956, + "loss/aux_loss": 0.04812815226614475, + "loss/crossentropy": 2.9032375514507294, + "loss/logits": 0.9839789032936096, + "step": 10380 + }, + { + "epoch": 0.1039, + "grad_norm": 10.75, + "grad_norm_var": 25.269124348958332, + "learning_rate": 0.0003, + "loss": 12.8725, + "loss/aux_loss": 0.048113728314638136, + "loss/crossentropy": 2.811239331960678, + "loss/logits": 0.9782313734292984, + "step": 10390 + }, + { + "epoch": 0.104, + "grad_norm": 10.8125, + "grad_norm_var": 25.748811848958333, + "learning_rate": 0.0003, + "loss": 12.5987, + "loss/aux_loss": 0.04810786601155996, + "loss/crossentropy": 2.8436325669288633, + "loss/logits": 0.9547698825597764, + "step": 10400 + }, + { + "epoch": 0.1041, + "grad_norm": 10.5625, + "grad_norm_var": 0.17962239583333334, + "learning_rate": 0.0003, + "loss": 12.6758, + "loss/aux_loss": 0.04811085946857929, + "loss/crossentropy": 2.827778089046478, + "loss/logits": 0.9550431787967681, + "step": 10410 + }, + { + "epoch": 0.1042, + "grad_norm": 9.8125, + "grad_norm_var": 0.919384765625, + "learning_rate": 0.0003, + "loss": 12.8482, + "loss/aux_loss": 0.04811614342033863, + "loss/crossentropy": 3.0204949617385863, + "loss/logits": 0.9995712280273438, + "step": 10420 + }, + { + "epoch": 0.1043, + "grad_norm": 10.125, + "grad_norm_var": 0.5180826822916667, + "learning_rate": 0.0003, + "loss": 12.6728, + "loss/aux_loss": 0.04812094569206238, + "loss/crossentropy": 3.012200677394867, + "loss/logits": 0.9757115840911865, + "step": 10430 + }, + { + "epoch": 0.1044, + "grad_norm": 9.75, + "grad_norm_var": 0.284619140625, + "learning_rate": 0.0003, + "loss": 12.7465, + "loss/aux_loss": 0.0481198638677597, + "loss/crossentropy": 2.8174231171607973, + "loss/logits": 0.9774721026420593, + "step": 10440 + }, + { + "epoch": 0.1045, + "grad_norm": 10.3125, + "grad_norm_var": 0.229931640625, + "learning_rate": 0.0003, + "loss": 12.7017, + "loss/aux_loss": 0.04811002127826214, + "loss/crossentropy": 2.955516219139099, + "loss/logits": 0.9981975615024566, + "step": 10450 + }, + { + "epoch": 0.1046, + "grad_norm": 10.0625, + "grad_norm_var": 0.4632649739583333, + "learning_rate": 0.0003, + "loss": 12.8212, + "loss/aux_loss": 0.048105720058083536, + "loss/crossentropy": 2.8493134498596193, + "loss/logits": 1.0022914230823516, + "step": 10460 + }, + { + "epoch": 0.1047, + "grad_norm": 10.4375, + "grad_norm_var": 17.641145833333333, + "learning_rate": 0.0003, + "loss": 12.8144, + "loss/aux_loss": 0.04811700396239758, + "loss/crossentropy": 2.865918278694153, + "loss/logits": 1.011181029677391, + "step": 10470 + }, + { + "epoch": 0.1048, + "grad_norm": 10.5625, + "grad_norm_var": 1.2333333333333334, + "learning_rate": 0.0003, + "loss": 12.7668, + "loss/aux_loss": 0.048107659071683885, + "loss/crossentropy": 2.9243461012840273, + "loss/logits": 1.0033222287893295, + "step": 10480 + }, + { + "epoch": 0.1049, + "grad_norm": 10.1875, + "grad_norm_var": 0.43645833333333334, + "learning_rate": 0.0003, + "loss": 12.7835, + "loss/aux_loss": 0.04813041500747204, + "loss/crossentropy": 2.799642193317413, + "loss/logits": 0.9750822395086288, + "step": 10490 + }, + { + "epoch": 0.105, + "grad_norm": 10.5625, + "grad_norm_var": 0.3960774739583333, + "learning_rate": 0.0003, + "loss": 12.6303, + "loss/aux_loss": 0.048102636635303495, + "loss/crossentropy": 2.842100405693054, + "loss/logits": 0.9673471480607987, + "step": 10500 + }, + { + "epoch": 0.1051, + "grad_norm": 9.6875, + "grad_norm_var": 0.19264322916666668, + "learning_rate": 0.0003, + "loss": 12.64, + "loss/aux_loss": 0.04811730049550533, + "loss/crossentropy": 2.735097426176071, + "loss/logits": 0.953799894452095, + "step": 10510 + }, + { + "epoch": 0.1052, + "grad_norm": 9.5, + "grad_norm_var": 0.428125, + "learning_rate": 0.0003, + "loss": 12.767, + "loss/aux_loss": 0.04810708742588758, + "loss/crossentropy": 3.0010022163391112, + "loss/logits": 1.0266100823879243, + "step": 10520 + }, + { + "epoch": 0.1053, + "grad_norm": 10.0625, + "grad_norm_var": 0.37083333333333335, + "learning_rate": 0.0003, + "loss": 12.6364, + "loss/aux_loss": 0.04811326451599598, + "loss/crossentropy": 2.8574136972427366, + "loss/logits": 0.9649319559335708, + "step": 10530 + }, + { + "epoch": 0.1054, + "grad_norm": 9.75, + "grad_norm_var": 0.5776041666666667, + "learning_rate": 0.0003, + "loss": 12.8414, + "loss/aux_loss": 0.048117882758378985, + "loss/crossentropy": 2.8915890574455263, + "loss/logits": 1.019568595290184, + "step": 10540 + }, + { + "epoch": 0.1055, + "grad_norm": 10.125, + "grad_norm_var": 0.26027018229166665, + "learning_rate": 0.0003, + "loss": 12.6664, + "loss/aux_loss": 0.04811375327408314, + "loss/crossentropy": 2.851538288593292, + "loss/logits": 0.9997588336467743, + "step": 10550 + }, + { + "epoch": 0.1056, + "grad_norm": 10.0, + "grad_norm_var": 0.20358072916666667, + "learning_rate": 0.0003, + "loss": 12.8045, + "loss/aux_loss": 0.04811806846410036, + "loss/crossentropy": 2.9150373101234437, + "loss/logits": 0.9894289702177048, + "step": 10560 + }, + { + "epoch": 0.1057, + "grad_norm": 10.1875, + "grad_norm_var": 0.12083333333333333, + "learning_rate": 0.0003, + "loss": 12.7669, + "loss/aux_loss": 0.048116890527307984, + "loss/crossentropy": 2.8022518932819365, + "loss/logits": 0.9851718157529831, + "step": 10570 + }, + { + "epoch": 0.1058, + "grad_norm": 11.3125, + "grad_norm_var": 0.23697916666666666, + "learning_rate": 0.0003, + "loss": 12.9142, + "loss/aux_loss": 0.04811734985560179, + "loss/crossentropy": 2.915334862470627, + "loss/logits": 1.051106184720993, + "step": 10580 + }, + { + "epoch": 0.1059, + "grad_norm": 9.9375, + "grad_norm_var": 0.47068684895833335, + "learning_rate": 0.0003, + "loss": 12.7202, + "loss/aux_loss": 0.04810705240815878, + "loss/crossentropy": 2.683490252494812, + "loss/logits": 0.9757168561220169, + "step": 10590 + }, + { + "epoch": 0.106, + "grad_norm": 11.625, + "grad_norm_var": 14.690869140625, + "learning_rate": 0.0003, + "loss": 12.9961, + "loss/aux_loss": 0.04812338836491108, + "loss/crossentropy": 2.8796960532665254, + "loss/logits": 1.0100533604621886, + "step": 10600 + }, + { + "epoch": 0.1061, + "grad_norm": 10.5625, + "grad_norm_var": 0.6958333333333333, + "learning_rate": 0.0003, + "loss": 12.6616, + "loss/aux_loss": 0.04811020065099001, + "loss/crossentropy": 2.790817213058472, + "loss/logits": 0.9659003525972366, + "step": 10610 + }, + { + "epoch": 0.1062, + "grad_norm": 10.6875, + "grad_norm_var": 0.41171875, + "learning_rate": 0.0003, + "loss": 12.6359, + "loss/aux_loss": 0.04811752960085869, + "loss/crossentropy": 2.971816051006317, + "loss/logits": 0.9964886039495469, + "step": 10620 + }, + { + "epoch": 0.1063, + "grad_norm": 10.9375, + "grad_norm_var": 0.350244140625, + "learning_rate": 0.0003, + "loss": 12.842, + "loss/aux_loss": 0.04811201822012663, + "loss/crossentropy": 2.986808705329895, + "loss/logits": 0.9862239271402359, + "step": 10630 + }, + { + "epoch": 0.1064, + "grad_norm": 10.5, + "grad_norm_var": 0.33318684895833334, + "learning_rate": 0.0003, + "loss": 12.6619, + "loss/aux_loss": 0.048111039027571675, + "loss/crossentropy": 2.8363620817661284, + "loss/logits": 0.9908882945775985, + "step": 10640 + }, + { + "epoch": 0.1065, + "grad_norm": 10.5625, + "grad_norm_var": 0.436962890625, + "learning_rate": 0.0003, + "loss": 12.5936, + "loss/aux_loss": 0.04811495747417212, + "loss/crossentropy": 2.7697442412376403, + "loss/logits": 0.9236278921365738, + "step": 10650 + }, + { + "epoch": 0.1066, + "grad_norm": 10.1875, + "grad_norm_var": 0.24451497395833333, + "learning_rate": 0.0003, + "loss": 12.8339, + "loss/aux_loss": 0.04810470268130303, + "loss/crossentropy": 2.695315259695053, + "loss/logits": 0.9890154510736465, + "step": 10660 + }, + { + "epoch": 0.1067, + "grad_norm": 10.5, + "grad_norm_var": 0.371337890625, + "learning_rate": 0.0003, + "loss": 12.6645, + "loss/aux_loss": 0.04811508543789387, + "loss/crossentropy": 2.8011455297470094, + "loss/logits": 0.9550657361745835, + "step": 10670 + }, + { + "epoch": 0.1068, + "grad_norm": 10.25, + "grad_norm_var": 1.6645833333333333, + "learning_rate": 0.0003, + "loss": 12.6319, + "loss/aux_loss": 0.048115532658994195, + "loss/crossentropy": 2.857553493976593, + "loss/logits": 0.9820433109998703, + "step": 10680 + }, + { + "epoch": 0.1069, + "grad_norm": 10.8125, + "grad_norm_var": 1.7983723958333333, + "learning_rate": 0.0003, + "loss": 12.742, + "loss/aux_loss": 0.04811100009828806, + "loss/crossentropy": 2.743243044614792, + "loss/logits": 0.9736525624990463, + "step": 10690 + }, + { + "epoch": 0.107, + "grad_norm": 9.625, + "grad_norm_var": 0.21183268229166666, + "learning_rate": 0.0003, + "loss": 12.5406, + "loss/aux_loss": 0.048111391440033915, + "loss/crossentropy": 2.8312166213989256, + "loss/logits": 0.9848694235086441, + "step": 10700 + }, + { + "epoch": 0.1071, + "grad_norm": 11.0, + "grad_norm_var": 0.32198893229166664, + "learning_rate": 0.0003, + "loss": 12.6608, + "loss/aux_loss": 0.04810557756572962, + "loss/crossentropy": 2.7473709881305695, + "loss/logits": 0.9552412897348403, + "step": 10710 + }, + { + "epoch": 0.1072, + "grad_norm": 10.0625, + "grad_norm_var": 0.30271809895833335, + "learning_rate": 0.0003, + "loss": 12.5961, + "loss/aux_loss": 0.048110843263566495, + "loss/crossentropy": 2.7141247391700745, + "loss/logits": 0.9736550092697144, + "step": 10720 + }, + { + "epoch": 0.1073, + "grad_norm": 9.3125, + "grad_norm_var": 0.1681640625, + "learning_rate": 0.0003, + "loss": 12.5023, + "loss/aux_loss": 0.048104040697216986, + "loss/crossentropy": 2.6945619106292726, + "loss/logits": 0.9484582245349884, + "step": 10730 + }, + { + "epoch": 0.1074, + "grad_norm": 10.5, + "grad_norm_var": 0.28177083333333336, + "learning_rate": 0.0003, + "loss": 12.6278, + "loss/aux_loss": 0.04810405727475882, + "loss/crossentropy": 2.8070200264453886, + "loss/logits": 0.9321490287780761, + "step": 10740 + }, + { + "epoch": 0.1075, + "grad_norm": 9.5, + "grad_norm_var": 0.199853515625, + "learning_rate": 0.0003, + "loss": 12.7025, + "loss/aux_loss": 0.04810698907822371, + "loss/crossentropy": 2.9952612042427065, + "loss/logits": 0.9933030098676682, + "step": 10750 + }, + { + "epoch": 0.1076, + "grad_norm": 9.875, + "grad_norm_var": 0.6831868489583334, + "learning_rate": 0.0003, + "loss": 12.6858, + "loss/aux_loss": 0.048116383515298365, + "loss/crossentropy": 2.80760772228241, + "loss/logits": 0.9752837151288987, + "step": 10760 + }, + { + "epoch": 0.1077, + "grad_norm": 10.9375, + "grad_norm_var": 0.7341145833333333, + "learning_rate": 0.0003, + "loss": 12.6131, + "loss/aux_loss": 0.04810668155550957, + "loss/crossentropy": 2.941310775279999, + "loss/logits": 0.9818355232477188, + "step": 10770 + }, + { + "epoch": 0.1078, + "grad_norm": 9.375, + "grad_norm_var": 0.39264322916666666, + "learning_rate": 0.0003, + "loss": 12.5573, + "loss/aux_loss": 0.048113299161195756, + "loss/crossentropy": 2.860517716407776, + "loss/logits": 0.9614722609519959, + "step": 10780 + }, + { + "epoch": 0.1079, + "grad_norm": 9.5625, + "grad_norm_var": 0.18787434895833333, + "learning_rate": 0.0003, + "loss": 12.6124, + "loss/aux_loss": 0.04811013750731945, + "loss/crossentropy": 2.760462909936905, + "loss/logits": 0.9715398788452149, + "step": 10790 + }, + { + "epoch": 0.108, + "grad_norm": 10.75, + "grad_norm_var": 0.36248372395833334, + "learning_rate": 0.0003, + "loss": 12.6471, + "loss/aux_loss": 0.04810702111572027, + "loss/crossentropy": 2.9887078046798705, + "loss/logits": 0.9805259108543396, + "step": 10800 + }, + { + "epoch": 0.1081, + "grad_norm": 10.6875, + "grad_norm_var": 0.2743326822916667, + "learning_rate": 0.0003, + "loss": 12.653, + "loss/aux_loss": 0.048103836551308635, + "loss/crossentropy": 2.8864383697509766, + "loss/logits": 0.9688400447368621, + "step": 10810 + }, + { + "epoch": 0.1082, + "grad_norm": 10.5625, + "grad_norm_var": 0.5113932291666666, + "learning_rate": 0.0003, + "loss": 12.7579, + "loss/aux_loss": 0.04811310451477766, + "loss/crossentropy": 2.869203519821167, + "loss/logits": 0.9728466540575027, + "step": 10820 + }, + { + "epoch": 0.1083, + "grad_norm": 10.125, + "grad_norm_var": 0.35703125, + "learning_rate": 0.0003, + "loss": 12.7857, + "loss/aux_loss": 0.04811171405017376, + "loss/crossentropy": 2.8055492877960204, + "loss/logits": 0.9609241902828216, + "step": 10830 + }, + { + "epoch": 0.1084, + "grad_norm": 10.1875, + "grad_norm_var": 0.5317708333333333, + "learning_rate": 0.0003, + "loss": 12.6272, + "loss/aux_loss": 0.04810267500579357, + "loss/crossentropy": 2.8873426795005797, + "loss/logits": 0.983044245839119, + "step": 10840 + }, + { + "epoch": 0.1085, + "grad_norm": 12.375, + "grad_norm_var": 1.7999837239583334, + "learning_rate": 0.0003, + "loss": 12.5848, + "loss/aux_loss": 0.0481255043298006, + "loss/crossentropy": 2.8434741854667664, + "loss/logits": 0.9674687087535858, + "step": 10850 + }, + { + "epoch": 0.1086, + "grad_norm": 10.875, + "grad_norm_var": 1.5106608072916667, + "learning_rate": 0.0003, + "loss": 12.7505, + "loss/aux_loss": 0.04811022691428661, + "loss/crossentropy": 2.7955089688301085, + "loss/logits": 0.9789073407649994, + "step": 10860 + }, + { + "epoch": 0.1087, + "grad_norm": 10.8125, + "grad_norm_var": 0.23956705729166666, + "learning_rate": 0.0003, + "loss": 12.7856, + "loss/aux_loss": 0.048125218600034714, + "loss/crossentropy": 2.7512724816799166, + "loss/logits": 0.9434742718935013, + "step": 10870 + }, + { + "epoch": 0.1088, + "grad_norm": 10.5, + "grad_norm_var": 0.4942708333333333, + "learning_rate": 0.0003, + "loss": 12.8403, + "loss/aux_loss": 0.048105237260460856, + "loss/crossentropy": 3.1708402156829836, + "loss/logits": 0.9876413464546203, + "step": 10880 + }, + { + "epoch": 0.1089, + "grad_norm": 11.5, + "grad_norm_var": 0.288525390625, + "learning_rate": 0.0003, + "loss": 12.4812, + "loss/aux_loss": 0.04810988549143076, + "loss/crossentropy": 2.802176779508591, + "loss/logits": 0.9748163193464279, + "step": 10890 + }, + { + "epoch": 0.109, + "grad_norm": 10.3125, + "grad_norm_var": 0.287744140625, + "learning_rate": 0.0003, + "loss": 12.5199, + "loss/aux_loss": 0.04810547549277544, + "loss/crossentropy": 2.9589218378067015, + "loss/logits": 0.9759902417659759, + "step": 10900 + }, + { + "epoch": 0.1091, + "grad_norm": 11.25, + "grad_norm_var": 0.5187337239583333, + "learning_rate": 0.0003, + "loss": 12.6455, + "loss/aux_loss": 0.048122298903763294, + "loss/crossentropy": 2.7537475407123564, + "loss/logits": 0.968425664305687, + "step": 10910 + }, + { + "epoch": 0.1092, + "grad_norm": 10.9375, + "grad_norm_var": 3.849853515625, + "learning_rate": 0.0003, + "loss": 12.6097, + "loss/aux_loss": 0.04810692425817251, + "loss/crossentropy": 2.87729851603508, + "loss/logits": 0.988306000828743, + "step": 10920 + }, + { + "epoch": 0.1093, + "grad_norm": 11.625, + "grad_norm_var": 3.958056640625, + "learning_rate": 0.0003, + "loss": 12.4243, + "loss/aux_loss": 0.04811464417725801, + "loss/crossentropy": 2.823906672000885, + "loss/logits": 0.9596776217222214, + "step": 10930 + }, + { + "epoch": 0.1094, + "grad_norm": 10.125, + "grad_norm_var": 0.39576822916666665, + "learning_rate": 0.0003, + "loss": 12.6005, + "loss/aux_loss": 0.04811041634529829, + "loss/crossentropy": 2.8116785049438477, + "loss/logits": 1.0132469624280929, + "step": 10940 + }, + { + "epoch": 0.1095, + "grad_norm": 11.8125, + "grad_norm_var": 0.43483072916666665, + "learning_rate": 0.0003, + "loss": 12.7462, + "loss/aux_loss": 0.04810987431555987, + "loss/crossentropy": 2.908396100997925, + "loss/logits": 1.008555829524994, + "step": 10950 + }, + { + "epoch": 0.1096, + "grad_norm": 10.1875, + "grad_norm_var": 0.51640625, + "learning_rate": 0.0003, + "loss": 12.6827, + "loss/aux_loss": 0.0481119841337204, + "loss/crossentropy": 2.865977716445923, + "loss/logits": 0.969117721915245, + "step": 10960 + }, + { + "epoch": 0.1097, + "grad_norm": 10.4375, + "grad_norm_var": 0.4945149739583333, + "learning_rate": 0.0003, + "loss": 12.5983, + "loss/aux_loss": 0.048109224624931814, + "loss/crossentropy": 2.9365819096565247, + "loss/logits": 0.9682926207780838, + "step": 10970 + }, + { + "epoch": 0.1098, + "grad_norm": 9.875, + "grad_norm_var": 0.481494140625, + "learning_rate": 0.0003, + "loss": 12.6494, + "loss/aux_loss": 0.048102812469005586, + "loss/crossentropy": 2.924280512332916, + "loss/logits": 1.0038185507059096, + "step": 10980 + }, + { + "epoch": 0.1099, + "grad_norm": 11.1875, + "grad_norm_var": 1.7367024739583334, + "learning_rate": 0.0003, + "loss": 12.9254, + "loss/aux_loss": 0.048106090165674686, + "loss/crossentropy": 2.9131445050239564, + "loss/logits": 0.9844058066606521, + "step": 10990 + }, + { + "epoch": 0.11, + "grad_norm": 10.1875, + "grad_norm_var": 0.44264322916666665, + "learning_rate": 0.0003, + "loss": 12.5432, + "loss/aux_loss": 0.04811704996973276, + "loss/crossentropy": 2.814534968137741, + "loss/logits": 0.9921759486198425, + "step": 11000 + }, + { + "epoch": 0.1101, + "grad_norm": 10.5, + "grad_norm_var": 0.370947265625, + "learning_rate": 0.0003, + "loss": 12.7211, + "loss/aux_loss": 0.048108032904565334, + "loss/crossentropy": 2.7905489981174467, + "loss/logits": 0.9878934472799301, + "step": 11010 + }, + { + "epoch": 0.1102, + "grad_norm": 11.8125, + "grad_norm_var": 9.125895182291666, + "learning_rate": 0.0003, + "loss": 12.6461, + "loss/aux_loss": 0.048117210157215595, + "loss/crossentropy": 2.8903677582740785, + "loss/logits": 0.9732258021831512, + "step": 11020 + }, + { + "epoch": 0.1103, + "grad_norm": 9.1875, + "grad_norm_var": 9.377197265625, + "learning_rate": 0.0003, + "loss": 12.6081, + "loss/aux_loss": 0.04811104368418455, + "loss/crossentropy": 2.719060683250427, + "loss/logits": 0.9554944217205048, + "step": 11030 + }, + { + "epoch": 0.1104, + "grad_norm": 10.9375, + "grad_norm_var": 0.5860514322916667, + "learning_rate": 0.0003, + "loss": 12.7441, + "loss/aux_loss": 0.04811680149286986, + "loss/crossentropy": 2.81855326294899, + "loss/logits": 0.976987361907959, + "step": 11040 + }, + { + "epoch": 0.1105, + "grad_norm": 9.625, + "grad_norm_var": 0.6400390625, + "learning_rate": 0.0003, + "loss": 12.6213, + "loss/aux_loss": 0.0481078302487731, + "loss/crossentropy": 2.97838671207428, + "loss/logits": 0.9554787337779999, + "step": 11050 + }, + { + "epoch": 0.1106, + "grad_norm": 9.8125, + "grad_norm_var": 0.20818684895833334, + "learning_rate": 0.0003, + "loss": 12.5692, + "loss/aux_loss": 0.04810439124703407, + "loss/crossentropy": 2.821098101139069, + "loss/logits": 0.9252155363559723, + "step": 11060 + }, + { + "epoch": 0.1107, + "grad_norm": 10.0625, + "grad_norm_var": 1.1117024739583334, + "learning_rate": 0.0003, + "loss": 12.6105, + "loss/aux_loss": 0.048108091577887535, + "loss/crossentropy": 2.782370573282242, + "loss/logits": 0.9402445912361145, + "step": 11070 + }, + { + "epoch": 0.1108, + "grad_norm": 10.875, + "grad_norm_var": 0.4554524739583333, + "learning_rate": 0.0003, + "loss": 12.7023, + "loss/aux_loss": 0.04811048619449139, + "loss/crossentropy": 2.903420704603195, + "loss/logits": 1.0490208446979523, + "step": 11080 + }, + { + "epoch": 0.1109, + "grad_norm": 10.1875, + "grad_norm_var": 0.378369140625, + "learning_rate": 0.0003, + "loss": 12.529, + "loss/aux_loss": 0.04811018593609333, + "loss/crossentropy": 2.9194815278053285, + "loss/logits": 1.0058273494243621, + "step": 11090 + }, + { + "epoch": 0.111, + "grad_norm": 11.625, + "grad_norm_var": 0.28587239583333335, + "learning_rate": 0.0003, + "loss": 12.6302, + "loss/aux_loss": 0.04811076112091541, + "loss/crossentropy": 2.787671518325806, + "loss/logits": 0.9877743035554886, + "step": 11100 + }, + { + "epoch": 0.1111, + "grad_norm": 10.0625, + "grad_norm_var": 0.23982747395833334, + "learning_rate": 0.0003, + "loss": 12.7033, + "loss/aux_loss": 0.04810548275709152, + "loss/crossentropy": 2.9209081172943114, + "loss/logits": 1.001340913772583, + "step": 11110 + }, + { + "epoch": 0.1112, + "grad_norm": 9.25, + "grad_norm_var": 0.25677083333333334, + "learning_rate": 0.0003, + "loss": 12.6145, + "loss/aux_loss": 0.04811436515301466, + "loss/crossentropy": 2.8942541658878325, + "loss/logits": 0.9485535502433777, + "step": 11120 + }, + { + "epoch": 0.1113, + "grad_norm": 9.875, + "grad_norm_var": 0.2587890625, + "learning_rate": 0.0003, + "loss": 12.5613, + "loss/aux_loss": 0.048104191198945045, + "loss/crossentropy": 2.803705060482025, + "loss/logits": 0.9647155731916428, + "step": 11130 + }, + { + "epoch": 0.1114, + "grad_norm": 10.1875, + "grad_norm_var": 0.25729166666666664, + "learning_rate": 0.0003, + "loss": 12.5791, + "loss/aux_loss": 0.04812238048762083, + "loss/crossentropy": 2.862819027900696, + "loss/logits": 0.9601949125528335, + "step": 11140 + }, + { + "epoch": 0.1115, + "grad_norm": 11.125, + "grad_norm_var": 0.23370768229166666, + "learning_rate": 0.0003, + "loss": 12.5839, + "loss/aux_loss": 0.04810948688536883, + "loss/crossentropy": 2.8555395185947416, + "loss/logits": 0.9564761400222779, + "step": 11150 + }, + { + "epoch": 0.1116, + "grad_norm": 10.3125, + "grad_norm_var": 0.2872233072916667, + "learning_rate": 0.0003, + "loss": 12.6588, + "loss/aux_loss": 0.0481093930080533, + "loss/crossentropy": 2.9155186653137206, + "loss/logits": 0.9658820390701294, + "step": 11160 + }, + { + "epoch": 0.1117, + "grad_norm": 11.1875, + "grad_norm_var": 0.20045572916666668, + "learning_rate": 0.0003, + "loss": 12.777, + "loss/aux_loss": 0.048106205835938456, + "loss/crossentropy": 2.8000992953777315, + "loss/logits": 0.9923090279102326, + "step": 11170 + }, + { + "epoch": 0.1118, + "grad_norm": 11.75, + "grad_norm_var": 0.3815104166666667, + "learning_rate": 0.0003, + "loss": 12.6665, + "loss/aux_loss": 0.048104862496256826, + "loss/crossentropy": 2.849539339542389, + "loss/logits": 0.973883080482483, + "step": 11180 + }, + { + "epoch": 0.1119, + "grad_norm": 10.625, + "grad_norm_var": 0.39739583333333334, + "learning_rate": 0.0003, + "loss": 12.4681, + "loss/aux_loss": 0.048110177554190156, + "loss/crossentropy": 2.8586711943149568, + "loss/logits": 1.001354029774666, + "step": 11190 + }, + { + "epoch": 0.112, + "grad_norm": 19.0, + "grad_norm_var": 5.034358723958333, + "learning_rate": 0.0003, + "loss": 12.638, + "loss/aux_loss": 0.04811324365437031, + "loss/crossentropy": 2.8593406438827516, + "loss/logits": 0.9571125656366348, + "step": 11200 + }, + { + "epoch": 0.1121, + "grad_norm": 10.4375, + "grad_norm_var": 4.9947265625, + "learning_rate": 0.0003, + "loss": 12.5001, + "loss/aux_loss": 0.04811281282454729, + "loss/crossentropy": 3.0845739006996156, + "loss/logits": 0.9708561331033707, + "step": 11210 + }, + { + "epoch": 0.1122, + "grad_norm": 10.5625, + "grad_norm_var": 0.21243489583333333, + "learning_rate": 0.0003, + "loss": 12.6438, + "loss/aux_loss": 0.04810171201825142, + "loss/crossentropy": 2.9789989948272706, + "loss/logits": 0.9724924474954605, + "step": 11220 + }, + { + "epoch": 0.1123, + "grad_norm": 10.625, + "grad_norm_var": 0.251416015625, + "learning_rate": 0.0003, + "loss": 12.8096, + "loss/aux_loss": 0.04811390731483698, + "loss/crossentropy": 2.897776019573212, + "loss/logits": 1.0156351894140243, + "step": 11230 + }, + { + "epoch": 0.1124, + "grad_norm": 10.6875, + "grad_norm_var": 1.221337890625, + "learning_rate": 0.0003, + "loss": 12.5675, + "loss/aux_loss": 0.04811619278043509, + "loss/crossentropy": 2.7954994082450866, + "loss/logits": 0.9756214946508408, + "step": 11240 + }, + { + "epoch": 0.1125, + "grad_norm": 10.5625, + "grad_norm_var": 0.25983072916666666, + "learning_rate": 0.0003, + "loss": 12.5928, + "loss/aux_loss": 0.04811410661786795, + "loss/crossentropy": 2.7457215189933777, + "loss/logits": 0.9541913717985153, + "step": 11250 + }, + { + "epoch": 0.1126, + "grad_norm": 10.5, + "grad_norm_var": 0.23084309895833333, + "learning_rate": 0.0003, + "loss": 12.634, + "loss/aux_loss": 0.0481100007891655, + "loss/crossentropy": 2.8804012298583985, + "loss/logits": 0.9697092175483704, + "step": 11260 + }, + { + "epoch": 0.1127, + "grad_norm": 12.4375, + "grad_norm_var": 0.7613118489583334, + "learning_rate": 0.0003, + "loss": 12.5108, + "loss/aux_loss": 0.04810825977474451, + "loss/crossentropy": 2.989805257320404, + "loss/logits": 1.0063245952129365, + "step": 11270 + }, + { + "epoch": 0.1128, + "grad_norm": 10.75, + "grad_norm_var": 5.937483723958334, + "learning_rate": 0.0003, + "loss": 12.7367, + "loss/aux_loss": 0.048129872791469096, + "loss/crossentropy": 2.910978400707245, + "loss/logits": 0.9753200441598893, + "step": 11280 + }, + { + "epoch": 0.1129, + "grad_norm": 10.25, + "grad_norm_var": 0.63359375, + "learning_rate": 0.0003, + "loss": 12.6325, + "loss/aux_loss": 0.048113250732421876, + "loss/crossentropy": 2.76353098154068, + "loss/logits": 0.966698682308197, + "step": 11290 + }, + { + "epoch": 0.113, + "grad_norm": 9.9375, + "grad_norm_var": 0.629931640625, + "learning_rate": 0.0003, + "loss": 12.5728, + "loss/aux_loss": 0.04810833781957626, + "loss/crossentropy": 2.8197692394256593, + "loss/logits": 0.9561370402574539, + "step": 11300 + }, + { + "epoch": 0.1131, + "grad_norm": 10.1875, + "grad_norm_var": 0.20364583333333333, + "learning_rate": 0.0003, + "loss": 12.8143, + "loss/aux_loss": 0.04810529686510563, + "loss/crossentropy": 2.8484480023384093, + "loss/logits": 0.9861988663673401, + "step": 11310 + }, + { + "epoch": 0.1132, + "grad_norm": 10.75, + "grad_norm_var": 0.278369140625, + "learning_rate": 0.0003, + "loss": 12.6227, + "loss/aux_loss": 0.04811426196247339, + "loss/crossentropy": 2.833006477355957, + "loss/logits": 0.9779454618692398, + "step": 11320 + }, + { + "epoch": 0.1133, + "grad_norm": 15.25, + "grad_norm_var": 1.6007649739583334, + "learning_rate": 0.0003, + "loss": 12.369, + "loss/aux_loss": 0.048116072081029415, + "loss/crossentropy": 2.728998416662216, + "loss/logits": 0.9195287644863128, + "step": 11330 + }, + { + "epoch": 0.1134, + "grad_norm": 10.3125, + "grad_norm_var": 1.529931640625, + "learning_rate": 0.0003, + "loss": 12.6997, + "loss/aux_loss": 0.04811266250908375, + "loss/crossentropy": 2.7394905209541323, + "loss/logits": 0.9578628540039062, + "step": 11340 + }, + { + "epoch": 0.1135, + "grad_norm": 10.125, + "grad_norm_var": 0.7983723958333333, + "learning_rate": 0.0003, + "loss": 12.4926, + "loss/aux_loss": 0.04810461904853582, + "loss/crossentropy": 2.8682806730270385, + "loss/logits": 1.0078667521476745, + "step": 11350 + }, + { + "epoch": 0.1136, + "grad_norm": 9.625, + "grad_norm_var": 0.842431640625, + "learning_rate": 0.0003, + "loss": 12.688, + "loss/aux_loss": 0.0481068329885602, + "loss/crossentropy": 2.8801401615142823, + "loss/logits": 0.9915682733058929, + "step": 11360 + }, + { + "epoch": 0.1137, + "grad_norm": 10.3125, + "grad_norm_var": 0.3640625, + "learning_rate": 0.0003, + "loss": 12.449, + "loss/aux_loss": 0.048105799593031406, + "loss/crossentropy": 2.9282567858695985, + "loss/logits": 0.9672238737344742, + "step": 11370 + }, + { + "epoch": 0.1138, + "grad_norm": 10.5625, + "grad_norm_var": 0.28274739583333336, + "learning_rate": 0.0003, + "loss": 12.5677, + "loss/aux_loss": 0.04810627568513155, + "loss/crossentropy": 2.8078087627887727, + "loss/logits": 0.9780084967613221, + "step": 11380 + }, + { + "epoch": 0.1139, + "grad_norm": 10.875, + "grad_norm_var": 3.143684895833333, + "learning_rate": 0.0003, + "loss": 12.5506, + "loss/aux_loss": 0.048108652047812936, + "loss/crossentropy": 2.987882399559021, + "loss/logits": 0.972921484708786, + "step": 11390 + }, + { + "epoch": 0.114, + "grad_norm": 10.1875, + "grad_norm_var": 2.8686848958333333, + "learning_rate": 0.0003, + "loss": 12.6501, + "loss/aux_loss": 0.04810910746455192, + "loss/crossentropy": 3.1420127391815185, + "loss/logits": 1.0203659981489182, + "step": 11400 + }, + { + "epoch": 0.1141, + "grad_norm": 11.25, + "grad_norm_var": 0.37265625, + "learning_rate": 0.0003, + "loss": 12.6042, + "loss/aux_loss": 0.04811469428241253, + "loss/crossentropy": 2.939081585407257, + "loss/logits": 0.9754122138023377, + "step": 11410 + }, + { + "epoch": 0.1142, + "grad_norm": 10.9375, + "grad_norm_var": 0.5501139322916667, + "learning_rate": 0.0003, + "loss": 12.4749, + "loss/aux_loss": 0.048111149854958055, + "loss/crossentropy": 2.8218182921409607, + "loss/logits": 0.9850091069936753, + "step": 11420 + }, + { + "epoch": 0.1143, + "grad_norm": 9.625, + "grad_norm_var": 0.5001139322916667, + "learning_rate": 0.0003, + "loss": 12.7164, + "loss/aux_loss": 0.04811033252626658, + "loss/crossentropy": 2.908424949645996, + "loss/logits": 0.9455067425966263, + "step": 11430 + }, + { + "epoch": 0.1144, + "grad_norm": 10.625, + "grad_norm_var": 0.8135416666666667, + "learning_rate": 0.0003, + "loss": 12.617, + "loss/aux_loss": 0.04811215177178383, + "loss/crossentropy": 2.83238645195961, + "loss/logits": 0.9909904628992081, + "step": 11440 + }, + { + "epoch": 0.1145, + "grad_norm": 10.625, + "grad_norm_var": 75.9525390625, + "learning_rate": 0.0003, + "loss": 12.631, + "loss/aux_loss": 0.04813397005200386, + "loss/crossentropy": 2.8597318053245546, + "loss/logits": 0.9930762708187103, + "step": 11450 + }, + { + "epoch": 0.1146, + "grad_norm": 9.0625, + "grad_norm_var": 0.27076822916666665, + "learning_rate": 0.0003, + "loss": 12.7103, + "loss/aux_loss": 0.04811841379851103, + "loss/crossentropy": 2.7436058163642882, + "loss/logits": 0.9644762337207794, + "step": 11460 + }, + { + "epoch": 0.1147, + "grad_norm": 9.9375, + "grad_norm_var": 0.5945149739583333, + "learning_rate": 0.0003, + "loss": 12.7142, + "loss/aux_loss": 0.04810911137610674, + "loss/crossentropy": 2.85939040184021, + "loss/logits": 0.9520360499620437, + "step": 11470 + }, + { + "epoch": 0.1148, + "grad_norm": 10.5, + "grad_norm_var": 0.4886555989583333, + "learning_rate": 0.0003, + "loss": 12.5397, + "loss/aux_loss": 0.04810511395335197, + "loss/crossentropy": 2.976986992359161, + "loss/logits": 0.9621847212314606, + "step": 11480 + }, + { + "epoch": 0.1149, + "grad_norm": 10.25, + "grad_norm_var": 0.429150390625, + "learning_rate": 0.0003, + "loss": 12.3119, + "loss/aux_loss": 0.048113946430385114, + "loss/crossentropy": 2.851515471935272, + "loss/logits": 0.9701724350452423, + "step": 11490 + }, + { + "epoch": 0.115, + "grad_norm": 10.875, + "grad_norm_var": 0.6905598958333333, + "learning_rate": 0.0003, + "loss": 12.5609, + "loss/aux_loss": 0.048105195350945, + "loss/crossentropy": 2.8894742131233215, + "loss/logits": 0.9788044720888138, + "step": 11500 + }, + { + "epoch": 0.1151, + "grad_norm": 10.25, + "grad_norm_var": 0.508837890625, + "learning_rate": 0.0003, + "loss": 12.6204, + "loss/aux_loss": 0.04811161942780018, + "loss/crossentropy": 2.7808387517929076, + "loss/logits": 0.9664526283740997, + "step": 11510 + }, + { + "epoch": 0.1152, + "grad_norm": 10.6875, + "grad_norm_var": 0.2964680989583333, + "learning_rate": 0.0003, + "loss": 12.5533, + "loss/aux_loss": 0.048106780648231505, + "loss/crossentropy": 3.0173611760139467, + "loss/logits": 0.9817897409200669, + "step": 11520 + }, + { + "epoch": 0.1153, + "grad_norm": 12.375, + "grad_norm_var": 0.4494140625, + "learning_rate": 0.0003, + "loss": 12.7684, + "loss/aux_loss": 0.04811464920639992, + "loss/crossentropy": 2.9131483495235444, + "loss/logits": 0.9585329800844192, + "step": 11530 + }, + { + "epoch": 0.1154, + "grad_norm": 10.75, + "grad_norm_var": 0.5989420572916667, + "learning_rate": 0.0003, + "loss": 12.5183, + "loss/aux_loss": 0.04811756443232298, + "loss/crossentropy": 2.868135905265808, + "loss/logits": 0.9498421907424927, + "step": 11540 + }, + { + "epoch": 0.1155, + "grad_norm": 10.4375, + "grad_norm_var": 0.6488932291666667, + "learning_rate": 0.0003, + "loss": 12.5642, + "loss/aux_loss": 0.0480995262041688, + "loss/crossentropy": 2.7917768478393556, + "loss/logits": 0.9568525284528733, + "step": 11550 + }, + { + "epoch": 0.1156, + "grad_norm": 10.8125, + "grad_norm_var": 0.334228515625, + "learning_rate": 0.0003, + "loss": 12.4884, + "loss/aux_loss": 0.04810049049556255, + "loss/crossentropy": 2.9180258393287657, + "loss/logits": 0.98627108335495, + "step": 11560 + }, + { + "epoch": 0.1157, + "grad_norm": 10.9375, + "grad_norm_var": 0.5395833333333333, + "learning_rate": 0.0003, + "loss": 12.627, + "loss/aux_loss": 0.0481100594624877, + "loss/crossentropy": 2.81765558719635, + "loss/logits": 0.9568387240171432, + "step": 11570 + }, + { + "epoch": 0.1158, + "grad_norm": 9.6875, + "grad_norm_var": 0.17472330729166666, + "learning_rate": 0.0003, + "loss": 12.7317, + "loss/aux_loss": 0.04811111558228731, + "loss/crossentropy": 2.8285086691379546, + "loss/logits": 0.9868262588977814, + "step": 11580 + }, + { + "epoch": 0.1159, + "grad_norm": 10.5625, + "grad_norm_var": 0.332666015625, + "learning_rate": 0.0003, + "loss": 12.3661, + "loss/aux_loss": 0.04810287747532129, + "loss/crossentropy": 2.7959317326545716, + "loss/logits": 0.9438671588897705, + "step": 11590 + }, + { + "epoch": 0.116, + "grad_norm": 11.0, + "grad_norm_var": 0.46013997395833334, + "learning_rate": 0.0003, + "loss": 12.5853, + "loss/aux_loss": 0.04810447972267866, + "loss/crossentropy": 2.96794798374176, + "loss/logits": 0.9699487566947937, + "step": 11600 + }, + { + "epoch": 0.1161, + "grad_norm": 9.9375, + "grad_norm_var": 0.2872233072916667, + "learning_rate": 0.0003, + "loss": 12.5827, + "loss/aux_loss": 0.04810726400464773, + "loss/crossentropy": 2.7267133831977843, + "loss/logits": 0.9387221932411194, + "step": 11610 + }, + { + "epoch": 0.1162, + "grad_norm": 10.6875, + "grad_norm_var": 0.2567057291666667, + "learning_rate": 0.0003, + "loss": 12.621, + "loss/aux_loss": 0.04810353182256222, + "loss/crossentropy": 2.8360018610954283, + "loss/logits": 0.9253288894891739, + "step": 11620 + }, + { + "epoch": 0.1163, + "grad_norm": 11.25, + "grad_norm_var": 0.2911295572916667, + "learning_rate": 0.0003, + "loss": 12.5106, + "loss/aux_loss": 0.048119408451020716, + "loss/crossentropy": 2.794688510894775, + "loss/logits": 0.9578901708126069, + "step": 11630 + }, + { + "epoch": 0.1164, + "grad_norm": 11.375, + "grad_norm_var": 0.43722330729166664, + "learning_rate": 0.0003, + "loss": 12.6836, + "loss/aux_loss": 0.04810473546385765, + "loss/crossentropy": 2.8710934042930605, + "loss/logits": 0.9788650065660477, + "step": 11640 + }, + { + "epoch": 0.1165, + "grad_norm": 10.0625, + "grad_norm_var": 0.3859375, + "learning_rate": 0.0003, + "loss": 12.6164, + "loss/aux_loss": 0.04809908457100391, + "loss/crossentropy": 2.807088649272919, + "loss/logits": 0.9633297890424728, + "step": 11650 + }, + { + "epoch": 0.1166, + "grad_norm": 9.6875, + "grad_norm_var": 0.3400390625, + "learning_rate": 0.0003, + "loss": 12.648, + "loss/aux_loss": 0.04809819832444191, + "loss/crossentropy": 2.9749920845031737, + "loss/logits": 1.0201595097780227, + "step": 11660 + }, + { + "epoch": 0.1167, + "grad_norm": 11.875, + "grad_norm_var": 2.988785807291667, + "learning_rate": 0.0003, + "loss": 12.4478, + "loss/aux_loss": 0.04812034796923399, + "loss/crossentropy": 2.62253178358078, + "loss/logits": 0.9186932921409607, + "step": 11670 + }, + { + "epoch": 0.1168, + "grad_norm": 11.9375, + "grad_norm_var": 12.179622395833333, + "learning_rate": 0.0003, + "loss": 12.4319, + "loss/aux_loss": 0.04812979437410832, + "loss/crossentropy": 2.857901084423065, + "loss/logits": 0.9803258359432221, + "step": 11680 + }, + { + "epoch": 0.1169, + "grad_norm": 11.1875, + "grad_norm_var": 12.487353515625, + "learning_rate": 0.0003, + "loss": 12.792, + "loss/aux_loss": 0.04811025280505419, + "loss/crossentropy": 2.872191935777664, + "loss/logits": 0.9423937231302262, + "step": 11690 + }, + { + "epoch": 0.117, + "grad_norm": 10.5625, + "grad_norm_var": 0.14036458333333332, + "learning_rate": 0.0003, + "loss": 12.5762, + "loss/aux_loss": 0.04811386782675982, + "loss/crossentropy": 2.741291904449463, + "loss/logits": 0.9453139632940293, + "step": 11700 + }, + { + "epoch": 0.1171, + "grad_norm": 10.75, + "grad_norm_var": 0.21795247395833334, + "learning_rate": 0.0003, + "loss": 12.3318, + "loss/aux_loss": 0.04810644257813692, + "loss/crossentropy": 2.9782674610614777, + "loss/logits": 0.9474372059106827, + "step": 11710 + }, + { + "epoch": 0.1172, + "grad_norm": 9.9375, + "grad_norm_var": 0.3042805989583333, + "learning_rate": 0.0003, + "loss": 12.6985, + "loss/aux_loss": 0.04810058567672968, + "loss/crossentropy": 2.759519326686859, + "loss/logits": 1.003102535009384, + "step": 11720 + }, + { + "epoch": 0.1173, + "grad_norm": 10.5625, + "grad_norm_var": 2.947379557291667, + "learning_rate": 0.0003, + "loss": 12.5665, + "loss/aux_loss": 0.04811586532741785, + "loss/crossentropy": 2.885226249694824, + "loss/logits": 0.9729419648647308, + "step": 11730 + }, + { + "epoch": 0.1174, + "grad_norm": 10.9375, + "grad_norm_var": 2.6150390625, + "learning_rate": 0.0003, + "loss": 12.5392, + "loss/aux_loss": 0.04810945596545935, + "loss/crossentropy": 2.859631586074829, + "loss/logits": 0.9514502733945847, + "step": 11740 + }, + { + "epoch": 0.1175, + "grad_norm": 11.0625, + "grad_norm_var": 0.47701822916666664, + "learning_rate": 0.0003, + "loss": 12.6717, + "loss/aux_loss": 0.048104429990053175, + "loss/crossentropy": 2.885056400299072, + "loss/logits": 0.9942733883857727, + "step": 11750 + }, + { + "epoch": 0.1176, + "grad_norm": 10.5, + "grad_norm_var": 0.6020182291666667, + "learning_rate": 0.0003, + "loss": 12.6648, + "loss/aux_loss": 0.04809593297541141, + "loss/crossentropy": 2.9696100473403932, + "loss/logits": 0.9882340937852859, + "step": 11760 + }, + { + "epoch": 0.1177, + "grad_norm": 11.125, + "grad_norm_var": 0.21451822916666666, + "learning_rate": 0.0003, + "loss": 12.4929, + "loss/aux_loss": 0.04809958972036839, + "loss/crossentropy": 2.844915008544922, + "loss/logits": 0.983967337012291, + "step": 11770 + }, + { + "epoch": 0.1178, + "grad_norm": 11.0625, + "grad_norm_var": 0.245556640625, + "learning_rate": 0.0003, + "loss": 12.4835, + "loss/aux_loss": 0.048105498775839806, + "loss/crossentropy": 2.9103447675704954, + "loss/logits": 0.9672578752040863, + "step": 11780 + }, + { + "epoch": 0.1179, + "grad_norm": 10.3125, + "grad_norm_var": 0.106884765625, + "learning_rate": 0.0003, + "loss": 12.599, + "loss/aux_loss": 0.04810230545699597, + "loss/crossentropy": 2.8057246923446657, + "loss/logits": 0.9675930976867676, + "step": 11790 + }, + { + "epoch": 0.118, + "grad_norm": 11.25, + "grad_norm_var": 0.14348958333333334, + "learning_rate": 0.0003, + "loss": 12.4241, + "loss/aux_loss": 0.04811300784349441, + "loss/crossentropy": 2.794022238254547, + "loss/logits": 0.9394135266542435, + "step": 11800 + }, + { + "epoch": 0.1181, + "grad_norm": 10.1875, + "grad_norm_var": 0.31197916666666664, + "learning_rate": 0.0003, + "loss": 12.5517, + "loss/aux_loss": 0.04811709113419056, + "loss/crossentropy": 2.727192759513855, + "loss/logits": 0.9235861957073211, + "step": 11810 + }, + { + "epoch": 0.1182, + "grad_norm": 10.75, + "grad_norm_var": 0.4572265625, + "learning_rate": 0.0003, + "loss": 12.4864, + "loss/aux_loss": 0.048109428584575654, + "loss/crossentropy": 2.7515438914299013, + "loss/logits": 0.9708627730607986, + "step": 11820 + }, + { + "epoch": 0.1183, + "grad_norm": 10.3125, + "grad_norm_var": 0.426025390625, + "learning_rate": 0.0003, + "loss": 12.4548, + "loss/aux_loss": 0.04810903538018465, + "loss/crossentropy": 2.889864444732666, + "loss/logits": 0.9854389071464539, + "step": 11830 + }, + { + "epoch": 0.1184, + "grad_norm": 10.1875, + "grad_norm_var": 0.20703125, + "learning_rate": 0.0003, + "loss": 12.6243, + "loss/aux_loss": 0.04810080174356699, + "loss/crossentropy": 2.85439276099205, + "loss/logits": 0.9542811691761017, + "step": 11840 + }, + { + "epoch": 0.1185, + "grad_norm": 10.5625, + "grad_norm_var": 0.21979166666666666, + "learning_rate": 0.0003, + "loss": 12.6767, + "loss/aux_loss": 0.048108363337814805, + "loss/crossentropy": 2.8335661768913267, + "loss/logits": 0.9943309754133225, + "step": 11850 + }, + { + "epoch": 0.1186, + "grad_norm": 10.3125, + "grad_norm_var": 34.180843098958334, + "learning_rate": 0.0003, + "loss": 12.4189, + "loss/aux_loss": 0.048121347464621066, + "loss/crossentropy": 2.8601845264434815, + "loss/logits": 0.9276136964559555, + "step": 11860 + }, + { + "epoch": 0.1187, + "grad_norm": 10.5625, + "grad_norm_var": 0.24295247395833333, + "learning_rate": 0.0003, + "loss": 12.5005, + "loss/aux_loss": 0.04811071082949638, + "loss/crossentropy": 2.7520765900611877, + "loss/logits": 0.9491381376981736, + "step": 11870 + }, + { + "epoch": 0.1188, + "grad_norm": 10.0, + "grad_norm_var": 0.1697265625, + "learning_rate": 0.0003, + "loss": 12.5049, + "loss/aux_loss": 0.048104492016136646, + "loss/crossentropy": 3.0041671991348267, + "loss/logits": 1.0042832434177398, + "step": 11880 + }, + { + "epoch": 0.1189, + "grad_norm": 10.5625, + "grad_norm_var": 0.12849934895833334, + "learning_rate": 0.0003, + "loss": 12.5701, + "loss/aux_loss": 0.048117601312696934, + "loss/crossentropy": 2.745959347486496, + "loss/logits": 0.9276633858680725, + "step": 11890 + }, + { + "epoch": 0.119, + "grad_norm": 10.125, + "grad_norm_var": 0.20514322916666666, + "learning_rate": 0.0003, + "loss": 12.4645, + "loss/aux_loss": 0.04810411240905523, + "loss/crossentropy": 2.842986249923706, + "loss/logits": 0.9896512359380722, + "step": 11900 + }, + { + "epoch": 0.1191, + "grad_norm": 11.3125, + "grad_norm_var": 0.43463541666666666, + "learning_rate": 0.0003, + "loss": 12.5719, + "loss/aux_loss": 0.04811199139803648, + "loss/crossentropy": 2.7681937336921694, + "loss/logits": 0.9506595671176911, + "step": 11910 + }, + { + "epoch": 0.1192, + "grad_norm": 11.375, + "grad_norm_var": 0.27263997395833334, + "learning_rate": 0.0003, + "loss": 12.6075, + "loss/aux_loss": 0.04810190089046955, + "loss/crossentropy": 2.9162669658660887, + "loss/logits": 0.9811139643192291, + "step": 11920 + }, + { + "epoch": 0.1193, + "grad_norm": 10.625, + "grad_norm_var": 0.19724934895833332, + "learning_rate": 0.0003, + "loss": 12.5182, + "loss/aux_loss": 0.04810644220560789, + "loss/crossentropy": 2.889602208137512, + "loss/logits": 0.9657979607582092, + "step": 11930 + }, + { + "epoch": 0.1194, + "grad_norm": 10.4375, + "grad_norm_var": 0.27317708333333335, + "learning_rate": 0.0003, + "loss": 12.4569, + "loss/aux_loss": 0.04809871483594179, + "loss/crossentropy": 3.0264495491981505, + "loss/logits": 0.9761357963085174, + "step": 11940 + }, + { + "epoch": 0.1195, + "grad_norm": 10.375, + "grad_norm_var": 0.12862955729166667, + "learning_rate": 0.0003, + "loss": 12.4498, + "loss/aux_loss": 0.04811019022017717, + "loss/crossentropy": 2.784598481655121, + "loss/logits": 0.9536922335624695, + "step": 11950 + }, + { + "epoch": 0.1196, + "grad_norm": 10.875, + "grad_norm_var": 24.016780598958334, + "learning_rate": 0.0003, + "loss": 12.4946, + "loss/aux_loss": 0.04811384323984384, + "loss/crossentropy": 2.9115478515625, + "loss/logits": 0.9744400382041931, + "step": 11960 + }, + { + "epoch": 0.1197, + "grad_norm": 11.3125, + "grad_norm_var": 0.674072265625, + "learning_rate": 0.0003, + "loss": 12.5965, + "loss/aux_loss": 0.04810780603438616, + "loss/crossentropy": 2.894571363925934, + "loss/logits": 0.9742325752973556, + "step": 11970 + }, + { + "epoch": 0.1198, + "grad_norm": 11.6875, + "grad_norm_var": 0.30911458333333336, + "learning_rate": 0.0003, + "loss": 12.6002, + "loss/aux_loss": 0.04811006467789412, + "loss/crossentropy": 3.06647070646286, + "loss/logits": 0.9646219074726105, + "step": 11980 + }, + { + "epoch": 0.1199, + "grad_norm": 10.1875, + "grad_norm_var": 0.217041015625, + "learning_rate": 0.0003, + "loss": 12.3748, + "loss/aux_loss": 0.04810543842613697, + "loss/crossentropy": 2.722061502933502, + "loss/logits": 0.9802715986967087, + "step": 11990 + }, + { + "epoch": 0.12, + "grad_norm": 10.0625, + "grad_norm_var": 0.42649739583333335, + "learning_rate": 0.0003, + "loss": 12.5784, + "loss/aux_loss": 0.04810118656605482, + "loss/crossentropy": 2.83687162399292, + "loss/logits": 0.9512553691864014, + "step": 12000 + }, + { + "epoch": 0.1201, + "grad_norm": 11.125, + "grad_norm_var": 0.24733072916666668, + "learning_rate": 0.0003, + "loss": 12.3364, + "loss/aux_loss": 0.04810278117656708, + "loss/crossentropy": 2.744813871383667, + "loss/logits": 0.9523531854152679, + "step": 12010 + }, + { + "epoch": 0.1202, + "grad_norm": 10.75, + "grad_norm_var": 0.23214518229166667, + "learning_rate": 0.0003, + "loss": 12.6001, + "loss/aux_loss": 0.04810234196484089, + "loss/crossentropy": 2.830919635295868, + "loss/logits": 0.9435950011014939, + "step": 12020 + }, + { + "epoch": 0.1203, + "grad_norm": 11.1875, + "grad_norm_var": 0.23201497395833334, + "learning_rate": 0.0003, + "loss": 12.5726, + "loss/aux_loss": 0.04810579065233469, + "loss/crossentropy": 2.802901232242584, + "loss/logits": 0.971744042634964, + "step": 12030 + }, + { + "epoch": 0.1204, + "grad_norm": 11.1875, + "grad_norm_var": 0.30206705729166666, + "learning_rate": 0.0003, + "loss": 12.6394, + "loss/aux_loss": 0.04810426253825426, + "loss/crossentropy": 2.839026927947998, + "loss/logits": 0.971143838763237, + "step": 12040 + }, + { + "epoch": 0.1205, + "grad_norm": 10.8125, + "grad_norm_var": 0.19895833333333332, + "learning_rate": 0.0003, + "loss": 12.5642, + "loss/aux_loss": 0.04810550380498171, + "loss/crossentropy": 2.8231468319892885, + "loss/logits": 0.9654949724674224, + "step": 12050 + }, + { + "epoch": 0.1206, + "grad_norm": 10.6875, + "grad_norm_var": 0.17420247395833333, + "learning_rate": 0.0003, + "loss": 12.3904, + "loss/aux_loss": 0.048108019307255744, + "loss/crossentropy": 3.012854266166687, + "loss/logits": 0.9883525311946869, + "step": 12060 + }, + { + "epoch": 0.1207, + "grad_norm": 11.125, + "grad_norm_var": 0.268603515625, + "learning_rate": 0.0003, + "loss": 12.5321, + "loss/aux_loss": 0.04810563083738088, + "loss/crossentropy": 2.70223063826561, + "loss/logits": 0.9730505347251892, + "step": 12070 + }, + { + "epoch": 0.1208, + "grad_norm": 10.875, + "grad_norm_var": 0.271728515625, + "learning_rate": 0.0003, + "loss": 12.5535, + "loss/aux_loss": 0.04810192938894033, + "loss/crossentropy": 3.0259074330329896, + "loss/logits": 0.9854034870862961, + "step": 12080 + }, + { + "epoch": 0.1209, + "grad_norm": 10.375, + "grad_norm_var": 0.17161458333333332, + "learning_rate": 0.0003, + "loss": 12.765, + "loss/aux_loss": 0.04810161255300045, + "loss/crossentropy": 2.875988984107971, + "loss/logits": 0.9918344229459762, + "step": 12090 + }, + { + "epoch": 0.121, + "grad_norm": 11.375, + "grad_norm_var": 0.19635416666666666, + "learning_rate": 0.0003, + "loss": 12.623, + "loss/aux_loss": 0.048103974759578706, + "loss/crossentropy": 2.742973780632019, + "loss/logits": 0.9306074976921082, + "step": 12100 + }, + { + "epoch": 0.1211, + "grad_norm": 10.5625, + "grad_norm_var": 0.22024739583333333, + "learning_rate": 0.0003, + "loss": 12.5789, + "loss/aux_loss": 0.04811310023069382, + "loss/crossentropy": 2.705821967124939, + "loss/logits": 0.9371457666158676, + "step": 12110 + }, + { + "epoch": 0.1212, + "grad_norm": 11.4375, + "grad_norm_var": 0.9910807291666667, + "learning_rate": 0.0003, + "loss": 12.353, + "loss/aux_loss": 0.04810843821614981, + "loss/crossentropy": 2.6658570945262907, + "loss/logits": 0.9095493495464325, + "step": 12120 + }, + { + "epoch": 0.1213, + "grad_norm": 10.5625, + "grad_norm_var": 1.1050618489583333, + "learning_rate": 0.0003, + "loss": 12.6082, + "loss/aux_loss": 0.04810453653335571, + "loss/crossentropy": 2.8400238871574404, + "loss/logits": 0.9993251740932465, + "step": 12130 + }, + { + "epoch": 0.1214, + "grad_norm": 10.9375, + "grad_norm_var": 0.3003743489583333, + "learning_rate": 0.0003, + "loss": 12.293, + "loss/aux_loss": 0.04811606556177139, + "loss/crossentropy": 2.8960613369941712, + "loss/logits": 0.9764822989702224, + "step": 12140 + }, + { + "epoch": 0.1215, + "grad_norm": 10.25, + "grad_norm_var": 0.29244791666666664, + "learning_rate": 0.0003, + "loss": 12.4496, + "loss/aux_loss": 0.048100278712809086, + "loss/crossentropy": 3.0011345863342287, + "loss/logits": 0.9704394817352295, + "step": 12150 + }, + { + "epoch": 0.1216, + "grad_norm": 11.3125, + "grad_norm_var": 0.4044270833333333, + "learning_rate": 0.0003, + "loss": 12.5732, + "loss/aux_loss": 0.0481085266917944, + "loss/crossentropy": 2.7692679166793823, + "loss/logits": 0.9632198810577393, + "step": 12160 + }, + { + "epoch": 0.1217, + "grad_norm": 11.0, + "grad_norm_var": 0.24609375, + "learning_rate": 0.0003, + "loss": 12.4976, + "loss/aux_loss": 0.04810815379023552, + "loss/crossentropy": 2.8866684079170226, + "loss/logits": 0.9551214545965194, + "step": 12170 + }, + { + "epoch": 0.1218, + "grad_norm": 11.1875, + "grad_norm_var": 0.23318684895833333, + "learning_rate": 0.0003, + "loss": 12.5185, + "loss/aux_loss": 0.048101365193724634, + "loss/crossentropy": 2.8655909061431886, + "loss/logits": 0.9533731818199158, + "step": 12180 + }, + { + "epoch": 0.1219, + "grad_norm": 11.0, + "grad_norm_var": 0.8066243489583333, + "learning_rate": 0.0003, + "loss": 12.6911, + "loss/aux_loss": 0.04811571817845106, + "loss/crossentropy": 2.8677718937397003, + "loss/logits": 0.976726308465004, + "step": 12190 + }, + { + "epoch": 0.122, + "grad_norm": 10.4375, + "grad_norm_var": 1.0106770833333334, + "learning_rate": 0.0003, + "loss": 12.5852, + "loss/aux_loss": 0.048111325688660146, + "loss/crossentropy": 2.756421709060669, + "loss/logits": 0.9976501137018203, + "step": 12200 + }, + { + "epoch": 0.1221, + "grad_norm": 10.3125, + "grad_norm_var": 1.3247233072916667, + "learning_rate": 0.0003, + "loss": 12.5394, + "loss/aux_loss": 0.04810780212283135, + "loss/crossentropy": 2.9484314799308775, + "loss/logits": 0.9888930469751358, + "step": 12210 + }, + { + "epoch": 0.1222, + "grad_norm": 11.25, + "grad_norm_var": 1.724853515625, + "learning_rate": 0.0003, + "loss": 12.658, + "loss/aux_loss": 0.048103698343038556, + "loss/crossentropy": 2.8530756711959837, + "loss/logits": 0.961388236284256, + "step": 12220 + }, + { + "epoch": 0.1223, + "grad_norm": 10.375, + "grad_norm_var": 0.7403645833333333, + "learning_rate": 0.0003, + "loss": 12.6152, + "loss/aux_loss": 0.048111156560480595, + "loss/crossentropy": 2.7180242002010346, + "loss/logits": 0.9586433321237564, + "step": 12230 + }, + { + "epoch": 0.1224, + "grad_norm": 10.1875, + "grad_norm_var": 0.7048014322916667, + "learning_rate": 0.0003, + "loss": 12.4559, + "loss/aux_loss": 0.048110079020261765, + "loss/crossentropy": 2.9755612432956697, + "loss/logits": 0.9319843083620072, + "step": 12240 + }, + { + "epoch": 0.1225, + "grad_norm": 10.8125, + "grad_norm_var": 0.697119140625, + "learning_rate": 0.0003, + "loss": 12.4664, + "loss/aux_loss": 0.048104499280452725, + "loss/crossentropy": 2.828293478488922, + "loss/logits": 0.9228764444589614, + "step": 12250 + }, + { + "epoch": 0.1226, + "grad_norm": 10.9375, + "grad_norm_var": 0.234228515625, + "learning_rate": 0.0003, + "loss": 12.6363, + "loss/aux_loss": 0.048111573606729505, + "loss/crossentropy": 2.9271127223968505, + "loss/logits": 0.9774176150560379, + "step": 12260 + }, + { + "epoch": 0.1227, + "grad_norm": 12.75, + "grad_norm_var": 8.608968098958334, + "learning_rate": 0.0003, + "loss": 12.3807, + "loss/aux_loss": 0.048101313598454, + "loss/crossentropy": 2.63518745303154, + "loss/logits": 0.9886480629444122, + "step": 12270 + }, + { + "epoch": 0.1228, + "grad_norm": 10.3125, + "grad_norm_var": 9.355712890625, + "learning_rate": 0.0003, + "loss": 12.6403, + "loss/aux_loss": 0.04810824524611235, + "loss/crossentropy": 2.950111997127533, + "loss/logits": 0.9644519031047821, + "step": 12280 + }, + { + "epoch": 0.1229, + "grad_norm": 11.0, + "grad_norm_var": 1.6855305989583333, + "learning_rate": 0.0003, + "loss": 12.6324, + "loss/aux_loss": 0.048108363337814805, + "loss/crossentropy": 3.013728940486908, + "loss/logits": 0.999145370721817, + "step": 12290 + }, + { + "epoch": 0.123, + "grad_norm": 10.5, + "grad_norm_var": 0.28904622395833335, + "learning_rate": 0.0003, + "loss": 12.5458, + "loss/aux_loss": 0.048100477643311025, + "loss/crossentropy": 2.722964417934418, + "loss/logits": 0.9335471302270889, + "step": 12300 + }, + { + "epoch": 0.1231, + "grad_norm": 11.875, + "grad_norm_var": 0.453125, + "learning_rate": 0.0003, + "loss": 12.4757, + "loss/aux_loss": 0.04810597654432058, + "loss/crossentropy": 2.820869207382202, + "loss/logits": 0.9774489820003509, + "step": 12310 + }, + { + "epoch": 0.1232, + "grad_norm": 10.1875, + "grad_norm_var": 0.27316080729166664, + "learning_rate": 0.0003, + "loss": 12.4025, + "loss/aux_loss": 0.04810553044080734, + "loss/crossentropy": 2.7451001048088073, + "loss/logits": 0.9624879866838455, + "step": 12320 + }, + { + "epoch": 0.1233, + "grad_norm": 10.6875, + "grad_norm_var": 0.25857747395833336, + "learning_rate": 0.0003, + "loss": 12.4356, + "loss/aux_loss": 0.04811267796903849, + "loss/crossentropy": 2.917902183532715, + "loss/logits": 0.9623291105031967, + "step": 12330 + }, + { + "epoch": 0.1234, + "grad_norm": 10.8125, + "grad_norm_var": 0.24842122395833333, + "learning_rate": 0.0003, + "loss": 12.5192, + "loss/aux_loss": 0.04810788352042437, + "loss/crossentropy": 2.9027243733406065, + "loss/logits": 0.9865126490592957, + "step": 12340 + }, + { + "epoch": 0.1235, + "grad_norm": 10.875, + "grad_norm_var": 0.14217122395833334, + "learning_rate": 0.0003, + "loss": 12.6416, + "loss/aux_loss": 0.04810071587562561, + "loss/crossentropy": 2.8123831510543824, + "loss/logits": 0.9990533202886581, + "step": 12350 + }, + { + "epoch": 0.1236, + "grad_norm": 10.9375, + "grad_norm_var": 0.07185872395833333, + "learning_rate": 0.0003, + "loss": 12.417, + "loss/aux_loss": 0.04809997137635946, + "loss/crossentropy": 2.8776489377021788, + "loss/logits": 0.9396691709756851, + "step": 12360 + }, + { + "epoch": 0.1237, + "grad_norm": 11.1875, + "grad_norm_var": 0.17771809895833332, + "learning_rate": 0.0003, + "loss": 12.4562, + "loss/aux_loss": 0.048106643930077554, + "loss/crossentropy": 2.726925420761108, + "loss/logits": 0.9575481981039047, + "step": 12370 + }, + { + "epoch": 0.1238, + "grad_norm": 10.625, + "grad_norm_var": 0.34427083333333336, + "learning_rate": 0.0003, + "loss": 12.5861, + "loss/aux_loss": 0.0481028001755476, + "loss/crossentropy": 2.8643892288208006, + "loss/logits": 0.9604303538799286, + "step": 12380 + }, + { + "epoch": 0.1239, + "grad_norm": 11.1875, + "grad_norm_var": 0.17838541666666666, + "learning_rate": 0.0003, + "loss": 12.5781, + "loss/aux_loss": 0.04810880180448294, + "loss/crossentropy": 2.9660362005233765, + "loss/logits": 0.9596373349428177, + "step": 12390 + }, + { + "epoch": 0.124, + "grad_norm": 10.625, + "grad_norm_var": 0.1931640625, + "learning_rate": 0.0003, + "loss": 12.6396, + "loss/aux_loss": 0.04811811447143555, + "loss/crossentropy": 2.980528914928436, + "loss/logits": 0.9617955178022385, + "step": 12400 + }, + { + "epoch": 0.1241, + "grad_norm": 11.625, + "grad_norm_var": 1.4231770833333333, + "learning_rate": 0.0003, + "loss": 12.6891, + "loss/aux_loss": 0.048112993128597736, + "loss/crossentropy": 2.9703574776649475, + "loss/logits": 0.973251935839653, + "step": 12410 + }, + { + "epoch": 0.1242, + "grad_norm": 10.375, + "grad_norm_var": 1.4962890625, + "learning_rate": 0.0003, + "loss": 12.3924, + "loss/aux_loss": 0.04811142534017563, + "loss/crossentropy": 2.6934870958328245, + "loss/logits": 0.9628842860460282, + "step": 12420 + }, + { + "epoch": 0.1243, + "grad_norm": 31.75, + "grad_norm_var": 27.332747395833334, + "learning_rate": 0.0003, + "loss": 12.5895, + "loss/aux_loss": 0.04810582157224417, + "loss/crossentropy": 2.749896514415741, + "loss/logits": 0.9580157309770584, + "step": 12430 + }, + { + "epoch": 0.1244, + "grad_norm": 10.375, + "grad_norm_var": 27.559879557291666, + "learning_rate": 0.0003, + "loss": 12.5005, + "loss/aux_loss": 0.04812322128564119, + "loss/crossentropy": 2.8022406458854676, + "loss/logits": 0.9580208510160446, + "step": 12440 + }, + { + "epoch": 0.1245, + "grad_norm": 10.5625, + "grad_norm_var": 0.3863118489583333, + "learning_rate": 0.0003, + "loss": 12.5714, + "loss/aux_loss": 0.048095401376485825, + "loss/crossentropy": 2.8494678735733032, + "loss/logits": 0.9524266660213471, + "step": 12450 + }, + { + "epoch": 0.1246, + "grad_norm": 10.0, + "grad_norm_var": 0.20271809895833334, + "learning_rate": 0.0003, + "loss": 12.5446, + "loss/aux_loss": 0.04811746347695589, + "loss/crossentropy": 2.8464840769767763, + "loss/logits": 0.9794892787933349, + "step": 12460 + }, + { + "epoch": 0.1247, + "grad_norm": 10.6875, + "grad_norm_var": 0.3634765625, + "learning_rate": 0.0003, + "loss": 12.5093, + "loss/aux_loss": 0.04810458458960056, + "loss/crossentropy": 2.8821428060531615, + "loss/logits": 0.9548739582300186, + "step": 12470 + }, + { + "epoch": 0.1248, + "grad_norm": 10.25, + "grad_norm_var": 158.55701497395833, + "learning_rate": 0.0003, + "loss": 12.5456, + "loss/aux_loss": 0.04812399763613939, + "loss/crossentropy": 2.8613539934158325, + "loss/logits": 0.9762499183416367, + "step": 12480 + }, + { + "epoch": 0.1249, + "grad_norm": 10.8125, + "grad_norm_var": 0.5799479166666667, + "learning_rate": 0.0003, + "loss": 12.5057, + "loss/aux_loss": 0.04810742326080799, + "loss/crossentropy": 2.865919351577759, + "loss/logits": 0.9773925930261612, + "step": 12490 + }, + { + "epoch": 0.125, + "grad_norm": 10.375, + "grad_norm_var": 0.40623372395833335, + "learning_rate": 0.0003, + "loss": 12.4343, + "loss/aux_loss": 0.04811244308948517, + "loss/crossentropy": 2.7066974461078646, + "loss/logits": 0.9389273285865783, + "step": 12500 + }, + { + "epoch": 0.1251, + "grad_norm": 11.3125, + "grad_norm_var": 0.38670247395833335, + "learning_rate": 0.0003, + "loss": 12.4011, + "loss/aux_loss": 0.04809202216565609, + "loss/crossentropy": 2.9026967763900755, + "loss/logits": 0.969332093000412, + "step": 12510 + }, + { + "epoch": 0.1252, + "grad_norm": 10.875, + "grad_norm_var": 0.6098958333333333, + "learning_rate": 0.0003, + "loss": 12.4336, + "loss/aux_loss": 0.04811373949050903, + "loss/crossentropy": 2.665140724182129, + "loss/logits": 0.9239953130483627, + "step": 12520 + }, + { + "epoch": 0.1253, + "grad_norm": 11.1875, + "grad_norm_var": 0.7331868489583333, + "learning_rate": 0.0003, + "loss": 12.6341, + "loss/aux_loss": 0.04810454789549112, + "loss/crossentropy": 2.8536665797233582, + "loss/logits": 0.9737724870443344, + "step": 12530 + }, + { + "epoch": 0.1254, + "grad_norm": 10.75, + "grad_norm_var": 0.253125, + "learning_rate": 0.0003, + "loss": 12.2721, + "loss/aux_loss": 0.04812637399882078, + "loss/crossentropy": 2.623065769672394, + "loss/logits": 0.9491947621107102, + "step": 12540 + }, + { + "epoch": 0.1255, + "grad_norm": 10.3125, + "grad_norm_var": 0.291259765625, + "learning_rate": 0.0003, + "loss": 12.346, + "loss/aux_loss": 0.04810612387955189, + "loss/crossentropy": 2.8949776351451875, + "loss/logits": 0.9749656409025192, + "step": 12550 + }, + { + "epoch": 0.1256, + "grad_norm": 9.875, + "grad_norm_var": 0.20358072916666667, + "learning_rate": 0.0003, + "loss": 12.39, + "loss/aux_loss": 0.048107765056192874, + "loss/crossentropy": 2.9021278619766235, + "loss/logits": 0.9786544352769851, + "step": 12560 + }, + { + "epoch": 0.1257, + "grad_norm": 10.4375, + "grad_norm_var": 0.23723958333333334, + "learning_rate": 0.0003, + "loss": 12.377, + "loss/aux_loss": 0.04810867067426443, + "loss/crossentropy": 2.6891712307929994, + "loss/logits": 0.9230633974075317, + "step": 12570 + }, + { + "epoch": 0.1258, + "grad_norm": 10.5, + "grad_norm_var": 0.12057291666666667, + "learning_rate": 0.0003, + "loss": 12.5812, + "loss/aux_loss": 0.04811552707105875, + "loss/crossentropy": 2.969293546676636, + "loss/logits": 0.9738602817058564, + "step": 12580 + }, + { + "epoch": 0.1259, + "grad_norm": 10.1875, + "grad_norm_var": 0.17498372395833334, + "learning_rate": 0.0003, + "loss": 12.4027, + "loss/aux_loss": 0.04809841345995665, + "loss/crossentropy": 2.835331308841705, + "loss/logits": 0.9679557770490647, + "step": 12590 + }, + { + "epoch": 0.126, + "grad_norm": 10.0625, + "grad_norm_var": 0.237744140625, + "learning_rate": 0.0003, + "loss": 12.5829, + "loss/aux_loss": 0.048117955774068834, + "loss/crossentropy": 2.8491066575050352, + "loss/logits": 0.910678106546402, + "step": 12600 + }, + { + "epoch": 0.1261, + "grad_norm": 11.0625, + "grad_norm_var": 0.4869791666666667, + "learning_rate": 0.0003, + "loss": 12.5276, + "loss/aux_loss": 0.04809832703322172, + "loss/crossentropy": 2.8928737163543703, + "loss/logits": 0.9363324135541916, + "step": 12610 + }, + { + "epoch": 0.1262, + "grad_norm": 10.8125, + "grad_norm_var": 0.4989420572916667, + "learning_rate": 0.0003, + "loss": 12.4322, + "loss/aux_loss": 0.04810363110154867, + "loss/crossentropy": 2.8710333466529847, + "loss/logits": 0.9906549990177155, + "step": 12620 + }, + { + "epoch": 0.1263, + "grad_norm": 11.25, + "grad_norm_var": 0.16053059895833333, + "learning_rate": 0.0003, + "loss": 12.4911, + "loss/aux_loss": 0.04810148868709803, + "loss/crossentropy": 2.8756853461265566, + "loss/logits": 0.9440797507762909, + "step": 12630 + }, + { + "epoch": 0.1264, + "grad_norm": 18.375, + "grad_norm_var": 7.615625, + "learning_rate": 0.0003, + "loss": 12.4571, + "loss/aux_loss": 0.04810570180416107, + "loss/crossentropy": 2.6758979201316833, + "loss/logits": 0.9320230633020401, + "step": 12640 + }, + { + "epoch": 0.1265, + "grad_norm": 10.3125, + "grad_norm_var": 4.074983723958334, + "learning_rate": 0.0003, + "loss": 12.4666, + "loss/aux_loss": 0.048111373744905, + "loss/crossentropy": 2.842112112045288, + "loss/logits": 0.9325708895921707, + "step": 12650 + }, + { + "epoch": 0.1266, + "grad_norm": 10.0, + "grad_norm_var": 1.2192708333333333, + "learning_rate": 0.0003, + "loss": 12.2817, + "loss/aux_loss": 0.04809804186224938, + "loss/crossentropy": 2.9903613328933716, + "loss/logits": 0.9794372290372848, + "step": 12660 + }, + { + "epoch": 0.1267, + "grad_norm": 10.375, + "grad_norm_var": 0.468212890625, + "learning_rate": 0.0003, + "loss": 12.3768, + "loss/aux_loss": 0.048095152527093885, + "loss/crossentropy": 2.74501034617424, + "loss/logits": 0.9036791056394577, + "step": 12670 + }, + { + "epoch": 0.1268, + "grad_norm": 10.1875, + "grad_norm_var": 0.32578125, + "learning_rate": 0.0003, + "loss": 12.654, + "loss/aux_loss": 0.04810344278812408, + "loss/crossentropy": 2.962386405467987, + "loss/logits": 0.9665306150913239, + "step": 12680 + }, + { + "epoch": 0.1269, + "grad_norm": 11.75, + "grad_norm_var": 0.36183268229166665, + "learning_rate": 0.0003, + "loss": 12.4222, + "loss/aux_loss": 0.04809885267168283, + "loss/crossentropy": 2.909253853559494, + "loss/logits": 0.9747259318828583, + "step": 12690 + }, + { + "epoch": 0.127, + "grad_norm": 10.9375, + "grad_norm_var": 0.2843098958333333, + "learning_rate": 0.0003, + "loss": 12.5564, + "loss/aux_loss": 0.048109317757189275, + "loss/crossentropy": 2.8867732286453247, + "loss/logits": 0.9507931470870972, + "step": 12700 + }, + { + "epoch": 0.1271, + "grad_norm": 12.8125, + "grad_norm_var": 0.38409830729166666, + "learning_rate": 0.0003, + "loss": 12.4902, + "loss/aux_loss": 0.04810472708195448, + "loss/crossentropy": 2.6750588059425353, + "loss/logits": 0.9369807064533233, + "step": 12710 + }, + { + "epoch": 0.1272, + "grad_norm": 11.375, + "grad_norm_var": 0.54609375, + "learning_rate": 0.0003, + "loss": 12.48, + "loss/aux_loss": 0.048105572909116746, + "loss/crossentropy": 2.9057799935340882, + "loss/logits": 0.9927403450012207, + "step": 12720 + }, + { + "epoch": 0.1273, + "grad_norm": 10.6875, + "grad_norm_var": 0.3148274739583333, + "learning_rate": 0.0003, + "loss": 12.5244, + "loss/aux_loss": 0.04810297396034002, + "loss/crossentropy": 2.9415181994438173, + "loss/logits": 0.9982303559780121, + "step": 12730 + }, + { + "epoch": 0.1274, + "grad_norm": 10.5625, + "grad_norm_var": 0.1697265625, + "learning_rate": 0.0003, + "loss": 12.403, + "loss/aux_loss": 0.04809920433908701, + "loss/crossentropy": 2.903778100013733, + "loss/logits": 0.970294651389122, + "step": 12740 + }, + { + "epoch": 0.1275, + "grad_norm": 10.8125, + "grad_norm_var": 0.15050455729166667, + "learning_rate": 0.0003, + "loss": 12.455, + "loss/aux_loss": 0.048093832843005654, + "loss/crossentropy": 2.9087541341781615, + "loss/logits": 0.9725099325180053, + "step": 12750 + }, + { + "epoch": 0.1276, + "grad_norm": 10.5625, + "grad_norm_var": 0.13274739583333334, + "learning_rate": 0.0003, + "loss": 12.5525, + "loss/aux_loss": 0.04810660276561975, + "loss/crossentropy": 2.792685878276825, + "loss/logits": 0.9733223885297775, + "step": 12760 + }, + { + "epoch": 0.1277, + "grad_norm": 11.375, + "grad_norm_var": 0.18019205729166668, + "learning_rate": 0.0003, + "loss": 12.559, + "loss/aux_loss": 0.04810384083539247, + "loss/crossentropy": 2.8665752828121187, + "loss/logits": 0.9338672608137131, + "step": 12770 + }, + { + "epoch": 0.1278, + "grad_norm": 10.1875, + "grad_norm_var": 0.3889973958333333, + "learning_rate": 0.0003, + "loss": 12.4119, + "loss/aux_loss": 0.048106889240443707, + "loss/crossentropy": 2.8227752327919005, + "loss/logits": 0.9450346022844315, + "step": 12780 + }, + { + "epoch": 0.1279, + "grad_norm": 11.125, + "grad_norm_var": 0.29791666666666666, + "learning_rate": 0.0003, + "loss": 12.6173, + "loss/aux_loss": 0.048102630861103536, + "loss/crossentropy": 2.9129024147987366, + "loss/logits": 0.9713657557964325, + "step": 12790 + }, + { + "epoch": 0.128, + "grad_norm": 10.9375, + "grad_norm_var": 0.18274739583333333, + "learning_rate": 0.0003, + "loss": 12.326, + "loss/aux_loss": 0.04809645600616932, + "loss/crossentropy": 2.8920622408390044, + "loss/logits": 0.954812154173851, + "step": 12800 + }, + { + "epoch": 0.1281, + "grad_norm": 10.9375, + "grad_norm_var": 0.17420247395833333, + "learning_rate": 0.0003, + "loss": 12.2345, + "loss/aux_loss": 0.04810621030628681, + "loss/crossentropy": 2.7076722204685213, + "loss/logits": 0.9177807062864304, + "step": 12810 + }, + { + "epoch": 0.1282, + "grad_norm": 10.0, + "grad_norm_var": 0.6026041666666667, + "learning_rate": 0.0003, + "loss": 12.4786, + "loss/aux_loss": 0.048109995760023595, + "loss/crossentropy": 2.8327449679374697, + "loss/logits": 0.9546353191137313, + "step": 12820 + }, + { + "epoch": 0.1283, + "grad_norm": 11.9375, + "grad_norm_var": 0.8191243489583333, + "learning_rate": 0.0003, + "loss": 12.5182, + "loss/aux_loss": 0.048099367320537566, + "loss/crossentropy": 2.9235777378082277, + "loss/logits": 0.9615249812602997, + "step": 12830 + }, + { + "epoch": 0.1284, + "grad_norm": 11.25, + "grad_norm_var": 3.563134765625, + "learning_rate": 0.0003, + "loss": 12.3886, + "loss/aux_loss": 0.04810474757105112, + "loss/crossentropy": 2.7467468440532685, + "loss/logits": 0.9554690361022949, + "step": 12840 + }, + { + "epoch": 0.1285, + "grad_norm": 11.625, + "grad_norm_var": 0.36666666666666664, + "learning_rate": 0.0003, + "loss": 12.4959, + "loss/aux_loss": 0.04810317847877741, + "loss/crossentropy": 2.9639437079429625, + "loss/logits": 0.9639540314674377, + "step": 12850 + }, + { + "epoch": 0.1286, + "grad_norm": 11.6875, + "grad_norm_var": 0.5353515625, + "learning_rate": 0.0003, + "loss": 12.4018, + "loss/aux_loss": 0.04811002798378468, + "loss/crossentropy": 2.7028500497341157, + "loss/logits": 0.9559302479028702, + "step": 12860 + }, + { + "epoch": 0.1287, + "grad_norm": 11.5, + "grad_norm_var": 53.47472330729167, + "learning_rate": 0.0003, + "loss": 12.5657, + "loss/aux_loss": 0.04811037741601467, + "loss/crossentropy": 2.924159586429596, + "loss/logits": 0.9749170869588852, + "step": 12870 + }, + { + "epoch": 0.1288, + "grad_norm": 10.4375, + "grad_norm_var": 53.731103515625, + "learning_rate": 0.0003, + "loss": 12.3255, + "loss/aux_loss": 0.048108428902924064, + "loss/crossentropy": 2.763288676738739, + "loss/logits": 0.9064864754676819, + "step": 12880 + }, + { + "epoch": 0.1289, + "grad_norm": 10.3125, + "grad_norm_var": 0.22604166666666667, + "learning_rate": 0.0003, + "loss": 12.2636, + "loss/aux_loss": 0.048106398433446884, + "loss/crossentropy": 2.785652810335159, + "loss/logits": 0.9184371441602707, + "step": 12890 + }, + { + "epoch": 0.129, + "grad_norm": 10.625, + "grad_norm_var": 0.30514322916666664, + "learning_rate": 0.0003, + "loss": 12.3059, + "loss/aux_loss": 0.048100420646369456, + "loss/crossentropy": 2.790884238481522, + "loss/logits": 0.9849524915218353, + "step": 12900 + }, + { + "epoch": 0.1291, + "grad_norm": 11.0625, + "grad_norm_var": 0.179150390625, + "learning_rate": 0.0003, + "loss": 12.4044, + "loss/aux_loss": 0.048105467297136786, + "loss/crossentropy": 2.791968286037445, + "loss/logits": 0.9593799233436584, + "step": 12910 + }, + { + "epoch": 0.1292, + "grad_norm": 10.25, + "grad_norm_var": 0.3337890625, + "learning_rate": 0.0003, + "loss": 12.4358, + "loss/aux_loss": 0.04811060018837452, + "loss/crossentropy": 2.9321176767349244, + "loss/logits": 0.9812934130430222, + "step": 12920 + }, + { + "epoch": 0.1293, + "grad_norm": 11.0625, + "grad_norm_var": 0.22784830729166666, + "learning_rate": 0.0003, + "loss": 12.4753, + "loss/aux_loss": 0.04810346253216267, + "loss/crossentropy": 2.8504304766654966, + "loss/logits": 0.9624378353357315, + "step": 12930 + }, + { + "epoch": 0.1294, + "grad_norm": 18.75, + "grad_norm_var": 4.043229166666666, + "learning_rate": 0.0003, + "loss": 12.5174, + "loss/aux_loss": 0.048096288181841376, + "loss/crossentropy": 2.861344063282013, + "loss/logits": 0.9707422107458115, + "step": 12940 + }, + { + "epoch": 0.1295, + "grad_norm": 11.1875, + "grad_norm_var": 3.975455729166667, + "learning_rate": 0.0003, + "loss": 12.413, + "loss/aux_loss": 0.048110200092196465, + "loss/crossentropy": 2.8129209518432616, + "loss/logits": 0.9287580490112305, + "step": 12950 + }, + { + "epoch": 0.1296, + "grad_norm": 11.0, + "grad_norm_var": 0.25338541666666664, + "learning_rate": 0.0003, + "loss": 12.3644, + "loss/aux_loss": 0.04810931608080864, + "loss/crossentropy": 2.802262395620346, + "loss/logits": 0.9195797771215439, + "step": 12960 + }, + { + "epoch": 0.1297, + "grad_norm": 11.1875, + "grad_norm_var": 0.5309895833333333, + "learning_rate": 0.0003, + "loss": 12.3496, + "loss/aux_loss": 0.048108757846057414, + "loss/crossentropy": 3.0164557695388794, + "loss/logits": 0.9846212476491928, + "step": 12970 + }, + { + "epoch": 0.1298, + "grad_norm": 10.875, + "grad_norm_var": 0.4212890625, + "learning_rate": 0.0003, + "loss": 12.382, + "loss/aux_loss": 0.04810013268142939, + "loss/crossentropy": 2.9002213299274446, + "loss/logits": 0.9503946632146836, + "step": 12980 + }, + { + "epoch": 0.1299, + "grad_norm": 10.375, + "grad_norm_var": 0.15167643229166666, + "learning_rate": 0.0003, + "loss": 12.3921, + "loss/aux_loss": 0.048111158050596715, + "loss/crossentropy": 2.7677676558494566, + "loss/logits": 0.921833261847496, + "step": 12990 + }, + { + "epoch": 0.13, + "grad_norm": 10.75, + "grad_norm_var": 0.388525390625, + "learning_rate": 0.0003, + "loss": 12.3999, + "loss/aux_loss": 0.048099796287715435, + "loss/crossentropy": 2.906938135623932, + "loss/logits": 0.9610762029886246, + "step": 13000 + }, + { + "epoch": 0.1301, + "grad_norm": 10.0625, + "grad_norm_var": 0.36139322916666666, + "learning_rate": 0.0003, + "loss": 12.6066, + "loss/aux_loss": 0.04810947496443987, + "loss/crossentropy": 2.892643666267395, + "loss/logits": 0.9925059139728546, + "step": 13010 + }, + { + "epoch": 0.1302, + "grad_norm": 10.875, + "grad_norm_var": 0.160400390625, + "learning_rate": 0.0003, + "loss": 12.2514, + "loss/aux_loss": 0.048100730404257774, + "loss/crossentropy": 2.7110345482826235, + "loss/logits": 0.931193083524704, + "step": 13020 + }, + { + "epoch": 0.1303, + "grad_norm": 11.6875, + "grad_norm_var": 0.546337890625, + "learning_rate": 0.0003, + "loss": 12.7064, + "loss/aux_loss": 0.04810579176992178, + "loss/crossentropy": 2.92630649805069, + "loss/logits": 0.9686931163072586, + "step": 13030 + }, + { + "epoch": 0.1304, + "grad_norm": 10.5, + "grad_norm_var": 0.7660807291666667, + "learning_rate": 0.0003, + "loss": 12.1971, + "loss/aux_loss": 0.04810760095715523, + "loss/crossentropy": 2.778294336795807, + "loss/logits": 0.9471270084381104, + "step": 13040 + }, + { + "epoch": 0.1305, + "grad_norm": 10.1875, + "grad_norm_var": 0.5150390625, + "learning_rate": 0.0003, + "loss": 12.5498, + "loss/aux_loss": 0.048095237277448175, + "loss/crossentropy": 2.8965494871139525, + "loss/logits": 0.9763620316982269, + "step": 13050 + }, + { + "epoch": 0.1306, + "grad_norm": 10.8125, + "grad_norm_var": 0.3341145833333333, + "learning_rate": 0.0003, + "loss": 12.268, + "loss/aux_loss": 0.04810608047991991, + "loss/crossentropy": 2.704496759176254, + "loss/logits": 0.9139129340648651, + "step": 13060 + }, + { + "epoch": 0.1307, + "grad_norm": 10.5625, + "grad_norm_var": 0.2833333333333333, + "learning_rate": 0.0003, + "loss": 12.3673, + "loss/aux_loss": 0.04811034444719553, + "loss/crossentropy": 2.8740296959877014, + "loss/logits": 0.9611405491828918, + "step": 13070 + }, + { + "epoch": 0.1308, + "grad_norm": 10.8125, + "grad_norm_var": 0.19138997395833332, + "learning_rate": 0.0003, + "loss": 12.4817, + "loss/aux_loss": 0.048108231462538245, + "loss/crossentropy": 2.746237635612488, + "loss/logits": 0.9631420075893402, + "step": 13080 + }, + { + "epoch": 0.1309, + "grad_norm": 10.9375, + "grad_norm_var": 0.26484375, + "learning_rate": 0.0003, + "loss": 12.4288, + "loss/aux_loss": 0.048110059648752215, + "loss/crossentropy": 2.974073600769043, + "loss/logits": 0.964441043138504, + "step": 13090 + }, + { + "epoch": 0.131, + "grad_norm": 11.75, + "grad_norm_var": 0.20130208333333333, + "learning_rate": 0.0003, + "loss": 12.4907, + "loss/aux_loss": 0.04810066521167755, + "loss/crossentropy": 2.760193109512329, + "loss/logits": 0.9369175344705581, + "step": 13100 + }, + { + "epoch": 0.1311, + "grad_norm": 11.125, + "grad_norm_var": 1.0639973958333333, + "learning_rate": 0.0003, + "loss": 12.464, + "loss/aux_loss": 0.04809979852288961, + "loss/crossentropy": 2.846820616722107, + "loss/logits": 0.9538910329341889, + "step": 13110 + }, + { + "epoch": 0.1312, + "grad_norm": 10.5625, + "grad_norm_var": 0.8995930989583333, + "learning_rate": 0.0003, + "loss": 12.4394, + "loss/aux_loss": 0.048106100782752036, + "loss/crossentropy": 2.9314664363861085, + "loss/logits": 0.94806087911129, + "step": 13120 + }, + { + "epoch": 0.1313, + "grad_norm": 11.375, + "grad_norm_var": 0.4202962239583333, + "learning_rate": 0.0003, + "loss": 12.3422, + "loss/aux_loss": 0.0481000566855073, + "loss/crossentropy": 2.8791940450668334, + "loss/logits": 0.9509881615638733, + "step": 13130 + }, + { + "epoch": 0.1314, + "grad_norm": 10.125, + "grad_norm_var": 0.27615559895833336, + "learning_rate": 0.0003, + "loss": 12.4757, + "loss/aux_loss": 0.048098215088248256, + "loss/crossentropy": 2.9675530552864076, + "loss/logits": 0.9818042993545533, + "step": 13140 + }, + { + "epoch": 0.1315, + "grad_norm": 10.625, + "grad_norm_var": 0.2618326822916667, + "learning_rate": 0.0003, + "loss": 12.336, + "loss/aux_loss": 0.04810485653579235, + "loss/crossentropy": 2.804446077346802, + "loss/logits": 0.9385320395231247, + "step": 13150 + }, + { + "epoch": 0.1316, + "grad_norm": 11.6875, + "grad_norm_var": 0.24368489583333333, + "learning_rate": 0.0003, + "loss": 12.4802, + "loss/aux_loss": 0.04809897020459175, + "loss/crossentropy": 2.8940049529075624, + "loss/logits": 0.9573934972286224, + "step": 13160 + }, + { + "epoch": 0.1317, + "grad_norm": 10.625, + "grad_norm_var": 0.30271809895833335, + "learning_rate": 0.0003, + "loss": 12.3512, + "loss/aux_loss": 0.04811223279684782, + "loss/crossentropy": 2.7365167438983917, + "loss/logits": 0.9326226800680161, + "step": 13170 + }, + { + "epoch": 0.1318, + "grad_norm": 10.625, + "grad_norm_var": 0.19765625, + "learning_rate": 0.0003, + "loss": 12.4848, + "loss/aux_loss": 0.048102138377726075, + "loss/crossentropy": 2.780118942260742, + "loss/logits": 0.9540079593658447, + "step": 13180 + }, + { + "epoch": 0.1319, + "grad_norm": 10.6875, + "grad_norm_var": 0.9747233072916667, + "learning_rate": 0.0003, + "loss": 12.3066, + "loss/aux_loss": 0.0481060640886426, + "loss/crossentropy": 2.848835837841034, + "loss/logits": 0.9211630582809448, + "step": 13190 + }, + { + "epoch": 0.132, + "grad_norm": 10.6875, + "grad_norm_var": 0.92421875, + "learning_rate": 0.0003, + "loss": 12.4287, + "loss/aux_loss": 0.048100389540195465, + "loss/crossentropy": 2.8563956737518312, + "loss/logits": 0.9615561842918396, + "step": 13200 + }, + { + "epoch": 0.1321, + "grad_norm": 10.8125, + "grad_norm_var": 0.39296875, + "learning_rate": 0.0003, + "loss": 12.4611, + "loss/aux_loss": 0.04809982106089592, + "loss/crossentropy": 2.99262011051178, + "loss/logits": 0.9975022733211517, + "step": 13210 + }, + { + "epoch": 0.1322, + "grad_norm": 10.0, + "grad_norm_var": 0.5723795572916667, + "learning_rate": 0.0003, + "loss": 12.3749, + "loss/aux_loss": 0.048114350996911526, + "loss/crossentropy": 2.715546762943268, + "loss/logits": 0.9087715715169906, + "step": 13220 + }, + { + "epoch": 0.1323, + "grad_norm": 11.4375, + "grad_norm_var": 0.8630045572916667, + "learning_rate": 0.0003, + "loss": 12.3739, + "loss/aux_loss": 0.048113764822483064, + "loss/crossentropy": 2.8105762124061586, + "loss/logits": 0.9538555532693863, + "step": 13230 + }, + { + "epoch": 0.1324, + "grad_norm": 11.9375, + "grad_norm_var": 19.576025390625, + "learning_rate": 0.0003, + "loss": 12.2992, + "loss/aux_loss": 0.04810525067150593, + "loss/crossentropy": 2.91482892036438, + "loss/logits": 0.9708419471979142, + "step": 13240 + }, + { + "epoch": 0.1325, + "grad_norm": 11.375, + "grad_norm_var": 0.3712076822916667, + "learning_rate": 0.0003, + "loss": 12.35, + "loss/aux_loss": 0.04810534752905369, + "loss/crossentropy": 2.8350456237792967, + "loss/logits": 0.9504047840833664, + "step": 13250 + }, + { + "epoch": 0.1326, + "grad_norm": 10.75, + "grad_norm_var": 0.229931640625, + "learning_rate": 0.0003, + "loss": 12.5435, + "loss/aux_loss": 0.048105095699429515, + "loss/crossentropy": 2.9701404571533203, + "loss/logits": 0.9582396388053894, + "step": 13260 + }, + { + "epoch": 0.1327, + "grad_norm": 10.6875, + "grad_norm_var": 0.201416015625, + "learning_rate": 0.0003, + "loss": 12.3695, + "loss/aux_loss": 0.04809423070400953, + "loss/crossentropy": 2.981611502170563, + "loss/logits": 0.9848940640687942, + "step": 13270 + }, + { + "epoch": 0.1328, + "grad_norm": 10.8125, + "grad_norm_var": 0.204931640625, + "learning_rate": 0.0003, + "loss": 12.2578, + "loss/aux_loss": 0.04810588490217924, + "loss/crossentropy": 2.786838227510452, + "loss/logits": 0.9377759993076324, + "step": 13280 + }, + { + "epoch": 0.1329, + "grad_norm": 10.3125, + "grad_norm_var": 0.354541015625, + "learning_rate": 0.0003, + "loss": 12.4597, + "loss/aux_loss": 0.04810259565711021, + "loss/crossentropy": 2.8435731649398805, + "loss/logits": 0.9304347574710846, + "step": 13290 + }, + { + "epoch": 0.133, + "grad_norm": 10.625, + "grad_norm_var": 0.2581868489583333, + "learning_rate": 0.0003, + "loss": 12.3977, + "loss/aux_loss": 0.0481047386303544, + "loss/crossentropy": 2.893440508842468, + "loss/logits": 0.9740776270627975, + "step": 13300 + }, + { + "epoch": 0.1331, + "grad_norm": 10.3125, + "grad_norm_var": 1.2949055989583333, + "learning_rate": 0.0003, + "loss": 12.3455, + "loss/aux_loss": 0.04810352213680744, + "loss/crossentropy": 2.6359397768974304, + "loss/logits": 0.9171884417533874, + "step": 13310 + }, + { + "epoch": 0.1332, + "grad_norm": 12.1875, + "grad_norm_var": 1.2700358072916667, + "learning_rate": 0.0003, + "loss": 12.3154, + "loss/aux_loss": 0.0481058057397604, + "loss/crossentropy": 2.766212022304535, + "loss/logits": 0.9359346807003022, + "step": 13320 + }, + { + "epoch": 0.1333, + "grad_norm": 10.0, + "grad_norm_var": 0.6723307291666667, + "learning_rate": 0.0003, + "loss": 12.4084, + "loss/aux_loss": 0.048116312362253666, + "loss/crossentropy": 2.84612637758255, + "loss/logits": 0.939299488067627, + "step": 13330 + }, + { + "epoch": 0.1334, + "grad_norm": 10.8125, + "grad_norm_var": 0.555322265625, + "learning_rate": 0.0003, + "loss": 12.4866, + "loss/aux_loss": 0.04810191094875336, + "loss/crossentropy": 2.958892011642456, + "loss/logits": 0.9613621711730957, + "step": 13340 + }, + { + "epoch": 0.1335, + "grad_norm": 10.875, + "grad_norm_var": 0.30149739583333335, + "learning_rate": 0.0003, + "loss": 12.4951, + "loss/aux_loss": 0.04811299704015255, + "loss/crossentropy": 2.949324941635132, + "loss/logits": 0.9427460253238678, + "step": 13350 + }, + { + "epoch": 0.1336, + "grad_norm": 10.0, + "grad_norm_var": 0.17420247395833333, + "learning_rate": 0.0003, + "loss": 12.3633, + "loss/aux_loss": 0.04811058808118105, + "loss/crossentropy": 2.9030325174331666, + "loss/logits": 0.9233207911252975, + "step": 13360 + }, + { + "epoch": 0.1337, + "grad_norm": 12.625, + "grad_norm_var": 0.3277180989583333, + "learning_rate": 0.0003, + "loss": 12.1642, + "loss/aux_loss": 0.04810114298015833, + "loss/crossentropy": 2.709307849407196, + "loss/logits": 0.9239124625921249, + "step": 13370 + }, + { + "epoch": 0.1338, + "grad_norm": 10.625, + "grad_norm_var": 0.30514322916666664, + "learning_rate": 0.0003, + "loss": 12.4912, + "loss/aux_loss": 0.04811024907976389, + "loss/crossentropy": 2.8618651926517487, + "loss/logits": 0.9447506815195084, + "step": 13380 + }, + { + "epoch": 0.1339, + "grad_norm": 11.6875, + "grad_norm_var": 0.19166666666666668, + "learning_rate": 0.0003, + "loss": 12.4706, + "loss/aux_loss": 0.04810334574431181, + "loss/crossentropy": 2.9332224130630493, + "loss/logits": 0.9420458465814591, + "step": 13390 + }, + { + "epoch": 0.134, + "grad_norm": 10.6875, + "grad_norm_var": 0.23748372395833334, + "learning_rate": 0.0003, + "loss": 12.2177, + "loss/aux_loss": 0.04811100345104933, + "loss/crossentropy": 2.896355766057968, + "loss/logits": 0.9644664227962494, + "step": 13400 + }, + { + "epoch": 0.1341, + "grad_norm": 11.0, + "grad_norm_var": 0.44021809895833336, + "learning_rate": 0.0003, + "loss": 12.4903, + "loss/aux_loss": 0.04810348581522703, + "loss/crossentropy": 2.7759104132652284, + "loss/logits": 0.9495417177677155, + "step": 13410 + }, + { + "epoch": 0.1342, + "grad_norm": 10.0, + "grad_norm_var": 0.492822265625, + "learning_rate": 0.0003, + "loss": 12.4067, + "loss/aux_loss": 0.04810683950781822, + "loss/crossentropy": 2.804324197769165, + "loss/logits": 0.9235481023788452, + "step": 13420 + }, + { + "epoch": 0.1343, + "grad_norm": 10.75, + "grad_norm_var": 1.1260416666666666, + "learning_rate": 0.0003, + "loss": 12.2612, + "loss/aux_loss": 0.04810345564037562, + "loss/crossentropy": 2.7631209015846254, + "loss/logits": 0.966147831082344, + "step": 13430 + }, + { + "epoch": 0.1344, + "grad_norm": 11.3125, + "grad_norm_var": 0.8817545572916666, + "learning_rate": 0.0003, + "loss": 12.3751, + "loss/aux_loss": 0.048112759739160536, + "loss/crossentropy": 2.9659682273864747, + "loss/logits": 0.9447232961654664, + "step": 13440 + }, + { + "epoch": 0.1345, + "grad_norm": 11.9375, + "grad_norm_var": 0.4984212239583333, + "learning_rate": 0.0003, + "loss": 12.3162, + "loss/aux_loss": 0.048107668198645114, + "loss/crossentropy": 2.83985230922699, + "loss/logits": 0.9504481822252273, + "step": 13450 + }, + { + "epoch": 0.1346, + "grad_norm": 10.625, + "grad_norm_var": 0.3667805989583333, + "learning_rate": 0.0003, + "loss": 12.1897, + "loss/aux_loss": 0.04810948856174946, + "loss/crossentropy": 2.7549479007720947, + "loss/logits": 0.9273734211921691, + "step": 13460 + }, + { + "epoch": 0.1347, + "grad_norm": 12.375, + "grad_norm_var": 0.31764322916666665, + "learning_rate": 0.0003, + "loss": 12.2581, + "loss/aux_loss": 0.0481115635484457, + "loss/crossentropy": 2.7301569998264315, + "loss/logits": 0.9274598181247711, + "step": 13470 + }, + { + "epoch": 0.1348, + "grad_norm": 11.625, + "grad_norm_var": 0.32146809895833334, + "learning_rate": 0.0003, + "loss": 12.2671, + "loss/aux_loss": 0.04809574782848358, + "loss/crossentropy": 2.7924930095672607, + "loss/logits": 0.9410725235939026, + "step": 13480 + }, + { + "epoch": 0.1349, + "grad_norm": 12.375, + "grad_norm_var": 0.30520833333333336, + "learning_rate": 0.0003, + "loss": 12.4427, + "loss/aux_loss": 0.048100709170103076, + "loss/crossentropy": 2.8529832124710084, + "loss/logits": 0.964218020439148, + "step": 13490 + }, + { + "epoch": 0.135, + "grad_norm": 11.6875, + "grad_norm_var": 0.48019205729166664, + "learning_rate": 0.0003, + "loss": 12.5486, + "loss/aux_loss": 0.04810782596468925, + "loss/crossentropy": 2.767691594362259, + "loss/logits": 0.9534753412008286, + "step": 13500 + }, + { + "epoch": 0.1351, + "grad_norm": 11.0625, + "grad_norm_var": 0.3384765625, + "learning_rate": 0.0003, + "loss": 12.4671, + "loss/aux_loss": 0.048108152486383914, + "loss/crossentropy": 2.840370202064514, + "loss/logits": 0.9720666646957398, + "step": 13510 + }, + { + "epoch": 0.1352, + "grad_norm": 10.6875, + "grad_norm_var": 0.40232747395833335, + "learning_rate": 0.0003, + "loss": 12.3131, + "loss/aux_loss": 0.04810761827975511, + "loss/crossentropy": 2.7722171783447265, + "loss/logits": 0.919560182094574, + "step": 13520 + }, + { + "epoch": 0.1353, + "grad_norm": 11.125, + "grad_norm_var": 0.27265625, + "learning_rate": 0.0003, + "loss": 12.3471, + "loss/aux_loss": 0.048098478280007836, + "loss/crossentropy": 2.7604997634887694, + "loss/logits": 0.9506667792797089, + "step": 13530 + }, + { + "epoch": 0.1354, + "grad_norm": 10.5625, + "grad_norm_var": 0.23385416666666667, + "learning_rate": 0.0003, + "loss": 12.5057, + "loss/aux_loss": 0.048096814006567, + "loss/crossentropy": 2.93693727850914, + "loss/logits": 1.006801837682724, + "step": 13540 + }, + { + "epoch": 0.1355, + "grad_norm": 10.5, + "grad_norm_var": 0.23357747395833334, + "learning_rate": 0.0003, + "loss": 12.4151, + "loss/aux_loss": 0.04810099713504314, + "loss/crossentropy": 2.9465499818325043, + "loss/logits": 0.9544957995414733, + "step": 13550 + }, + { + "epoch": 0.1356, + "grad_norm": 11.1875, + "grad_norm_var": 0.29322916666666665, + "learning_rate": 0.0003, + "loss": 12.4663, + "loss/aux_loss": 0.04810344949364662, + "loss/crossentropy": 2.783466875553131, + "loss/logits": 0.982630443572998, + "step": 13560 + }, + { + "epoch": 0.1357, + "grad_norm": 12.0, + "grad_norm_var": 0.36451822916666665, + "learning_rate": 0.0003, + "loss": 12.3713, + "loss/aux_loss": 0.04810951203107834, + "loss/crossentropy": 2.66209716796875, + "loss/logits": 0.9058698862791061, + "step": 13570 + }, + { + "epoch": 0.1358, + "grad_norm": 12.125, + "grad_norm_var": 0.5755208333333334, + "learning_rate": 0.0003, + "loss": 12.4161, + "loss/aux_loss": 0.04810614287853241, + "loss/crossentropy": 2.793833488225937, + "loss/logits": 0.908539018034935, + "step": 13580 + }, + { + "epoch": 0.1359, + "grad_norm": 10.6875, + "grad_norm_var": 0.36495768229166664, + "learning_rate": 0.0003, + "loss": 12.4122, + "loss/aux_loss": 0.048112927563488485, + "loss/crossentropy": 2.7475938618183138, + "loss/logits": 0.9510286509990692, + "step": 13590 + }, + { + "epoch": 0.136, + "grad_norm": 11.375, + "grad_norm_var": 0.23639322916666666, + "learning_rate": 0.0003, + "loss": 12.2558, + "loss/aux_loss": 0.048097974807024005, + "loss/crossentropy": 2.8546653985977173, + "loss/logits": 0.9764155447483063, + "step": 13600 + }, + { + "epoch": 0.1361, + "grad_norm": 10.25, + "grad_norm_var": 0.2652180989583333, + "learning_rate": 0.0003, + "loss": 12.3507, + "loss/aux_loss": 0.048107730224728584, + "loss/crossentropy": 2.7831094443798063, + "loss/logits": 0.933989730477333, + "step": 13610 + }, + { + "epoch": 0.1362, + "grad_norm": 10.8125, + "grad_norm_var": 0.27076822916666665, + "learning_rate": 0.0003, + "loss": 12.4364, + "loss/aux_loss": 0.04810614828020334, + "loss/crossentropy": 2.8577419936656954, + "loss/logits": 0.9404568552970887, + "step": 13620 + }, + { + "epoch": 0.1363, + "grad_norm": 10.875, + "grad_norm_var": 0.12902018229166667, + "learning_rate": 0.0003, + "loss": 12.2498, + "loss/aux_loss": 0.04811039827764034, + "loss/crossentropy": 2.732570058107376, + "loss/logits": 0.9277025848627091, + "step": 13630 + }, + { + "epoch": 0.1364, + "grad_norm": 11.9375, + "grad_norm_var": 0.34055989583333335, + "learning_rate": 0.0003, + "loss": 12.2871, + "loss/aux_loss": 0.04810627643018961, + "loss/crossentropy": 2.8617196679115295, + "loss/logits": 0.9472320884466171, + "step": 13640 + }, + { + "epoch": 0.1365, + "grad_norm": 11.1875, + "grad_norm_var": 0.4354166666666667, + "learning_rate": 0.0003, + "loss": 12.2645, + "loss/aux_loss": 0.048111869394779204, + "loss/crossentropy": 2.8335381925106047, + "loss/logits": 0.9731518387794494, + "step": 13650 + }, + { + "epoch": 0.1366, + "grad_norm": 10.9375, + "grad_norm_var": 0.267822265625, + "learning_rate": 0.0003, + "loss": 12.1778, + "loss/aux_loss": 0.048117116838693616, + "loss/crossentropy": 2.772057569026947, + "loss/logits": 0.9199839055538177, + "step": 13660 + }, + { + "epoch": 0.1367, + "grad_norm": 10.8125, + "grad_norm_var": 0.24348958333333334, + "learning_rate": 0.0003, + "loss": 12.2065, + "loss/aux_loss": 0.04810203909873963, + "loss/crossentropy": 2.8068443894386292, + "loss/logits": 0.9232182204723358, + "step": 13670 + }, + { + "epoch": 0.1368, + "grad_norm": 10.875, + "grad_norm_var": 0.48639322916666666, + "learning_rate": 0.0003, + "loss": 12.304, + "loss/aux_loss": 0.048113486543297765, + "loss/crossentropy": 2.8098415970802306, + "loss/logits": 0.9973312526941299, + "step": 13680 + }, + { + "epoch": 0.1369, + "grad_norm": 11.5625, + "grad_norm_var": 0.769775390625, + "learning_rate": 0.0003, + "loss": 12.3701, + "loss/aux_loss": 0.048099438101053237, + "loss/crossentropy": 2.9046237051486967, + "loss/logits": 0.9579191863536834, + "step": 13690 + }, + { + "epoch": 0.137, + "grad_norm": 10.6875, + "grad_norm_var": 0.4046875, + "learning_rate": 0.0003, + "loss": 12.563, + "loss/aux_loss": 0.04810395799577236, + "loss/crossentropy": 2.983587795495987, + "loss/logits": 0.9559501677751541, + "step": 13700 + }, + { + "epoch": 0.1371, + "grad_norm": 10.25, + "grad_norm_var": 0.6278483072916666, + "learning_rate": 0.0003, + "loss": 12.2734, + "loss/aux_loss": 0.04810773227363825, + "loss/crossentropy": 2.8191973209381103, + "loss/logits": 0.92610003054142, + "step": 13710 + }, + { + "epoch": 0.1372, + "grad_norm": 11.6875, + "grad_norm_var": 0.46066080729166664, + "learning_rate": 0.0003, + "loss": 12.3531, + "loss/aux_loss": 0.04809810016304254, + "loss/crossentropy": 2.871203887462616, + "loss/logits": 0.9389143049716949, + "step": 13720 + }, + { + "epoch": 0.1373, + "grad_norm": 11.125, + "grad_norm_var": 0.2565104166666667, + "learning_rate": 0.0003, + "loss": 12.2326, + "loss/aux_loss": 0.04810184333473444, + "loss/crossentropy": 2.794565808773041, + "loss/logits": 0.9445665180683136, + "step": 13730 + }, + { + "epoch": 0.1374, + "grad_norm": 11.9375, + "grad_norm_var": 0.22604166666666667, + "learning_rate": 0.0003, + "loss": 12.3533, + "loss/aux_loss": 0.048102909699082375, + "loss/crossentropy": 2.8416384637355803, + "loss/logits": 0.9654471457004548, + "step": 13740 + }, + { + "epoch": 0.1375, + "grad_norm": 10.1875, + "grad_norm_var": 0.3582682291666667, + "learning_rate": 0.0003, + "loss": 12.327, + "loss/aux_loss": 0.04810390882194042, + "loss/crossentropy": 2.8315653204917908, + "loss/logits": 0.9377313375473022, + "step": 13750 + }, + { + "epoch": 0.1376, + "grad_norm": 10.3125, + "grad_norm_var": 0.36769205729166665, + "learning_rate": 0.0003, + "loss": 12.4527, + "loss/aux_loss": 0.04810698907822371, + "loss/crossentropy": 2.719471883773804, + "loss/logits": 0.9382916927337647, + "step": 13760 + }, + { + "epoch": 0.1377, + "grad_norm": 11.125, + "grad_norm_var": 0.21555989583333332, + "learning_rate": 0.0003, + "loss": 12.2269, + "loss/aux_loss": 0.048097971081733706, + "loss/crossentropy": 2.861399304866791, + "loss/logits": 0.9493412613868714, + "step": 13770 + }, + { + "epoch": 0.1378, + "grad_norm": 11.75, + "grad_norm_var": 0.6538899739583334, + "learning_rate": 0.0003, + "loss": 12.3158, + "loss/aux_loss": 0.048107971996068956, + "loss/crossentropy": 2.840020203590393, + "loss/logits": 0.9154278337955475, + "step": 13780 + }, + { + "epoch": 0.1379, + "grad_norm": 10.6875, + "grad_norm_var": 0.37862955729166664, + "learning_rate": 0.0003, + "loss": 12.3169, + "loss/aux_loss": 0.04810533430427313, + "loss/crossentropy": 2.7927276849746705, + "loss/logits": 0.9583002328872681, + "step": 13790 + }, + { + "epoch": 0.138, + "grad_norm": 11.0, + "grad_norm_var": 0.22161458333333334, + "learning_rate": 0.0003, + "loss": 12.3317, + "loss/aux_loss": 0.04810004401952028, + "loss/crossentropy": 2.894696593284607, + "loss/logits": 0.9791848719120025, + "step": 13800 + }, + { + "epoch": 0.1381, + "grad_norm": 11.3125, + "grad_norm_var": 0.30201822916666665, + "learning_rate": 0.0003, + "loss": 12.3781, + "loss/aux_loss": 0.04811547808349133, + "loss/crossentropy": 2.8908798456192017, + "loss/logits": 0.894439697265625, + "step": 13810 + }, + { + "epoch": 0.1382, + "grad_norm": 12.0, + "grad_norm_var": 0.24230143229166667, + "learning_rate": 0.0003, + "loss": 12.3648, + "loss/aux_loss": 0.048103314451873304, + "loss/crossentropy": 2.6465035498142244, + "loss/logits": 0.9376543581485748, + "step": 13820 + }, + { + "epoch": 0.1383, + "grad_norm": 11.375, + "grad_norm_var": 0.21555989583333332, + "learning_rate": 0.0003, + "loss": 12.3217, + "loss/aux_loss": 0.048104156740009785, + "loss/crossentropy": 2.957222414016724, + "loss/logits": 0.9343441456556321, + "step": 13830 + }, + { + "epoch": 0.1384, + "grad_norm": 10.5, + "grad_norm_var": 0.33697916666666666, + "learning_rate": 0.0003, + "loss": 12.1056, + "loss/aux_loss": 0.04809317253530025, + "loss/crossentropy": 2.6575527429580688, + "loss/logits": 0.9211295455694198, + "step": 13840 + }, + { + "epoch": 0.1385, + "grad_norm": 12.25, + "grad_norm_var": 0.41139322916666665, + "learning_rate": 0.0003, + "loss": 12.3165, + "loss/aux_loss": 0.04810005649924278, + "loss/crossentropy": 2.8109684944152833, + "loss/logits": 0.9039525598287582, + "step": 13850 + }, + { + "epoch": 0.1386, + "grad_norm": 11.125, + "grad_norm_var": 0.30462239583333334, + "learning_rate": 0.0003, + "loss": 12.4473, + "loss/aux_loss": 0.048096515238285065, + "loss/crossentropy": 2.7612143099308013, + "loss/logits": 0.9181933552026749, + "step": 13860 + }, + { + "epoch": 0.1387, + "grad_norm": 10.4375, + "grad_norm_var": 0.15623372395833332, + "learning_rate": 0.0003, + "loss": 12.3176, + "loss/aux_loss": 0.04810582865029574, + "loss/crossentropy": 2.8384499669075014, + "loss/logits": 0.9379000872373581, + "step": 13870 + }, + { + "epoch": 0.1388, + "grad_norm": 11.0625, + "grad_norm_var": 0.33839518229166665, + "learning_rate": 0.0003, + "loss": 12.2429, + "loss/aux_loss": 0.04810143150389194, + "loss/crossentropy": 2.819534254074097, + "loss/logits": 0.9464493721723557, + "step": 13880 + }, + { + "epoch": 0.1389, + "grad_norm": 10.625, + "grad_norm_var": 0.3815104166666667, + "learning_rate": 0.0003, + "loss": 12.5049, + "loss/aux_loss": 0.04809562023729086, + "loss/crossentropy": 2.858844381570816, + "loss/logits": 0.9540527880191803, + "step": 13890 + }, + { + "epoch": 0.139, + "grad_norm": 11.5, + "grad_norm_var": 0.4141764322916667, + "learning_rate": 0.0003, + "loss": 12.3641, + "loss/aux_loss": 0.04810257162898779, + "loss/crossentropy": 2.8486937463283537, + "loss/logits": 0.962219363451004, + "step": 13900 + }, + { + "epoch": 0.1391, + "grad_norm": 10.75, + "grad_norm_var": 0.2528483072916667, + "learning_rate": 0.0003, + "loss": 12.3461, + "loss/aux_loss": 0.048107230477035044, + "loss/crossentropy": 2.6285312592983248, + "loss/logits": 0.9440800845623016, + "step": 13910 + }, + { + "epoch": 0.1392, + "grad_norm": 11.5, + "grad_norm_var": 0.6181640625, + "learning_rate": 0.0003, + "loss": 12.2632, + "loss/aux_loss": 0.048101754114031794, + "loss/crossentropy": 2.7042616248130797, + "loss/logits": 0.911108523607254, + "step": 13920 + }, + { + "epoch": 0.1393, + "grad_norm": 10.5, + "grad_norm_var": 0.391650390625, + "learning_rate": 0.0003, + "loss": 12.4164, + "loss/aux_loss": 0.048104698024690154, + "loss/crossentropy": 2.760655826330185, + "loss/logits": 0.9386287301778793, + "step": 13930 + }, + { + "epoch": 0.1394, + "grad_norm": 11.375, + "grad_norm_var": 0.17526041666666667, + "learning_rate": 0.0003, + "loss": 12.3058, + "loss/aux_loss": 0.04810354206711054, + "loss/crossentropy": 2.8063110530376436, + "loss/logits": 0.9448209255933762, + "step": 13940 + }, + { + "epoch": 0.1395, + "grad_norm": 10.6875, + "grad_norm_var": 0.106494140625, + "learning_rate": 0.0003, + "loss": 12.3086, + "loss/aux_loss": 0.048095726780593394, + "loss/crossentropy": 2.791933035850525, + "loss/logits": 0.9196500927209854, + "step": 13950 + }, + { + "epoch": 0.1396, + "grad_norm": 11.125, + "grad_norm_var": 0.4676432291666667, + "learning_rate": 0.0003, + "loss": 12.4126, + "loss/aux_loss": 0.04809935782104731, + "loss/crossentropy": 2.8098475694656373, + "loss/logits": 0.9401834368705749, + "step": 13960 + }, + { + "epoch": 0.1397, + "grad_norm": 11.75, + "grad_norm_var": 0.23430989583333334, + "learning_rate": 0.0003, + "loss": 11.9704, + "loss/aux_loss": 0.04809695854783058, + "loss/crossentropy": 2.8572974622249605, + "loss/logits": 0.9450656235218048, + "step": 13970 + }, + { + "epoch": 0.1398, + "grad_norm": 11.5, + "grad_norm_var": 0.5817708333333333, + "learning_rate": 0.0003, + "loss": 12.2933, + "loss/aux_loss": 0.04811172261834144, + "loss/crossentropy": 2.7184654772281647, + "loss/logits": 1.0148230105638505, + "step": 13980 + }, + { + "epoch": 0.1399, + "grad_norm": 10.3125, + "grad_norm_var": 0.6473307291666667, + "learning_rate": 0.0003, + "loss": 12.3804, + "loss/aux_loss": 0.048092580400407314, + "loss/crossentropy": 2.8070730805397033, + "loss/logits": 0.9323766380548477, + "step": 13990 + }, + { + "epoch": 0.14, + "grad_norm": 12.4375, + "grad_norm_var": 7.025260416666667, + "learning_rate": 0.0003, + "loss": 12.5511, + "loss/aux_loss": 0.048104867339134216, + "loss/crossentropy": 2.888067865371704, + "loss/logits": 0.9707460403442383, + "step": 14000 + }, + { + "epoch": 0.1401, + "grad_norm": 10.375, + "grad_norm_var": 6.564957682291666, + "learning_rate": 0.0003, + "loss": 12.2582, + "loss/aux_loss": 0.04810222536325455, + "loss/crossentropy": 2.671162748336792, + "loss/logits": 0.9309888124465943, + "step": 14010 + }, + { + "epoch": 0.1402, + "grad_norm": 12.25, + "grad_norm_var": 0.7313639322916666, + "learning_rate": 0.0003, + "loss": 12.3619, + "loss/aux_loss": 0.04810118060559034, + "loss/crossentropy": 2.8578037440776827, + "loss/logits": 0.9234833359718323, + "step": 14020 + }, + { + "epoch": 0.1403, + "grad_norm": 9.875, + "grad_norm_var": 0.919775390625, + "learning_rate": 0.0003, + "loss": 12.2416, + "loss/aux_loss": 0.04809358064085245, + "loss/crossentropy": 2.9598345518112184, + "loss/logits": 0.9572826653718949, + "step": 14030 + }, + { + "epoch": 0.1404, + "grad_norm": 10.9375, + "grad_norm_var": 0.23748372395833334, + "learning_rate": 0.0003, + "loss": 12.2382, + "loss/aux_loss": 0.04810269232839346, + "loss/crossentropy": 2.7237884759902955, + "loss/logits": 0.9275262981653214, + "step": 14040 + }, + { + "epoch": 0.1405, + "grad_norm": 13.3125, + "grad_norm_var": 0.4171875, + "learning_rate": 0.0003, + "loss": 12.6322, + "loss/aux_loss": 0.04810795094817877, + "loss/crossentropy": 2.8176922678947447, + "loss/logits": 0.9676473349332809, + "step": 14050 + }, + { + "epoch": 0.1406, + "grad_norm": 11.0, + "grad_norm_var": 0.517431640625, + "learning_rate": 0.0003, + "loss": 12.3324, + "loss/aux_loss": 0.04809824340045452, + "loss/crossentropy": 2.830560302734375, + "loss/logits": 0.9444521218538284, + "step": 14060 + }, + { + "epoch": 0.1407, + "grad_norm": 11.1875, + "grad_norm_var": 0.38448893229166664, + "learning_rate": 0.0003, + "loss": 12.2942, + "loss/aux_loss": 0.048107242211699486, + "loss/crossentropy": 2.8280949234962462, + "loss/logits": 0.9159689843654633, + "step": 14070 + }, + { + "epoch": 0.1408, + "grad_norm": 10.8125, + "grad_norm_var": 21.600244140625, + "learning_rate": 0.0003, + "loss": 12.1401, + "loss/aux_loss": 0.048114721104502677, + "loss/crossentropy": 2.805542439222336, + "loss/logits": 0.9222520262002945, + "step": 14080 + }, + { + "epoch": 0.1409, + "grad_norm": 10.875, + "grad_norm_var": 0.7968587239583333, + "learning_rate": 0.0003, + "loss": 12.3523, + "loss/aux_loss": 0.048097500950098036, + "loss/crossentropy": 2.820968973636627, + "loss/logits": 0.9572250634431839, + "step": 14090 + }, + { + "epoch": 0.141, + "grad_norm": 10.375, + "grad_norm_var": 0.4398274739583333, + "learning_rate": 0.0003, + "loss": 12.3109, + "loss/aux_loss": 0.04811274372041226, + "loss/crossentropy": 2.873396396636963, + "loss/logits": 0.9583924978971481, + "step": 14100 + }, + { + "epoch": 0.1411, + "grad_norm": 11.0, + "grad_norm_var": 0.26886393229166666, + "learning_rate": 0.0003, + "loss": 12.4389, + "loss/aux_loss": 0.04809671007096768, + "loss/crossentropy": 2.81766881942749, + "loss/logits": 0.9768635481595993, + "step": 14110 + }, + { + "epoch": 0.1412, + "grad_norm": 11.8125, + "grad_norm_var": 0.1962890625, + "learning_rate": 0.0003, + "loss": 12.124, + "loss/aux_loss": 0.048110161907970905, + "loss/crossentropy": 2.7933003425598146, + "loss/logits": 0.9018822848796845, + "step": 14120 + }, + { + "epoch": 0.1413, + "grad_norm": 11.8125, + "grad_norm_var": 0.478369140625, + "learning_rate": 0.0003, + "loss": 12.3726, + "loss/aux_loss": 0.04809680469334125, + "loss/crossentropy": 2.961318391561508, + "loss/logits": 0.9694911539554596, + "step": 14130 + }, + { + "epoch": 0.1414, + "grad_norm": 11.0625, + "grad_norm_var": 0.30388997395833334, + "learning_rate": 0.0003, + "loss": 12.3803, + "loss/aux_loss": 0.048111779242753984, + "loss/crossentropy": 2.7930466175079345, + "loss/logits": 0.9387133151292801, + "step": 14140 + }, + { + "epoch": 0.1415, + "grad_norm": 11.0625, + "grad_norm_var": 0.327587890625, + "learning_rate": 0.0003, + "loss": 12.3241, + "loss/aux_loss": 0.048099853470921515, + "loss/crossentropy": 2.7507693111896514, + "loss/logits": 0.923522162437439, + "step": 14150 + }, + { + "epoch": 0.1416, + "grad_norm": 9.8125, + "grad_norm_var": 0.2556640625, + "learning_rate": 0.0003, + "loss": 12.1408, + "loss/aux_loss": 0.04810427725315094, + "loss/crossentropy": 2.591277301311493, + "loss/logits": 0.9115919172763824, + "step": 14160 + }, + { + "epoch": 0.1417, + "grad_norm": 10.75, + "grad_norm_var": 0.335009765625, + "learning_rate": 0.0003, + "loss": 12.3768, + "loss/aux_loss": 0.04809947330504656, + "loss/crossentropy": 2.9588594675064086, + "loss/logits": 0.9485181331634521, + "step": 14170 + }, + { + "epoch": 0.1418, + "grad_norm": 10.75, + "grad_norm_var": 0.44698893229166664, + "learning_rate": 0.0003, + "loss": 12.4181, + "loss/aux_loss": 0.04810207560658455, + "loss/crossentropy": 2.9570438385009767, + "loss/logits": 0.939887073636055, + "step": 14180 + }, + { + "epoch": 0.1419, + "grad_norm": 11.8125, + "grad_norm_var": 0.3916015625, + "learning_rate": 0.0003, + "loss": 12.401, + "loss/aux_loss": 0.0481014484539628, + "loss/crossentropy": 2.912750172615051, + "loss/logits": 0.9410306662321091, + "step": 14190 + }, + { + "epoch": 0.142, + "grad_norm": 11.6875, + "grad_norm_var": 0.2747395833333333, + "learning_rate": 0.0003, + "loss": 12.3249, + "loss/aux_loss": 0.04810392800718546, + "loss/crossentropy": 2.840937912464142, + "loss/logits": 0.9358460456132889, + "step": 14200 + }, + { + "epoch": 0.1421, + "grad_norm": 10.875, + "grad_norm_var": 0.3424479166666667, + "learning_rate": 0.0003, + "loss": 12.1907, + "loss/aux_loss": 0.048108947835862635, + "loss/crossentropy": 2.7028416991233826, + "loss/logits": 0.9135666370391846, + "step": 14210 + }, + { + "epoch": 0.1422, + "grad_norm": 10.6875, + "grad_norm_var": 0.391650390625, + "learning_rate": 0.0003, + "loss": 12.2893, + "loss/aux_loss": 0.04810764603316784, + "loss/crossentropy": 2.7547565340995788, + "loss/logits": 0.9308681100606918, + "step": 14220 + }, + { + "epoch": 0.1423, + "grad_norm": 11.625, + "grad_norm_var": 0.134228515625, + "learning_rate": 0.0003, + "loss": 12.4046, + "loss/aux_loss": 0.0481036901473999, + "loss/crossentropy": 2.8777437806129456, + "loss/logits": 0.9518471479415893, + "step": 14230 + }, + { + "epoch": 0.1424, + "grad_norm": 11.9375, + "grad_norm_var": 0.343603515625, + "learning_rate": 0.0003, + "loss": 12.2732, + "loss/aux_loss": 0.04810591135174036, + "loss/crossentropy": 2.758294236660004, + "loss/logits": 0.9424175173044205, + "step": 14240 + }, + { + "epoch": 0.1425, + "grad_norm": 11.0, + "grad_norm_var": 0.365087890625, + "learning_rate": 0.0003, + "loss": 12.2751, + "loss/aux_loss": 0.04810457993298769, + "loss/crossentropy": 2.7782435297966, + "loss/logits": 0.9354342728853225, + "step": 14250 + }, + { + "epoch": 0.1426, + "grad_norm": 11.1875, + "grad_norm_var": 0.2275390625, + "learning_rate": 0.0003, + "loss": 12.3644, + "loss/aux_loss": 0.04809907414019108, + "loss/crossentropy": 2.6942915558815, + "loss/logits": 0.9781391233205795, + "step": 14260 + }, + { + "epoch": 0.1427, + "grad_norm": 11.25, + "grad_norm_var": 0.15818684895833332, + "learning_rate": 0.0003, + "loss": 12.2785, + "loss/aux_loss": 0.04809475652873516, + "loss/crossentropy": 2.8013816356658934, + "loss/logits": 0.9540988564491272, + "step": 14270 + }, + { + "epoch": 0.1428, + "grad_norm": 10.8125, + "grad_norm_var": 0.174072265625, + "learning_rate": 0.0003, + "loss": 12.274, + "loss/aux_loss": 0.048094392754137516, + "loss/crossentropy": 2.7784948647022247, + "loss/logits": 0.9767741382122039, + "step": 14280 + }, + { + "epoch": 0.1429, + "grad_norm": 12.1875, + "grad_norm_var": 0.49412434895833335, + "learning_rate": 0.0003, + "loss": 12.0625, + "loss/aux_loss": 0.04810249712318182, + "loss/crossentropy": 2.7680072247982026, + "loss/logits": 0.9276201993227005, + "step": 14290 + }, + { + "epoch": 0.143, + "grad_norm": 10.8125, + "grad_norm_var": 0.6501139322916667, + "learning_rate": 0.0003, + "loss": 12.377, + "loss/aux_loss": 0.04810411389917135, + "loss/crossentropy": 2.857174110412598, + "loss/logits": 0.9727380841970443, + "step": 14300 + }, + { + "epoch": 0.1431, + "grad_norm": 10.25, + "grad_norm_var": 0.37890625, + "learning_rate": 0.0003, + "loss": 12.3747, + "loss/aux_loss": 0.04810033030807972, + "loss/crossentropy": 2.8648359537124635, + "loss/logits": 0.9480609089136124, + "step": 14310 + }, + { + "epoch": 0.1432, + "grad_norm": 11.125, + "grad_norm_var": 0.42630208333333336, + "learning_rate": 0.0003, + "loss": 12.3662, + "loss/aux_loss": 0.04809709247201681, + "loss/crossentropy": 2.898632252216339, + "loss/logits": 0.9568489253520965, + "step": 14320 + }, + { + "epoch": 0.1433, + "grad_norm": 12.375, + "grad_norm_var": 0.4400390625, + "learning_rate": 0.0003, + "loss": 12.0264, + "loss/aux_loss": 0.048100917227566244, + "loss/crossentropy": 2.740042132139206, + "loss/logits": 0.9168058276176453, + "step": 14330 + }, + { + "epoch": 0.1434, + "grad_norm": 11.5, + "grad_norm_var": 0.25909830729166666, + "learning_rate": 0.0003, + "loss": 12.2735, + "loss/aux_loss": 0.048106574639678004, + "loss/crossentropy": 2.599212634563446, + "loss/logits": 0.9046968847513199, + "step": 14340 + }, + { + "epoch": 0.1435, + "grad_norm": 11.375, + "grad_norm_var": 0.18430989583333332, + "learning_rate": 0.0003, + "loss": 12.3895, + "loss/aux_loss": 0.0481049045920372, + "loss/crossentropy": 2.817890876531601, + "loss/logits": 0.9239292711019516, + "step": 14350 + }, + { + "epoch": 0.1436, + "grad_norm": 11.8125, + "grad_norm_var": 0.17701822916666668, + "learning_rate": 0.0003, + "loss": 12.1993, + "loss/aux_loss": 0.048096088133752345, + "loss/crossentropy": 2.841284441947937, + "loss/logits": 0.9078822374343872, + "step": 14360 + }, + { + "epoch": 0.1437, + "grad_norm": 12.1875, + "grad_norm_var": 0.334228515625, + "learning_rate": 0.0003, + "loss": 12.2995, + "loss/aux_loss": 0.04810677636414766, + "loss/crossentropy": 2.8144130051136016, + "loss/logits": 0.9448065549135208, + "step": 14370 + }, + { + "epoch": 0.1438, + "grad_norm": 10.75, + "grad_norm_var": 0.499462890625, + "learning_rate": 0.0003, + "loss": 12.3904, + "loss/aux_loss": 0.048093562759459016, + "loss/crossentropy": 2.7329901337623594, + "loss/logits": 0.9095306128263474, + "step": 14380 + }, + { + "epoch": 0.1439, + "grad_norm": 11.0, + "grad_norm_var": 0.3337890625, + "learning_rate": 0.0003, + "loss": 12.3279, + "loss/aux_loss": 0.048103582486510275, + "loss/crossentropy": 2.7921825289726256, + "loss/logits": 0.8989864021539689, + "step": 14390 + }, + { + "epoch": 0.144, + "grad_norm": 10.3125, + "grad_norm_var": 0.3791015625, + "learning_rate": 0.0003, + "loss": 12.2123, + "loss/aux_loss": 0.04809750877320766, + "loss/crossentropy": 2.757075470685959, + "loss/logits": 0.9241562187671661, + "step": 14400 + }, + { + "epoch": 0.1441, + "grad_norm": 11.5625, + "grad_norm_var": 0.3485514322916667, + "learning_rate": 0.0003, + "loss": 12.2931, + "loss/aux_loss": 0.04810139331966638, + "loss/crossentropy": 2.7922261476516725, + "loss/logits": 0.9176064521074295, + "step": 14410 + }, + { + "epoch": 0.1442, + "grad_norm": 11.875, + "grad_norm_var": 0.3316243489583333, + "learning_rate": 0.0003, + "loss": 12.254, + "loss/aux_loss": 0.04809432700276375, + "loss/crossentropy": 2.699055606126785, + "loss/logits": 0.922983717918396, + "step": 14420 + }, + { + "epoch": 0.1443, + "grad_norm": 13.25, + "grad_norm_var": 0.5127604166666667, + "learning_rate": 0.0003, + "loss": 12.3069, + "loss/aux_loss": 0.04811172112822533, + "loss/crossentropy": 2.6716023087501526, + "loss/logits": 0.916047015786171, + "step": 14430 + }, + { + "epoch": 0.1444, + "grad_norm": 12.0, + "grad_norm_var": 0.5277180989583333, + "learning_rate": 0.0003, + "loss": 12.063, + "loss/aux_loss": 0.048108107224106786, + "loss/crossentropy": 2.722955423593521, + "loss/logits": 0.9298226207494735, + "step": 14440 + }, + { + "epoch": 0.1445, + "grad_norm": 12.0, + "grad_norm_var": 0.6900390625, + "learning_rate": 0.0003, + "loss": 12.4038, + "loss/aux_loss": 0.04810595251619816, + "loss/crossentropy": 2.758984863758087, + "loss/logits": 0.9555283427238465, + "step": 14450 + }, + { + "epoch": 0.1446, + "grad_norm": 10.5, + "grad_norm_var": 1.40625, + "learning_rate": 0.0003, + "loss": 12.2967, + "loss/aux_loss": 0.04809891190379858, + "loss/crossentropy": 2.9647063076496125, + "loss/logits": 0.9323074251413346, + "step": 14460 + }, + { + "epoch": 0.1447, + "grad_norm": 11.0625, + "grad_norm_var": 0.2816243489583333, + "learning_rate": 0.0003, + "loss": 12.1597, + "loss/aux_loss": 0.048092216812074186, + "loss/crossentropy": 2.825933372974396, + "loss/logits": 0.9327166765928269, + "step": 14470 + }, + { + "epoch": 0.1448, + "grad_norm": 11.625, + "grad_norm_var": 0.21927083333333333, + "learning_rate": 0.0003, + "loss": 12.1911, + "loss/aux_loss": 0.048105007782578466, + "loss/crossentropy": 2.8361350774765013, + "loss/logits": 0.9484387129545212, + "step": 14480 + }, + { + "epoch": 0.1449, + "grad_norm": 12.125, + "grad_norm_var": 0.44895833333333335, + "learning_rate": 0.0003, + "loss": 12.2172, + "loss/aux_loss": 0.04810567460954189, + "loss/crossentropy": 3.038946294784546, + "loss/logits": 0.93484668135643, + "step": 14490 + }, + { + "epoch": 0.145, + "grad_norm": 12.4375, + "grad_norm_var": 0.190625, + "learning_rate": 0.0003, + "loss": 12.0355, + "loss/aux_loss": 0.04809667877852917, + "loss/crossentropy": 2.7166079699993135, + "loss/logits": 0.9272844552993774, + "step": 14500 + }, + { + "epoch": 0.1451, + "grad_norm": 10.8125, + "grad_norm_var": 0.3563639322916667, + "learning_rate": 0.0003, + "loss": 12.1147, + "loss/aux_loss": 0.04810612741857767, + "loss/crossentropy": 2.8721679210662843, + "loss/logits": 0.9418263047933578, + "step": 14510 + }, + { + "epoch": 0.1452, + "grad_norm": 13.75, + "grad_norm_var": 0.8372395833333334, + "learning_rate": 0.0003, + "loss": 12.294, + "loss/aux_loss": 0.0481014546006918, + "loss/crossentropy": 2.8245011150836943, + "loss/logits": 0.9494610846042633, + "step": 14520 + }, + { + "epoch": 0.1453, + "grad_norm": 12.375, + "grad_norm_var": 0.6234375, + "learning_rate": 0.0003, + "loss": 12.4112, + "loss/aux_loss": 0.048096208833158016, + "loss/crossentropy": 2.841428017616272, + "loss/logits": 0.9481588363647461, + "step": 14530 + }, + { + "epoch": 0.1454, + "grad_norm": 11.625, + "grad_norm_var": 0.2752604166666667, + "learning_rate": 0.0003, + "loss": 12.3017, + "loss/aux_loss": 0.048103177733719346, + "loss/crossentropy": 2.8085617065429687, + "loss/logits": 0.9153319448232651, + "step": 14540 + }, + { + "epoch": 0.1455, + "grad_norm": 10.3125, + "grad_norm_var": 0.7103515625, + "learning_rate": 0.0003, + "loss": 12.0693, + "loss/aux_loss": 0.04810036141425371, + "loss/crossentropy": 2.780474007129669, + "loss/logits": 0.9415223807096481, + "step": 14550 + }, + { + "epoch": 0.1456, + "grad_norm": 13.25, + "grad_norm_var": 0.590087890625, + "learning_rate": 0.0003, + "loss": 12.2025, + "loss/aux_loss": 0.04810147602111101, + "loss/crossentropy": 2.99658949971199, + "loss/logits": 0.9620972305536271, + "step": 14560 + }, + { + "epoch": 0.1457, + "grad_norm": 11.5625, + "grad_norm_var": 0.5958170572916667, + "learning_rate": 0.0003, + "loss": 12.2627, + "loss/aux_loss": 0.04810364861041307, + "loss/crossentropy": 2.804142338037491, + "loss/logits": 0.9306607961654663, + "step": 14570 + }, + { + "epoch": 0.1458, + "grad_norm": 11.5625, + "grad_norm_var": 0.718212890625, + "learning_rate": 0.0003, + "loss": 12.2289, + "loss/aux_loss": 0.048095517046749595, + "loss/crossentropy": 2.887796187400818, + "loss/logits": 0.9498396337032318, + "step": 14580 + }, + { + "epoch": 0.1459, + "grad_norm": 10.4375, + "grad_norm_var": 224.5869140625, + "learning_rate": 0.0003, + "loss": 12.2851, + "loss/aux_loss": 0.04810778181999922, + "loss/crossentropy": 2.8193862557411196, + "loss/logits": 0.9181474059820175, + "step": 14590 + }, + { + "epoch": 0.146, + "grad_norm": 10.625, + "grad_norm_var": 0.9703125, + "learning_rate": 0.0003, + "loss": 12.3751, + "loss/aux_loss": 0.048111128620803356, + "loss/crossentropy": 2.8784336388111114, + "loss/logits": 0.9728086590766907, + "step": 14600 + }, + { + "epoch": 0.1461, + "grad_norm": 11.25, + "grad_norm_var": 1.16015625, + "learning_rate": 0.0003, + "loss": 12.307, + "loss/aux_loss": 0.0481025354936719, + "loss/crossentropy": 2.7547997891902924, + "loss/logits": 0.9490894585847854, + "step": 14610 + }, + { + "epoch": 0.1462, + "grad_norm": 10.5, + "grad_norm_var": 0.7546875, + "learning_rate": 0.0003, + "loss": 12.2052, + "loss/aux_loss": 0.04810136090964079, + "loss/crossentropy": 2.8345041155815123, + "loss/logits": 0.932789009809494, + "step": 14620 + }, + { + "epoch": 0.1463, + "grad_norm": 10.6875, + "grad_norm_var": 0.5806640625, + "learning_rate": 0.0003, + "loss": 12.3757, + "loss/aux_loss": 0.048109573498368266, + "loss/crossentropy": 2.812575626373291, + "loss/logits": 0.909963321685791, + "step": 14630 + }, + { + "epoch": 0.1464, + "grad_norm": 10.5625, + "grad_norm_var": 0.7407389322916667, + "learning_rate": 0.0003, + "loss": 12.3286, + "loss/aux_loss": 0.04808946587145328, + "loss/crossentropy": 3.0018001079559324, + "loss/logits": 0.9122880339622498, + "step": 14640 + }, + { + "epoch": 0.1465, + "grad_norm": 11.5625, + "grad_norm_var": 0.2999348958333333, + "learning_rate": 0.0003, + "loss": 12.2271, + "loss/aux_loss": 0.04810205716639757, + "loss/crossentropy": 2.813987505435944, + "loss/logits": 0.9646568685770035, + "step": 14650 + }, + { + "epoch": 0.1466, + "grad_norm": 11.375, + "grad_norm_var": 0.30130208333333336, + "learning_rate": 0.0003, + "loss": 12.3144, + "loss/aux_loss": 0.04810758735984564, + "loss/crossentropy": 2.8299679458141327, + "loss/logits": 0.986625736951828, + "step": 14660 + }, + { + "epoch": 0.1467, + "grad_norm": 11.0625, + "grad_norm_var": 0.448291015625, + "learning_rate": 0.0003, + "loss": 12.1161, + "loss/aux_loss": 0.04809447340667248, + "loss/crossentropy": 2.72513552904129, + "loss/logits": 0.9294513940811158, + "step": 14670 + }, + { + "epoch": 0.1468, + "grad_norm": 12.1875, + "grad_norm_var": 0.6794270833333333, + "learning_rate": 0.0003, + "loss": 12.3315, + "loss/aux_loss": 0.048106583207845686, + "loss/crossentropy": 2.876382863521576, + "loss/logits": 0.9856162458658219, + "step": 14680 + }, + { + "epoch": 0.1469, + "grad_norm": 11.625, + "grad_norm_var": 0.43483072916666665, + "learning_rate": 0.0003, + "loss": 12.2266, + "loss/aux_loss": 0.04810109194368124, + "loss/crossentropy": 2.7526570439338682, + "loss/logits": 0.9561353415250778, + "step": 14690 + }, + { + "epoch": 0.147, + "grad_norm": 10.5, + "grad_norm_var": 0.3259765625, + "learning_rate": 0.0003, + "loss": 12.2576, + "loss/aux_loss": 0.04809536933898926, + "loss/crossentropy": 2.841619998216629, + "loss/logits": 0.9037140548229218, + "step": 14700 + }, + { + "epoch": 0.1471, + "grad_norm": 10.6875, + "grad_norm_var": 0.17081705729166666, + "learning_rate": 0.0003, + "loss": 12.1134, + "loss/aux_loss": 0.04811270516365766, + "loss/crossentropy": 2.8084808826446532, + "loss/logits": 0.8835839122533798, + "step": 14710 + }, + { + "epoch": 0.1472, + "grad_norm": 10.6875, + "grad_norm_var": 0.15703125, + "learning_rate": 0.0003, + "loss": 12.362, + "loss/aux_loss": 0.04809713140130043, + "loss/crossentropy": 2.7635149002075194, + "loss/logits": 0.9131356865167618, + "step": 14720 + }, + { + "epoch": 0.1473, + "grad_norm": 37.75, + "grad_norm_var": 43.606754557291666, + "learning_rate": 0.0003, + "loss": 12.3082, + "loss/aux_loss": 0.04809991512447596, + "loss/crossentropy": 2.6443506985902787, + "loss/logits": 0.9391460686922073, + "step": 14730 + }, + { + "epoch": 0.1474, + "grad_norm": 10.75, + "grad_norm_var": 43.39609375, + "learning_rate": 0.0003, + "loss": 12.1862, + "loss/aux_loss": 0.048105531558394435, + "loss/crossentropy": 2.745135086774826, + "loss/logits": 0.9022040009498596, + "step": 14740 + }, + { + "epoch": 0.1475, + "grad_norm": 11.125, + "grad_norm_var": 0.849072265625, + "learning_rate": 0.0003, + "loss": 12.2467, + "loss/aux_loss": 0.04809885751456022, + "loss/crossentropy": 2.8118128538131715, + "loss/logits": 0.9468438357114792, + "step": 14750 + }, + { + "epoch": 0.1476, + "grad_norm": 10.75, + "grad_norm_var": 0.8421223958333334, + "learning_rate": 0.0003, + "loss": 12.2405, + "loss/aux_loss": 0.04809672348201275, + "loss/crossentropy": 2.9226966857910157, + "loss/logits": 0.9427454113960266, + "step": 14760 + }, + { + "epoch": 0.1477, + "grad_norm": 10.75, + "grad_norm_var": 0.402978515625, + "learning_rate": 0.0003, + "loss": 12.0438, + "loss/aux_loss": 0.04809822179377079, + "loss/crossentropy": 2.7584859311580656, + "loss/logits": 0.9179711043834686, + "step": 14770 + }, + { + "epoch": 0.1478, + "grad_norm": 10.75, + "grad_norm_var": 0.32941080729166666, + "learning_rate": 0.0003, + "loss": 12.2503, + "loss/aux_loss": 0.04809885416179895, + "loss/crossentropy": 2.9167349338531494, + "loss/logits": 0.9420515596866608, + "step": 14780 + }, + { + "epoch": 0.1479, + "grad_norm": 11.625, + "grad_norm_var": 8.384098307291667, + "learning_rate": 0.0003, + "loss": 12.296, + "loss/aux_loss": 0.04811673872172832, + "loss/crossentropy": 2.6683114945888518, + "loss/logits": 0.9089529395103455, + "step": 14790 + }, + { + "epoch": 0.148, + "grad_norm": 11.0625, + "grad_norm_var": 8.616145833333333, + "learning_rate": 0.0003, + "loss": 12.4053, + "loss/aux_loss": 0.048098386451601985, + "loss/crossentropy": 2.920657384395599, + "loss/logits": 0.954785504937172, + "step": 14800 + }, + { + "epoch": 0.1481, + "grad_norm": 11.5625, + "grad_norm_var": 0.6645182291666667, + "learning_rate": 0.0003, + "loss": 12.32, + "loss/aux_loss": 0.048104028962552545, + "loss/crossentropy": 2.8948807239532472, + "loss/logits": 0.9660200357437134, + "step": 14810 + }, + { + "epoch": 0.1482, + "grad_norm": 10.8125, + "grad_norm_var": 0.3228515625, + "learning_rate": 0.0003, + "loss": 12.2445, + "loss/aux_loss": 0.04809078220278025, + "loss/crossentropy": 2.6194146156311033, + "loss/logits": 0.9160060435533524, + "step": 14820 + }, + { + "epoch": 0.1483, + "grad_norm": 11.5625, + "grad_norm_var": 0.3328125, + "learning_rate": 0.0003, + "loss": 12.2885, + "loss/aux_loss": 0.048103746958076954, + "loss/crossentropy": 2.754777866601944, + "loss/logits": 0.949258816242218, + "step": 14830 + }, + { + "epoch": 0.1484, + "grad_norm": 11.8125, + "grad_norm_var": 19.371077473958334, + "learning_rate": 0.0003, + "loss": 12.3542, + "loss/aux_loss": 0.0480975853279233, + "loss/crossentropy": 2.8865331768989564, + "loss/logits": 0.9503841429948807, + "step": 14840 + }, + { + "epoch": 0.1485, + "grad_norm": 11.5625, + "grad_norm_var": 19.502978515625, + "learning_rate": 0.0003, + "loss": 12.2103, + "loss/aux_loss": 0.04809964876621962, + "loss/crossentropy": 2.8561151921749115, + "loss/logits": 0.9401407986879349, + "step": 14850 + }, + { + "epoch": 0.1486, + "grad_norm": 11.3125, + "grad_norm_var": 0.692431640625, + "learning_rate": 0.0003, + "loss": 12.2275, + "loss/aux_loss": 0.04810830354690552, + "loss/crossentropy": 2.8200284421443937, + "loss/logits": 0.9532903909683228, + "step": 14860 + }, + { + "epoch": 0.1487, + "grad_norm": 12.4375, + "grad_norm_var": 20.823421223958334, + "learning_rate": 0.0003, + "loss": 12.2273, + "loss/aux_loss": 0.04810541290789842, + "loss/crossentropy": 2.6964865624904633, + "loss/logits": 0.9282492130994797, + "step": 14870 + }, + { + "epoch": 0.1488, + "grad_norm": 12.4375, + "grad_norm_var": 20.870247395833335, + "learning_rate": 0.0003, + "loss": 12.3184, + "loss/aux_loss": 0.04810648560523987, + "loss/crossentropy": 2.8158532321453094, + "loss/logits": 0.9431109875440598, + "step": 14880 + }, + { + "epoch": 0.1489, + "grad_norm": 10.5, + "grad_norm_var": 0.601025390625, + "learning_rate": 0.0003, + "loss": 12.2071, + "loss/aux_loss": 0.048101908154785634, + "loss/crossentropy": 2.803946614265442, + "loss/logits": 0.9242048561573029, + "step": 14890 + }, + { + "epoch": 0.149, + "grad_norm": 10.6875, + "grad_norm_var": 0.7249348958333334, + "learning_rate": 0.0003, + "loss": 12.0886, + "loss/aux_loss": 0.04810064677149058, + "loss/crossentropy": 2.778617113828659, + "loss/logits": 0.9423558235168457, + "step": 14900 + }, + { + "epoch": 0.1491, + "grad_norm": 11.75, + "grad_norm_var": 0.5096354166666667, + "learning_rate": 0.0003, + "loss": 12.1908, + "loss/aux_loss": 0.0481045451015234, + "loss/crossentropy": 2.854165458679199, + "loss/logits": 0.9379553228616715, + "step": 14910 + }, + { + "epoch": 0.1492, + "grad_norm": 11.6875, + "grad_norm_var": 0.46848958333333335, + "learning_rate": 0.0003, + "loss": 12.4479, + "loss/aux_loss": 0.04809947554022074, + "loss/crossentropy": 2.8008215487003327, + "loss/logits": 0.9356680005788803, + "step": 14920 + }, + { + "epoch": 0.1493, + "grad_norm": 11.6875, + "grad_norm_var": 0.13587239583333333, + "learning_rate": 0.0003, + "loss": 12.0882, + "loss/aux_loss": 0.04809720925986767, + "loss/crossentropy": 2.7476901531219484, + "loss/logits": 0.9420498043298722, + "step": 14930 + }, + { + "epoch": 0.1494, + "grad_norm": 10.9375, + "grad_norm_var": 0.208447265625, + "learning_rate": 0.0003, + "loss": 12.2413, + "loss/aux_loss": 0.0480950940400362, + "loss/crossentropy": 2.864052379131317, + "loss/logits": 0.9272116690874099, + "step": 14940 + }, + { + "epoch": 0.1495, + "grad_norm": 11.375, + "grad_norm_var": 0.311181640625, + "learning_rate": 0.0003, + "loss": 12.2616, + "loss/aux_loss": 0.04809942021965981, + "loss/crossentropy": 2.9187353610992433, + "loss/logits": 0.9510285437107087, + "step": 14950 + }, + { + "epoch": 0.1496, + "grad_norm": 11.9375, + "grad_norm_var": 47.15206705729167, + "learning_rate": 0.0003, + "loss": 12.3457, + "loss/aux_loss": 0.04811178985983133, + "loss/crossentropy": 2.7071347713470457, + "loss/logits": 0.9264847010374069, + "step": 14960 + }, + { + "epoch": 0.1497, + "grad_norm": 10.875, + "grad_norm_var": 0.9809895833333333, + "learning_rate": 0.0003, + "loss": 12.0873, + "loss/aux_loss": 0.048096916265785696, + "loss/crossentropy": 2.8192372739315035, + "loss/logits": 0.961195969581604, + "step": 14970 + }, + { + "epoch": 0.1498, + "grad_norm": 11.0, + "grad_norm_var": 0.7156087239583333, + "learning_rate": 0.0003, + "loss": 12.113, + "loss/aux_loss": 0.04811111818999052, + "loss/crossentropy": 2.698056328296661, + "loss/logits": 0.9076811641454696, + "step": 14980 + }, + { + "epoch": 0.1499, + "grad_norm": 12.0, + "grad_norm_var": 269.00714518229165, + "learning_rate": 0.0003, + "loss": 12.3343, + "loss/aux_loss": 0.048099159449338916, + "loss/crossentropy": 2.7637811422348024, + "loss/logits": 0.968485102057457, + "step": 14990 + }, + { + "epoch": 0.15, + "grad_norm": 11.375, + "grad_norm_var": 269.80128580729166, + "learning_rate": 0.0003, + "loss": 12.4506, + "loss/aux_loss": 0.04810250028967857, + "loss/crossentropy": 2.8541912317276, + "loss/logits": 0.987116688489914, + "step": 15000 + }, + { + "epoch": 0.1501, + "grad_norm": 10.75, + "grad_norm_var": 0.6130208333333333, + "learning_rate": 0.0003, + "loss": 12.2255, + "loss/aux_loss": 0.048107155971229075, + "loss/crossentropy": 2.7818135201931, + "loss/logits": 0.9167533338069915, + "step": 15010 + }, + { + "epoch": 0.1502, + "grad_norm": 11.3125, + "grad_norm_var": 0.13170572916666667, + "learning_rate": 0.0003, + "loss": 12.356, + "loss/aux_loss": 0.04810172915458679, + "loss/crossentropy": 2.940346974134445, + "loss/logits": 0.952643695473671, + "step": 15020 + }, + { + "epoch": 0.1503, + "grad_norm": 12.125, + "grad_norm_var": 0.5635416666666667, + "learning_rate": 0.0003, + "loss": 12.2117, + "loss/aux_loss": 0.04809871483594179, + "loss/crossentropy": 2.7255462646484374, + "loss/logits": 0.9134910553693771, + "step": 15030 + }, + { + "epoch": 0.1504, + "grad_norm": 12.0, + "grad_norm_var": 0.5828125, + "learning_rate": 0.0003, + "loss": 12.197, + "loss/aux_loss": 0.04809676483273506, + "loss/crossentropy": 2.9399500370025633, + "loss/logits": 0.9335614711046218, + "step": 15040 + }, + { + "epoch": 0.1505, + "grad_norm": 11.75, + "grad_norm_var": 0.296337890625, + "learning_rate": 0.0003, + "loss": 12.2855, + "loss/aux_loss": 0.048100493475794794, + "loss/crossentropy": 2.83375204205513, + "loss/logits": 0.9076710551977157, + "step": 15050 + }, + { + "epoch": 0.1506, + "grad_norm": 10.125, + "grad_norm_var": 1.1322916666666667, + "learning_rate": 0.0003, + "loss": 12.1571, + "loss/aux_loss": 0.0481040021404624, + "loss/crossentropy": 2.9388111233711243, + "loss/logits": 0.9607951223850251, + "step": 15060 + }, + { + "epoch": 0.1507, + "grad_norm": 11.875, + "grad_norm_var": 0.3541666666666667, + "learning_rate": 0.0003, + "loss": 12.4012, + "loss/aux_loss": 0.04810583982616663, + "loss/crossentropy": 2.875928044319153, + "loss/logits": 0.9627457737922669, + "step": 15070 + }, + { + "epoch": 0.1508, + "grad_norm": 10.6875, + "grad_norm_var": 0.32786458333333335, + "learning_rate": 0.0003, + "loss": 12.2652, + "loss/aux_loss": 0.04809319153428078, + "loss/crossentropy": 2.9540405869483948, + "loss/logits": 0.9330450028181076, + "step": 15080 + }, + { + "epoch": 0.1509, + "grad_norm": 11.625, + "grad_norm_var": 0.32760416666666664, + "learning_rate": 0.0003, + "loss": 12.1051, + "loss/aux_loss": 0.048104870691895486, + "loss/crossentropy": 2.8114062428474424, + "loss/logits": 0.9364354491233826, + "step": 15090 + }, + { + "epoch": 0.151, + "grad_norm": 12.0, + "grad_norm_var": 36.4619140625, + "learning_rate": 0.0003, + "loss": 12.2164, + "loss/aux_loss": 0.04811019506305456, + "loss/crossentropy": 2.859804928302765, + "loss/logits": 0.9465476185083389, + "step": 15100 + }, + { + "epoch": 0.1511, + "grad_norm": 10.9375, + "grad_norm_var": 35.42526041666667, + "learning_rate": 0.0003, + "loss": 12.3094, + "loss/aux_loss": 0.04809868466109037, + "loss/crossentropy": 2.8438979148864747, + "loss/logits": 0.8984570145606995, + "step": 15110 + }, + { + "epoch": 0.1512, + "grad_norm": 12.3125, + "grad_norm_var": 1.280712890625, + "learning_rate": 0.0003, + "loss": 12.2106, + "loss/aux_loss": 0.048102298937737945, + "loss/crossentropy": 2.8705235600471495, + "loss/logits": 0.9498428493738175, + "step": 15120 + }, + { + "epoch": 0.1513, + "grad_norm": 10.9375, + "grad_norm_var": 1.2218098958333334, + "learning_rate": 0.0003, + "loss": 12.208, + "loss/aux_loss": 0.04809999018907547, + "loss/crossentropy": 2.90715229511261, + "loss/logits": 0.9575346529483795, + "step": 15130 + }, + { + "epoch": 0.1514, + "grad_norm": 13.125, + "grad_norm_var": 0.46145833333333336, + "learning_rate": 0.0003, + "loss": 12.2519, + "loss/aux_loss": 0.04809638075530529, + "loss/crossentropy": 2.7828579187393188, + "loss/logits": 0.9503386884927749, + "step": 15140 + }, + { + "epoch": 0.1515, + "grad_norm": 11.5625, + "grad_norm_var": 0.3907389322916667, + "learning_rate": 0.0003, + "loss": 12.2184, + "loss/aux_loss": 0.04810054805129767, + "loss/crossentropy": 2.871291196346283, + "loss/logits": 0.9457465648651123, + "step": 15150 + }, + { + "epoch": 0.1516, + "grad_norm": 11.6875, + "grad_norm_var": 0.328369140625, + "learning_rate": 0.0003, + "loss": 12.1969, + "loss/aux_loss": 0.04810226745903492, + "loss/crossentropy": 2.773683416843414, + "loss/logits": 0.9155610114336014, + "step": 15160 + }, + { + "epoch": 0.1517, + "grad_norm": 10.875, + "grad_norm_var": 0.39036458333333335, + "learning_rate": 0.0003, + "loss": 12.1309, + "loss/aux_loss": 0.04810235556215048, + "loss/crossentropy": 2.908278775215149, + "loss/logits": 0.9158676236867904, + "step": 15170 + }, + { + "epoch": 0.1518, + "grad_norm": 10.875, + "grad_norm_var": 0.24264322916666667, + "learning_rate": 0.0003, + "loss": 12.1781, + "loss/aux_loss": 0.04809516854584217, + "loss/crossentropy": 2.737054407596588, + "loss/logits": 0.9324061542749404, + "step": 15180 + }, + { + "epoch": 0.1519, + "grad_norm": 10.6875, + "grad_norm_var": 24.646354166666665, + "learning_rate": 0.0003, + "loss": 12.1106, + "loss/aux_loss": 0.048110068589448926, + "loss/crossentropy": 2.820358157157898, + "loss/logits": 0.937472653388977, + "step": 15190 + }, + { + "epoch": 0.152, + "grad_norm": 12.1875, + "grad_norm_var": 1.719384765625, + "learning_rate": 0.0003, + "loss": 12.1341, + "loss/aux_loss": 0.04811493624001741, + "loss/crossentropy": 2.8123875498771667, + "loss/logits": 0.915363097190857, + "step": 15200 + }, + { + "epoch": 0.1521, + "grad_norm": 11.0625, + "grad_norm_var": 1.6275390625, + "learning_rate": 0.0003, + "loss": 12.0856, + "loss/aux_loss": 0.048097194731235506, + "loss/crossentropy": 2.9831456780433654, + "loss/logits": 0.9460885792970657, + "step": 15210 + }, + { + "epoch": 0.1522, + "grad_norm": 10.25, + "grad_norm_var": 0.3004557291666667, + "learning_rate": 0.0003, + "loss": 11.8513, + "loss/aux_loss": 0.04810490664094687, + "loss/crossentropy": 2.7776617228984835, + "loss/logits": 0.9056734681129456, + "step": 15220 + }, + { + "epoch": 0.1523, + "grad_norm": 11.25, + "grad_norm_var": 0.22263997395833332, + "learning_rate": 0.0003, + "loss": 12.0605, + "loss/aux_loss": 0.04809698183089495, + "loss/crossentropy": 2.8233635425567627, + "loss/logits": 0.9389057904481888, + "step": 15230 + }, + { + "epoch": 0.1524, + "grad_norm": 11.6875, + "grad_norm_var": 0.4488118489583333, + "learning_rate": 0.0003, + "loss": 12.1783, + "loss/aux_loss": 0.048117165453732014, + "loss/crossentropy": 2.66759774684906, + "loss/logits": 0.9018658816814422, + "step": 15240 + }, + { + "epoch": 0.1525, + "grad_norm": 11.75, + "grad_norm_var": 0.46295572916666666, + "learning_rate": 0.0003, + "loss": 12.1848, + "loss/aux_loss": 0.048101647198200224, + "loss/crossentropy": 2.803697109222412, + "loss/logits": 0.9530074447393417, + "step": 15250 + }, + { + "epoch": 0.1526, + "grad_norm": 10.9375, + "grad_norm_var": 0.25930989583333336, + "learning_rate": 0.0003, + "loss": 12.1231, + "loss/aux_loss": 0.04810661189258099, + "loss/crossentropy": 2.847998285293579, + "loss/logits": 0.9225472122430801, + "step": 15260 + }, + { + "epoch": 0.1527, + "grad_norm": 11.75, + "grad_norm_var": 0.14837239583333334, + "learning_rate": 0.0003, + "loss": 12.2114, + "loss/aux_loss": 0.048101527616381645, + "loss/crossentropy": 2.8020704984664917, + "loss/logits": 0.9532386660575867, + "step": 15270 + }, + { + "epoch": 0.1528, + "grad_norm": 11.625, + "grad_norm_var": 0.20701497395833332, + "learning_rate": 0.0003, + "loss": 11.9737, + "loss/aux_loss": 0.0480971185490489, + "loss/crossentropy": 2.984708344936371, + "loss/logits": 0.9079653114080429, + "step": 15280 + }, + { + "epoch": 0.1529, + "grad_norm": 11.1875, + "grad_norm_var": 0.22057291666666667, + "learning_rate": 0.0003, + "loss": 12.3004, + "loss/aux_loss": 0.0480941278859973, + "loss/crossentropy": 2.877718913555145, + "loss/logits": 0.9893636494874954, + "step": 15290 + }, + { + "epoch": 0.153, + "grad_norm": 11.0, + "grad_norm_var": 0.5619791666666667, + "learning_rate": 0.0003, + "loss": 12.3488, + "loss/aux_loss": 0.04810192976146936, + "loss/crossentropy": 2.6575785517692565, + "loss/logits": 0.9273690760135651, + "step": 15300 + }, + { + "epoch": 0.1531, + "grad_norm": 12.1875, + "grad_norm_var": 0.675, + "learning_rate": 0.0003, + "loss": 12.3104, + "loss/aux_loss": 0.048097644187510016, + "loss/crossentropy": 2.748287373781204, + "loss/logits": 0.948896062374115, + "step": 15310 + }, + { + "epoch": 0.1532, + "grad_norm": 11.125, + "grad_norm_var": 0.31953125, + "learning_rate": 0.0003, + "loss": 12.2604, + "loss/aux_loss": 0.04810179900377989, + "loss/crossentropy": 2.7924574255943297, + "loss/logits": 0.9476647943258285, + "step": 15320 + }, + { + "epoch": 0.1533, + "grad_norm": 12.0625, + "grad_norm_var": 0.2618326822916667, + "learning_rate": 0.0003, + "loss": 12.1864, + "loss/aux_loss": 0.04809467382729053, + "loss/crossentropy": 2.8050376057624815, + "loss/logits": 0.9532152026891708, + "step": 15330 + }, + { + "epoch": 0.1534, + "grad_norm": 11.625, + "grad_norm_var": 0.29140625, + "learning_rate": 0.0003, + "loss": 12.347, + "loss/aux_loss": 0.04810280818492174, + "loss/crossentropy": 2.8835896611213685, + "loss/logits": 0.9571986377239228, + "step": 15340 + }, + { + "epoch": 0.1535, + "grad_norm": 11.1875, + "grad_norm_var": 0.18333333333333332, + "learning_rate": 0.0003, + "loss": 12.2084, + "loss/aux_loss": 0.04809752386063337, + "loss/crossentropy": 2.749102717638016, + "loss/logits": 0.9065598905086517, + "step": 15350 + }, + { + "epoch": 0.1536, + "grad_norm": 11.4375, + "grad_norm_var": 0.21979166666666666, + "learning_rate": 0.0003, + "loss": 12.0297, + "loss/aux_loss": 0.04811021964997053, + "loss/crossentropy": 2.9079548954963683, + "loss/logits": 0.9038378298282623, + "step": 15360 + }, + { + "epoch": 0.1537, + "grad_norm": 11.625, + "grad_norm_var": 0.38743489583333335, + "learning_rate": 0.0003, + "loss": 12.2798, + "loss/aux_loss": 0.04810038134455681, + "loss/crossentropy": 2.696992439031601, + "loss/logits": 0.9243462920188904, + "step": 15370 + }, + { + "epoch": 0.1538, + "grad_norm": 10.6875, + "grad_norm_var": 0.419775390625, + "learning_rate": 0.0003, + "loss": 12.3808, + "loss/aux_loss": 0.048097037523984906, + "loss/crossentropy": 2.955533170700073, + "loss/logits": 0.962461119890213, + "step": 15380 + }, + { + "epoch": 0.1539, + "grad_norm": 11.625, + "grad_norm_var": 0.4515462239583333, + "learning_rate": 0.0003, + "loss": 12.255, + "loss/aux_loss": 0.04808609709143639, + "loss/crossentropy": 2.8893189787864686, + "loss/logits": 0.9806782245635987, + "step": 15390 + }, + { + "epoch": 0.154, + "grad_norm": 13.125, + "grad_norm_var": 0.5421875, + "learning_rate": 0.0003, + "loss": 12.2583, + "loss/aux_loss": 0.048098246194422246, + "loss/crossentropy": 3.016023313999176, + "loss/logits": 0.9697432667016983, + "step": 15400 + }, + { + "epoch": 0.1541, + "grad_norm": 11.1875, + "grad_norm_var": 0.5311848958333333, + "learning_rate": 0.0003, + "loss": 12.2399, + "loss/aux_loss": 0.04809315577149391, + "loss/crossentropy": 2.803581511974335, + "loss/logits": 0.9420624375343323, + "step": 15410 + }, + { + "epoch": 0.1542, + "grad_norm": 12.1875, + "grad_norm_var": 0.371728515625, + "learning_rate": 0.0003, + "loss": 12.2796, + "loss/aux_loss": 0.04809564612805843, + "loss/crossentropy": 2.9065362393856047, + "loss/logits": 0.9607432782649994, + "step": 15420 + }, + { + "epoch": 0.1543, + "grad_norm": 11.8125, + "grad_norm_var": 0.4650390625, + "learning_rate": 0.0003, + "loss": 12.2096, + "loss/aux_loss": 0.04809183832257986, + "loss/crossentropy": 2.902686321735382, + "loss/logits": 0.9689311563968659, + "step": 15430 + }, + { + "epoch": 0.1544, + "grad_norm": 15.0, + "grad_norm_var": 3.1499837239583335, + "learning_rate": 0.0003, + "loss": 12.1702, + "loss/aux_loss": 0.04809550140053034, + "loss/crossentropy": 2.842085200548172, + "loss/logits": 0.9201117724180221, + "step": 15440 + }, + { + "epoch": 0.1545, + "grad_norm": 12.0, + "grad_norm_var": 3.273811848958333, + "learning_rate": 0.0003, + "loss": 12.4202, + "loss/aux_loss": 0.048114397749304774, + "loss/crossentropy": 2.8335028886795044, + "loss/logits": 0.9513376891613007, + "step": 15450 + }, + { + "epoch": 0.1546, + "grad_norm": 11.0625, + "grad_norm_var": 0.345166015625, + "learning_rate": 0.0003, + "loss": 12.2035, + "loss/aux_loss": 0.04809577390551567, + "loss/crossentropy": 2.7710729837417603, + "loss/logits": 0.9169834047555924, + "step": 15460 + }, + { + "epoch": 0.1547, + "grad_norm": 10.5625, + "grad_norm_var": 0.6999837239583333, + "learning_rate": 0.0003, + "loss": 12.0555, + "loss/aux_loss": 0.04811387322843075, + "loss/crossentropy": 2.7559852480888365, + "loss/logits": 0.8992862313985824, + "step": 15470 + }, + { + "epoch": 0.1548, + "grad_norm": 11.0625, + "grad_norm_var": 0.3203125, + "learning_rate": 0.0003, + "loss": 12.2393, + "loss/aux_loss": 0.048102208971977235, + "loss/crossentropy": 2.8739076018333436, + "loss/logits": 0.8997802734375, + "step": 15480 + }, + { + "epoch": 0.1549, + "grad_norm": 12.3125, + "grad_norm_var": 0.24777018229166667, + "learning_rate": 0.0003, + "loss": 12.3324, + "loss/aux_loss": 0.04809423796832561, + "loss/crossentropy": 2.89512904882431, + "loss/logits": 0.9646374642848968, + "step": 15490 + }, + { + "epoch": 0.155, + "grad_norm": 11.6875, + "grad_norm_var": 10.139436848958333, + "learning_rate": 0.0003, + "loss": 12.278, + "loss/aux_loss": 0.04810593910515308, + "loss/crossentropy": 2.809836542606354, + "loss/logits": 0.9548726409673691, + "step": 15500 + }, + { + "epoch": 0.1551, + "grad_norm": 12.5625, + "grad_norm_var": 0.4507649739583333, + "learning_rate": 0.0003, + "loss": 12.1395, + "loss/aux_loss": 0.048092353343963626, + "loss/crossentropy": 2.638697361946106, + "loss/logits": 0.9311948031187057, + "step": 15510 + }, + { + "epoch": 0.1552, + "grad_norm": 11.625, + "grad_norm_var": 0.28359375, + "learning_rate": 0.0003, + "loss": 12.2342, + "loss/aux_loss": 0.048095655255019666, + "loss/crossentropy": 2.988735723495483, + "loss/logits": 0.9588959008455277, + "step": 15520 + }, + { + "epoch": 0.1553, + "grad_norm": 13.375, + "grad_norm_var": 150.09296875, + "learning_rate": 0.0003, + "loss": 12.2716, + "loss/aux_loss": 0.04810060281306505, + "loss/crossentropy": 2.6777639269828795, + "loss/logits": 0.9254604041576385, + "step": 15530 + }, + { + "epoch": 0.1554, + "grad_norm": 11.375, + "grad_norm_var": 0.410791015625, + "learning_rate": 0.0003, + "loss": 12.256, + "loss/aux_loss": 0.04809675142168999, + "loss/crossentropy": 2.9493250966072084, + "loss/logits": 0.9647281706333161, + "step": 15540 + }, + { + "epoch": 0.1555, + "grad_norm": 10.125, + "grad_norm_var": 0.28951822916666664, + "learning_rate": 0.0003, + "loss": 12.3685, + "loss/aux_loss": 0.048103061877191065, + "loss/crossentropy": 2.8565509915351868, + "loss/logits": 0.9633009701967239, + "step": 15550 + }, + { + "epoch": 0.1556, + "grad_norm": 10.875, + "grad_norm_var": 0.7700520833333333, + "learning_rate": 0.0003, + "loss": 12.2548, + "loss/aux_loss": 0.04810278750956058, + "loss/crossentropy": 2.889665186405182, + "loss/logits": 0.9235396683216095, + "step": 15560 + }, + { + "epoch": 0.1557, + "grad_norm": 11.5, + "grad_norm_var": 0.8440104166666667, + "learning_rate": 0.0003, + "loss": 12.3396, + "loss/aux_loss": 0.04810402244329452, + "loss/crossentropy": 2.737299156188965, + "loss/logits": 0.9336203277111054, + "step": 15570 + }, + { + "epoch": 0.1558, + "grad_norm": 11.125, + "grad_norm_var": 0.8192708333333333, + "learning_rate": 0.0003, + "loss": 12.1569, + "loss/aux_loss": 0.048098998703062536, + "loss/crossentropy": 2.78268221616745, + "loss/logits": 0.9026233315467834, + "step": 15580 + }, + { + "epoch": 0.1559, + "grad_norm": 11.6875, + "grad_norm_var": 0.13162434895833333, + "learning_rate": 0.0003, + "loss": 12.2533, + "loss/aux_loss": 0.048094463720917704, + "loss/crossentropy": 2.901452112197876, + "loss/logits": 0.9481937050819397, + "step": 15590 + }, + { + "epoch": 0.156, + "grad_norm": 10.5625, + "grad_norm_var": 0.5333170572916667, + "learning_rate": 0.0003, + "loss": 12.2767, + "loss/aux_loss": 0.048098971135914326, + "loss/crossentropy": 2.8950270414352417, + "loss/logits": 0.9641896247863769, + "step": 15600 + }, + { + "epoch": 0.1561, + "grad_norm": 11.625, + "grad_norm_var": 0.34178059895833335, + "learning_rate": 0.0003, + "loss": 11.981, + "loss/aux_loss": 0.04809776470065117, + "loss/crossentropy": 2.802956283092499, + "loss/logits": 0.920897588133812, + "step": 15610 + }, + { + "epoch": 0.1562, + "grad_norm": 10.4375, + "grad_norm_var": 0.1875, + "learning_rate": 0.0003, + "loss": 12.2279, + "loss/aux_loss": 0.04809428732842207, + "loss/crossentropy": 2.8816702008247375, + "loss/logits": 0.9599309653043747, + "step": 15620 + }, + { + "epoch": 0.1563, + "grad_norm": 11.5, + "grad_norm_var": 0.2686848958333333, + "learning_rate": 0.0003, + "loss": 12.1109, + "loss/aux_loss": 0.04809434395283461, + "loss/crossentropy": 2.7398535430431368, + "loss/logits": 0.8981555104255676, + "step": 15630 + }, + { + "epoch": 0.1564, + "grad_norm": 10.875, + "grad_norm_var": 0.31451822916666666, + "learning_rate": 0.0003, + "loss": 12.081, + "loss/aux_loss": 0.04809578433632851, + "loss/crossentropy": 2.823803460597992, + "loss/logits": 0.937409034371376, + "step": 15640 + }, + { + "epoch": 0.1565, + "grad_norm": 12.0625, + "grad_norm_var": 0.3103515625, + "learning_rate": 0.0003, + "loss": 12.0933, + "loss/aux_loss": 0.04810278974473477, + "loss/crossentropy": 2.9505991697311402, + "loss/logits": 0.9344629585742951, + "step": 15650 + }, + { + "epoch": 0.1566, + "grad_norm": 11.25, + "grad_norm_var": 0.3744140625, + "learning_rate": 0.0003, + "loss": 12.1932, + "loss/aux_loss": 0.04810452219098806, + "loss/crossentropy": 2.92813218832016, + "loss/logits": 0.9079012930393219, + "step": 15660 + }, + { + "epoch": 0.1567, + "grad_norm": 11.125, + "grad_norm_var": 0.23697916666666666, + "learning_rate": 0.0003, + "loss": 12.1024, + "loss/aux_loss": 0.04809817671775818, + "loss/crossentropy": 2.5720150113105773, + "loss/logits": 0.9325621664524079, + "step": 15670 + }, + { + "epoch": 0.1568, + "grad_norm": 12.375, + "grad_norm_var": 0.33515625, + "learning_rate": 0.0003, + "loss": 12.0574, + "loss/aux_loss": 0.04810387324541807, + "loss/crossentropy": 2.8633701324462892, + "loss/logits": 0.9684804528951645, + "step": 15680 + }, + { + "epoch": 0.1569, + "grad_norm": 11.1875, + "grad_norm_var": 0.33006184895833335, + "learning_rate": 0.0003, + "loss": 12.3274, + "loss/aux_loss": 0.04809674210846424, + "loss/crossentropy": 2.86969450712204, + "loss/logits": 0.9554843038320542, + "step": 15690 + }, + { + "epoch": 0.157, + "grad_norm": 10.125, + "grad_norm_var": 0.33839518229166665, + "learning_rate": 0.0003, + "loss": 12.1372, + "loss/aux_loss": 0.0481055686250329, + "loss/crossentropy": 2.640603184700012, + "loss/logits": 0.8951573967933655, + "step": 15700 + }, + { + "epoch": 0.1571, + "grad_norm": 12.5, + "grad_norm_var": 0.44244791666666666, + "learning_rate": 0.0003, + "loss": 12.2587, + "loss/aux_loss": 0.04809990283101797, + "loss/crossentropy": 2.864989972114563, + "loss/logits": 0.9769071489572525, + "step": 15710 + }, + { + "epoch": 0.1572, + "grad_norm": 11.4375, + "grad_norm_var": 0.28515625, + "learning_rate": 0.0003, + "loss": 12.0975, + "loss/aux_loss": 0.0480915404856205, + "loss/crossentropy": 2.7617894768714906, + "loss/logits": 0.9289013177156449, + "step": 15720 + }, + { + "epoch": 0.1573, + "grad_norm": 11.3125, + "grad_norm_var": 0.19816080729166666, + "learning_rate": 0.0003, + "loss": 12.2621, + "loss/aux_loss": 0.0480972645804286, + "loss/crossentropy": 2.8848094820976256, + "loss/logits": 0.9512290894985199, + "step": 15730 + }, + { + "epoch": 0.1574, + "grad_norm": 12.0, + "grad_norm_var": 0.5363118489583333, + "learning_rate": 0.0003, + "loss": 12.2609, + "loss/aux_loss": 0.04810263868421316, + "loss/crossentropy": 2.94705730676651, + "loss/logits": 0.9353223860263824, + "step": 15740 + }, + { + "epoch": 0.1575, + "grad_norm": 10.875, + "grad_norm_var": 0.45364583333333336, + "learning_rate": 0.0003, + "loss": 12.0892, + "loss/aux_loss": 0.048098064586520196, + "loss/crossentropy": 2.923324429988861, + "loss/logits": 0.9398457109928131, + "step": 15750 + }, + { + "epoch": 0.1576, + "grad_norm": 11.375, + "grad_norm_var": 0.22135416666666666, + "learning_rate": 0.0003, + "loss": 12.0815, + "loss/aux_loss": 0.048105406761169436, + "loss/crossentropy": 2.720612233877182, + "loss/logits": 0.8936576157808304, + "step": 15760 + }, + { + "epoch": 0.1577, + "grad_norm": 11.4375, + "grad_norm_var": 0.3462890625, + "learning_rate": 0.0003, + "loss": 12.2061, + "loss/aux_loss": 0.04810209292918444, + "loss/crossentropy": 2.7888991832733154, + "loss/logits": 0.9135987132787704, + "step": 15770 + }, + { + "epoch": 0.1578, + "grad_norm": 11.875, + "grad_norm_var": 0.219775390625, + "learning_rate": 0.0003, + "loss": 12.2186, + "loss/aux_loss": 0.048100620880723, + "loss/crossentropy": 2.94092253446579, + "loss/logits": 0.9436818659305573, + "step": 15780 + }, + { + "epoch": 0.1579, + "grad_norm": 12.6875, + "grad_norm_var": 0.28566080729166665, + "learning_rate": 0.0003, + "loss": 12.2812, + "loss/aux_loss": 0.04810180887579918, + "loss/crossentropy": 2.6856593787670135, + "loss/logits": 0.890146228671074, + "step": 15790 + }, + { + "epoch": 0.158, + "grad_norm": 10.75, + "grad_norm_var": 0.4161458333333333, + "learning_rate": 0.0003, + "loss": 12.1282, + "loss/aux_loss": 0.04810675587505102, + "loss/crossentropy": 2.824471127986908, + "loss/logits": 0.8990681618452072, + "step": 15800 + }, + { + "epoch": 0.1581, + "grad_norm": 11.25, + "grad_norm_var": 0.27706705729166664, + "learning_rate": 0.0003, + "loss": 12.2103, + "loss/aux_loss": 0.04810557030141353, + "loss/crossentropy": 2.780168378353119, + "loss/logits": 0.9444521903991699, + "step": 15810 + }, + { + "epoch": 0.1582, + "grad_norm": 11.5625, + "grad_norm_var": 0.5145670572916666, + "learning_rate": 0.0003, + "loss": 12.1687, + "loss/aux_loss": 0.04809657074511051, + "loss/crossentropy": 2.8092296421527863, + "loss/logits": 0.9068554252386093, + "step": 15820 + }, + { + "epoch": 0.1583, + "grad_norm": 11.25, + "grad_norm_var": 0.6791015625, + "learning_rate": 0.0003, + "loss": 11.9178, + "loss/aux_loss": 0.04810602068901062, + "loss/crossentropy": 2.8012136101722716, + "loss/logits": 0.8847994655370712, + "step": 15830 + }, + { + "epoch": 0.1584, + "grad_norm": 10.875, + "grad_norm_var": 0.4574055989583333, + "learning_rate": 0.0003, + "loss": 12.1666, + "loss/aux_loss": 0.04810144230723381, + "loss/crossentropy": 2.838590919971466, + "loss/logits": 0.9268758982419968, + "step": 15840 + }, + { + "epoch": 0.1585, + "grad_norm": 12.1875, + "grad_norm_var": 18.598893229166666, + "learning_rate": 0.0003, + "loss": 12.1954, + "loss/aux_loss": 0.04810195360332727, + "loss/crossentropy": 2.9161871790885927, + "loss/logits": 0.9759931951761246, + "step": 15850 + }, + { + "epoch": 0.1586, + "grad_norm": 12.25, + "grad_norm_var": 0.5593587239583333, + "learning_rate": 0.0003, + "loss": 12.1365, + "loss/aux_loss": 0.04810574501752853, + "loss/crossentropy": 2.8291844010353087, + "loss/logits": 0.9489723861217498, + "step": 15860 + }, + { + "epoch": 0.1587, + "grad_norm": 11.3125, + "grad_norm_var": 0.30050455729166664, + "learning_rate": 0.0003, + "loss": 12.0479, + "loss/aux_loss": 0.04810310564935207, + "loss/crossentropy": 2.7364535570144652, + "loss/logits": 0.9301041215658188, + "step": 15870 + }, + { + "epoch": 0.1588, + "grad_norm": 11.6875, + "grad_norm_var": 0.43865559895833334, + "learning_rate": 0.0003, + "loss": 12.1076, + "loss/aux_loss": 0.04809244927018881, + "loss/crossentropy": 2.7394683599472045, + "loss/logits": 0.9101533353328705, + "step": 15880 + }, + { + "epoch": 0.1589, + "grad_norm": 11.5625, + "grad_norm_var": 0.44453125, + "learning_rate": 0.0003, + "loss": 12.134, + "loss/aux_loss": 0.04809813145548105, + "loss/crossentropy": 2.774601572751999, + "loss/logits": 0.9330229997634888, + "step": 15890 + }, + { + "epoch": 0.159, + "grad_norm": 10.4375, + "grad_norm_var": 1.927978515625, + "learning_rate": 0.0003, + "loss": 12.1558, + "loss/aux_loss": 0.048109999299049376, + "loss/crossentropy": 2.6731720924377442, + "loss/logits": 0.8847457319498062, + "step": 15900 + }, + { + "epoch": 0.1591, + "grad_norm": 11.875, + "grad_norm_var": 1.7113932291666667, + "learning_rate": 0.0003, + "loss": 12.1998, + "loss/aux_loss": 0.04809703305363655, + "loss/crossentropy": 2.887303102016449, + "loss/logits": 0.9333222597837448, + "step": 15910 + }, + { + "epoch": 0.1592, + "grad_norm": 11.4375, + "grad_norm_var": 0.3822265625, + "learning_rate": 0.0003, + "loss": 12.2916, + "loss/aux_loss": 0.04809732548892498, + "loss/crossentropy": 2.969210720062256, + "loss/logits": 0.9471912950277328, + "step": 15920 + }, + { + "epoch": 0.1593, + "grad_norm": 10.4375, + "grad_norm_var": 0.4900390625, + "learning_rate": 0.0003, + "loss": 12.078, + "loss/aux_loss": 0.0480979910120368, + "loss/crossentropy": 2.7797336280345917, + "loss/logits": 0.8907982796430588, + "step": 15930 + }, + { + "epoch": 0.1594, + "grad_norm": 10.4375, + "grad_norm_var": 0.3692708333333333, + "learning_rate": 0.0003, + "loss": 12.0424, + "loss/aux_loss": 0.048096719570457934, + "loss/crossentropy": 2.7327013194561003, + "loss/logits": 0.898812472820282, + "step": 15940 + }, + { + "epoch": 0.1595, + "grad_norm": 15.3125, + "grad_norm_var": 124.77394205729166, + "learning_rate": 0.0003, + "loss": 12.2458, + "loss/aux_loss": 0.04809980187565088, + "loss/crossentropy": 2.7992530286312105, + "loss/logits": 0.9126897126436233, + "step": 15950 + }, + { + "epoch": 0.1596, + "grad_norm": 11.0, + "grad_norm_var": 125.29425455729167, + "learning_rate": 0.0003, + "loss": 12.3109, + "loss/aux_loss": 0.04810700826346874, + "loss/crossentropy": 2.7022558569908144, + "loss/logits": 0.965540987253189, + "step": 15960 + }, + { + "epoch": 0.1597, + "grad_norm": 11.0625, + "grad_norm_var": 0.098291015625, + "learning_rate": 0.0003, + "loss": 11.9925, + "loss/aux_loss": 0.04811773002147675, + "loss/crossentropy": 2.837008905410767, + "loss/logits": 0.9231843024492263, + "step": 15970 + }, + { + "epoch": 0.1598, + "grad_norm": 11.0625, + "grad_norm_var": 0.2630208333333333, + "learning_rate": 0.0003, + "loss": 12.165, + "loss/aux_loss": 0.04810309894382954, + "loss/crossentropy": 2.728328824043274, + "loss/logits": 0.9210437297821045, + "step": 15980 + }, + { + "epoch": 0.1599, + "grad_norm": 11.0, + "grad_norm_var": 0.19295247395833334, + "learning_rate": 0.0003, + "loss": 12.4097, + "loss/aux_loss": 0.048101219907402994, + "loss/crossentropy": 2.9919423699378966, + "loss/logits": 0.926442277431488, + "step": 15990 + }, + { + "epoch": 0.16, + "grad_norm": 10.875, + "grad_norm_var": 0.126025390625, + "learning_rate": 0.0003, + "loss": 12.2974, + "loss/aux_loss": 0.04809750020503998, + "loss/crossentropy": 2.9499990582466125, + "loss/logits": 0.987061333656311, + "step": 16000 + }, + { + "epoch": 0.1601, + "grad_norm": 12.4375, + "grad_norm_var": 0.20911458333333333, + "learning_rate": 0.0003, + "loss": 12.2875, + "loss/aux_loss": 0.04809744451195001, + "loss/crossentropy": 2.8682973265647886, + "loss/logits": 0.9318195432424545, + "step": 16010 + }, + { + "epoch": 0.1602, + "grad_norm": 11.5625, + "grad_norm_var": 0.2109375, + "learning_rate": 0.0003, + "loss": 12.1403, + "loss/aux_loss": 0.0480994550511241, + "loss/crossentropy": 2.770287108421326, + "loss/logits": 0.8830248892307282, + "step": 16020 + }, + { + "epoch": 0.1603, + "grad_norm": 11.0625, + "grad_norm_var": 0.4239420572916667, + "learning_rate": 0.0003, + "loss": 12.1359, + "loss/aux_loss": 0.04810148365795612, + "loss/crossentropy": 2.974120169878006, + "loss/logits": 0.9349057674407959, + "step": 16030 + }, + { + "epoch": 0.1604, + "grad_norm": 10.8125, + "grad_norm_var": 0.47734375, + "learning_rate": 0.0003, + "loss": 12.0298, + "loss/aux_loss": 0.048095306381583214, + "loss/crossentropy": 2.8043901443481447, + "loss/logits": 0.9003081053495408, + "step": 16040 + }, + { + "epoch": 0.1605, + "grad_norm": 11.6875, + "grad_norm_var": 0.278759765625, + "learning_rate": 0.0003, + "loss": 12.2773, + "loss/aux_loss": 0.04809126928448677, + "loss/crossentropy": 2.9143420457839966, + "loss/logits": 0.9333951026201248, + "step": 16050 + }, + { + "epoch": 0.1606, + "grad_norm": 10.9375, + "grad_norm_var": 0.4009765625, + "learning_rate": 0.0003, + "loss": 12.0947, + "loss/aux_loss": 0.048095472529530524, + "loss/crossentropy": 2.952311336994171, + "loss/logits": 0.9581076145172119, + "step": 16060 + }, + { + "epoch": 0.1607, + "grad_norm": 11.125, + "grad_norm_var": 0.2712890625, + "learning_rate": 0.0003, + "loss": 12.2128, + "loss/aux_loss": 0.048100481182336806, + "loss/crossentropy": 2.8516422152519225, + "loss/logits": 0.9667297631502152, + "step": 16070 + }, + { + "epoch": 0.1608, + "grad_norm": 11.8125, + "grad_norm_var": 0.1984375, + "learning_rate": 0.0003, + "loss": 12.2553, + "loss/aux_loss": 0.04809715617448092, + "loss/crossentropy": 2.77035049200058, + "loss/logits": 0.9237784296274185, + "step": 16080 + }, + { + "epoch": 0.1609, + "grad_norm": 11.3125, + "grad_norm_var": 0.3114420572916667, + "learning_rate": 0.0003, + "loss": 12.0383, + "loss/aux_loss": 0.04809458721429109, + "loss/crossentropy": 2.816128599643707, + "loss/logits": 0.9450860530138016, + "step": 16090 + }, + { + "epoch": 0.161, + "grad_norm": 11.4375, + "grad_norm_var": 0.33839518229166665, + "learning_rate": 0.0003, + "loss": 12.1705, + "loss/aux_loss": 0.048097101412713526, + "loss/crossentropy": 2.6644342601299287, + "loss/logits": 0.9285436570644379, + "step": 16100 + }, + { + "epoch": 0.1611, + "grad_norm": 11.9375, + "grad_norm_var": 0.247900390625, + "learning_rate": 0.0003, + "loss": 12.3188, + "loss/aux_loss": 0.0481021337211132, + "loss/crossentropy": 2.8370134472846984, + "loss/logits": 0.9432329386472702, + "step": 16110 + }, + { + "epoch": 0.1612, + "grad_norm": 11.375, + "grad_norm_var": 0.17928059895833334, + "learning_rate": 0.0003, + "loss": 12.2744, + "loss/aux_loss": 0.04810014273971319, + "loss/crossentropy": 2.829314595460892, + "loss/logits": 0.9058397889137269, + "step": 16120 + }, + { + "epoch": 0.1613, + "grad_norm": 11.625, + "grad_norm_var": 0.3004557291666667, + "learning_rate": 0.0003, + "loss": 12.1699, + "loss/aux_loss": 0.048092078790068625, + "loss/crossentropy": 2.7174839854240416, + "loss/logits": 0.9196837037801743, + "step": 16130 + }, + { + "epoch": 0.1614, + "grad_norm": 10.625, + "grad_norm_var": 0.5082682291666667, + "learning_rate": 0.0003, + "loss": 12.1341, + "loss/aux_loss": 0.04810208380222321, + "loss/crossentropy": 2.823376166820526, + "loss/logits": 0.9344120264053345, + "step": 16140 + }, + { + "epoch": 0.1615, + "grad_norm": 10.75, + "grad_norm_var": 0.36847330729166666, + "learning_rate": 0.0003, + "loss": 12.1209, + "loss/aux_loss": 0.04809723366051912, + "loss/crossentropy": 2.7950705885887146, + "loss/logits": 0.9158975452184677, + "step": 16150 + }, + { + "epoch": 0.1616, + "grad_norm": 11.25, + "grad_norm_var": 0.3140625, + "learning_rate": 0.0003, + "loss": 12.1399, + "loss/aux_loss": 0.04809668511152267, + "loss/crossentropy": 2.8807433605194093, + "loss/logits": 0.9192746669054032, + "step": 16160 + }, + { + "epoch": 0.1617, + "grad_norm": 13.375, + "grad_norm_var": 0.4009765625, + "learning_rate": 0.0003, + "loss": 12.2518, + "loss/aux_loss": 0.04810331519693136, + "loss/crossentropy": 2.7547273516654966, + "loss/logits": 0.9467386364936828, + "step": 16170 + }, + { + "epoch": 0.1618, + "grad_norm": 11.0625, + "grad_norm_var": 0.496875, + "learning_rate": 0.0003, + "loss": 12.2401, + "loss/aux_loss": 0.04808684252202511, + "loss/crossentropy": 2.847959554195404, + "loss/logits": 0.9240302503108978, + "step": 16180 + }, + { + "epoch": 0.1619, + "grad_norm": 11.0625, + "grad_norm_var": 0.312353515625, + "learning_rate": 0.0003, + "loss": 12.2344, + "loss/aux_loss": 0.048099389672279357, + "loss/crossentropy": 2.845101058483124, + "loss/logits": 0.942423290014267, + "step": 16190 + }, + { + "epoch": 0.162, + "grad_norm": 10.6875, + "grad_norm_var": 0.22029622395833334, + "learning_rate": 0.0003, + "loss": 11.9915, + "loss/aux_loss": 0.04809580724686384, + "loss/crossentropy": 2.7099331617355347, + "loss/logits": 0.9017234027385712, + "step": 16200 + }, + { + "epoch": 0.1621, + "grad_norm": 10.9375, + "grad_norm_var": 0.5179524739583333, + "learning_rate": 0.0003, + "loss": 11.9946, + "loss/aux_loss": 0.048097424954175946, + "loss/crossentropy": 2.884207457304001, + "loss/logits": 0.8981190234422683, + "step": 16210 + }, + { + "epoch": 0.1622, + "grad_norm": 11.875, + "grad_norm_var": 0.238134765625, + "learning_rate": 0.0003, + "loss": 12.2356, + "loss/aux_loss": 0.04810638912022114, + "loss/crossentropy": 2.747766560316086, + "loss/logits": 0.9162409037351609, + "step": 16220 + }, + { + "epoch": 0.1623, + "grad_norm": 10.5, + "grad_norm_var": 0.13644205729166667, + "learning_rate": 0.0003, + "loss": 11.9741, + "loss/aux_loss": 0.04809376634657383, + "loss/crossentropy": 2.8902024030685425, + "loss/logits": 0.936252373456955, + "step": 16230 + }, + { + "epoch": 0.1624, + "grad_norm": 11.8125, + "grad_norm_var": 0.366259765625, + "learning_rate": 0.0003, + "loss": 12.3348, + "loss/aux_loss": 0.048098774440586564, + "loss/crossentropy": 2.803633749485016, + "loss/logits": 0.973576670885086, + "step": 16240 + }, + { + "epoch": 0.1625, + "grad_norm": 12.1875, + "grad_norm_var": 0.278369140625, + "learning_rate": 0.0003, + "loss": 12.0583, + "loss/aux_loss": 0.04810395650565624, + "loss/crossentropy": 2.8404315412044525, + "loss/logits": 0.9931963056325912, + "step": 16250 + }, + { + "epoch": 0.1626, + "grad_norm": 12.375, + "grad_norm_var": 0.3738932291666667, + "learning_rate": 0.0003, + "loss": 12.1916, + "loss/aux_loss": 0.048100400157272814, + "loss/crossentropy": 2.768079376220703, + "loss/logits": 0.9406552851200104, + "step": 16260 + }, + { + "epoch": 0.1627, + "grad_norm": 11.3125, + "grad_norm_var": 0.28899739583333334, + "learning_rate": 0.0003, + "loss": 12.0582, + "loss/aux_loss": 0.04809861071407795, + "loss/crossentropy": 2.713436472415924, + "loss/logits": 0.9488321393728256, + "step": 16270 + }, + { + "epoch": 0.1628, + "grad_norm": 11.375, + "grad_norm_var": 0.380322265625, + "learning_rate": 0.0003, + "loss": 12.0068, + "loss/aux_loss": 0.04809640850871801, + "loss/crossentropy": 2.780947434902191, + "loss/logits": 0.9337312400341033, + "step": 16280 + }, + { + "epoch": 0.1629, + "grad_norm": 11.875, + "grad_norm_var": 0.439306640625, + "learning_rate": 0.0003, + "loss": 12.1232, + "loss/aux_loss": 0.04809816125780344, + "loss/crossentropy": 2.8880489349365233, + "loss/logits": 0.929016700387001, + "step": 16290 + }, + { + "epoch": 0.163, + "grad_norm": 11.6875, + "grad_norm_var": 0.3094889322916667, + "learning_rate": 0.0003, + "loss": 11.988, + "loss/aux_loss": 0.048106462322175504, + "loss/crossentropy": 2.9447537541389464, + "loss/logits": 0.9289344936609268, + "step": 16300 + }, + { + "epoch": 0.1631, + "grad_norm": 11.5625, + "grad_norm_var": 19.153059895833334, + "learning_rate": 0.0003, + "loss": 12.2841, + "loss/aux_loss": 0.04810431189835072, + "loss/crossentropy": 2.7848674178123476, + "loss/logits": 0.9472992300987244, + "step": 16310 + }, + { + "epoch": 0.1632, + "grad_norm": 12.0, + "grad_norm_var": 17.702604166666667, + "learning_rate": 0.0003, + "loss": 12.2275, + "loss/aux_loss": 0.04810850899666548, + "loss/crossentropy": 2.7362507581710815, + "loss/logits": 0.9090913355350494, + "step": 16320 + }, + { + "epoch": 0.1633, + "grad_norm": 10.5, + "grad_norm_var": 1.0449055989583333, + "learning_rate": 0.0003, + "loss": 12.1755, + "loss/aux_loss": 0.04809928461909294, + "loss/crossentropy": 2.870719301700592, + "loss/logits": 0.953993484377861, + "step": 16330 + }, + { + "epoch": 0.1634, + "grad_norm": 11.9375, + "grad_norm_var": 0.270166015625, + "learning_rate": 0.0003, + "loss": 12.1613, + "loss/aux_loss": 0.04809772912412882, + "loss/crossentropy": 2.730927813053131, + "loss/logits": 0.9100559711456299, + "step": 16340 + }, + { + "epoch": 0.1635, + "grad_norm": 12.625, + "grad_norm_var": 0.6403483072916667, + "learning_rate": 0.0003, + "loss": 11.9033, + "loss/aux_loss": 0.04809962585568428, + "loss/crossentropy": 2.6463473558425905, + "loss/logits": 0.8951964765787125, + "step": 16350 + }, + { + "epoch": 0.1636, + "grad_norm": 11.0, + "grad_norm_var": 0.9075520833333334, + "learning_rate": 0.0003, + "loss": 12.2224, + "loss/aux_loss": 0.04809305313974619, + "loss/crossentropy": 2.7235575318336487, + "loss/logits": 0.9235330730676651, + "step": 16360 + }, + { + "epoch": 0.1637, + "grad_norm": 11.125, + "grad_norm_var": 0.47433268229166664, + "learning_rate": 0.0003, + "loss": 12.0492, + "loss/aux_loss": 0.04809541571885347, + "loss/crossentropy": 2.8656546056270598, + "loss/logits": 0.9521847158670426, + "step": 16370 + }, + { + "epoch": 0.1638, + "grad_norm": 11.625, + "grad_norm_var": 0.34256184895833336, + "learning_rate": 0.0003, + "loss": 12.1038, + "loss/aux_loss": 0.048089108802378176, + "loss/crossentropy": 2.8255446553230286, + "loss/logits": 0.8946599334478378, + "step": 16380 + }, + { + "epoch": 0.1639, + "grad_norm": 11.75, + "grad_norm_var": 0.49581705729166664, + "learning_rate": 0.0003, + "loss": 12.1369, + "loss/aux_loss": 0.04810045957565308, + "loss/crossentropy": 2.6379716813564302, + "loss/logits": 0.9489578425884246, + "step": 16390 + }, + { + "epoch": 0.164, + "grad_norm": 10.4375, + "grad_norm_var": 0.5274576822916667, + "learning_rate": 0.0003, + "loss": 12.1495, + "loss/aux_loss": 0.0480981033295393, + "loss/crossentropy": 2.691847151517868, + "loss/logits": 0.9181078314781189, + "step": 16400 + }, + { + "epoch": 0.1641, + "grad_norm": 11.25, + "grad_norm_var": 6.025, + "learning_rate": 0.0003, + "loss": 12.1718, + "loss/aux_loss": 0.048096432350575924, + "loss/crossentropy": 2.829243075847626, + "loss/logits": 0.9195181250572204, + "step": 16410 + }, + { + "epoch": 0.1642, + "grad_norm": 11.25, + "grad_norm_var": 2.2655598958333334, + "learning_rate": 0.0003, + "loss": 12.1507, + "loss/aux_loss": 0.04810443092137575, + "loss/crossentropy": 2.794591999053955, + "loss/logits": 0.9034171938896179, + "step": 16420 + }, + { + "epoch": 0.1643, + "grad_norm": 12.3125, + "grad_norm_var": 0.42604166666666665, + "learning_rate": 0.0003, + "loss": 12.0589, + "loss/aux_loss": 0.0480903310701251, + "loss/crossentropy": 2.806131112575531, + "loss/logits": 0.9333689689636231, + "step": 16430 + }, + { + "epoch": 0.1644, + "grad_norm": 12.875, + "grad_norm_var": 0.47291666666666665, + "learning_rate": 0.0003, + "loss": 12.1076, + "loss/aux_loss": 0.04809165094047785, + "loss/crossentropy": 3.009689784049988, + "loss/logits": 0.9455327719449997, + "step": 16440 + }, + { + "epoch": 0.1645, + "grad_norm": 12.3125, + "grad_norm_var": 0.6640462239583333, + "learning_rate": 0.0003, + "loss": 12.1808, + "loss/aux_loss": 0.04810215122997761, + "loss/crossentropy": 2.7933058738708496, + "loss/logits": 0.8973431855440139, + "step": 16450 + }, + { + "epoch": 0.1646, + "grad_norm": 12.5625, + "grad_norm_var": 0.501025390625, + "learning_rate": 0.0003, + "loss": 11.9219, + "loss/aux_loss": 0.04809769950807095, + "loss/crossentropy": 2.6854580640792847, + "loss/logits": 0.9056837558746338, + "step": 16460 + }, + { + "epoch": 0.1647, + "grad_norm": 12.5, + "grad_norm_var": 0.2234375, + "learning_rate": 0.0003, + "loss": 12.1113, + "loss/aux_loss": 0.048105937987565996, + "loss/crossentropy": 2.7549439489841463, + "loss/logits": 0.917845630645752, + "step": 16470 + }, + { + "epoch": 0.1648, + "grad_norm": 11.75, + "grad_norm_var": 0.2669270833333333, + "learning_rate": 0.0003, + "loss": 12.1535, + "loss/aux_loss": 0.048096487298607826, + "loss/crossentropy": 2.927453136444092, + "loss/logits": 0.9296642661094665, + "step": 16480 + }, + { + "epoch": 0.1649, + "grad_norm": 11.4375, + "grad_norm_var": 0.35442708333333334, + "learning_rate": 0.0003, + "loss": 12.0322, + "loss/aux_loss": 0.048098070360720155, + "loss/crossentropy": 2.93691543340683, + "loss/logits": 0.9586718380451202, + "step": 16490 + }, + { + "epoch": 0.165, + "grad_norm": 12.0625, + "grad_norm_var": 0.5493326822916667, + "learning_rate": 0.0003, + "loss": 12.214, + "loss/aux_loss": 0.04809797964990139, + "loss/crossentropy": 2.7319608986377717, + "loss/logits": 0.9313073545694351, + "step": 16500 + }, + { + "epoch": 0.1651, + "grad_norm": 11.0625, + "grad_norm_var": 17.5791015625, + "learning_rate": 0.0003, + "loss": 12.113, + "loss/aux_loss": 0.048102827928960326, + "loss/crossentropy": 2.8502477288246153, + "loss/logits": 0.9453782886266708, + "step": 16510 + }, + { + "epoch": 0.1652, + "grad_norm": 10.875, + "grad_norm_var": 0.16378580729166667, + "learning_rate": 0.0003, + "loss": 12.0053, + "loss/aux_loss": 0.04810033868998289, + "loss/crossentropy": 2.5949636459350587, + "loss/logits": 0.8820204049348831, + "step": 16520 + }, + { + "epoch": 0.1653, + "grad_norm": 11.125, + "grad_norm_var": 0.48943684895833334, + "learning_rate": 0.0003, + "loss": 12.1139, + "loss/aux_loss": 0.04809820037335157, + "loss/crossentropy": 2.789997029304504, + "loss/logits": 0.9239853471517563, + "step": 16530 + }, + { + "epoch": 0.1654, + "grad_norm": 12.8125, + "grad_norm_var": 0.37180989583333335, + "learning_rate": 0.0003, + "loss": 12.3409, + "loss/aux_loss": 0.04809273220598698, + "loss/crossentropy": 2.9401179909706117, + "loss/logits": 0.9572012543678283, + "step": 16540 + }, + { + "epoch": 0.1655, + "grad_norm": 11.375, + "grad_norm_var": 0.5218587239583333, + "learning_rate": 0.0003, + "loss": 12.111, + "loss/aux_loss": 0.04809644967317581, + "loss/crossentropy": 2.735247939825058, + "loss/logits": 0.9140194296836853, + "step": 16550 + }, + { + "epoch": 0.1656, + "grad_norm": 13.4375, + "grad_norm_var": 0.6286295572916667, + "learning_rate": 0.0003, + "loss": 12.1088, + "loss/aux_loss": 0.048106629587709906, + "loss/crossentropy": 2.5925142049789427, + "loss/logits": 0.8779049098491669, + "step": 16560 + }, + { + "epoch": 0.1657, + "grad_norm": 11.3125, + "grad_norm_var": 0.7202473958333333, + "learning_rate": 0.0003, + "loss": 12.0626, + "loss/aux_loss": 0.048098241165280345, + "loss/crossentropy": 2.78861083984375, + "loss/logits": 0.9169972121715546, + "step": 16570 + }, + { + "epoch": 0.1658, + "grad_norm": 12.1875, + "grad_norm_var": 0.7563639322916667, + "learning_rate": 0.0003, + "loss": 12.0808, + "loss/aux_loss": 0.04810166098177433, + "loss/crossentropy": 2.8476951360702514, + "loss/logits": 0.9255498439073563, + "step": 16580 + }, + { + "epoch": 0.1659, + "grad_norm": 11.1875, + "grad_norm_var": 0.8036458333333333, + "learning_rate": 0.0003, + "loss": 11.9856, + "loss/aux_loss": 0.048094157315790656, + "loss/crossentropy": 2.7307428240776064, + "loss/logits": 0.9049693077802659, + "step": 16590 + }, + { + "epoch": 0.166, + "grad_norm": 11.4375, + "grad_norm_var": 0.491650390625, + "learning_rate": 0.0003, + "loss": 12.0904, + "loss/aux_loss": 0.048098260350525376, + "loss/crossentropy": 2.7222547829151154, + "loss/logits": 0.9318049371242523, + "step": 16600 + }, + { + "epoch": 0.1661, + "grad_norm": 12.9375, + "grad_norm_var": 0.5280598958333333, + "learning_rate": 0.0003, + "loss": 12.0952, + "loss/aux_loss": 0.04809851739555597, + "loss/crossentropy": 2.6276703774929047, + "loss/logits": 0.8886691600084304, + "step": 16610 + }, + { + "epoch": 0.1662, + "grad_norm": 11.9375, + "grad_norm_var": 0.40260416666666665, + "learning_rate": 0.0003, + "loss": 12.1522, + "loss/aux_loss": 0.048096096701920034, + "loss/crossentropy": 2.756567734479904, + "loss/logits": 0.9005006104707718, + "step": 16620 + }, + { + "epoch": 0.1663, + "grad_norm": 12.25, + "grad_norm_var": 0.397119140625, + "learning_rate": 0.0003, + "loss": 12.0698, + "loss/aux_loss": 0.04808939378708601, + "loss/crossentropy": 2.8644691705703735, + "loss/logits": 0.9494952738285065, + "step": 16630 + }, + { + "epoch": 0.1664, + "grad_norm": 10.875, + "grad_norm_var": 0.304150390625, + "learning_rate": 0.0003, + "loss": 11.9855, + "loss/aux_loss": 0.04809617009013891, + "loss/crossentropy": 2.8036171019077303, + "loss/logits": 0.9411976546049118, + "step": 16640 + }, + { + "epoch": 0.1665, + "grad_norm": 11.8125, + "grad_norm_var": 1.0863932291666667, + "learning_rate": 0.0003, + "loss": 12.1059, + "loss/aux_loss": 0.04810191765427589, + "loss/crossentropy": 2.8542271971702577, + "loss/logits": 0.941945058107376, + "step": 16650 + }, + { + "epoch": 0.1666, + "grad_norm": 11.6875, + "grad_norm_var": 0.356103515625, + "learning_rate": 0.0003, + "loss": 12.1667, + "loss/aux_loss": 0.048105718195438386, + "loss/crossentropy": 2.759011608362198, + "loss/logits": 0.911252424120903, + "step": 16660 + }, + { + "epoch": 0.1667, + "grad_norm": 10.5, + "grad_norm_var": 0.34140625, + "learning_rate": 0.0003, + "loss": 12.019, + "loss/aux_loss": 0.048089655488729476, + "loss/crossentropy": 2.7977048456668854, + "loss/logits": 0.9163706332445145, + "step": 16670 + }, + { + "epoch": 0.1668, + "grad_norm": 11.8125, + "grad_norm_var": 0.4, + "learning_rate": 0.0003, + "loss": 12.0449, + "loss/aux_loss": 0.04810653738677502, + "loss/crossentropy": 2.8017389357089995, + "loss/logits": 0.9295397102832794, + "step": 16680 + }, + { + "epoch": 0.1669, + "grad_norm": 10.5625, + "grad_norm_var": 0.34420572916666664, + "learning_rate": 0.0003, + "loss": 12.1962, + "loss/aux_loss": 0.04809100721031427, + "loss/crossentropy": 2.8505070567131043, + "loss/logits": 0.9185640811920166, + "step": 16690 + }, + { + "epoch": 0.167, + "grad_norm": 12.8125, + "grad_norm_var": 1.0841145833333334, + "learning_rate": 0.0003, + "loss": 12.0777, + "loss/aux_loss": 0.048104220815002915, + "loss/crossentropy": 2.6532647252082824, + "loss/logits": 0.9005499392747879, + "step": 16700 + }, + { + "epoch": 0.1671, + "grad_norm": 11.25, + "grad_norm_var": 1.1744140625, + "learning_rate": 0.0003, + "loss": 11.9404, + "loss/aux_loss": 0.04809899311512709, + "loss/crossentropy": 2.7995954275131227, + "loss/logits": 0.906044989824295, + "step": 16710 + }, + { + "epoch": 0.1672, + "grad_norm": 11.625, + "grad_norm_var": 0.7273274739583333, + "learning_rate": 0.0003, + "loss": 11.93, + "loss/aux_loss": 0.04809800013899803, + "loss/crossentropy": 2.867034387588501, + "loss/logits": 0.908442784845829, + "step": 16720 + }, + { + "epoch": 0.1673, + "grad_norm": 12.5, + "grad_norm_var": 0.760400390625, + "learning_rate": 0.0003, + "loss": 11.9833, + "loss/aux_loss": 0.048098081909120086, + "loss/crossentropy": 2.7534588992595674, + "loss/logits": 0.9382378399372101, + "step": 16730 + }, + { + "epoch": 0.1674, + "grad_norm": 12.0, + "grad_norm_var": 0.29791666666666666, + "learning_rate": 0.0003, + "loss": 12.1122, + "loss/aux_loss": 0.04809783697128296, + "loss/crossentropy": 2.8023226737976072, + "loss/logits": 0.9453730881214142, + "step": 16740 + }, + { + "epoch": 0.1675, + "grad_norm": 11.3125, + "grad_norm_var": 0.24816080729166667, + "learning_rate": 0.0003, + "loss": 12.0329, + "loss/aux_loss": 0.04809421058744192, + "loss/crossentropy": 2.7011972665786743, + "loss/logits": 0.9254505336284637, + "step": 16750 + }, + { + "epoch": 0.1676, + "grad_norm": 11.0625, + "grad_norm_var": 0.3059895833333333, + "learning_rate": 0.0003, + "loss": 12.1991, + "loss/aux_loss": 0.048098991997539996, + "loss/crossentropy": 2.923702526092529, + "loss/logits": 0.9767153590917588, + "step": 16760 + }, + { + "epoch": 0.1677, + "grad_norm": 11.0, + "grad_norm_var": 0.4400390625, + "learning_rate": 0.0003, + "loss": 12.2559, + "loss/aux_loss": 0.048096513748168944, + "loss/crossentropy": 2.9745861649513246, + "loss/logits": 0.9630038678646088, + "step": 16770 + }, + { + "epoch": 0.1678, + "grad_norm": 10.625, + "grad_norm_var": 0.424072265625, + "learning_rate": 0.0003, + "loss": 12.0234, + "loss/aux_loss": 0.048099739477038383, + "loss/crossentropy": 2.6956757068634034, + "loss/logits": 0.9454267978668213, + "step": 16780 + }, + { + "epoch": 0.1679, + "grad_norm": 12.0, + "grad_norm_var": 0.2275390625, + "learning_rate": 0.0003, + "loss": 12.1579, + "loss/aux_loss": 0.04809470549225807, + "loss/crossentropy": 2.8803565382957457, + "loss/logits": 0.9466471463441849, + "step": 16790 + }, + { + "epoch": 0.168, + "grad_norm": 11.0625, + "grad_norm_var": 0.3606770833333333, + "learning_rate": 0.0003, + "loss": 12.0702, + "loss/aux_loss": 0.048100711591541764, + "loss/crossentropy": 2.769081395864487, + "loss/logits": 0.9133290886878968, + "step": 16800 + }, + { + "epoch": 0.1681, + "grad_norm": 11.3125, + "grad_norm_var": 0.29542643229166665, + "learning_rate": 0.0003, + "loss": 12.026, + "loss/aux_loss": 0.04810358509421349, + "loss/crossentropy": 2.6967382431030273, + "loss/logits": 0.9303892910480499, + "step": 16810 + }, + { + "epoch": 0.1682, + "grad_norm": 12.0, + "grad_norm_var": 0.1791015625, + "learning_rate": 0.0003, + "loss": 12.2407, + "loss/aux_loss": 0.04810531884431839, + "loss/crossentropy": 2.7483027279376984, + "loss/logits": 0.9215909510850906, + "step": 16820 + }, + { + "epoch": 0.1683, + "grad_norm": 12.4375, + "grad_norm_var": 0.4901041666666667, + "learning_rate": 0.0003, + "loss": 12.0529, + "loss/aux_loss": 0.04809485897421837, + "loss/crossentropy": 2.6491159200668335, + "loss/logits": 0.8965664654970169, + "step": 16830 + }, + { + "epoch": 0.1684, + "grad_norm": 11.9375, + "grad_norm_var": 0.496728515625, + "learning_rate": 0.0003, + "loss": 12.0189, + "loss/aux_loss": 0.04810217395424843, + "loss/crossentropy": 2.8553712725639344, + "loss/logits": 0.9247318297624588, + "step": 16840 + }, + { + "epoch": 0.1685, + "grad_norm": 11.6875, + "grad_norm_var": 0.19192708333333333, + "learning_rate": 0.0003, + "loss": 12.2266, + "loss/aux_loss": 0.04810085538774729, + "loss/crossentropy": 2.841351580619812, + "loss/logits": 0.9316177189350128, + "step": 16850 + }, + { + "epoch": 0.1686, + "grad_norm": 12.1875, + "grad_norm_var": 0.28097330729166664, + "learning_rate": 0.0003, + "loss": 12.166, + "loss/aux_loss": 0.04810477644205093, + "loss/crossentropy": 2.816389191150665, + "loss/logits": 0.9328649133443833, + "step": 16860 + }, + { + "epoch": 0.1687, + "grad_norm": 11.25, + "grad_norm_var": 0.2598958333333333, + "learning_rate": 0.0003, + "loss": 12.0411, + "loss/aux_loss": 0.04809478260576725, + "loss/crossentropy": 2.9236293196678163, + "loss/logits": 0.9430270612239837, + "step": 16870 + }, + { + "epoch": 0.1688, + "grad_norm": 11.0, + "grad_norm_var": 0.2899576822916667, + "learning_rate": 0.0003, + "loss": 11.9808, + "loss/aux_loss": 0.0481045238673687, + "loss/crossentropy": 2.7350330710411073, + "loss/logits": 0.8710766971111298, + "step": 16880 + }, + { + "epoch": 0.1689, + "grad_norm": 11.3125, + "grad_norm_var": 0.5572265625, + "learning_rate": 0.0003, + "loss": 12.021, + "loss/aux_loss": 0.048098239861428735, + "loss/crossentropy": 2.7996289134025574, + "loss/logits": 0.8801421314477921, + "step": 16890 + }, + { + "epoch": 0.169, + "grad_norm": 11.4375, + "grad_norm_var": 0.31573893229166666, + "learning_rate": 0.0003, + "loss": 12.0939, + "loss/aux_loss": 0.048104897141456604, + "loss/crossentropy": 2.7211228966712953, + "loss/logits": 0.9303423374891281, + "step": 16900 + }, + { + "epoch": 0.1691, + "grad_norm": 12.5, + "grad_norm_var": 0.6618326822916667, + "learning_rate": 0.0003, + "loss": 12.0377, + "loss/aux_loss": 0.04810284618288278, + "loss/crossentropy": 2.7154513716697695, + "loss/logits": 0.9080936968326568, + "step": 16910 + }, + { + "epoch": 0.1692, + "grad_norm": 12.5625, + "grad_norm_var": 0.7940104166666667, + "learning_rate": 0.0003, + "loss": 12.0661, + "loss/aux_loss": 0.0480948593467474, + "loss/crossentropy": 2.756969064474106, + "loss/logits": 0.9487773150205612, + "step": 16920 + }, + { + "epoch": 0.1693, + "grad_norm": 10.5625, + "grad_norm_var": 0.402197265625, + "learning_rate": 0.0003, + "loss": 12.1444, + "loss/aux_loss": 0.04810402132570744, + "loss/crossentropy": 2.7484578788280487, + "loss/logits": 0.9173682719469071, + "step": 16930 + }, + { + "epoch": 0.1694, + "grad_norm": 12.125, + "grad_norm_var": 0.2416015625, + "learning_rate": 0.0003, + "loss": 12.0027, + "loss/aux_loss": 0.04809407070279122, + "loss/crossentropy": 2.719779831171036, + "loss/logits": 0.9158334016799927, + "step": 16940 + }, + { + "epoch": 0.1695, + "grad_norm": 12.3125, + "grad_norm_var": 0.28203125, + "learning_rate": 0.0003, + "loss": 12.0734, + "loss/aux_loss": 0.04810118954628706, + "loss/crossentropy": 2.806976354122162, + "loss/logits": 0.8893602877855301, + "step": 16950 + }, + { + "epoch": 0.1696, + "grad_norm": 11.25, + "grad_norm_var": 1.8876139322916667, + "learning_rate": 0.0003, + "loss": 12.0654, + "loss/aux_loss": 0.04809729289263487, + "loss/crossentropy": 2.8548884272575377, + "loss/logits": 0.9692226439714432, + "step": 16960 + }, + { + "epoch": 0.1697, + "grad_norm": 10.5625, + "grad_norm_var": 0.314697265625, + "learning_rate": 0.0003, + "loss": 12.1883, + "loss/aux_loss": 0.048106925748288634, + "loss/crossentropy": 2.9143474459648133, + "loss/logits": 0.9318220674991607, + "step": 16970 + }, + { + "epoch": 0.1698, + "grad_norm": 12.625, + "grad_norm_var": 1.1700358072916666, + "learning_rate": 0.0003, + "loss": 11.9753, + "loss/aux_loss": 0.04809020813554525, + "loss/crossentropy": 2.8300904273986816, + "loss/logits": 0.9694455862045288, + "step": 16980 + }, + { + "epoch": 0.1699, + "grad_norm": 11.4375, + "grad_norm_var": 0.403125, + "learning_rate": 0.0003, + "loss": 12.2506, + "loss/aux_loss": 0.048101211339235304, + "loss/crossentropy": 2.794898247718811, + "loss/logits": 0.9496973544359207, + "step": 16990 + }, + { + "epoch": 0.17, + "grad_norm": 12.0625, + "grad_norm_var": 0.43333333333333335, + "learning_rate": 0.0003, + "loss": 12.101, + "loss/aux_loss": 0.04809295553714037, + "loss/crossentropy": 3.0219761967658996, + "loss/logits": 0.9177909851074219, + "step": 17000 + }, + { + "epoch": 0.1701, + "grad_norm": 15.75, + "grad_norm_var": 96.87667643229166, + "learning_rate": 0.0003, + "loss": 12.0937, + "loss/aux_loss": 0.0481051966547966, + "loss/crossentropy": 2.9502204298973083, + "loss/logits": 0.9211991935968399, + "step": 17010 + }, + { + "epoch": 0.1702, + "grad_norm": 11.75, + "grad_norm_var": 2.8739420572916665, + "learning_rate": 0.0003, + "loss": 12.0823, + "loss/aux_loss": 0.04810426589101553, + "loss/crossentropy": 2.6327461183071135, + "loss/logits": 0.923973485827446, + "step": 17020 + }, + { + "epoch": 0.1703, + "grad_norm": 11.375, + "grad_norm_var": 1.0531087239583334, + "learning_rate": 0.0003, + "loss": 12.0928, + "loss/aux_loss": 0.04810360558331013, + "loss/crossentropy": 3.002879500389099, + "loss/logits": 0.9532357782125473, + "step": 17030 + }, + { + "epoch": 0.1704, + "grad_norm": 11.1875, + "grad_norm_var": 0.63046875, + "learning_rate": 0.0003, + "loss": 12.0248, + "loss/aux_loss": 0.0480900751426816, + "loss/crossentropy": 2.7537549138069153, + "loss/logits": 0.9276573568582535, + "step": 17040 + }, + { + "epoch": 0.1705, + "grad_norm": 13.375, + "grad_norm_var": 1.6624837239583334, + "learning_rate": 0.0003, + "loss": 12.0054, + "loss/aux_loss": 0.04810062348842621, + "loss/crossentropy": 2.884317523241043, + "loss/logits": 0.9326794624328614, + "step": 17050 + }, + { + "epoch": 0.1706, + "grad_norm": 11.5, + "grad_norm_var": 1.736181640625, + "learning_rate": 0.0003, + "loss": 11.8953, + "loss/aux_loss": 0.04811934363096952, + "loss/crossentropy": 2.6659162402153016, + "loss/logits": 0.8868398576974869, + "step": 17060 + }, + { + "epoch": 0.1707, + "grad_norm": 11.875, + "grad_norm_var": 0.5770833333333333, + "learning_rate": 0.0003, + "loss": 12.0149, + "loss/aux_loss": 0.04809251334518194, + "loss/crossentropy": 2.749342954158783, + "loss/logits": 0.9031396269798279, + "step": 17070 + }, + { + "epoch": 0.1708, + "grad_norm": 11.875, + "grad_norm_var": 0.2526041666666667, + "learning_rate": 0.0003, + "loss": 12.1526, + "loss/aux_loss": 0.048104824125766756, + "loss/crossentropy": 2.856028115749359, + "loss/logits": 0.959146237373352, + "step": 17080 + }, + { + "epoch": 0.1709, + "grad_norm": 11.625, + "grad_norm_var": 0.1916015625, + "learning_rate": 0.0003, + "loss": 12.0983, + "loss/aux_loss": 0.04809093903750181, + "loss/crossentropy": 2.8145798802375794, + "loss/logits": 0.9006113916635513, + "step": 17090 + }, + { + "epoch": 0.171, + "grad_norm": 12.3125, + "grad_norm_var": 0.22337239583333332, + "learning_rate": 0.0003, + "loss": 12.1089, + "loss/aux_loss": 0.04809470176696777, + "loss/crossentropy": 2.878612220287323, + "loss/logits": 0.9135033786296844, + "step": 17100 + }, + { + "epoch": 0.1711, + "grad_norm": 11.375, + "grad_norm_var": 54.241129557291664, + "learning_rate": 0.0003, + "loss": 12.0472, + "loss/aux_loss": 0.048102441057562825, + "loss/crossentropy": 2.811024880409241, + "loss/logits": 0.8989885419607162, + "step": 17110 + }, + { + "epoch": 0.1712, + "grad_norm": 11.5, + "grad_norm_var": 0.40206705729166664, + "learning_rate": 0.0003, + "loss": 12.1958, + "loss/aux_loss": 0.048090110532939434, + "loss/crossentropy": 2.8124298572540285, + "loss/logits": 0.9289597928524017, + "step": 17120 + }, + { + "epoch": 0.1713, + "grad_norm": 11.625, + "grad_norm_var": 0.2353515625, + "learning_rate": 0.0003, + "loss": 11.8911, + "loss/aux_loss": 0.04809228479862213, + "loss/crossentropy": 2.876737803220749, + "loss/logits": 0.9436014890670776, + "step": 17130 + }, + { + "epoch": 0.1714, + "grad_norm": 11.375, + "grad_norm_var": 0.2886555989583333, + "learning_rate": 0.0003, + "loss": 11.9557, + "loss/aux_loss": 0.048099903389811516, + "loss/crossentropy": 2.8804137110710144, + "loss/logits": 0.938829579949379, + "step": 17140 + }, + { + "epoch": 0.1715, + "grad_norm": 10.6875, + "grad_norm_var": 0.260791015625, + "learning_rate": 0.0003, + "loss": 11.852, + "loss/aux_loss": 0.04809135273098945, + "loss/crossentropy": 2.7023903012275694, + "loss/logits": 0.8962929219007492, + "step": 17150 + }, + { + "epoch": 0.1716, + "grad_norm": 11.5625, + "grad_norm_var": 0.3447265625, + "learning_rate": 0.0003, + "loss": 12.158, + "loss/aux_loss": 0.048096888884902, + "loss/crossentropy": 2.827605813741684, + "loss/logits": 0.9441530287265778, + "step": 17160 + }, + { + "epoch": 0.1717, + "grad_norm": 13.5, + "grad_norm_var": 0.7931640625, + "learning_rate": 0.0003, + "loss": 12.1546, + "loss/aux_loss": 0.048093185387551786, + "loss/crossentropy": 2.814880883693695, + "loss/logits": 0.9108005404472351, + "step": 17170 + }, + { + "epoch": 0.1718, + "grad_norm": 11.625, + "grad_norm_var": 3.0380208333333334, + "learning_rate": 0.0003, + "loss": 11.962, + "loss/aux_loss": 0.04810141772031784, + "loss/crossentropy": 2.7444641530513763, + "loss/logits": 0.9487886667251587, + "step": 17180 + }, + { + "epoch": 0.1719, + "grad_norm": 11.6875, + "grad_norm_var": 2.831770833333333, + "learning_rate": 0.0003, + "loss": 12.1348, + "loss/aux_loss": 0.04809289593249559, + "loss/crossentropy": 2.7105092108249664, + "loss/logits": 0.9182222783565521, + "step": 17190 + }, + { + "epoch": 0.172, + "grad_norm": 11.375, + "grad_norm_var": 0.474853515625, + "learning_rate": 0.0003, + "loss": 12.1075, + "loss/aux_loss": 0.048097463138401506, + "loss/crossentropy": 2.8113415241241455, + "loss/logits": 0.9427078306674957, + "step": 17200 + }, + { + "epoch": 0.1721, + "grad_norm": 12.375, + "grad_norm_var": 0.2786458333333333, + "learning_rate": 0.0003, + "loss": 12.14, + "loss/aux_loss": 0.04809903036803007, + "loss/crossentropy": 2.9176873922348023, + "loss/logits": 0.9191664904356003, + "step": 17210 + }, + { + "epoch": 0.1722, + "grad_norm": 10.3125, + "grad_norm_var": 0.5504557291666666, + "learning_rate": 0.0003, + "loss": 12.0308, + "loss/aux_loss": 0.04809430036693811, + "loss/crossentropy": 2.64280886054039, + "loss/logits": 0.8799058675765992, + "step": 17220 + }, + { + "epoch": 0.1723, + "grad_norm": 11.3125, + "grad_norm_var": 0.4051432291666667, + "learning_rate": 0.0003, + "loss": 12.1222, + "loss/aux_loss": 0.048100571148097515, + "loss/crossentropy": 2.9199374198913572, + "loss/logits": 0.9405399680137634, + "step": 17230 + }, + { + "epoch": 0.1724, + "grad_norm": 11.1875, + "grad_norm_var": 0.29464518229166664, + "learning_rate": 0.0003, + "loss": 12.2574, + "loss/aux_loss": 0.04809443484991789, + "loss/crossentropy": 2.7939966559410094, + "loss/logits": 0.9183706283569336, + "step": 17240 + }, + { + "epoch": 0.1725, + "grad_norm": 12.0, + "grad_norm_var": 0.2562337239583333, + "learning_rate": 0.0003, + "loss": 12.0131, + "loss/aux_loss": 0.048107451759278774, + "loss/crossentropy": 2.779514318704605, + "loss/logits": 0.9068025201559067, + "step": 17250 + }, + { + "epoch": 0.1726, + "grad_norm": 11.8125, + "grad_norm_var": 0.39108072916666664, + "learning_rate": 0.0003, + "loss": 11.9615, + "loss/aux_loss": 0.04809543266892433, + "loss/crossentropy": 2.817984676361084, + "loss/logits": 0.9144764870405198, + "step": 17260 + }, + { + "epoch": 0.1727, + "grad_norm": 11.625, + "grad_norm_var": 2.011458333333333, + "learning_rate": 0.0003, + "loss": 11.9494, + "loss/aux_loss": 0.04809968285262585, + "loss/crossentropy": 2.9492964446544647, + "loss/logits": 0.9344431668519974, + "step": 17270 + }, + { + "epoch": 0.1728, + "grad_norm": 13.0, + "grad_norm_var": 2.0067545572916665, + "learning_rate": 0.0003, + "loss": 11.9271, + "loss/aux_loss": 0.04809718765318394, + "loss/crossentropy": 2.7608347654342653, + "loss/logits": 0.9011356472969055, + "step": 17280 + }, + { + "epoch": 0.1729, + "grad_norm": 11.8125, + "grad_norm_var": 0.7468098958333333, + "learning_rate": 0.0003, + "loss": 12.0838, + "loss/aux_loss": 0.048104557767510416, + "loss/crossentropy": 2.7879473209381103, + "loss/logits": 0.9154693454504013, + "step": 17290 + }, + { + "epoch": 0.173, + "grad_norm": 11.625, + "grad_norm_var": 0.749072265625, + "learning_rate": 0.0003, + "loss": 11.9908, + "loss/aux_loss": 0.048098857142031194, + "loss/crossentropy": 2.6744504272937775, + "loss/logits": 0.8712642341852188, + "step": 17300 + }, + { + "epoch": 0.1731, + "grad_norm": 12.0625, + "grad_norm_var": 0.38671875, + "learning_rate": 0.0003, + "loss": 12.1573, + "loss/aux_loss": 0.048095573857426646, + "loss/crossentropy": 3.0483207941055297, + "loss/logits": 0.93597452044487, + "step": 17310 + }, + { + "epoch": 0.1732, + "grad_norm": 12.5625, + "grad_norm_var": 0.15818684895833332, + "learning_rate": 0.0003, + "loss": 11.8331, + "loss/aux_loss": 0.04809464327991009, + "loss/crossentropy": 2.7563810288906097, + "loss/logits": 0.8930452913045883, + "step": 17320 + }, + { + "epoch": 0.1733, + "grad_norm": 11.875, + "grad_norm_var": 50.563395182291664, + "learning_rate": 0.0003, + "loss": 12.0362, + "loss/aux_loss": 0.048101813159883024, + "loss/crossentropy": 2.807816767692566, + "loss/logits": 0.9140335559844971, + "step": 17330 + }, + { + "epoch": 0.1734, + "grad_norm": 12.125, + "grad_norm_var": 50.9265625, + "learning_rate": 0.0003, + "loss": 11.9131, + "loss/aux_loss": 0.048089164309203625, + "loss/crossentropy": 2.7212966203689577, + "loss/logits": 0.9433120638132095, + "step": 17340 + }, + { + "epoch": 0.1735, + "grad_norm": 12.25, + "grad_norm_var": 0.15402018229166667, + "learning_rate": 0.0003, + "loss": 12.1065, + "loss/aux_loss": 0.04809843562543392, + "loss/crossentropy": 2.6257729053497316, + "loss/logits": 0.8846357733011245, + "step": 17350 + }, + { + "epoch": 0.1736, + "grad_norm": 12.0625, + "grad_norm_var": 0.445166015625, + "learning_rate": 0.0003, + "loss": 12.0886, + "loss/aux_loss": 0.04810376763343811, + "loss/crossentropy": 2.8265872836112975, + "loss/logits": 0.9571549206972122, + "step": 17360 + }, + { + "epoch": 0.1737, + "grad_norm": 11.0625, + "grad_norm_var": 0.4348958333333333, + "learning_rate": 0.0003, + "loss": 12.0218, + "loss/aux_loss": 0.04809571448713541, + "loss/crossentropy": 2.695615494251251, + "loss/logits": 0.9150578171014786, + "step": 17370 + }, + { + "epoch": 0.1738, + "grad_norm": 11.8125, + "grad_norm_var": 0.46139322916666664, + "learning_rate": 0.0003, + "loss": 12.2693, + "loss/aux_loss": 0.04809955190867186, + "loss/crossentropy": 2.6700818240642548, + "loss/logits": 0.9094936668872833, + "step": 17380 + }, + { + "epoch": 0.1739, + "grad_norm": 11.1875, + "grad_norm_var": 0.7393229166666667, + "learning_rate": 0.0003, + "loss": 12.0212, + "loss/aux_loss": 0.048098945058882236, + "loss/crossentropy": 2.7970273315906526, + "loss/logits": 0.8984217762947082, + "step": 17390 + }, + { + "epoch": 0.174, + "grad_norm": 12.8125, + "grad_norm_var": 1.09375, + "learning_rate": 0.0003, + "loss": 12.0838, + "loss/aux_loss": 0.048093376122415064, + "loss/crossentropy": 2.8114991784095764, + "loss/logits": 0.8884566456079483, + "step": 17400 + }, + { + "epoch": 0.1741, + "grad_norm": 12.375, + "grad_norm_var": 1.1936848958333333, + "learning_rate": 0.0003, + "loss": 12.0995, + "loss/aux_loss": 0.04810470137745142, + "loss/crossentropy": 2.7583046913146974, + "loss/logits": 0.9460157155990601, + "step": 17410 + }, + { + "epoch": 0.1742, + "grad_norm": 25.875, + "grad_norm_var": 12.618733723958334, + "learning_rate": 0.0003, + "loss": 12.1705, + "loss/aux_loss": 0.04809574950486421, + "loss/crossentropy": 2.821639972925186, + "loss/logits": 0.920597642660141, + "step": 17420 + }, + { + "epoch": 0.1743, + "grad_norm": 12.0, + "grad_norm_var": 13.774593098958333, + "learning_rate": 0.0003, + "loss": 12.1343, + "loss/aux_loss": 0.04809047318994999, + "loss/crossentropy": 2.9120493054389955, + "loss/logits": 0.9170797854661942, + "step": 17430 + }, + { + "epoch": 0.1744, + "grad_norm": 12.4375, + "grad_norm_var": 2.804541015625, + "learning_rate": 0.0003, + "loss": 12.0075, + "loss/aux_loss": 0.0480960488319397, + "loss/crossentropy": 2.7624635457992555, + "loss/logits": 0.8919235855340958, + "step": 17440 + }, + { + "epoch": 0.1745, + "grad_norm": 11.6875, + "grad_norm_var": 0.469775390625, + "learning_rate": 0.0003, + "loss": 12.1544, + "loss/aux_loss": 0.048093258403241634, + "loss/crossentropy": 2.8480118989944456, + "loss/logits": 0.9208219617605209, + "step": 17450 + }, + { + "epoch": 0.1746, + "grad_norm": 10.875, + "grad_norm_var": 0.167041015625, + "learning_rate": 0.0003, + "loss": 12.038, + "loss/aux_loss": 0.048100156150758265, + "loss/crossentropy": 2.81412872672081, + "loss/logits": 0.925744378566742, + "step": 17460 + }, + { + "epoch": 0.1747, + "grad_norm": 11.1875, + "grad_norm_var": 0.22317708333333333, + "learning_rate": 0.0003, + "loss": 12.0594, + "loss/aux_loss": 0.04808522202074528, + "loss/crossentropy": 2.782361996173859, + "loss/logits": 0.9385877996683121, + "step": 17470 + }, + { + "epoch": 0.1748, + "grad_norm": 12.3125, + "grad_norm_var": 23.0869140625, + "learning_rate": 0.0003, + "loss": 12.1986, + "loss/aux_loss": 0.04810808375477791, + "loss/crossentropy": 2.819118005037308, + "loss/logits": 0.9407922476530075, + "step": 17480 + }, + { + "epoch": 0.1749, + "grad_norm": 11.5625, + "grad_norm_var": 22.847119140625, + "learning_rate": 0.0003, + "loss": 11.9801, + "loss/aux_loss": 0.04809485077857971, + "loss/crossentropy": 2.8491112112998964, + "loss/logits": 0.9401687920093537, + "step": 17490 + }, + { + "epoch": 0.175, + "grad_norm": 12.6875, + "grad_norm_var": 0.262744140625, + "learning_rate": 0.0003, + "loss": 11.9604, + "loss/aux_loss": 0.04808875843882561, + "loss/crossentropy": 2.8413546562194822, + "loss/logits": 0.9534878820180893, + "step": 17500 + }, + { + "epoch": 0.1751, + "grad_norm": 11.4375, + "grad_norm_var": 0.3395182291666667, + "learning_rate": 0.0003, + "loss": 12.1045, + "loss/aux_loss": 0.048095555044710636, + "loss/crossentropy": 2.7935108840465546, + "loss/logits": 0.9017595887184143, + "step": 17510 + }, + { + "epoch": 0.1752, + "grad_norm": 11.0, + "grad_norm_var": 0.33274739583333335, + "learning_rate": 0.0003, + "loss": 12.0272, + "loss/aux_loss": 0.04809608049690724, + "loss/crossentropy": 2.933014285564423, + "loss/logits": 0.9154089689254761, + "step": 17520 + }, + { + "epoch": 0.1753, + "grad_norm": 11.8125, + "grad_norm_var": 0.2848307291666667, + "learning_rate": 0.0003, + "loss": 12.0961, + "loss/aux_loss": 0.04810178428888321, + "loss/crossentropy": 2.8541224718093874, + "loss/logits": 0.9548793703317642, + "step": 17530 + }, + { + "epoch": 0.1754, + "grad_norm": 10.6875, + "grad_norm_var": 0.43307291666666664, + "learning_rate": 0.0003, + "loss": 12.0888, + "loss/aux_loss": 0.04808586481958628, + "loss/crossentropy": 2.7221501886844637, + "loss/logits": 0.9254509091377259, + "step": 17540 + }, + { + "epoch": 0.1755, + "grad_norm": 10.9375, + "grad_norm_var": 0.595556640625, + "learning_rate": 0.0003, + "loss": 11.8596, + "loss/aux_loss": 0.04811476822942495, + "loss/crossentropy": 2.8995654344558717, + "loss/logits": 0.9064432740211487, + "step": 17550 + }, + { + "epoch": 0.1756, + "grad_norm": 12.3125, + "grad_norm_var": 0.382666015625, + "learning_rate": 0.0003, + "loss": 11.7278, + "loss/aux_loss": 0.048095325380563735, + "loss/crossentropy": 2.797735607624054, + "loss/logits": 0.8821221351623535, + "step": 17560 + }, + { + "epoch": 0.1757, + "grad_norm": 11.125, + "grad_norm_var": 0.3997395833333333, + "learning_rate": 0.0003, + "loss": 11.8596, + "loss/aux_loss": 0.04810337759554386, + "loss/crossentropy": 2.7168959975242615, + "loss/logits": 0.8649186968803406, + "step": 17570 + }, + { + "epoch": 0.1758, + "grad_norm": 12.0, + "grad_norm_var": 0.5856608072916667, + "learning_rate": 0.0003, + "loss": 12.1123, + "loss/aux_loss": 0.0481048546731472, + "loss/crossentropy": 2.896924364566803, + "loss/logits": 0.9432176023721695, + "step": 17580 + }, + { + "epoch": 0.1759, + "grad_norm": 11.3125, + "grad_norm_var": 0.663916015625, + "learning_rate": 0.0003, + "loss": 12.1182, + "loss/aux_loss": 0.048101380653679374, + "loss/crossentropy": 2.7174128890037537, + "loss/logits": 0.8965833187103271, + "step": 17590 + }, + { + "epoch": 0.176, + "grad_norm": 12.0625, + "grad_norm_var": 33.6681640625, + "learning_rate": 0.0003, + "loss": 12.0515, + "loss/aux_loss": 0.04809325095266104, + "loss/crossentropy": 2.6860816717147826, + "loss/logits": 0.9306042581796646, + "step": 17600 + }, + { + "epoch": 0.1761, + "grad_norm": 11.0, + "grad_norm_var": 32.864583333333336, + "learning_rate": 0.0003, + "loss": 12.0491, + "loss/aux_loss": 0.04810206014662981, + "loss/crossentropy": 2.795276200771332, + "loss/logits": 0.9053617566823959, + "step": 17610 + }, + { + "epoch": 0.1762, + "grad_norm": 11.0625, + "grad_norm_var": 0.749462890625, + "learning_rate": 0.0003, + "loss": 11.947, + "loss/aux_loss": 0.04809454921633005, + "loss/crossentropy": 2.6671301662921905, + "loss/logits": 0.9019128113985062, + "step": 17620 + }, + { + "epoch": 0.1763, + "grad_norm": 12.75, + "grad_norm_var": 0.48318684895833336, + "learning_rate": 0.0003, + "loss": 12.1508, + "loss/aux_loss": 0.04808996580541134, + "loss/crossentropy": 2.8986705422401426, + "loss/logits": 0.9507706761360168, + "step": 17630 + }, + { + "epoch": 0.1764, + "grad_norm": 11.75, + "grad_norm_var": 1.9614420572916667, + "learning_rate": 0.0003, + "loss": 11.9558, + "loss/aux_loss": 0.04809357337653637, + "loss/crossentropy": 2.818922591209412, + "loss/logits": 0.890010553598404, + "step": 17640 + }, + { + "epoch": 0.1765, + "grad_norm": 11.25, + "grad_norm_var": 2.1322265625, + "learning_rate": 0.0003, + "loss": 11.9349, + "loss/aux_loss": 0.04809540584683418, + "loss/crossentropy": 2.8297097086906433, + "loss/logits": 0.9366719990968704, + "step": 17650 + }, + { + "epoch": 0.1766, + "grad_norm": 12.4375, + "grad_norm_var": 0.9899576822916667, + "learning_rate": 0.0003, + "loss": 12.1478, + "loss/aux_loss": 0.048089148849248885, + "loss/crossentropy": 2.843839108943939, + "loss/logits": 0.9119983077049255, + "step": 17660 + }, + { + "epoch": 0.1767, + "grad_norm": 12.0, + "grad_norm_var": 0.7913899739583333, + "learning_rate": 0.0003, + "loss": 12.1032, + "loss/aux_loss": 0.04810617808252573, + "loss/crossentropy": 2.8277599930763246, + "loss/logits": 0.9263883680105209, + "step": 17670 + }, + { + "epoch": 0.1768, + "grad_norm": 13.5625, + "grad_norm_var": 0.5104166666666666, + "learning_rate": 0.0003, + "loss": 12.0376, + "loss/aux_loss": 0.0480927873402834, + "loss/crossentropy": 2.8370666086673735, + "loss/logits": 0.9011499643325805, + "step": 17680 + }, + { + "epoch": 0.1769, + "grad_norm": 12.6875, + "grad_norm_var": 0.5216145833333333, + "learning_rate": 0.0003, + "loss": 11.9335, + "loss/aux_loss": 0.048093126900494096, + "loss/crossentropy": 2.7675021648406983, + "loss/logits": 0.8972540199756622, + "step": 17690 + }, + { + "epoch": 0.177, + "grad_norm": 11.75, + "grad_norm_var": 0.34739583333333335, + "learning_rate": 0.0003, + "loss": 11.8338, + "loss/aux_loss": 0.04809641428291798, + "loss/crossentropy": 2.691696697473526, + "loss/logits": 0.8934990376234054, + "step": 17700 + }, + { + "epoch": 0.1771, + "grad_norm": 12.125, + "grad_norm_var": 0.236572265625, + "learning_rate": 0.0003, + "loss": 11.9839, + "loss/aux_loss": 0.04809584002941847, + "loss/crossentropy": 2.9292188465595244, + "loss/logits": 0.9080984503030777, + "step": 17710 + }, + { + "epoch": 0.1772, + "grad_norm": 13.25, + "grad_norm_var": 0.27786458333333336, + "learning_rate": 0.0003, + "loss": 12.0181, + "loss/aux_loss": 0.048103841580450536, + "loss/crossentropy": 2.6175199866294863, + "loss/logits": 0.9136331707239151, + "step": 17720 + }, + { + "epoch": 0.1773, + "grad_norm": 12.75, + "grad_norm_var": 0.5515625, + "learning_rate": 0.0003, + "loss": 11.9502, + "loss/aux_loss": 0.04809474535286427, + "loss/crossentropy": 2.9119593143463134, + "loss/logits": 0.9304135531187058, + "step": 17730 + }, + { + "epoch": 0.1774, + "grad_norm": 11.5, + "grad_norm_var": 0.445947265625, + "learning_rate": 0.0003, + "loss": 12.0916, + "loss/aux_loss": 0.0480996023863554, + "loss/crossentropy": 2.8041651487350463, + "loss/logits": 0.9179874926805496, + "step": 17740 + }, + { + "epoch": 0.1775, + "grad_norm": 11.9375, + "grad_norm_var": 0.238916015625, + "learning_rate": 0.0003, + "loss": 11.9421, + "loss/aux_loss": 0.04809048194438219, + "loss/crossentropy": 2.9143458247184753, + "loss/logits": 0.9369097352027893, + "step": 17750 + }, + { + "epoch": 0.1776, + "grad_norm": 12.875, + "grad_norm_var": 0.49993489583333334, + "learning_rate": 0.0003, + "loss": 12.0297, + "loss/aux_loss": 0.04809221494942904, + "loss/crossentropy": 2.77188703417778, + "loss/logits": 0.8611804962158203, + "step": 17760 + }, + { + "epoch": 0.1777, + "grad_norm": 19.0, + "grad_norm_var": 3.628889973958333, + "learning_rate": 0.0003, + "loss": 12.1735, + "loss/aux_loss": 0.04809418804943562, + "loss/crossentropy": 2.866736590862274, + "loss/logits": 0.9347006261348725, + "step": 17770 + }, + { + "epoch": 0.1778, + "grad_norm": 11.125, + "grad_norm_var": 3.622509765625, + "learning_rate": 0.0003, + "loss": 11.8912, + "loss/aux_loss": 0.048098650947213176, + "loss/crossentropy": 2.8250136613845824, + "loss/logits": 0.945642602443695, + "step": 17780 + }, + { + "epoch": 0.1779, + "grad_norm": 11.125, + "grad_norm_var": 0.2462890625, + "learning_rate": 0.0003, + "loss": 12.0138, + "loss/aux_loss": 0.0481024345383048, + "loss/crossentropy": 2.750953811407089, + "loss/logits": 0.8871120661497116, + "step": 17790 + }, + { + "epoch": 0.178, + "grad_norm": 12.25, + "grad_norm_var": 109.72862955729167, + "learning_rate": 0.0003, + "loss": 11.8782, + "loss/aux_loss": 0.048107188753783704, + "loss/crossentropy": 2.9277958452701567, + "loss/logits": 0.9459708213806153, + "step": 17800 + }, + { + "epoch": 0.1781, + "grad_norm": 13.8125, + "grad_norm_var": 3.061393229166667, + "learning_rate": 0.0003, + "loss": 12.0301, + "loss/aux_loss": 0.048091620206832886, + "loss/crossentropy": 2.8570632517337797, + "loss/logits": 0.9262526482343674, + "step": 17810 + }, + { + "epoch": 0.1782, + "grad_norm": 10.5625, + "grad_norm_var": 3.207275390625, + "learning_rate": 0.0003, + "loss": 12.0702, + "loss/aux_loss": 0.0481030935421586, + "loss/crossentropy": 2.7938737750053404, + "loss/logits": 0.9401546657085419, + "step": 17820 + }, + { + "epoch": 0.1783, + "grad_norm": 11.875, + "grad_norm_var": 2.024853515625, + "learning_rate": 0.0003, + "loss": 11.9894, + "loss/aux_loss": 0.04810605850070715, + "loss/crossentropy": 2.67935990691185, + "loss/logits": 0.8830744028091431, + "step": 17830 + }, + { + "epoch": 0.1784, + "grad_norm": 12.8125, + "grad_norm_var": 4.722900390625, + "learning_rate": 0.0003, + "loss": 12.0205, + "loss/aux_loss": 0.04809609260410071, + "loss/crossentropy": 2.708746474981308, + "loss/logits": 0.9178021907806396, + "step": 17840 + }, + { + "epoch": 0.1785, + "grad_norm": 11.1875, + "grad_norm_var": 4.58671875, + "learning_rate": 0.0003, + "loss": 12.012, + "loss/aux_loss": 0.04809623472392559, + "loss/crossentropy": 2.7406187474727632, + "loss/logits": 0.9204235941171646, + "step": 17850 + }, + { + "epoch": 0.1786, + "grad_norm": 12.0625, + "grad_norm_var": 0.156103515625, + "learning_rate": 0.0003, + "loss": 12.0759, + "loss/aux_loss": 0.04808931238949299, + "loss/crossentropy": 2.822909486293793, + "loss/logits": 0.9199528455734253, + "step": 17860 + }, + { + "epoch": 0.1787, + "grad_norm": 11.75, + "grad_norm_var": 0.38331705729166665, + "learning_rate": 0.0003, + "loss": 12.0255, + "loss/aux_loss": 0.048098467849195005, + "loss/crossentropy": 2.9027469515800477, + "loss/logits": 0.9527244418859482, + "step": 17870 + }, + { + "epoch": 0.1788, + "grad_norm": 11.375, + "grad_norm_var": 0.349462890625, + "learning_rate": 0.0003, + "loss": 11.9763, + "loss/aux_loss": 0.048094440065324304, + "loss/crossentropy": 2.8290345549583433, + "loss/logits": 0.965818139910698, + "step": 17880 + }, + { + "epoch": 0.1789, + "grad_norm": 12.0625, + "grad_norm_var": 15.248681640625, + "learning_rate": 0.0003, + "loss": 11.9306, + "loss/aux_loss": 0.048097760416567326, + "loss/crossentropy": 2.7043901085853577, + "loss/logits": 0.8894819289445877, + "step": 17890 + }, + { + "epoch": 0.179, + "grad_norm": 12.25, + "grad_norm_var": 14.694791666666667, + "learning_rate": 0.0003, + "loss": 12.1708, + "loss/aux_loss": 0.04809524416923523, + "loss/crossentropy": 2.865119767189026, + "loss/logits": 0.9330274909734726, + "step": 17900 + }, + { + "epoch": 0.1791, + "grad_norm": 12.0625, + "grad_norm_var": 0.23487955729166668, + "learning_rate": 0.0003, + "loss": 11.9219, + "loss/aux_loss": 0.04810286946594715, + "loss/crossentropy": 2.8617121458053587, + "loss/logits": 0.9211963266134262, + "step": 17910 + }, + { + "epoch": 0.1792, + "grad_norm": 13.125, + "grad_norm_var": 0.25826822916666664, + "learning_rate": 0.0003, + "loss": 12.1026, + "loss/aux_loss": 0.048097353614866736, + "loss/crossentropy": 2.9374179244041443, + "loss/logits": 0.946164458990097, + "step": 17920 + }, + { + "epoch": 0.1793, + "grad_norm": 11.6875, + "grad_norm_var": 0.9634765625, + "learning_rate": 0.0003, + "loss": 11.9333, + "loss/aux_loss": 0.04809586051851511, + "loss/crossentropy": 2.804642015695572, + "loss/logits": 0.884665310382843, + "step": 17930 + }, + { + "epoch": 0.1794, + "grad_norm": 11.9375, + "grad_norm_var": 0.7452473958333333, + "learning_rate": 0.0003, + "loss": 12.1827, + "loss/aux_loss": 0.04808973409235477, + "loss/crossentropy": 2.782781344652176, + "loss/logits": 0.9129390954971314, + "step": 17940 + }, + { + "epoch": 0.1795, + "grad_norm": 12.0625, + "grad_norm_var": 0.424853515625, + "learning_rate": 0.0003, + "loss": 11.9763, + "loss/aux_loss": 0.04808917623013258, + "loss/crossentropy": 2.7897274017333986, + "loss/logits": 0.8996834605932236, + "step": 17950 + }, + { + "epoch": 0.1796, + "grad_norm": 11.625, + "grad_norm_var": 0.5186848958333333, + "learning_rate": 0.0003, + "loss": 12.0981, + "loss/aux_loss": 0.048086441680788995, + "loss/crossentropy": 2.7038078784942625, + "loss/logits": 0.8972001552581788, + "step": 17960 + }, + { + "epoch": 0.1797, + "grad_norm": 11.0625, + "grad_norm_var": 0.19088541666666667, + "learning_rate": 0.0003, + "loss": 12.2017, + "loss/aux_loss": 0.04809106402099132, + "loss/crossentropy": 2.738478219509125, + "loss/logits": 0.9567953556776047, + "step": 17970 + }, + { + "epoch": 0.1798, + "grad_norm": 11.25, + "grad_norm_var": 0.3150390625, + "learning_rate": 0.0003, + "loss": 12.2132, + "loss/aux_loss": 0.04809019956737757, + "loss/crossentropy": 2.894696664810181, + "loss/logits": 0.9616926342248917, + "step": 17980 + }, + { + "epoch": 0.1799, + "grad_norm": 11.9375, + "grad_norm_var": 0.292041015625, + "learning_rate": 0.0003, + "loss": 12.0304, + "loss/aux_loss": 0.048091168701648715, + "loss/crossentropy": 2.8822677552700045, + "loss/logits": 0.9124285817146301, + "step": 17990 + }, + { + "epoch": 0.18, + "grad_norm": 11.375, + "grad_norm_var": 0.322119140625, + "learning_rate": 0.0003, + "loss": 11.8724, + "loss/aux_loss": 0.04809627775102854, + "loss/crossentropy": 2.790885365009308, + "loss/logits": 0.9168848097324371, + "step": 18000 + }, + { + "epoch": 0.1801, + "grad_norm": 11.0, + "grad_norm_var": 0.28318684895833335, + "learning_rate": 0.0003, + "loss": 12.1473, + "loss/aux_loss": 0.048096579127013685, + "loss/crossentropy": 2.8502917110919954, + "loss/logits": 0.9435136646032334, + "step": 18010 + }, + { + "epoch": 0.1802, + "grad_norm": 11.0, + "grad_norm_var": 0.3889973958333333, + "learning_rate": 0.0003, + "loss": 11.9018, + "loss/aux_loss": 0.04810248874127865, + "loss/crossentropy": 2.7946541905403137, + "loss/logits": 0.9097151190042496, + "step": 18020 + }, + { + "epoch": 0.1803, + "grad_norm": 11.6875, + "grad_norm_var": 0.19581705729166668, + "learning_rate": 0.0003, + "loss": 11.9168, + "loss/aux_loss": 0.048086031526327136, + "loss/crossentropy": 2.8708603501319887, + "loss/logits": 0.9378985464572906, + "step": 18030 + }, + { + "epoch": 0.1804, + "grad_norm": 11.25, + "grad_norm_var": 0.4671223958333333, + "learning_rate": 0.0003, + "loss": 12.0196, + "loss/aux_loss": 0.04809171762317419, + "loss/crossentropy": 2.757344883680344, + "loss/logits": 0.9077586501836776, + "step": 18040 + }, + { + "epoch": 0.1805, + "grad_norm": 11.6875, + "grad_norm_var": 0.5227701822916667, + "learning_rate": 0.0003, + "loss": 12.1929, + "loss/aux_loss": 0.04808536898344755, + "loss/crossentropy": 2.9499477982521056, + "loss/logits": 0.938400462269783, + "step": 18050 + }, + { + "epoch": 0.1806, + "grad_norm": 11.125, + "grad_norm_var": 0.259619140625, + "learning_rate": 0.0003, + "loss": 11.8522, + "loss/aux_loss": 0.04810300972312689, + "loss/crossentropy": 2.7223174929618836, + "loss/logits": 0.9064554870128632, + "step": 18060 + }, + { + "epoch": 0.1807, + "grad_norm": 11.1875, + "grad_norm_var": 0.19680989583333333, + "learning_rate": 0.0003, + "loss": 12.0087, + "loss/aux_loss": 0.048086580634117124, + "loss/crossentropy": 2.81934916973114, + "loss/logits": 0.932407483458519, + "step": 18070 + }, + { + "epoch": 0.1808, + "grad_norm": 12.5625, + "grad_norm_var": 0.4161295572916667, + "learning_rate": 0.0003, + "loss": 11.9218, + "loss/aux_loss": 0.048097232170403, + "loss/crossentropy": 2.783638632297516, + "loss/logits": 0.9202796012163162, + "step": 18080 + }, + { + "epoch": 0.1809, + "grad_norm": 12.0, + "grad_norm_var": 4.4009765625, + "learning_rate": 0.0003, + "loss": 12.0502, + "loss/aux_loss": 0.04810140375047922, + "loss/crossentropy": 2.8965428352355955, + "loss/logits": 0.9300930172204971, + "step": 18090 + }, + { + "epoch": 0.181, + "grad_norm": 11.9375, + "grad_norm_var": 4.006770833333333, + "learning_rate": 0.0003, + "loss": 12.2106, + "loss/aux_loss": 0.04809464998543263, + "loss/crossentropy": 2.82760112285614, + "loss/logits": 0.9162612468004226, + "step": 18100 + }, + { + "epoch": 0.1811, + "grad_norm": 11.1875, + "grad_norm_var": 0.7079264322916666, + "learning_rate": 0.0003, + "loss": 11.9327, + "loss/aux_loss": 0.04809616301208734, + "loss/crossentropy": 2.9348750352859496, + "loss/logits": 0.908600127696991, + "step": 18110 + }, + { + "epoch": 0.1812, + "grad_norm": 11.3125, + "grad_norm_var": 0.5577473958333333, + "learning_rate": 0.0003, + "loss": 12.0074, + "loss/aux_loss": 0.0480873541906476, + "loss/crossentropy": 2.8367616474628448, + "loss/logits": 0.9282374233007431, + "step": 18120 + }, + { + "epoch": 0.1813, + "grad_norm": 12.125, + "grad_norm_var": 0.3374837239583333, + "learning_rate": 0.0003, + "loss": 12.0499, + "loss/aux_loss": 0.04810123294591904, + "loss/crossentropy": 2.811716413497925, + "loss/logits": 0.8937458395957947, + "step": 18130 + }, + { + "epoch": 0.1814, + "grad_norm": 11.9375, + "grad_norm_var": 0.340478515625, + "learning_rate": 0.0003, + "loss": 12.0357, + "loss/aux_loss": 0.04809012711048126, + "loss/crossentropy": 2.7347005784511564, + "loss/logits": 0.8987887173891067, + "step": 18140 + }, + { + "epoch": 0.1815, + "grad_norm": 10.9375, + "grad_norm_var": 0.4512858072916667, + "learning_rate": 0.0003, + "loss": 12.0446, + "loss/aux_loss": 0.048095178604125974, + "loss/crossentropy": 2.792975926399231, + "loss/logits": 0.936535793542862, + "step": 18150 + }, + { + "epoch": 0.1816, + "grad_norm": 11.9375, + "grad_norm_var": 0.1978515625, + "learning_rate": 0.0003, + "loss": 12.1014, + "loss/aux_loss": 0.048092770390212536, + "loss/crossentropy": 2.6440272629261017, + "loss/logits": 0.9145908206701279, + "step": 18160 + }, + { + "epoch": 0.1817, + "grad_norm": 11.0, + "grad_norm_var": 0.27858072916666665, + "learning_rate": 0.0003, + "loss": 11.801, + "loss/aux_loss": 0.04809875432401896, + "loss/crossentropy": 2.7408132016658784, + "loss/logits": 0.8750650644302368, + "step": 18170 + }, + { + "epoch": 0.1818, + "grad_norm": 11.4375, + "grad_norm_var": 1.0853515625, + "learning_rate": 0.0003, + "loss": 12.1041, + "loss/aux_loss": 0.04810262303799391, + "loss/crossentropy": 2.7091507375240327, + "loss/logits": 0.9323061019182205, + "step": 18180 + }, + { + "epoch": 0.1819, + "grad_norm": 11.8125, + "grad_norm_var": 1.352197265625, + "learning_rate": 0.0003, + "loss": 12.1093, + "loss/aux_loss": 0.04809154383838177, + "loss/crossentropy": 2.9365743041038512, + "loss/logits": 0.9312824219465256, + "step": 18190 + }, + { + "epoch": 0.182, + "grad_norm": 11.3125, + "grad_norm_var": 0.7645670572916666, + "learning_rate": 0.0003, + "loss": 11.9302, + "loss/aux_loss": 0.04810417983680963, + "loss/crossentropy": 2.696337890625, + "loss/logits": 0.9121669709682465, + "step": 18200 + }, + { + "epoch": 0.1821, + "grad_norm": 11.5, + "grad_norm_var": 0.37180989583333335, + "learning_rate": 0.0003, + "loss": 12.1331, + "loss/aux_loss": 0.04809307269752026, + "loss/crossentropy": 2.718644219636917, + "loss/logits": 0.9004943788051605, + "step": 18210 + }, + { + "epoch": 0.1822, + "grad_norm": 10.8125, + "grad_norm_var": 0.5102701822916667, + "learning_rate": 0.0003, + "loss": 11.953, + "loss/aux_loss": 0.04809835311025381, + "loss/crossentropy": 2.907946026325226, + "loss/logits": 0.9023657441139221, + "step": 18220 + }, + { + "epoch": 0.1823, + "grad_norm": 10.9375, + "grad_norm_var": 0.429541015625, + "learning_rate": 0.0003, + "loss": 11.7626, + "loss/aux_loss": 0.04809815175831318, + "loss/crossentropy": 2.7833638072013853, + "loss/logits": 0.9019864350557327, + "step": 18230 + }, + { + "epoch": 0.1824, + "grad_norm": 12.0, + "grad_norm_var": 0.9494791666666667, + "learning_rate": 0.0003, + "loss": 12.1544, + "loss/aux_loss": 0.04810196273028851, + "loss/crossentropy": 2.856829822063446, + "loss/logits": 0.8836144953966141, + "step": 18240 + }, + { + "epoch": 0.1825, + "grad_norm": 12.0, + "grad_norm_var": 0.853125, + "learning_rate": 0.0003, + "loss": 11.7924, + "loss/aux_loss": 0.04810160342603922, + "loss/crossentropy": 2.8368868112564085, + "loss/logits": 0.9327179700136184, + "step": 18250 + }, + { + "epoch": 0.1826, + "grad_norm": 11.5625, + "grad_norm_var": 0.25323893229166666, + "learning_rate": 0.0003, + "loss": 12.1253, + "loss/aux_loss": 0.04810277093201876, + "loss/crossentropy": 2.7802767038345335, + "loss/logits": 0.8940910458564758, + "step": 18260 + }, + { + "epoch": 0.1827, + "grad_norm": 12.0, + "grad_norm_var": 0.2759765625, + "learning_rate": 0.0003, + "loss": 11.9725, + "loss/aux_loss": 0.04809536635875702, + "loss/crossentropy": 2.845566821098328, + "loss/logits": 0.9107513338327408, + "step": 18270 + }, + { + "epoch": 0.1828, + "grad_norm": 11.375, + "grad_norm_var": 0.4071451822916667, + "learning_rate": 0.0003, + "loss": 11.985, + "loss/aux_loss": 0.04809177704155445, + "loss/crossentropy": 2.708817595243454, + "loss/logits": 0.8630902379751205, + "step": 18280 + }, + { + "epoch": 0.1829, + "grad_norm": 11.5, + "grad_norm_var": 0.27708333333333335, + "learning_rate": 0.0003, + "loss": 11.8368, + "loss/aux_loss": 0.04809412229806185, + "loss/crossentropy": 2.6290226101875307, + "loss/logits": 0.927997687458992, + "step": 18290 + }, + { + "epoch": 0.183, + "grad_norm": 11.0, + "grad_norm_var": 0.5557291666666667, + "learning_rate": 0.0003, + "loss": 12.079, + "loss/aux_loss": 0.04809760414063931, + "loss/crossentropy": 2.820905792713165, + "loss/logits": 0.9419155091047287, + "step": 18300 + }, + { + "epoch": 0.1831, + "grad_norm": 21.875, + "grad_norm_var": 389.9878743489583, + "learning_rate": 0.0003, + "loss": 12.2556, + "loss/aux_loss": 0.04809573795646429, + "loss/crossentropy": 2.8808292627334593, + "loss/logits": 0.9127176314592361, + "step": 18310 + }, + { + "epoch": 0.1832, + "grad_norm": 11.5625, + "grad_norm_var": 6.450374348958333, + "learning_rate": 0.0003, + "loss": 11.9252, + "loss/aux_loss": 0.04810270164161921, + "loss/crossentropy": 2.638647198677063, + "loss/logits": 0.8678022742271423, + "step": 18320 + }, + { + "epoch": 0.1833, + "grad_norm": 12.5, + "grad_norm_var": 0.24576822916666666, + "learning_rate": 0.0003, + "loss": 11.9287, + "loss/aux_loss": 0.048086739145219326, + "loss/crossentropy": 2.7931796431541445, + "loss/logits": 0.9423355519771576, + "step": 18330 + }, + { + "epoch": 0.1834, + "grad_norm": 11.5625, + "grad_norm_var": 0.28878580729166664, + "learning_rate": 0.0003, + "loss": 11.8476, + "loss/aux_loss": 0.04810136705636978, + "loss/crossentropy": 2.7460675835609436, + "loss/logits": 0.9129256516695022, + "step": 18340 + }, + { + "epoch": 0.1835, + "grad_norm": 11.625, + "grad_norm_var": 0.39264322916666666, + "learning_rate": 0.0003, + "loss": 12.1094, + "loss/aux_loss": 0.04808819629251957, + "loss/crossentropy": 2.9586320996284483, + "loss/logits": 0.926827785372734, + "step": 18350 + }, + { + "epoch": 0.1836, + "grad_norm": 12.25, + "grad_norm_var": 0.271728515625, + "learning_rate": 0.0003, + "loss": 12.0495, + "loss/aux_loss": 0.04808443989604712, + "loss/crossentropy": 2.8124279379844666, + "loss/logits": 0.929901072382927, + "step": 18360 + }, + { + "epoch": 0.1837, + "grad_norm": 11.5, + "grad_norm_var": 0.550244140625, + "learning_rate": 0.0003, + "loss": 12.1857, + "loss/aux_loss": 0.04809539690613747, + "loss/crossentropy": 2.9786995530128477, + "loss/logits": 0.9320331394672394, + "step": 18370 + }, + { + "epoch": 0.1838, + "grad_norm": 13.0625, + "grad_norm_var": 0.5140625, + "learning_rate": 0.0003, + "loss": 11.9399, + "loss/aux_loss": 0.048090987093746665, + "loss/crossentropy": 2.921971356868744, + "loss/logits": 0.9322270661592483, + "step": 18380 + }, + { + "epoch": 0.1839, + "grad_norm": 11.9375, + "grad_norm_var": 0.30416666666666664, + "learning_rate": 0.0003, + "loss": 12.0165, + "loss/aux_loss": 0.04809644818305969, + "loss/crossentropy": 2.7656728088855744, + "loss/logits": 0.8962883800268173, + "step": 18390 + }, + { + "epoch": 0.184, + "grad_norm": 11.25, + "grad_norm_var": 0.3473307291666667, + "learning_rate": 0.0003, + "loss": 11.9019, + "loss/aux_loss": 0.048087817057967185, + "loss/crossentropy": 2.9055333137512207, + "loss/logits": 0.903611746430397, + "step": 18400 + }, + { + "epoch": 0.1841, + "grad_norm": 11.125, + "grad_norm_var": 0.43697916666666664, + "learning_rate": 0.0003, + "loss": 11.9844, + "loss/aux_loss": 0.04810544457286596, + "loss/crossentropy": 2.788058453798294, + "loss/logits": 0.9051843047142029, + "step": 18410 + }, + { + "epoch": 0.1842, + "grad_norm": 11.4375, + "grad_norm_var": 0.15729166666666666, + "learning_rate": 0.0003, + "loss": 11.8739, + "loss/aux_loss": 0.0480987248942256, + "loss/crossentropy": 2.725960999727249, + "loss/logits": 0.9009216666221619, + "step": 18420 + }, + { + "epoch": 0.1843, + "grad_norm": 11.5, + "grad_norm_var": 0.0900390625, + "learning_rate": 0.0003, + "loss": 12.0252, + "loss/aux_loss": 0.04809308275580406, + "loss/crossentropy": 2.8324514091014863, + "loss/logits": 0.9101878136396409, + "step": 18430 + }, + { + "epoch": 0.1844, + "grad_norm": 12.1875, + "grad_norm_var": 0.235791015625, + "learning_rate": 0.0003, + "loss": 12.037, + "loss/aux_loss": 0.04810200035572052, + "loss/crossentropy": 2.7521001577377318, + "loss/logits": 0.8864558875560761, + "step": 18440 + }, + { + "epoch": 0.1845, + "grad_norm": 11.75, + "grad_norm_var": 0.21951497395833333, + "learning_rate": 0.0003, + "loss": 11.9625, + "loss/aux_loss": 0.04809632524847984, + "loss/crossentropy": 2.670381647348404, + "loss/logits": 0.9308747231960297, + "step": 18450 + }, + { + "epoch": 0.1846, + "grad_norm": 11.75, + "grad_norm_var": 7.765999348958333, + "learning_rate": 0.0003, + "loss": 11.9008, + "loss/aux_loss": 0.04809861965477467, + "loss/crossentropy": 2.729556679725647, + "loss/logits": 0.9098370641469955, + "step": 18460 + }, + { + "epoch": 0.1847, + "grad_norm": 11.1875, + "grad_norm_var": 0.390087890625, + "learning_rate": 0.0003, + "loss": 11.92, + "loss/aux_loss": 0.04810507521033287, + "loss/crossentropy": 2.72969531416893, + "loss/logits": 0.8956705331802368, + "step": 18470 + }, + { + "epoch": 0.1848, + "grad_norm": 11.375, + "grad_norm_var": 0.366650390625, + "learning_rate": 0.0003, + "loss": 12.1006, + "loss/aux_loss": 0.048082930594682695, + "loss/crossentropy": 2.878212571144104, + "loss/logits": 0.9402207374572754, + "step": 18480 + }, + { + "epoch": 0.1849, + "grad_norm": 12.375, + "grad_norm_var": 0.27316080729166664, + "learning_rate": 0.0003, + "loss": 11.9182, + "loss/aux_loss": 0.048104613274335864, + "loss/crossentropy": 2.8093533754348754, + "loss/logits": 0.9267432987689972, + "step": 18490 + }, + { + "epoch": 0.185, + "grad_norm": 11.3125, + "grad_norm_var": 0.45089518229166664, + "learning_rate": 0.0003, + "loss": 12.0427, + "loss/aux_loss": 0.04809886794537306, + "loss/crossentropy": 2.8468139350414274, + "loss/logits": 0.9497668504714966, + "step": 18500 + }, + { + "epoch": 0.1851, + "grad_norm": 11.875, + "grad_norm_var": 0.48409830729166664, + "learning_rate": 0.0003, + "loss": 12.2465, + "loss/aux_loss": 0.04809215907007456, + "loss/crossentropy": 2.7852579593658446, + "loss/logits": 0.9196423381567002, + "step": 18510 + }, + { + "epoch": 0.1852, + "grad_norm": 11.875, + "grad_norm_var": 0.42701822916666665, + "learning_rate": 0.0003, + "loss": 11.7951, + "loss/aux_loss": 0.04809624664485455, + "loss/crossentropy": 2.7887323558330537, + "loss/logits": 0.9103799790143967, + "step": 18520 + }, + { + "epoch": 0.1853, + "grad_norm": 11.6875, + "grad_norm_var": 0.4661458333333333, + "learning_rate": 0.0003, + "loss": 12.0625, + "loss/aux_loss": 0.04808788318186998, + "loss/crossentropy": 2.808869343996048, + "loss/logits": 0.9083440005779266, + "step": 18530 + }, + { + "epoch": 0.1854, + "grad_norm": 10.8125, + "grad_norm_var": 0.3947265625, + "learning_rate": 0.0003, + "loss": 11.8269, + "loss/aux_loss": 0.04810307510197163, + "loss/crossentropy": 2.657493585348129, + "loss/logits": 0.9473574429750442, + "step": 18540 + }, + { + "epoch": 0.1855, + "grad_norm": 11.75, + "grad_norm_var": 0.34036458333333336, + "learning_rate": 0.0003, + "loss": 11.9915, + "loss/aux_loss": 0.048092805035412314, + "loss/crossentropy": 2.809315764904022, + "loss/logits": 0.9260794132947922, + "step": 18550 + }, + { + "epoch": 0.1856, + "grad_norm": 12.0625, + "grad_norm_var": 0.19920247395833332, + "learning_rate": 0.0003, + "loss": 11.99, + "loss/aux_loss": 0.04810086619108915, + "loss/crossentropy": 2.834631139039993, + "loss/logits": 0.8880037814378738, + "step": 18560 + }, + { + "epoch": 0.1857, + "grad_norm": 11.5625, + "grad_norm_var": 0.27447916666666666, + "learning_rate": 0.0003, + "loss": 12.0215, + "loss/aux_loss": 0.04809237774461508, + "loss/crossentropy": 2.82566694021225, + "loss/logits": 0.9089670658111573, + "step": 18570 + }, + { + "epoch": 0.1858, + "grad_norm": 11.25, + "grad_norm_var": 0.4634765625, + "learning_rate": 0.0003, + "loss": 11.9482, + "loss/aux_loss": 0.048088868334889415, + "loss/crossentropy": 2.8229294657707213, + "loss/logits": 0.9378566771745682, + "step": 18580 + }, + { + "epoch": 0.1859, + "grad_norm": 12.5, + "grad_norm_var": 0.2718587239583333, + "learning_rate": 0.0003, + "loss": 11.8611, + "loss/aux_loss": 0.04808779731392861, + "loss/crossentropy": 2.7925811648368835, + "loss/logits": 0.9152165412902832, + "step": 18590 + }, + { + "epoch": 0.186, + "grad_norm": 12.875, + "grad_norm_var": 0.2869140625, + "learning_rate": 0.0003, + "loss": 11.9566, + "loss/aux_loss": 0.04808272887021303, + "loss/crossentropy": 2.84689114689827, + "loss/logits": 0.9415480852127075, + "step": 18600 + }, + { + "epoch": 0.1861, + "grad_norm": 12.625, + "grad_norm_var": 0.47389322916666665, + "learning_rate": 0.0003, + "loss": 12.017, + "loss/aux_loss": 0.0480954147875309, + "loss/crossentropy": 2.9054375171661375, + "loss/logits": 0.9320352107286454, + "step": 18610 + }, + { + "epoch": 0.1862, + "grad_norm": 12.8125, + "grad_norm_var": 0.4014973958333333, + "learning_rate": 0.0003, + "loss": 11.9752, + "loss/aux_loss": 0.04809523019939661, + "loss/crossentropy": 2.8620842158794404, + "loss/logits": 0.9115070551633835, + "step": 18620 + }, + { + "epoch": 0.1863, + "grad_norm": 11.4375, + "grad_norm_var": 0.3343587239583333, + "learning_rate": 0.0003, + "loss": 11.9462, + "loss/aux_loss": 0.04809815548360348, + "loss/crossentropy": 2.799818730354309, + "loss/logits": 0.9180498957633972, + "step": 18630 + }, + { + "epoch": 0.1864, + "grad_norm": 10.75, + "grad_norm_var": 0.3402180989583333, + "learning_rate": 0.0003, + "loss": 12.1313, + "loss/aux_loss": 0.048090537264943126, + "loss/crossentropy": 2.9112443208694456, + "loss/logits": 0.9628580302000046, + "step": 18640 + }, + { + "epoch": 0.1865, + "grad_norm": 12.0625, + "grad_norm_var": 0.3275390625, + "learning_rate": 0.0003, + "loss": 11.8097, + "loss/aux_loss": 0.04809784088283777, + "loss/crossentropy": 2.7898995995521547, + "loss/logits": 0.8758876919746399, + "step": 18650 + }, + { + "epoch": 0.1866, + "grad_norm": 11.25, + "grad_norm_var": 0.355712890625, + "learning_rate": 0.0003, + "loss": 11.9504, + "loss/aux_loss": 0.04809091240167618, + "loss/crossentropy": 2.883331334590912, + "loss/logits": 0.9137732326984406, + "step": 18660 + }, + { + "epoch": 0.1867, + "grad_norm": 10.875, + "grad_norm_var": 0.2604166666666667, + "learning_rate": 0.0003, + "loss": 11.9183, + "loss/aux_loss": 0.04810288343578577, + "loss/crossentropy": 2.827151381969452, + "loss/logits": 0.9118337035179138, + "step": 18670 + }, + { + "epoch": 0.1868, + "grad_norm": 12.5, + "grad_norm_var": 0.5574055989583333, + "learning_rate": 0.0003, + "loss": 11.7425, + "loss/aux_loss": 0.048099182173609735, + "loss/crossentropy": 2.751882565021515, + "loss/logits": 0.915845838189125, + "step": 18680 + }, + { + "epoch": 0.1869, + "grad_norm": 18.25, + "grad_norm_var": 2.970572916666667, + "learning_rate": 0.0003, + "loss": 11.9627, + "loss/aux_loss": 0.04809111282229424, + "loss/crossentropy": 2.8157127261161805, + "loss/logits": 0.8805839955806732, + "step": 18690 + }, + { + "epoch": 0.187, + "grad_norm": 11.75, + "grad_norm_var": 6.431103515625, + "learning_rate": 0.0003, + "loss": 12.0129, + "loss/aux_loss": 0.04809790011495352, + "loss/crossentropy": 2.9344887137413025, + "loss/logits": 0.9546463966369629, + "step": 18700 + }, + { + "epoch": 0.1871, + "grad_norm": 13.0, + "grad_norm_var": 4.466927083333333, + "learning_rate": 0.0003, + "loss": 11.8991, + "loss/aux_loss": 0.048103202134370804, + "loss/crossentropy": 2.6000198304653166, + "loss/logits": 0.8983616352081298, + "step": 18710 + }, + { + "epoch": 0.1872, + "grad_norm": 10.75, + "grad_norm_var": 1.0605305989583333, + "learning_rate": 0.0003, + "loss": 11.8265, + "loss/aux_loss": 0.048097947239875795, + "loss/crossentropy": 2.74048707485199, + "loss/logits": 0.9315300911664963, + "step": 18720 + }, + { + "epoch": 0.1873, + "grad_norm": 11.0625, + "grad_norm_var": 0.9038899739583334, + "learning_rate": 0.0003, + "loss": 11.8877, + "loss/aux_loss": 0.04809299129992724, + "loss/crossentropy": 2.6987858176231385, + "loss/logits": 0.8788728475570678, + "step": 18730 + }, + { + "epoch": 0.1874, + "grad_norm": 10.875, + "grad_norm_var": 1.0518229166666666, + "learning_rate": 0.0003, + "loss": 11.9914, + "loss/aux_loss": 0.04809512048959732, + "loss/crossentropy": 2.7466448664665224, + "loss/logits": 0.9056605339050293, + "step": 18740 + }, + { + "epoch": 0.1875, + "grad_norm": 11.375, + "grad_norm_var": 0.5421223958333333, + "learning_rate": 0.0003, + "loss": 12.0469, + "loss/aux_loss": 0.04808540716767311, + "loss/crossentropy": 2.8385793924331666, + "loss/logits": 0.9397071808576584, + "step": 18750 + }, + { + "epoch": 0.1876, + "grad_norm": 12.125, + "grad_norm_var": 0.2952962239583333, + "learning_rate": 0.0003, + "loss": 11.9745, + "loss/aux_loss": 0.04808508455753326, + "loss/crossentropy": 2.7859063267707826, + "loss/logits": 0.9249111205339432, + "step": 18760 + }, + { + "epoch": 0.1877, + "grad_norm": 11.8125, + "grad_norm_var": 0.253369140625, + "learning_rate": 0.0003, + "loss": 11.9344, + "loss/aux_loss": 0.048101603612303735, + "loss/crossentropy": 2.96168977022171, + "loss/logits": 0.9427939087152482, + "step": 18770 + }, + { + "epoch": 0.1878, + "grad_norm": 12.125, + "grad_norm_var": 0.23474934895833333, + "learning_rate": 0.0003, + "loss": 11.8409, + "loss/aux_loss": 0.04809587094932795, + "loss/crossentropy": 2.6832815647125243, + "loss/logits": 0.8893307328224183, + "step": 18780 + }, + { + "epoch": 0.1879, + "grad_norm": 12.9375, + "grad_norm_var": 0.46925455729166665, + "learning_rate": 0.0003, + "loss": 11.8433, + "loss/aux_loss": 0.048098374903202054, + "loss/crossentropy": 2.973353409767151, + "loss/logits": 0.9185240358114243, + "step": 18790 + }, + { + "epoch": 0.188, + "grad_norm": 11.3125, + "grad_norm_var": 0.385009765625, + "learning_rate": 0.0003, + "loss": 11.8279, + "loss/aux_loss": 0.04809072986245155, + "loss/crossentropy": 2.883065390586853, + "loss/logits": 0.8887928575277328, + "step": 18800 + }, + { + "epoch": 0.1881, + "grad_norm": 11.4375, + "grad_norm_var": 0.1421875, + "learning_rate": 0.0003, + "loss": 12.0192, + "loss/aux_loss": 0.04809522368013859, + "loss/crossentropy": 2.8153730273246764, + "loss/logits": 0.8990904957056045, + "step": 18810 + }, + { + "epoch": 0.1882, + "grad_norm": 12.125, + "grad_norm_var": 0.44524739583333334, + "learning_rate": 0.0003, + "loss": 12.0603, + "loss/aux_loss": 0.04809008222073317, + "loss/crossentropy": 2.917261230945587, + "loss/logits": 0.9583010584115982, + "step": 18820 + }, + { + "epoch": 0.1883, + "grad_norm": 12.125, + "grad_norm_var": 0.4032389322916667, + "learning_rate": 0.0003, + "loss": 12.0391, + "loss/aux_loss": 0.04809148814529181, + "loss/crossentropy": 2.7239452958106996, + "loss/logits": 0.9369824826717377, + "step": 18830 + }, + { + "epoch": 0.1884, + "grad_norm": 12.375, + "grad_norm_var": 0.9059895833333333, + "learning_rate": 0.0003, + "loss": 11.9066, + "loss/aux_loss": 0.04810024816542864, + "loss/crossentropy": 2.6165760159492493, + "loss/logits": 0.8547280013561249, + "step": 18840 + }, + { + "epoch": 0.1885, + "grad_norm": 17.25, + "grad_norm_var": 2.301497395833333, + "learning_rate": 0.0003, + "loss": 11.9055, + "loss/aux_loss": 0.048094492405653, + "loss/crossentropy": 3.0237093448638914, + "loss/logits": 0.9564484775066375, + "step": 18850 + }, + { + "epoch": 0.1886, + "grad_norm": 11.4375, + "grad_norm_var": 2.128059895833333, + "learning_rate": 0.0003, + "loss": 12.0438, + "loss/aux_loss": 0.04808449726551771, + "loss/crossentropy": 2.8197420120239256, + "loss/logits": 0.9342156380414963, + "step": 18860 + }, + { + "epoch": 0.1887, + "grad_norm": 11.1875, + "grad_norm_var": 0.26691080729166666, + "learning_rate": 0.0003, + "loss": 11.826, + "loss/aux_loss": 0.04809930399060249, + "loss/crossentropy": 2.7798034250736237, + "loss/logits": 0.8755748480558395, + "step": 18870 + }, + { + "epoch": 0.1888, + "grad_norm": 12.0625, + "grad_norm_var": 0.35545247395833335, + "learning_rate": 0.0003, + "loss": 11.6886, + "loss/aux_loss": 0.04809543527662754, + "loss/crossentropy": 2.8541658937931063, + "loss/logits": 0.892428070306778, + "step": 18880 + }, + { + "epoch": 0.1889, + "grad_norm": 11.9375, + "grad_norm_var": 0.32224934895833335, + "learning_rate": 0.0003, + "loss": 11.7692, + "loss/aux_loss": 0.04810644872486591, + "loss/crossentropy": 2.7232487499713898, + "loss/logits": 0.9081037282943726, + "step": 18890 + }, + { + "epoch": 0.189, + "grad_norm": 11.375, + "grad_norm_var": 0.2938639322916667, + "learning_rate": 0.0003, + "loss": 11.9456, + "loss/aux_loss": 0.04808319099247456, + "loss/crossentropy": 2.5981625437736513, + "loss/logits": 0.8822880685329437, + "step": 18900 + }, + { + "epoch": 0.1891, + "grad_norm": 11.8125, + "grad_norm_var": 0.36456705729166666, + "learning_rate": 0.0003, + "loss": 11.9909, + "loss/aux_loss": 0.048096727766096595, + "loss/crossentropy": 2.718340504169464, + "loss/logits": 0.8984918922185898, + "step": 18910 + }, + { + "epoch": 0.1892, + "grad_norm": 12.5, + "grad_norm_var": 0.4791015625, + "learning_rate": 0.0003, + "loss": 12.0497, + "loss/aux_loss": 0.048100071772933004, + "loss/crossentropy": 2.914843189716339, + "loss/logits": 0.9236764490604401, + "step": 18920 + }, + { + "epoch": 0.1893, + "grad_norm": 11.1875, + "grad_norm_var": 0.5247395833333334, + "learning_rate": 0.0003, + "loss": 11.8306, + "loss/aux_loss": 0.0480832876637578, + "loss/crossentropy": 2.8327117800712585, + "loss/logits": 0.9310741007328034, + "step": 18930 + }, + { + "epoch": 0.1894, + "grad_norm": 10.875, + "grad_norm_var": 0.3282389322916667, + "learning_rate": 0.0003, + "loss": 11.8084, + "loss/aux_loss": 0.048092326149344444, + "loss/crossentropy": 2.8622347712516785, + "loss/logits": 0.8887595921754837, + "step": 18940 + }, + { + "epoch": 0.1895, + "grad_norm": 12.3125, + "grad_norm_var": 0.4891764322916667, + "learning_rate": 0.0003, + "loss": 11.9539, + "loss/aux_loss": 0.048091747984290126, + "loss/crossentropy": 2.8625791549682615, + "loss/logits": 0.9395757526159286, + "step": 18950 + }, + { + "epoch": 0.1896, + "grad_norm": 12.0625, + "grad_norm_var": 0.24073893229166668, + "learning_rate": 0.0003, + "loss": 11.8872, + "loss/aux_loss": 0.04809561818838119, + "loss/crossentropy": 2.93541020154953, + "loss/logits": 0.9296731561422348, + "step": 18960 + }, + { + "epoch": 0.1897, + "grad_norm": 11.0625, + "grad_norm_var": 0.5176432291666667, + "learning_rate": 0.0003, + "loss": 11.926, + "loss/aux_loss": 0.04809174351394176, + "loss/crossentropy": 2.85881884098053, + "loss/logits": 0.9174812495708465, + "step": 18970 + }, + { + "epoch": 0.1898, + "grad_norm": 12.8125, + "grad_norm_var": 0.49375, + "learning_rate": 0.0003, + "loss": 11.9255, + "loss/aux_loss": 0.04810503609478474, + "loss/crossentropy": 2.7044818341732024, + "loss/logits": 0.9038681089878082, + "step": 18980 + }, + { + "epoch": 0.1899, + "grad_norm": 12.3125, + "grad_norm_var": 0.323291015625, + "learning_rate": 0.0003, + "loss": 11.934, + "loss/aux_loss": 0.04808723460882902, + "loss/crossentropy": 2.8215215682983397, + "loss/logits": 0.8740798741579056, + "step": 18990 + }, + { + "epoch": 0.19, + "grad_norm": 12.5625, + "grad_norm_var": 0.5848795572916666, + "learning_rate": 0.0003, + "loss": 11.9942, + "loss/aux_loss": 0.04809704348444939, + "loss/crossentropy": 2.9540371537208556, + "loss/logits": 0.9380867898464202, + "step": 19000 + }, + { + "epoch": 0.1901, + "grad_norm": 11.875, + "grad_norm_var": 35.141927083333336, + "learning_rate": 0.0003, + "loss": 12.117, + "loss/aux_loss": 0.04809831455349922, + "loss/crossentropy": 2.848419559001923, + "loss/logits": 0.9661454766988754, + "step": 19010 + }, + { + "epoch": 0.1902, + "grad_norm": 11.75, + "grad_norm_var": 1.568212890625, + "learning_rate": 0.0003, + "loss": 12.1233, + "loss/aux_loss": 0.048095112666487694, + "loss/crossentropy": 2.914617598056793, + "loss/logits": 0.940568807721138, + "step": 19020 + }, + { + "epoch": 0.1903, + "grad_norm": 12.75, + "grad_norm_var": 1.885791015625, + "learning_rate": 0.0003, + "loss": 12.0781, + "loss/aux_loss": 0.04809290152043104, + "loss/crossentropy": 2.9060685276985168, + "loss/logits": 0.9084505170583725, + "step": 19030 + }, + { + "epoch": 0.1904, + "grad_norm": 12.0625, + "grad_norm_var": 0.547509765625, + "learning_rate": 0.0003, + "loss": 11.8626, + "loss/aux_loss": 0.04809753466397524, + "loss/crossentropy": 2.8450966238975526, + "loss/logits": 0.9347045987844467, + "step": 19040 + }, + { + "epoch": 0.1905, + "grad_norm": 11.75, + "grad_norm_var": 0.421728515625, + "learning_rate": 0.0003, + "loss": 11.786, + "loss/aux_loss": 0.04809632711112499, + "loss/crossentropy": 2.6032280802726744, + "loss/logits": 0.8754363477230072, + "step": 19050 + }, + { + "epoch": 0.1906, + "grad_norm": 12.4375, + "grad_norm_var": 0.266650390625, + "learning_rate": 0.0003, + "loss": 11.9427, + "loss/aux_loss": 0.048089970275759696, + "loss/crossentropy": 2.8427577376365663, + "loss/logits": 0.9292992860078811, + "step": 19060 + }, + { + "epoch": 0.1907, + "grad_norm": 11.9375, + "grad_norm_var": 0.29099934895833335, + "learning_rate": 0.0003, + "loss": 12.0185, + "loss/aux_loss": 0.04809428974986076, + "loss/crossentropy": 2.9434940934181215, + "loss/logits": 0.8928971856832504, + "step": 19070 + }, + { + "epoch": 0.1908, + "grad_norm": 11.6875, + "grad_norm_var": 0.7889973958333333, + "learning_rate": 0.0003, + "loss": 11.742, + "loss/aux_loss": 0.048089478723704816, + "loss/crossentropy": 2.7458222687244414, + "loss/logits": 0.9186016976833343, + "step": 19080 + }, + { + "epoch": 0.1909, + "grad_norm": 11.0625, + "grad_norm_var": 1.5890462239583334, + "learning_rate": 0.0003, + "loss": 12.0539, + "loss/aux_loss": 0.048099744878709313, + "loss/crossentropy": 2.829107737541199, + "loss/logits": 0.9015246391296386, + "step": 19090 + }, + { + "epoch": 0.191, + "grad_norm": 11.375, + "grad_norm_var": 1.597900390625, + "learning_rate": 0.0003, + "loss": 11.939, + "loss/aux_loss": 0.04808116909116507, + "loss/crossentropy": 2.874987268447876, + "loss/logits": 0.9092469424009323, + "step": 19100 + }, + { + "epoch": 0.1911, + "grad_norm": 11.625, + "grad_norm_var": 0.36912434895833335, + "learning_rate": 0.0003, + "loss": 11.9227, + "loss/aux_loss": 0.04809539634734392, + "loss/crossentropy": 2.7079729199409486, + "loss/logits": 0.9122515827417373, + "step": 19110 + }, + { + "epoch": 0.1912, + "grad_norm": 11.1875, + "grad_norm_var": 0.3335774739583333, + "learning_rate": 0.0003, + "loss": 12.094, + "loss/aux_loss": 0.0480892339721322, + "loss/crossentropy": 2.6642766416072847, + "loss/logits": 0.9355534881353378, + "step": 19120 + }, + { + "epoch": 0.1913, + "grad_norm": 12.75, + "grad_norm_var": 0.28631184895833334, + "learning_rate": 0.0003, + "loss": 11.8506, + "loss/aux_loss": 0.04809222798794508, + "loss/crossentropy": 2.581315791606903, + "loss/logits": 0.8726950109004974, + "step": 19130 + }, + { + "epoch": 0.1914, + "grad_norm": 11.9375, + "grad_norm_var": 0.19993489583333332, + "learning_rate": 0.0003, + "loss": 11.9463, + "loss/aux_loss": 0.0480927174910903, + "loss/crossentropy": 2.8361901879310607, + "loss/logits": 0.93881676197052, + "step": 19140 + }, + { + "epoch": 0.1915, + "grad_norm": 12.5625, + "grad_norm_var": 0.3233723958333333, + "learning_rate": 0.0003, + "loss": 11.8851, + "loss/aux_loss": 0.04809240084141493, + "loss/crossentropy": 2.772093939781189, + "loss/logits": 0.895565664768219, + "step": 19150 + }, + { + "epoch": 0.1916, + "grad_norm": 12.8125, + "grad_norm_var": 0.42941080729166664, + "learning_rate": 0.0003, + "loss": 11.8773, + "loss/aux_loss": 0.04808843210339546, + "loss/crossentropy": 2.7380272090435027, + "loss/logits": 0.8849580556154251, + "step": 19160 + }, + { + "epoch": 0.1917, + "grad_norm": 11.375, + "grad_norm_var": 0.6075520833333333, + "learning_rate": 0.0003, + "loss": 12.0319, + "loss/aux_loss": 0.04809383936226368, + "loss/crossentropy": 2.679046392440796, + "loss/logits": 0.9247982114553451, + "step": 19170 + }, + { + "epoch": 0.1918, + "grad_norm": 12.5625, + "grad_norm_var": 0.51875, + "learning_rate": 0.0003, + "loss": 11.9646, + "loss/aux_loss": 0.0480990482494235, + "loss/crossentropy": 2.8308603882789614, + "loss/logits": 0.9139487504959106, + "step": 19180 + }, + { + "epoch": 0.1919, + "grad_norm": 12.1875, + "grad_norm_var": 0.8937337239583333, + "learning_rate": 0.0003, + "loss": 11.9498, + "loss/aux_loss": 0.04809563048183918, + "loss/crossentropy": 2.868353658914566, + "loss/logits": 0.9112594306468964, + "step": 19190 + }, + { + "epoch": 0.192, + "grad_norm": 11.75, + "grad_norm_var": 178.2056640625, + "learning_rate": 0.0003, + "loss": 11.9624, + "loss/aux_loss": 0.04809640198945999, + "loss/crossentropy": 2.875636076927185, + "loss/logits": 0.9370063930749893, + "step": 19200 + }, + { + "epoch": 0.1921, + "grad_norm": 11.625, + "grad_norm_var": 0.4247233072916667, + "learning_rate": 0.0003, + "loss": 11.9338, + "loss/aux_loss": 0.048091776110231875, + "loss/crossentropy": 2.885943067073822, + "loss/logits": 0.9205142021179199, + "step": 19210 + }, + { + "epoch": 0.1922, + "grad_norm": 11.3125, + "grad_norm_var": 0.292431640625, + "learning_rate": 0.0003, + "loss": 11.841, + "loss/aux_loss": 0.04809496812522411, + "loss/crossentropy": 2.5971029341220855, + "loss/logits": 0.8990915536880493, + "step": 19220 + }, + { + "epoch": 0.1923, + "grad_norm": 11.6875, + "grad_norm_var": 0.3155598958333333, + "learning_rate": 0.0003, + "loss": 11.9581, + "loss/aux_loss": 0.048088048957288264, + "loss/crossentropy": 2.8906711101531983, + "loss/logits": 0.9080457538366318, + "step": 19230 + }, + { + "epoch": 0.1924, + "grad_norm": 12.8125, + "grad_norm_var": 0.363134765625, + "learning_rate": 0.0003, + "loss": 11.9134, + "loss/aux_loss": 0.04809061922132969, + "loss/crossentropy": 2.7267106890678408, + "loss/logits": 0.8822133630514145, + "step": 19240 + }, + { + "epoch": 0.1925, + "grad_norm": 12.4375, + "grad_norm_var": 0.18854166666666666, + "learning_rate": 0.0003, + "loss": 11.9539, + "loss/aux_loss": 0.04809251707047224, + "loss/crossentropy": 2.834822082519531, + "loss/logits": 0.9051540076732636, + "step": 19250 + }, + { + "epoch": 0.1926, + "grad_norm": 12.1875, + "grad_norm_var": 0.3575520833333333, + "learning_rate": 0.0003, + "loss": 11.8532, + "loss/aux_loss": 0.04809150565415621, + "loss/crossentropy": 2.673792243003845, + "loss/logits": 0.8930087149143219, + "step": 19260 + }, + { + "epoch": 0.1927, + "grad_norm": 11.625, + "grad_norm_var": 0.376025390625, + "learning_rate": 0.0003, + "loss": 11.8667, + "loss/aux_loss": 0.04809306338429451, + "loss/crossentropy": 2.8607924938201905, + "loss/logits": 0.8900872558355332, + "step": 19270 + }, + { + "epoch": 0.1928, + "grad_norm": 12.125, + "grad_norm_var": 0.15714518229166666, + "learning_rate": 0.0003, + "loss": 11.9148, + "loss/aux_loss": 0.04807972889393568, + "loss/crossentropy": 2.803767132759094, + "loss/logits": 0.9348350763320923, + "step": 19280 + }, + { + "epoch": 0.1929, + "grad_norm": 11.125, + "grad_norm_var": 0.41067708333333336, + "learning_rate": 0.0003, + "loss": 11.8139, + "loss/aux_loss": 0.04809964876621962, + "loss/crossentropy": 2.7644663214683534, + "loss/logits": 0.9412413388490677, + "step": 19290 + }, + { + "epoch": 0.193, + "grad_norm": 11.9375, + "grad_norm_var": 15.3375, + "learning_rate": 0.0003, + "loss": 11.8668, + "loss/aux_loss": 0.04809567742049694, + "loss/crossentropy": 2.687431216239929, + "loss/logits": 0.8960238516330719, + "step": 19300 + }, + { + "epoch": 0.1931, + "grad_norm": 12.5, + "grad_norm_var": 14.444645182291667, + "learning_rate": 0.0003, + "loss": 12.0495, + "loss/aux_loss": 0.04808875881135464, + "loss/crossentropy": 2.8696415305137633, + "loss/logits": 0.9359003514051437, + "step": 19310 + }, + { + "epoch": 0.1932, + "grad_norm": 14.25, + "grad_norm_var": 0.6895182291666667, + "learning_rate": 0.0003, + "loss": 11.8149, + "loss/aux_loss": 0.048093396797776224, + "loss/crossentropy": 2.7438224017620088, + "loss/logits": 0.9250198155641556, + "step": 19320 + }, + { + "epoch": 0.1933, + "grad_norm": 11.5625, + "grad_norm_var": 1.2317057291666667, + "learning_rate": 0.0003, + "loss": 11.9555, + "loss/aux_loss": 0.0480903297662735, + "loss/crossentropy": 2.8425944447517395, + "loss/logits": 0.9191201657056809, + "step": 19330 + }, + { + "epoch": 0.1934, + "grad_norm": 13.375, + "grad_norm_var": 0.6511555989583333, + "learning_rate": 0.0003, + "loss": 11.8307, + "loss/aux_loss": 0.04809546768665314, + "loss/crossentropy": 2.794313246011734, + "loss/logits": 0.8963600903749466, + "step": 19340 + }, + { + "epoch": 0.1935, + "grad_norm": 11.6875, + "grad_norm_var": 4.347135416666666, + "learning_rate": 0.0003, + "loss": 11.9059, + "loss/aux_loss": 0.048097337037324904, + "loss/crossentropy": 2.8060832381248475, + "loss/logits": 0.923801937699318, + "step": 19350 + }, + { + "epoch": 0.1936, + "grad_norm": 11.25, + "grad_norm_var": 0.4930826822916667, + "learning_rate": 0.0003, + "loss": 12.0551, + "loss/aux_loss": 0.048092464171350005, + "loss/crossentropy": 2.737381660938263, + "loss/logits": 0.9238520950078964, + "step": 19360 + }, + { + "epoch": 0.1937, + "grad_norm": 12.0, + "grad_norm_var": 2.992431640625, + "learning_rate": 0.0003, + "loss": 11.9236, + "loss/aux_loss": 0.04809140842407942, + "loss/crossentropy": 2.7283401012420656, + "loss/logits": 0.9115884095430374, + "step": 19370 + }, + { + "epoch": 0.1938, + "grad_norm": 13.25, + "grad_norm_var": 3.064306640625, + "learning_rate": 0.0003, + "loss": 11.9373, + "loss/aux_loss": 0.04809054136276245, + "loss/crossentropy": 2.9607500314712523, + "loss/logits": 0.9213700443506241, + "step": 19380 + }, + { + "epoch": 0.1939, + "grad_norm": 12.5, + "grad_norm_var": 0.5308430989583334, + "learning_rate": 0.0003, + "loss": 12.0326, + "loss/aux_loss": 0.04808128047734499, + "loss/crossentropy": 2.966229736804962, + "loss/logits": 0.9744098156690597, + "step": 19390 + }, + { + "epoch": 0.194, + "grad_norm": 11.75, + "grad_norm_var": 8.039697265625, + "learning_rate": 0.0003, + "loss": 11.8697, + "loss/aux_loss": 0.04811720736324787, + "loss/crossentropy": 2.8290202260017394, + "loss/logits": 0.9148620575666427, + "step": 19400 + }, + { + "epoch": 0.1941, + "grad_norm": 12.0625, + "grad_norm_var": 0.25983072916666666, + "learning_rate": 0.0003, + "loss": 11.81, + "loss/aux_loss": 0.0480983579531312, + "loss/crossentropy": 2.618745720386505, + "loss/logits": 0.8423079371452331, + "step": 19410 + }, + { + "epoch": 0.1942, + "grad_norm": 11.875, + "grad_norm_var": 15.804671223958334, + "learning_rate": 0.0003, + "loss": 11.8575, + "loss/aux_loss": 0.048095330409705636, + "loss/crossentropy": 2.8630192160606383, + "loss/logits": 0.9032826870679855, + "step": 19420 + }, + { + "epoch": 0.1943, + "grad_norm": 12.3125, + "grad_norm_var": 0.23776041666666667, + "learning_rate": 0.0003, + "loss": 11.8413, + "loss/aux_loss": 0.04809598363935948, + "loss/crossentropy": 2.714806389808655, + "loss/logits": 0.8757014304399491, + "step": 19430 + }, + { + "epoch": 0.1944, + "grad_norm": 12.0, + "grad_norm_var": 0.3473307291666667, + "learning_rate": 0.0003, + "loss": 11.9041, + "loss/aux_loss": 0.04809326659888029, + "loss/crossentropy": 2.81378653049469, + "loss/logits": 0.9226094603538513, + "step": 19440 + }, + { + "epoch": 0.1945, + "grad_norm": 12.9375, + "grad_norm_var": 0.469384765625, + "learning_rate": 0.0003, + "loss": 12.0162, + "loss/aux_loss": 0.04810178130865097, + "loss/crossentropy": 2.7176017642021177, + "loss/logits": 0.8890448838472367, + "step": 19450 + }, + { + "epoch": 0.1946, + "grad_norm": 12.25, + "grad_norm_var": 0.7030598958333333, + "learning_rate": 0.0003, + "loss": 11.8237, + "loss/aux_loss": 0.048099988326430324, + "loss/crossentropy": 2.578825032711029, + "loss/logits": 0.8693708449602127, + "step": 19460 + }, + { + "epoch": 0.1947, + "grad_norm": 11.875, + "grad_norm_var": 0.24191080729166667, + "learning_rate": 0.0003, + "loss": 11.9591, + "loss/aux_loss": 0.04810124989598989, + "loss/crossentropy": 2.919858819246292, + "loss/logits": 0.9405999302864074, + "step": 19470 + }, + { + "epoch": 0.1948, + "grad_norm": 11.3125, + "grad_norm_var": 0.07786458333333333, + "learning_rate": 0.0003, + "loss": 11.876, + "loss/aux_loss": 0.04809066876769066, + "loss/crossentropy": 2.830302083492279, + "loss/logits": 0.9027499586343766, + "step": 19480 + }, + { + "epoch": 0.1949, + "grad_norm": 11.8125, + "grad_norm_var": 2.059830729166667, + "learning_rate": 0.0003, + "loss": 11.879, + "loss/aux_loss": 0.04809845667332411, + "loss/crossentropy": 2.846599793434143, + "loss/logits": 0.8904654294252395, + "step": 19490 + }, + { + "epoch": 0.195, + "grad_norm": 11.6875, + "grad_norm_var": 1.7852701822916666, + "learning_rate": 0.0003, + "loss": 11.9823, + "loss/aux_loss": 0.048098682425916195, + "loss/crossentropy": 2.6993426620960235, + "loss/logits": 0.9028889060020446, + "step": 19500 + }, + { + "epoch": 0.1951, + "grad_norm": 11.5, + "grad_norm_var": 0.4869140625, + "learning_rate": 0.0003, + "loss": 11.8731, + "loss/aux_loss": 0.04808852486312389, + "loss/crossentropy": 2.9450518250465394, + "loss/logits": 0.923612329363823, + "step": 19510 + }, + { + "epoch": 0.1952, + "grad_norm": 11.9375, + "grad_norm_var": 0.6424479166666667, + "learning_rate": 0.0003, + "loss": 11.9629, + "loss/aux_loss": 0.048087956570088866, + "loss/crossentropy": 2.7310150384902956, + "loss/logits": 0.8867404013872147, + "step": 19520 + }, + { + "epoch": 0.1953, + "grad_norm": 11.6875, + "grad_norm_var": 0.7098307291666667, + "learning_rate": 0.0003, + "loss": 11.9431, + "loss/aux_loss": 0.04809447377920151, + "loss/crossentropy": 2.7376580953598024, + "loss/logits": 0.9287895351648331, + "step": 19530 + }, + { + "epoch": 0.1954, + "grad_norm": 11.9375, + "grad_norm_var": 0.45078125, + "learning_rate": 0.0003, + "loss": 11.9501, + "loss/aux_loss": 0.04808611460030079, + "loss/crossentropy": 2.8004270434379577, + "loss/logits": 0.9067860126495362, + "step": 19540 + }, + { + "epoch": 0.1955, + "grad_norm": 11.6875, + "grad_norm_var": 0.2916015625, + "learning_rate": 0.0003, + "loss": 11.8895, + "loss/aux_loss": 0.048095055297017096, + "loss/crossentropy": 2.7705915451049803, + "loss/logits": 0.8861099511384964, + "step": 19550 + }, + { + "epoch": 0.1956, + "grad_norm": 11.3125, + "grad_norm_var": 0.3238118489583333, + "learning_rate": 0.0003, + "loss": 11.8624, + "loss/aux_loss": 0.048082451336085796, + "loss/crossentropy": 2.7829070925712585, + "loss/logits": 0.9209084331989288, + "step": 19560 + }, + { + "epoch": 0.1957, + "grad_norm": 11.6875, + "grad_norm_var": 48.423893229166666, + "learning_rate": 0.0003, + "loss": 11.8985, + "loss/aux_loss": 0.048102785088121894, + "loss/crossentropy": 2.795788884162903, + "loss/logits": 0.9219643086194992, + "step": 19570 + }, + { + "epoch": 0.1958, + "grad_norm": 12.0, + "grad_norm_var": 0.17649739583333332, + "learning_rate": 0.0003, + "loss": 11.8718, + "loss/aux_loss": 0.04808918442577124, + "loss/crossentropy": 2.6786233842372895, + "loss/logits": 0.8851831436157227, + "step": 19580 + }, + { + "epoch": 0.1959, + "grad_norm": 12.4375, + "grad_norm_var": 0.28958333333333336, + "learning_rate": 0.0003, + "loss": 11.9507, + "loss/aux_loss": 0.048088375851511954, + "loss/crossentropy": 2.7911401748657227, + "loss/logits": 0.9245690137147904, + "step": 19590 + }, + { + "epoch": 0.196, + "grad_norm": 12.0, + "grad_norm_var": 0.4019368489583333, + "learning_rate": 0.0003, + "loss": 12.0528, + "loss/aux_loss": 0.04808861836791038, + "loss/crossentropy": 2.89783319234848, + "loss/logits": 0.9419608056545258, + "step": 19600 + }, + { + "epoch": 0.1961, + "grad_norm": 12.25, + "grad_norm_var": 4.918733723958334, + "learning_rate": 0.0003, + "loss": 11.9651, + "loss/aux_loss": 0.048108036443591115, + "loss/crossentropy": 2.7094775795936585, + "loss/logits": 0.8831329464912414, + "step": 19610 + }, + { + "epoch": 0.1962, + "grad_norm": 13.75, + "grad_norm_var": 0.55234375, + "learning_rate": 0.0003, + "loss": 11.9267, + "loss/aux_loss": 0.048093420639634135, + "loss/crossentropy": 2.6996466517448425, + "loss/logits": 0.8816385596990586, + "step": 19620 + }, + { + "epoch": 0.1963, + "grad_norm": 11.875, + "grad_norm_var": 0.6469889322916667, + "learning_rate": 0.0003, + "loss": 12.072, + "loss/aux_loss": 0.0480880755931139, + "loss/crossentropy": 2.9747036695480347, + "loss/logits": 0.9532156825065613, + "step": 19630 + }, + { + "epoch": 0.1964, + "grad_norm": 11.3125, + "grad_norm_var": 0.18776041666666668, + "learning_rate": 0.0003, + "loss": 11.9037, + "loss/aux_loss": 0.04809441566467285, + "loss/crossentropy": 2.816385340690613, + "loss/logits": 0.8943806827068329, + "step": 19640 + }, + { + "epoch": 0.1965, + "grad_norm": 11.6875, + "grad_norm_var": 0.3033854166666667, + "learning_rate": 0.0003, + "loss": 11.8965, + "loss/aux_loss": 0.04809524808079004, + "loss/crossentropy": 2.786463499069214, + "loss/logits": 0.8947435468435287, + "step": 19650 + }, + { + "epoch": 0.1966, + "grad_norm": 11.5, + "grad_norm_var": 0.31443684895833335, + "learning_rate": 0.0003, + "loss": 11.9229, + "loss/aux_loss": 0.04809077382087708, + "loss/crossentropy": 2.635916793346405, + "loss/logits": 0.8936503291130066, + "step": 19660 + }, + { + "epoch": 0.1967, + "grad_norm": 11.9375, + "grad_norm_var": 74.60792643229166, + "learning_rate": 0.0003, + "loss": 11.9338, + "loss/aux_loss": 0.04811822287738323, + "loss/crossentropy": 2.851349139213562, + "loss/logits": 0.9552539438009262, + "step": 19670 + }, + { + "epoch": 0.1968, + "grad_norm": 13.3125, + "grad_norm_var": 539.7253743489583, + "learning_rate": 0.0003, + "loss": 11.8808, + "loss/aux_loss": 0.04809991996735334, + "loss/crossentropy": 2.686124062538147, + "loss/logits": 0.8603927254676819, + "step": 19680 + }, + { + "epoch": 0.1969, + "grad_norm": 11.125, + "grad_norm_var": 2.3004557291666665, + "learning_rate": 0.0003, + "loss": 12.0303, + "loss/aux_loss": 0.048102331534028056, + "loss/crossentropy": 2.856088125705719, + "loss/logits": 0.8935580879449845, + "step": 19690 + }, + { + "epoch": 0.197, + "grad_norm": 12.4375, + "grad_norm_var": 0.290625, + "learning_rate": 0.0003, + "loss": 11.6799, + "loss/aux_loss": 0.04809893500059843, + "loss/crossentropy": 2.4471123695373533, + "loss/logits": 0.8147686392068862, + "step": 19700 + }, + { + "epoch": 0.1971, + "grad_norm": 15.0, + "grad_norm_var": 36.69993489583333, + "learning_rate": 0.0003, + "loss": 11.9236, + "loss/aux_loss": 0.048108641244471076, + "loss/crossentropy": 2.7717926442623138, + "loss/logits": 0.9015519857406616, + "step": 19710 + }, + { + "epoch": 0.1972, + "grad_norm": 11.4375, + "grad_norm_var": 35.6734375, + "learning_rate": 0.0003, + "loss": 11.9069, + "loss/aux_loss": 0.04808454010635614, + "loss/crossentropy": 2.759740972518921, + "loss/logits": 0.916330274939537, + "step": 19720 + }, + { + "epoch": 0.1973, + "grad_norm": 11.8125, + "grad_norm_var": 0.5555826822916666, + "learning_rate": 0.0003, + "loss": 11.8078, + "loss/aux_loss": 0.04809134602546692, + "loss/crossentropy": 2.824735289812088, + "loss/logits": 0.915794974565506, + "step": 19730 + }, + { + "epoch": 0.1974, + "grad_norm": 11.9375, + "grad_norm_var": 0.36354166666666665, + "learning_rate": 0.0003, + "loss": 12.0126, + "loss/aux_loss": 0.04809140507131815, + "loss/crossentropy": 2.8288570284843444, + "loss/logits": 0.899117162823677, + "step": 19740 + }, + { + "epoch": 0.1975, + "grad_norm": 12.8125, + "grad_norm_var": 0.6856608072916667, + "learning_rate": 0.0003, + "loss": 12.0629, + "loss/aux_loss": 0.04809539392590523, + "loss/crossentropy": 2.808449399471283, + "loss/logits": 0.9411624908447266, + "step": 19750 + }, + { + "epoch": 0.1976, + "grad_norm": 12.0625, + "grad_norm_var": 0.8481608072916667, + "learning_rate": 0.0003, + "loss": 11.8754, + "loss/aux_loss": 0.04808392804116011, + "loss/crossentropy": 2.8559494376182557, + "loss/logits": 0.8965466380119324, + "step": 19760 + }, + { + "epoch": 0.1977, + "grad_norm": 12.25, + "grad_norm_var": 0.3729166666666667, + "learning_rate": 0.0003, + "loss": 11.7598, + "loss/aux_loss": 0.048097134567797184, + "loss/crossentropy": 2.6028787732124328, + "loss/logits": 0.883382824063301, + "step": 19770 + }, + { + "epoch": 0.1978, + "grad_norm": 12.4375, + "grad_norm_var": 0.375, + "learning_rate": 0.0003, + "loss": 12.0172, + "loss/aux_loss": 0.0480909226462245, + "loss/crossentropy": 2.745894658565521, + "loss/logits": 0.8921247333288193, + "step": 19780 + }, + { + "epoch": 0.1979, + "grad_norm": 11.875, + "grad_norm_var": 0.14869791666666668, + "learning_rate": 0.0003, + "loss": 11.9666, + "loss/aux_loss": 0.04809285439550877, + "loss/crossentropy": 2.7850330114364623, + "loss/logits": 0.912046593427658, + "step": 19790 + }, + { + "epoch": 0.198, + "grad_norm": 11.3125, + "grad_norm_var": 0.4832682291666667, + "learning_rate": 0.0003, + "loss": 11.9582, + "loss/aux_loss": 0.048090188205242156, + "loss/crossentropy": 2.5545909225940706, + "loss/logits": 0.8726184368133545, + "step": 19800 + }, + { + "epoch": 0.1981, + "grad_norm": 12.1875, + "grad_norm_var": 0.6088541666666667, + "learning_rate": 0.0003, + "loss": 11.8976, + "loss/aux_loss": 0.04808424971997738, + "loss/crossentropy": 2.764713633060455, + "loss/logits": 0.9010277688503265, + "step": 19810 + }, + { + "epoch": 0.1982, + "grad_norm": 11.75, + "grad_norm_var": 0.27005208333333336, + "learning_rate": 0.0003, + "loss": 11.933, + "loss/aux_loss": 0.048095231875777245, + "loss/crossentropy": 2.8057524442672728, + "loss/logits": 0.912197208404541, + "step": 19820 + }, + { + "epoch": 0.1983, + "grad_norm": 11.5625, + "grad_norm_var": 0.286962890625, + "learning_rate": 0.0003, + "loss": 11.8571, + "loss/aux_loss": 0.048092585243284705, + "loss/crossentropy": 2.7631799936294557, + "loss/logits": 0.9289673507213593, + "step": 19830 + }, + { + "epoch": 0.1984, + "grad_norm": 11.3125, + "grad_norm_var": 0.17291666666666666, + "learning_rate": 0.0003, + "loss": 11.7907, + "loss/aux_loss": 0.04810852501541376, + "loss/crossentropy": 2.6866043627262117, + "loss/logits": 0.905146250128746, + "step": 19840 + }, + { + "epoch": 0.1985, + "grad_norm": 11.5, + "grad_norm_var": 0.29816080729166666, + "learning_rate": 0.0003, + "loss": 11.8136, + "loss/aux_loss": 0.048087861575186255, + "loss/crossentropy": 2.9205057621002197, + "loss/logits": 0.8634889364242554, + "step": 19850 + }, + { + "epoch": 0.1986, + "grad_norm": 12.0, + "grad_norm_var": 0.5166015625, + "learning_rate": 0.0003, + "loss": 12.0486, + "loss/aux_loss": 0.0480953972786665, + "loss/crossentropy": 2.9846151471138, + "loss/logits": 0.929237163066864, + "step": 19860 + }, + { + "epoch": 0.1987, + "grad_norm": 13.0625, + "grad_norm_var": 0.6348307291666667, + "learning_rate": 0.0003, + "loss": 11.9695, + "loss/aux_loss": 0.0481043117120862, + "loss/crossentropy": 2.7358814120292663, + "loss/logits": 0.8989768654108048, + "step": 19870 + }, + { + "epoch": 0.1988, + "grad_norm": 13.5625, + "grad_norm_var": 4.075260416666667, + "learning_rate": 0.0003, + "loss": 12.0024, + "loss/aux_loss": 0.04810099173337221, + "loss/crossentropy": 2.8778868198394774, + "loss/logits": 0.945749819278717, + "step": 19880 + }, + { + "epoch": 0.1989, + "grad_norm": 12.5, + "grad_norm_var": 0.3583170572916667, + "learning_rate": 0.0003, + "loss": 11.8718, + "loss/aux_loss": 0.04808882363140583, + "loss/crossentropy": 2.848455381393433, + "loss/logits": 0.9049903869628906, + "step": 19890 + }, + { + "epoch": 0.199, + "grad_norm": 11.4375, + "grad_norm_var": 0.24680989583333332, + "learning_rate": 0.0003, + "loss": 11.8808, + "loss/aux_loss": 0.04809690471738577, + "loss/crossentropy": 2.8279551804065703, + "loss/logits": 0.9005701452493667, + "step": 19900 + }, + { + "epoch": 0.1991, + "grad_norm": 11.5, + "grad_norm_var": 0.353369140625, + "learning_rate": 0.0003, + "loss": 11.9659, + "loss/aux_loss": 0.04810180887579918, + "loss/crossentropy": 2.8082579135894776, + "loss/logits": 0.9061174720525742, + "step": 19910 + }, + { + "epoch": 0.1992, + "grad_norm": 13.25, + "grad_norm_var": 0.4071451822916667, + "learning_rate": 0.0003, + "loss": 11.7249, + "loss/aux_loss": 0.0480797978118062, + "loss/crossentropy": 2.827571380138397, + "loss/logits": 0.8759961783885956, + "step": 19920 + }, + { + "epoch": 0.1993, + "grad_norm": 11.875, + "grad_norm_var": 0.49192708333333335, + "learning_rate": 0.0003, + "loss": 12.0004, + "loss/aux_loss": 0.04808774013072252, + "loss/crossentropy": 2.8538602709770204, + "loss/logits": 0.9502677023410797, + "step": 19930 + }, + { + "epoch": 0.1994, + "grad_norm": 12.5, + "grad_norm_var": 0.41868489583333335, + "learning_rate": 0.0003, + "loss": 11.6786, + "loss/aux_loss": 0.04809605274349451, + "loss/crossentropy": 2.667774814367294, + "loss/logits": 0.8690688908100128, + "step": 19940 + }, + { + "epoch": 0.1995, + "grad_norm": 12.4375, + "grad_norm_var": 0.18214518229166668, + "learning_rate": 0.0003, + "loss": 11.7086, + "loss/aux_loss": 0.04809432104229927, + "loss/crossentropy": 2.7703699648380278, + "loss/logits": 0.8963902860879898, + "step": 19950 + }, + { + "epoch": 0.1996, + "grad_norm": 11.6875, + "grad_norm_var": 0.4400390625, + "learning_rate": 0.0003, + "loss": 11.8438, + "loss/aux_loss": 0.04808981604874134, + "loss/crossentropy": 2.852495664358139, + "loss/logits": 0.9264784097671509, + "step": 19960 + }, + { + "epoch": 0.1997, + "grad_norm": 12.5625, + "grad_norm_var": 0.5706868489583333, + "learning_rate": 0.0003, + "loss": 11.8371, + "loss/aux_loss": 0.04808941353112459, + "loss/crossentropy": 2.6668283939361572, + "loss/logits": 0.8973574638366699, + "step": 19970 + }, + { + "epoch": 0.1998, + "grad_norm": 11.9375, + "grad_norm_var": 0.645556640625, + "learning_rate": 0.0003, + "loss": 12.0197, + "loss/aux_loss": 0.04809361547231674, + "loss/crossentropy": 2.7544242978096007, + "loss/logits": 0.9390354514122009, + "step": 19980 + }, + { + "epoch": 0.1999, + "grad_norm": 13.5, + "grad_norm_var": 0.3184733072916667, + "learning_rate": 0.0003, + "loss": 11.998, + "loss/aux_loss": 0.04809475131332874, + "loss/crossentropy": 2.8074730575084685, + "loss/logits": 0.8892990052700043, + "step": 19990 + }, + { + "epoch": 0.2, + "grad_norm": 12.375, + "grad_norm_var": 0.36795247395833336, + "learning_rate": 0.0003, + "loss": 11.8505, + "loss/aux_loss": 0.04809391163289547, + "loss/crossentropy": 2.663703387975693, + "loss/logits": 0.8925030082464218, + "step": 20000 + }, + { + "epoch": 0.2001, + "grad_norm": 13.0625, + "grad_norm_var": 0.281494140625, + "learning_rate": 0.0003, + "loss": 11.9197, + "loss/aux_loss": 0.048094058968126775, + "loss/crossentropy": 2.886281728744507, + "loss/logits": 0.8830851048231125, + "step": 20010 + }, + { + "epoch": 0.2002, + "grad_norm": 11.75, + "grad_norm_var": 0.480322265625, + "learning_rate": 0.0003, + "loss": 11.8763, + "loss/aux_loss": 0.04809100497514009, + "loss/crossentropy": 2.7841232419013977, + "loss/logits": 0.8877080678939819, + "step": 20020 + }, + { + "epoch": 0.2003, + "grad_norm": 11.125, + "grad_norm_var": 3.560660807291667, + "learning_rate": 0.0003, + "loss": 11.629, + "loss/aux_loss": 0.048091770894825456, + "loss/crossentropy": 2.6575345516204836, + "loss/logits": 0.8857085227966308, + "step": 20030 + }, + { + "epoch": 0.2004, + "grad_norm": 12.8125, + "grad_norm_var": 3.2007649739583335, + "learning_rate": 0.0003, + "loss": 11.8182, + "loss/aux_loss": 0.0480912720784545, + "loss/crossentropy": 2.7816062927246095, + "loss/logits": 0.8824679642915726, + "step": 20040 + }, + { + "epoch": 0.2005, + "grad_norm": 11.375, + "grad_norm_var": 0.8089680989583333, + "learning_rate": 0.0003, + "loss": 11.8584, + "loss/aux_loss": 0.048090060241520405, + "loss/crossentropy": 2.7619201481342315, + "loss/logits": 0.8898406118154526, + "step": 20050 + }, + { + "epoch": 0.2006, + "grad_norm": 11.5625, + "grad_norm_var": 0.8502604166666666, + "learning_rate": 0.0003, + "loss": 11.7573, + "loss/aux_loss": 0.04808730930089951, + "loss/crossentropy": 2.752053952217102, + "loss/logits": 0.9270232617855072, + "step": 20060 + }, + { + "epoch": 0.2007, + "grad_norm": 12.3125, + "grad_norm_var": 0.26287434895833334, + "learning_rate": 0.0003, + "loss": 11.9228, + "loss/aux_loss": 0.04809353221207857, + "loss/crossentropy": 2.749705493450165, + "loss/logits": 0.9575551152229309, + "step": 20070 + }, + { + "epoch": 0.2008, + "grad_norm": 12.25, + "grad_norm_var": 0.28489583333333335, + "learning_rate": 0.0003, + "loss": 11.7852, + "loss/aux_loss": 0.04809745699167252, + "loss/crossentropy": 2.6136541962623596, + "loss/logits": 0.9018281251192093, + "step": 20080 + }, + { + "epoch": 0.2009, + "grad_norm": 11.375, + "grad_norm_var": 0.24264322916666667, + "learning_rate": 0.0003, + "loss": 11.6765, + "loss/aux_loss": 0.04809559304267168, + "loss/crossentropy": 2.757488173246384, + "loss/logits": 0.9004332274198532, + "step": 20090 + }, + { + "epoch": 0.201, + "grad_norm": 11.6875, + "grad_norm_var": 0.10154622395833333, + "learning_rate": 0.0003, + "loss": 11.942, + "loss/aux_loss": 0.04809119962155819, + "loss/crossentropy": 2.8438161253929137, + "loss/logits": 0.9208786696195602, + "step": 20100 + }, + { + "epoch": 0.2011, + "grad_norm": 10.8125, + "grad_norm_var": 1.2364583333333334, + "learning_rate": 0.0003, + "loss": 11.8541, + "loss/aux_loss": 0.048097537644207475, + "loss/crossentropy": 2.763727468252182, + "loss/logits": 0.9115354359149933, + "step": 20110 + }, + { + "epoch": 0.2012, + "grad_norm": 11.6875, + "grad_norm_var": 0.5036295572916667, + "learning_rate": 0.0003, + "loss": 11.8807, + "loss/aux_loss": 0.04808099064975977, + "loss/crossentropy": 2.8801899015903474, + "loss/logits": 0.9038815647363663, + "step": 20120 + }, + { + "epoch": 0.2013, + "grad_norm": 12.0625, + "grad_norm_var": 0.41848958333333336, + "learning_rate": 0.0003, + "loss": 11.9244, + "loss/aux_loss": 0.048102755844593045, + "loss/crossentropy": 2.9221291661262514, + "loss/logits": 0.9072444885969162, + "step": 20130 + }, + { + "epoch": 0.2014, + "grad_norm": 12.3125, + "grad_norm_var": 0.21223958333333334, + "learning_rate": 0.0003, + "loss": 11.7636, + "loss/aux_loss": 0.04809268806129694, + "loss/crossentropy": 2.69385387301445, + "loss/logits": 0.9101363003253937, + "step": 20140 + }, + { + "epoch": 0.2015, + "grad_norm": 11.75, + "grad_norm_var": 0.7202962239583334, + "learning_rate": 0.0003, + "loss": 11.82, + "loss/aux_loss": 0.04809230994433165, + "loss/crossentropy": 2.800033277273178, + "loss/logits": 0.9162040054798126, + "step": 20150 + }, + { + "epoch": 0.2016, + "grad_norm": 11.9375, + "grad_norm_var": 0.5473795572916667, + "learning_rate": 0.0003, + "loss": 11.9676, + "loss/aux_loss": 0.04809824600815773, + "loss/crossentropy": 2.828506714105606, + "loss/logits": 0.8795387417078018, + "step": 20160 + }, + { + "epoch": 0.2017, + "grad_norm": 11.75, + "grad_norm_var": 0.40305989583333335, + "learning_rate": 0.0003, + "loss": 11.9633, + "loss/aux_loss": 0.0480917414650321, + "loss/crossentropy": 2.8800631880760195, + "loss/logits": 0.875808122754097, + "step": 20170 + }, + { + "epoch": 0.2018, + "grad_norm": 12.4375, + "grad_norm_var": 0.460009765625, + "learning_rate": 0.0003, + "loss": 11.8485, + "loss/aux_loss": 0.04808654151856899, + "loss/crossentropy": 2.7784875988960267, + "loss/logits": 0.9081717103719711, + "step": 20180 + }, + { + "epoch": 0.2019, + "grad_norm": 12.4375, + "grad_norm_var": 0.23917643229166666, + "learning_rate": 0.0003, + "loss": 11.8831, + "loss/aux_loss": 0.04809090811759233, + "loss/crossentropy": 2.8120036482810975, + "loss/logits": 0.9035557597875595, + "step": 20190 + }, + { + "epoch": 0.202, + "grad_norm": 11.625, + "grad_norm_var": 0.2228515625, + "learning_rate": 0.0003, + "loss": 11.8836, + "loss/aux_loss": 0.048092659749090674, + "loss/crossentropy": 2.607446867227554, + "loss/logits": 0.9118954926729202, + "step": 20200 + }, + { + "epoch": 0.2021, + "grad_norm": 11.9375, + "grad_norm_var": 0.41139322916666665, + "learning_rate": 0.0003, + "loss": 11.9498, + "loss/aux_loss": 0.04809481520205736, + "loss/crossentropy": 2.781216490268707, + "loss/logits": 0.88041250705719, + "step": 20210 + }, + { + "epoch": 0.2022, + "grad_norm": 12.5625, + "grad_norm_var": 1.6801432291666667, + "learning_rate": 0.0003, + "loss": 12.0076, + "loss/aux_loss": 0.04808514565229416, + "loss/crossentropy": 2.691144472360611, + "loss/logits": 0.9039320826530457, + "step": 20220 + }, + { + "epoch": 0.2023, + "grad_norm": 12.875, + "grad_norm_var": 1.501416015625, + "learning_rate": 0.0003, + "loss": 11.8567, + "loss/aux_loss": 0.048104914277791976, + "loss/crossentropy": 2.6379098296165466, + "loss/logits": 0.8757845312356949, + "step": 20230 + }, + { + "epoch": 0.2024, + "grad_norm": 11.625, + "grad_norm_var": 165.79401041666668, + "learning_rate": 0.0003, + "loss": 11.967, + "loss/aux_loss": 0.0480836084112525, + "loss/crossentropy": 2.774720752239227, + "loss/logits": 0.8843321442604065, + "step": 20240 + }, + { + "epoch": 0.2025, + "grad_norm": 12.6875, + "grad_norm_var": 0.454931640625, + "learning_rate": 0.0003, + "loss": 11.9247, + "loss/aux_loss": 0.04810081459581852, + "loss/crossentropy": 2.810983347892761, + "loss/logits": 0.9296145677566529, + "step": 20250 + }, + { + "epoch": 0.2026, + "grad_norm": 12.0625, + "grad_norm_var": 0.36795247395833336, + "learning_rate": 0.0003, + "loss": 11.6745, + "loss/aux_loss": 0.04808822646737099, + "loss/crossentropy": 2.684907627105713, + "loss/logits": 0.8728846788406373, + "step": 20260 + }, + { + "epoch": 0.2027, + "grad_norm": 12.375, + "grad_norm_var": 0.21339518229166668, + "learning_rate": 0.0003, + "loss": 11.9664, + "loss/aux_loss": 0.048096506483852865, + "loss/crossentropy": 2.8471142053604126, + "loss/logits": 0.9133492529392242, + "step": 20270 + }, + { + "epoch": 0.2028, + "grad_norm": 11.4375, + "grad_norm_var": 0.7161295572916667, + "learning_rate": 0.0003, + "loss": 11.7893, + "loss/aux_loss": 0.04809205681085586, + "loss/crossentropy": 2.612596166133881, + "loss/logits": 0.870291605591774, + "step": 20280 + }, + { + "epoch": 0.2029, + "grad_norm": 12.8125, + "grad_norm_var": 0.8388020833333333, + "learning_rate": 0.0003, + "loss": 11.9102, + "loss/aux_loss": 0.04809516165405512, + "loss/crossentropy": 3.0022507131099703, + "loss/logits": 0.9417583554983139, + "step": 20290 + }, + { + "epoch": 0.203, + "grad_norm": 11.8125, + "grad_norm_var": 0.9346354166666667, + "learning_rate": 0.0003, + "loss": 11.919, + "loss/aux_loss": 0.048092111200094226, + "loss/crossentropy": 2.8318777084350586, + "loss/logits": 0.9274811953306198, + "step": 20300 + }, + { + "epoch": 0.2031, + "grad_norm": 11.4375, + "grad_norm_var": 0.48639322916666666, + "learning_rate": 0.0003, + "loss": 11.8993, + "loss/aux_loss": 0.04809704571962357, + "loss/crossentropy": 2.653505039215088, + "loss/logits": 0.8742677927017212, + "step": 20310 + }, + { + "epoch": 0.2032, + "grad_norm": 11.875, + "grad_norm_var": 0.3296875, + "learning_rate": 0.0003, + "loss": 11.8204, + "loss/aux_loss": 0.04809397198259831, + "loss/crossentropy": 2.8869922399520873, + "loss/logits": 0.9349960386753082, + "step": 20320 + }, + { + "epoch": 0.2033, + "grad_norm": 12.625, + "grad_norm_var": 0.4853515625, + "learning_rate": 0.0003, + "loss": 11.7685, + "loss/aux_loss": 0.04808988757431507, + "loss/crossentropy": 2.713202327489853, + "loss/logits": 0.9036098659038544, + "step": 20330 + }, + { + "epoch": 0.2034, + "grad_norm": 11.25, + "grad_norm_var": 0.31243489583333334, + "learning_rate": 0.0003, + "loss": 11.783, + "loss/aux_loss": 0.04809346310794353, + "loss/crossentropy": 2.7694801688194275, + "loss/logits": 0.9182763814926147, + "step": 20340 + }, + { + "epoch": 0.2035, + "grad_norm": 12.4375, + "grad_norm_var": 0.44308268229166664, + "learning_rate": 0.0003, + "loss": 11.8069, + "loss/aux_loss": 0.04808987472206354, + "loss/crossentropy": 2.7956194162368773, + "loss/logits": 0.8902797639369965, + "step": 20350 + }, + { + "epoch": 0.2036, + "grad_norm": 11.6875, + "grad_norm_var": 0.18567708333333333, + "learning_rate": 0.0003, + "loss": 11.8991, + "loss/aux_loss": 0.048092426359653474, + "loss/crossentropy": 2.7856826066970823, + "loss/logits": 0.8947367310523987, + "step": 20360 + }, + { + "epoch": 0.2037, + "grad_norm": 11.8125, + "grad_norm_var": 7.349739583333333, + "learning_rate": 0.0003, + "loss": 11.8399, + "loss/aux_loss": 0.04809611644595861, + "loss/crossentropy": 2.7929181456565857, + "loss/logits": 0.9144560486078263, + "step": 20370 + }, + { + "epoch": 0.2038, + "grad_norm": 13.625, + "grad_norm_var": 0.5590983072916667, + "learning_rate": 0.0003, + "loss": 11.8832, + "loss/aux_loss": 0.04809699356555939, + "loss/crossentropy": 2.8222780883312226, + "loss/logits": 0.8883333146572113, + "step": 20380 + }, + { + "epoch": 0.2039, + "grad_norm": 12.0625, + "grad_norm_var": 7.416650390625, + "learning_rate": 0.0003, + "loss": 11.7412, + "loss/aux_loss": 0.048093249835073945, + "loss/crossentropy": 2.8504996478557585, + "loss/logits": 0.9075500249862671, + "step": 20390 + }, + { + "epoch": 0.204, + "grad_norm": 12.0625, + "grad_norm_var": 0.23331705729166666, + "learning_rate": 0.0003, + "loss": 12.0127, + "loss/aux_loss": 0.04809419121593237, + "loss/crossentropy": 2.8577288150787354, + "loss/logits": 0.902344498038292, + "step": 20400 + }, + { + "epoch": 0.2041, + "grad_norm": 12.5, + "grad_norm_var": 77.52389322916666, + "learning_rate": 0.0003, + "loss": 11.7763, + "loss/aux_loss": 0.04809053186327219, + "loss/crossentropy": 2.835852700471878, + "loss/logits": 0.9226241886615754, + "step": 20410 + }, + { + "epoch": 0.2042, + "grad_norm": 11.4375, + "grad_norm_var": 0.4442057291666667, + "learning_rate": 0.0003, + "loss": 11.8972, + "loss/aux_loss": 0.04809423293918371, + "loss/crossentropy": 2.8161026298999787, + "loss/logits": 0.926646676659584, + "step": 20420 + }, + { + "epoch": 0.2043, + "grad_norm": 11.4375, + "grad_norm_var": 0.691650390625, + "learning_rate": 0.0003, + "loss": 11.8844, + "loss/aux_loss": 0.04809097535908222, + "loss/crossentropy": 2.763902723789215, + "loss/logits": 0.9052618652582168, + "step": 20430 + }, + { + "epoch": 0.2044, + "grad_norm": 12.6875, + "grad_norm_var": 0.6910807291666666, + "learning_rate": 0.0003, + "loss": 11.7705, + "loss/aux_loss": 0.04810637105256319, + "loss/crossentropy": 2.6885470867156984, + "loss/logits": 0.8763923466205596, + "step": 20440 + }, + { + "epoch": 0.2045, + "grad_norm": 13.8125, + "grad_norm_var": 56.29894205729167, + "learning_rate": 0.0003, + "loss": 11.9226, + "loss/aux_loss": 0.048100747354328635, + "loss/crossentropy": 2.8076935350894927, + "loss/logits": 0.894736310839653, + "step": 20450 + }, + { + "epoch": 0.2046, + "grad_norm": 12.8125, + "grad_norm_var": 54.62161458333333, + "learning_rate": 0.0003, + "loss": 11.8524, + "loss/aux_loss": 0.0480956481769681, + "loss/crossentropy": 2.8430003762245177, + "loss/logits": 0.9245178937911988, + "step": 20460 + }, + { + "epoch": 0.2047, + "grad_norm": 11.3125, + "grad_norm_var": 0.391650390625, + "learning_rate": 0.0003, + "loss": 11.7571, + "loss/aux_loss": 0.04809457026422024, + "loss/crossentropy": 2.6352721631526945, + "loss/logits": 0.8530379116535187, + "step": 20470 + }, + { + "epoch": 0.2048, + "grad_norm": 13.4375, + "grad_norm_var": 0.6627604166666666, + "learning_rate": 0.0003, + "loss": 11.9712, + "loss/aux_loss": 0.04810174349695444, + "loss/crossentropy": 2.8618993282318117, + "loss/logits": 0.8944301903247833, + "step": 20480 + }, + { + "epoch": 0.2049, + "grad_norm": 13.9375, + "grad_norm_var": 0.7, + "learning_rate": 0.0003, + "loss": 11.951, + "loss/aux_loss": 0.04809792432934046, + "loss/crossentropy": 2.7998989462852477, + "loss/logits": 0.8966112703084945, + "step": 20490 + }, + { + "epoch": 0.205, + "grad_norm": 13.625, + "grad_norm_var": 0.350244140625, + "learning_rate": 0.0003, + "loss": 12.0158, + "loss/aux_loss": 0.04808657988905907, + "loss/crossentropy": 2.919311285018921, + "loss/logits": 0.9131373822689056, + "step": 20500 + }, + { + "epoch": 0.2051, + "grad_norm": 11.625, + "grad_norm_var": 0.4176432291666667, + "learning_rate": 0.0003, + "loss": 11.8259, + "loss/aux_loss": 0.04808542001992464, + "loss/crossentropy": 2.8769485354423523, + "loss/logits": 0.9414841264486313, + "step": 20510 + }, + { + "epoch": 0.2052, + "grad_norm": 11.9375, + "grad_norm_var": 0.343994140625, + "learning_rate": 0.0003, + "loss": 11.9157, + "loss/aux_loss": 0.04809294939041138, + "loss/crossentropy": 2.6883323431015014, + "loss/logits": 0.8925897628068924, + "step": 20520 + }, + { + "epoch": 0.2053, + "grad_norm": 11.25, + "grad_norm_var": 0.4405598958333333, + "learning_rate": 0.0003, + "loss": 11.9364, + "loss/aux_loss": 0.04809175301343203, + "loss/crossentropy": 2.723712849617004, + "loss/logits": 0.8805695950984955, + "step": 20530 + }, + { + "epoch": 0.2054, + "grad_norm": 12.25, + "grad_norm_var": 0.5085774739583333, + "learning_rate": 0.0003, + "loss": 11.7972, + "loss/aux_loss": 0.04808581694960594, + "loss/crossentropy": 2.7969413816928865, + "loss/logits": 0.8847203850746155, + "step": 20540 + }, + { + "epoch": 0.2055, + "grad_norm": 11.6875, + "grad_norm_var": 0.1775390625, + "learning_rate": 0.0003, + "loss": 11.8719, + "loss/aux_loss": 0.04808731451630592, + "loss/crossentropy": 2.7113927245140075, + "loss/logits": 0.8873605281114578, + "step": 20550 + }, + { + "epoch": 0.2056, + "grad_norm": 12.6875, + "grad_norm_var": 0.39837239583333334, + "learning_rate": 0.0003, + "loss": 11.7171, + "loss/aux_loss": 0.048092583753168584, + "loss/crossentropy": 2.7325293242931368, + "loss/logits": 0.8983477979898453, + "step": 20560 + }, + { + "epoch": 0.2057, + "grad_norm": 12.25, + "grad_norm_var": 0.3421223958333333, + "learning_rate": 0.0003, + "loss": 11.791, + "loss/aux_loss": 0.048091687634587287, + "loss/crossentropy": 2.744081234931946, + "loss/logits": 0.9124333083629608, + "step": 20570 + }, + { + "epoch": 0.2058, + "grad_norm": 12.0625, + "grad_norm_var": 0.3817057291666667, + "learning_rate": 0.0003, + "loss": 11.8137, + "loss/aux_loss": 0.04810182899236679, + "loss/crossentropy": 2.82774156332016, + "loss/logits": 0.9164555937051773, + "step": 20580 + }, + { + "epoch": 0.2059, + "grad_norm": 12.9375, + "grad_norm_var": 2.7134765625, + "learning_rate": 0.0003, + "loss": 11.7681, + "loss/aux_loss": 0.04808175358921289, + "loss/crossentropy": 2.7752291679382326, + "loss/logits": 0.923569667339325, + "step": 20590 + }, + { + "epoch": 0.206, + "grad_norm": 11.875, + "grad_norm_var": 2.0637858072916666, + "learning_rate": 0.0003, + "loss": 11.8039, + "loss/aux_loss": 0.04809446018189192, + "loss/crossentropy": 2.7302875399589537, + "loss/logits": 0.9092204391956329, + "step": 20600 + }, + { + "epoch": 0.2061, + "grad_norm": 12.8125, + "grad_norm_var": 0.3763020833333333, + "learning_rate": 0.0003, + "loss": 11.7904, + "loss/aux_loss": 0.04808835070580244, + "loss/crossentropy": 2.9346749424934386, + "loss/logits": 0.8693037539720535, + "step": 20610 + }, + { + "epoch": 0.2062, + "grad_norm": 11.625, + "grad_norm_var": 0.2877604166666667, + "learning_rate": 0.0003, + "loss": 11.7153, + "loss/aux_loss": 0.04808915480971336, + "loss/crossentropy": 2.891802728176117, + "loss/logits": 0.872070437669754, + "step": 20620 + }, + { + "epoch": 0.2063, + "grad_norm": 12.25, + "grad_norm_var": 43.424739583333334, + "learning_rate": 0.0003, + "loss": 11.8111, + "loss/aux_loss": 0.0481000667437911, + "loss/crossentropy": 2.752538466453552, + "loss/logits": 0.910025691986084, + "step": 20630 + }, + { + "epoch": 0.2064, + "grad_norm": 12.1875, + "grad_norm_var": 0.3419270833333333, + "learning_rate": 0.0003, + "loss": 11.7758, + "loss/aux_loss": 0.04808867033571005, + "loss/crossentropy": 2.816843068599701, + "loss/logits": 0.8870363384485245, + "step": 20640 + }, + { + "epoch": 0.2065, + "grad_norm": 11.4375, + "grad_norm_var": 0.27537434895833335, + "learning_rate": 0.0003, + "loss": 11.9281, + "loss/aux_loss": 0.04809427950531244, + "loss/crossentropy": 2.798200511932373, + "loss/logits": 0.9375192672014236, + "step": 20650 + }, + { + "epoch": 0.2066, + "grad_norm": 12.375, + "grad_norm_var": 3.9953125, + "learning_rate": 0.0003, + "loss": 11.9466, + "loss/aux_loss": 0.048091378435492514, + "loss/crossentropy": 2.707651823759079, + "loss/logits": 0.8876151233911515, + "step": 20660 + }, + { + "epoch": 0.2067, + "grad_norm": 12.5, + "grad_norm_var": 1.7358723958333333, + "learning_rate": 0.0003, + "loss": 11.8981, + "loss/aux_loss": 0.048106171749532224, + "loss/crossentropy": 2.697846329212189, + "loss/logits": 0.9072348445653915, + "step": 20670 + }, + { + "epoch": 0.2068, + "grad_norm": 13.0, + "grad_norm_var": 1.8640625, + "learning_rate": 0.0003, + "loss": 11.7331, + "loss/aux_loss": 0.04809394646435976, + "loss/crossentropy": 2.8085982382297514, + "loss/logits": 0.8863144606351853, + "step": 20680 + }, + { + "epoch": 0.2069, + "grad_norm": 12.6875, + "grad_norm_var": 0.5259765625, + "learning_rate": 0.0003, + "loss": 11.6847, + "loss/aux_loss": 0.048090824671089646, + "loss/crossentropy": 2.688712340593338, + "loss/logits": 0.9194134473800659, + "step": 20690 + }, + { + "epoch": 0.207, + "grad_norm": 12.25, + "grad_norm_var": 0.43430989583333335, + "learning_rate": 0.0003, + "loss": 11.8249, + "loss/aux_loss": 0.048093540407717225, + "loss/crossentropy": 2.8308887600898744, + "loss/logits": 0.9012588620185852, + "step": 20700 + }, + { + "epoch": 0.2071, + "grad_norm": 12.625, + "grad_norm_var": 0.38483072916666666, + "learning_rate": 0.0003, + "loss": 11.8041, + "loss/aux_loss": 0.04809002298861742, + "loss/crossentropy": 2.7663665294647215, + "loss/logits": 0.8768691569566727, + "step": 20710 + }, + { + "epoch": 0.2072, + "grad_norm": 12.0625, + "grad_norm_var": 0.8957682291666667, + "learning_rate": 0.0003, + "loss": 11.7693, + "loss/aux_loss": 0.0480813292786479, + "loss/crossentropy": 2.7900067985057833, + "loss/logits": 0.9126892119646073, + "step": 20720 + }, + { + "epoch": 0.2073, + "grad_norm": 11.25, + "grad_norm_var": 0.735400390625, + "learning_rate": 0.0003, + "loss": 11.8439, + "loss/aux_loss": 0.04809948187321424, + "loss/crossentropy": 2.7565189003944397, + "loss/logits": 0.8766478002071381, + "step": 20730 + }, + { + "epoch": 0.2074, + "grad_norm": 12.125, + "grad_norm_var": 0.45558268229166665, + "learning_rate": 0.0003, + "loss": 11.8518, + "loss/aux_loss": 0.048083253763616086, + "loss/crossentropy": 2.7713675141334533, + "loss/logits": 0.8875041484832764, + "step": 20740 + }, + { + "epoch": 0.2075, + "grad_norm": 12.8125, + "grad_norm_var": 0.9202962239583333, + "learning_rate": 0.0003, + "loss": 11.9413, + "loss/aux_loss": 0.04809508752077818, + "loss/crossentropy": 2.937278914451599, + "loss/logits": 0.9425408929586411, + "step": 20750 + }, + { + "epoch": 0.2076, + "grad_norm": 11.5, + "grad_norm_var": 0.9744140625, + "learning_rate": 0.0003, + "loss": 11.9612, + "loss/aux_loss": 0.04808584563434124, + "loss/crossentropy": 2.837298500537872, + "loss/logits": 0.9269993782043457, + "step": 20760 + }, + { + "epoch": 0.2077, + "grad_norm": 12.0, + "grad_norm_var": 0.24420572916666666, + "learning_rate": 0.0003, + "loss": 11.8762, + "loss/aux_loss": 0.0480938971042633, + "loss/crossentropy": 2.7334739685058596, + "loss/logits": 0.8894063144922256, + "step": 20770 + }, + { + "epoch": 0.2078, + "grad_norm": 11.4375, + "grad_norm_var": 0.2652180989583333, + "learning_rate": 0.0003, + "loss": 11.8865, + "loss/aux_loss": 0.04807817693799734, + "loss/crossentropy": 2.7916306495666503, + "loss/logits": 0.9009652465581894, + "step": 20780 + }, + { + "epoch": 0.2079, + "grad_norm": 12.4375, + "grad_norm_var": 0.24921875, + "learning_rate": 0.0003, + "loss": 11.8028, + "loss/aux_loss": 0.0480882965028286, + "loss/crossentropy": 2.7257829308509827, + "loss/logits": 0.9154664635658264, + "step": 20790 + }, + { + "epoch": 0.208, + "grad_norm": 12.6875, + "grad_norm_var": 0.2400390625, + "learning_rate": 0.0003, + "loss": 12.0261, + "loss/aux_loss": 0.048092817142605784, + "loss/crossentropy": 2.7954021215438845, + "loss/logits": 0.9113053381443024, + "step": 20800 + }, + { + "epoch": 0.2081, + "grad_norm": 13.0, + "grad_norm_var": 0.2259765625, + "learning_rate": 0.0003, + "loss": 11.7759, + "loss/aux_loss": 0.04809394646435976, + "loss/crossentropy": 2.8197373390197753, + "loss/logits": 0.8813621670007705, + "step": 20810 + }, + { + "epoch": 0.2082, + "grad_norm": 19.5, + "grad_norm_var": 3.6874348958333334, + "learning_rate": 0.0003, + "loss": 11.6814, + "loss/aux_loss": 0.0480814166367054, + "loss/crossentropy": 2.817176288366318, + "loss/logits": 0.9225684970617294, + "step": 20820 + }, + { + "epoch": 0.2083, + "grad_norm": 13.0625, + "grad_norm_var": 4.002718098958334, + "learning_rate": 0.0003, + "loss": 12.0019, + "loss/aux_loss": 0.0480852359905839, + "loss/crossentropy": 2.668290489912033, + "loss/logits": 0.9053199380636215, + "step": 20830 + }, + { + "epoch": 0.2084, + "grad_norm": 11.875, + "grad_norm_var": 0.613134765625, + "learning_rate": 0.0003, + "loss": 11.8592, + "loss/aux_loss": 0.04809372667223215, + "loss/crossentropy": 2.7935187935829164, + "loss/logits": 0.8976625889539719, + "step": 20840 + }, + { + "epoch": 0.2085, + "grad_norm": 13.125, + "grad_norm_var": 0.438134765625, + "learning_rate": 0.0003, + "loss": 11.6427, + "loss/aux_loss": 0.04809672702103853, + "loss/crossentropy": 2.868895101547241, + "loss/logits": 0.8949025511741638, + "step": 20850 + }, + { + "epoch": 0.2086, + "grad_norm": 12.375, + "grad_norm_var": 0.442822265625, + "learning_rate": 0.0003, + "loss": 11.7802, + "loss/aux_loss": 0.04807591456919909, + "loss/crossentropy": 2.884802359342575, + "loss/logits": 0.88098503947258, + "step": 20860 + }, + { + "epoch": 0.2087, + "grad_norm": 11.875, + "grad_norm_var": 0.331103515625, + "learning_rate": 0.0003, + "loss": 11.8061, + "loss/aux_loss": 0.048096058703958985, + "loss/crossentropy": 2.86653151512146, + "loss/logits": 0.9177994340658188, + "step": 20870 + }, + { + "epoch": 0.2088, + "grad_norm": 11.5625, + "grad_norm_var": 0.1890625, + "learning_rate": 0.0003, + "loss": 11.6698, + "loss/aux_loss": 0.04808076079934835, + "loss/crossentropy": 2.669018977880478, + "loss/logits": 0.8996531933546066, + "step": 20880 + }, + { + "epoch": 0.2089, + "grad_norm": 11.375, + "grad_norm_var": 0.10584309895833334, + "learning_rate": 0.0003, + "loss": 11.7736, + "loss/aux_loss": 0.048097938485443595, + "loss/crossentropy": 2.814938074350357, + "loss/logits": 0.8803753167390823, + "step": 20890 + }, + { + "epoch": 0.209, + "grad_norm": 12.6875, + "grad_norm_var": 0.249462890625, + "learning_rate": 0.0003, + "loss": 11.8773, + "loss/aux_loss": 0.048087738640606406, + "loss/crossentropy": 2.8559614181518556, + "loss/logits": 0.924946254491806, + "step": 20900 + }, + { + "epoch": 0.2091, + "grad_norm": 11.8125, + "grad_norm_var": 0.45677083333333335, + "learning_rate": 0.0003, + "loss": 11.7792, + "loss/aux_loss": 0.04809671528637409, + "loss/crossentropy": 2.833332586288452, + "loss/logits": 0.9064707219600677, + "step": 20910 + }, + { + "epoch": 0.2092, + "grad_norm": 12.875, + "grad_norm_var": 0.45115559895833335, + "learning_rate": 0.0003, + "loss": 11.6766, + "loss/aux_loss": 0.048085590824484825, + "loss/crossentropy": 2.6808117508888243, + "loss/logits": 0.8879122287034988, + "step": 20920 + }, + { + "epoch": 0.2093, + "grad_norm": 13.0, + "grad_norm_var": 0.3841145833333333, + "learning_rate": 0.0003, + "loss": 11.926, + "loss/aux_loss": 0.048085760325193405, + "loss/crossentropy": 2.8332987904548643, + "loss/logits": 0.9374050021171569, + "step": 20930 + }, + { + "epoch": 0.2094, + "grad_norm": 12.375, + "grad_norm_var": 0.48125, + "learning_rate": 0.0003, + "loss": 11.7561, + "loss/aux_loss": 0.0480853458866477, + "loss/crossentropy": 2.8423936545848845, + "loss/logits": 0.9064114809036254, + "step": 20940 + }, + { + "epoch": 0.2095, + "grad_norm": 12.25, + "grad_norm_var": 0.6079264322916667, + "learning_rate": 0.0003, + "loss": 11.7373, + "loss/aux_loss": 0.048095690459012984, + "loss/crossentropy": 2.6432439744472505, + "loss/logits": 0.8873191922903061, + "step": 20950 + }, + { + "epoch": 0.2096, + "grad_norm": 12.0625, + "grad_norm_var": 0.54296875, + "learning_rate": 0.0003, + "loss": 11.7411, + "loss/aux_loss": 0.04809337351471186, + "loss/crossentropy": 2.7309992611408234, + "loss/logits": 0.8804535895586014, + "step": 20960 + }, + { + "epoch": 0.2097, + "grad_norm": 12.0, + "grad_norm_var": 0.32734375, + "learning_rate": 0.0003, + "loss": 11.8208, + "loss/aux_loss": 0.04809424672275782, + "loss/crossentropy": 2.733998316526413, + "loss/logits": 0.8988215506076813, + "step": 20970 + }, + { + "epoch": 0.2098, + "grad_norm": 11.9375, + "grad_norm_var": 0.21222330729166666, + "learning_rate": 0.0003, + "loss": 11.896, + "loss/aux_loss": 0.04808952175080776, + "loss/crossentropy": 2.5865807056427004, + "loss/logits": 0.854627400636673, + "step": 20980 + }, + { + "epoch": 0.2099, + "grad_norm": 12.375, + "grad_norm_var": 0.38645833333333335, + "learning_rate": 0.0003, + "loss": 11.8622, + "loss/aux_loss": 0.04808156508952379, + "loss/crossentropy": 2.851080930233002, + "loss/logits": 0.9109200239181519, + "step": 20990 + }, + { + "epoch": 0.21, + "grad_norm": 11.6875, + "grad_norm_var": 0.49661458333333336, + "learning_rate": 0.0003, + "loss": 11.7082, + "loss/aux_loss": 0.04808969963341951, + "loss/crossentropy": 2.84299578666687, + "loss/logits": 0.8802772104740143, + "step": 21000 + }, + { + "epoch": 0.2101, + "grad_norm": 12.375, + "grad_norm_var": 0.5040201822916667, + "learning_rate": 0.0003, + "loss": 11.8945, + "loss/aux_loss": 0.048087981343269345, + "loss/crossentropy": 2.6353746175765993, + "loss/logits": 0.8833418905735015, + "step": 21010 + }, + { + "epoch": 0.2102, + "grad_norm": 11.75, + "grad_norm_var": 0.4593098958333333, + "learning_rate": 0.0003, + "loss": 11.6485, + "loss/aux_loss": 0.048100402019917964, + "loss/crossentropy": 2.8276872038841248, + "loss/logits": 0.9351934552192688, + "step": 21020 + }, + { + "epoch": 0.2103, + "grad_norm": 12.6875, + "grad_norm_var": 0.4083170572916667, + "learning_rate": 0.0003, + "loss": 11.6644, + "loss/aux_loss": 0.048088280111551286, + "loss/crossentropy": 2.8238101243972777, + "loss/logits": 0.8919139176607132, + "step": 21030 + }, + { + "epoch": 0.2104, + "grad_norm": 11.375, + "grad_norm_var": 0.4735514322916667, + "learning_rate": 0.0003, + "loss": 11.6249, + "loss/aux_loss": 0.04809662196785212, + "loss/crossentropy": 2.7005931556224825, + "loss/logits": 0.8710683017969132, + "step": 21040 + }, + { + "epoch": 0.2105, + "grad_norm": 12.0, + "grad_norm_var": 0.3633951822916667, + "learning_rate": 0.0003, + "loss": 11.895, + "loss/aux_loss": 0.048086578585207464, + "loss/crossentropy": 2.9533318161964415, + "loss/logits": 0.9189774692058563, + "step": 21050 + }, + { + "epoch": 0.2106, + "grad_norm": 13.0, + "grad_norm_var": 0.34542643229166664, + "learning_rate": 0.0003, + "loss": 11.933, + "loss/aux_loss": 0.04809347465634346, + "loss/crossentropy": 2.7933754503726957, + "loss/logits": 0.8989807814359665, + "step": 21060 + }, + { + "epoch": 0.2107, + "grad_norm": 11.625, + "grad_norm_var": 0.3900390625, + "learning_rate": 0.0003, + "loss": 11.7229, + "loss/aux_loss": 0.048093566112220286, + "loss/crossentropy": 2.909538185596466, + "loss/logits": 0.9254848033189773, + "step": 21070 + }, + { + "epoch": 0.2108, + "grad_norm": 12.3125, + "grad_norm_var": 0.48483072916666664, + "learning_rate": 0.0003, + "loss": 11.8718, + "loss/aux_loss": 0.048095266707241534, + "loss/crossentropy": 2.77746034860611, + "loss/logits": 0.8918098568916321, + "step": 21080 + }, + { + "epoch": 0.2109, + "grad_norm": 12.3125, + "grad_norm_var": 0.5109375, + "learning_rate": 0.0003, + "loss": 11.8693, + "loss/aux_loss": 0.04808137100189924, + "loss/crossentropy": 2.8595972299575805, + "loss/logits": 0.9113215535879136, + "step": 21090 + }, + { + "epoch": 0.211, + "grad_norm": 12.1875, + "grad_norm_var": 0.226416015625, + "learning_rate": 0.0003, + "loss": 11.7439, + "loss/aux_loss": 0.04809752646833658, + "loss/crossentropy": 2.8955005407333374, + "loss/logits": 0.9018853276968002, + "step": 21100 + }, + { + "epoch": 0.2111, + "grad_norm": 13.3125, + "grad_norm_var": 0.44021809895833336, + "learning_rate": 0.0003, + "loss": 11.8998, + "loss/aux_loss": 0.04808686450123787, + "loss/crossentropy": 2.8281142473220826, + "loss/logits": 0.9361472398042678, + "step": 21110 + }, + { + "epoch": 0.2112, + "grad_norm": 11.8125, + "grad_norm_var": 0.7166015625, + "learning_rate": 0.0003, + "loss": 11.9078, + "loss/aux_loss": 0.04808351919054985, + "loss/crossentropy": 2.8331224858760833, + "loss/logits": 0.925226366519928, + "step": 21120 + }, + { + "epoch": 0.2113, + "grad_norm": 12.25, + "grad_norm_var": 0.20045572916666668, + "learning_rate": 0.0003, + "loss": 11.8609, + "loss/aux_loss": 0.048093733564019206, + "loss/crossentropy": 2.7209898710250853, + "loss/logits": 0.9094552010297775, + "step": 21130 + }, + { + "epoch": 0.2114, + "grad_norm": 13.1875, + "grad_norm_var": 0.3734212239583333, + "learning_rate": 0.0003, + "loss": 11.8317, + "loss/aux_loss": 0.04808988496661186, + "loss/crossentropy": 2.7308087766170503, + "loss/logits": 0.875358846783638, + "step": 21140 + }, + { + "epoch": 0.2115, + "grad_norm": 11.875, + "grad_norm_var": 1.264306640625, + "learning_rate": 0.0003, + "loss": 11.7752, + "loss/aux_loss": 0.04809015057981014, + "loss/crossentropy": 2.744631814956665, + "loss/logits": 0.890999186038971, + "step": 21150 + }, + { + "epoch": 0.2116, + "grad_norm": 11.3125, + "grad_norm_var": 1.3072916666666667, + "learning_rate": 0.0003, + "loss": 11.7284, + "loss/aux_loss": 0.04809484537690878, + "loss/crossentropy": 2.7210877299308778, + "loss/logits": 0.8934634417295456, + "step": 21160 + }, + { + "epoch": 0.2117, + "grad_norm": 13.75, + "grad_norm_var": 0.718603515625, + "learning_rate": 0.0003, + "loss": 11.6448, + "loss/aux_loss": 0.04808205626904964, + "loss/crossentropy": 2.6400113105773926, + "loss/logits": 0.8528676211833954, + "step": 21170 + }, + { + "epoch": 0.2118, + "grad_norm": 11.625, + "grad_norm_var": 0.5536295572916666, + "learning_rate": 0.0003, + "loss": 11.881, + "loss/aux_loss": 0.0481026129797101, + "loss/crossentropy": 2.7602346658706667, + "loss/logits": 0.8984622836112977, + "step": 21180 + }, + { + "epoch": 0.2119, + "grad_norm": 12.3125, + "grad_norm_var": 0.2978515625, + "learning_rate": 0.0003, + "loss": 11.9188, + "loss/aux_loss": 0.04807357918471098, + "loss/crossentropy": 2.9864767670631407, + "loss/logits": 0.9314229309558868, + "step": 21190 + }, + { + "epoch": 0.212, + "grad_norm": 12.0, + "grad_norm_var": 0.6145182291666667, + "learning_rate": 0.0003, + "loss": 12.0001, + "loss/aux_loss": 0.048103776201605795, + "loss/crossentropy": 2.816402053833008, + "loss/logits": 0.904208105802536, + "step": 21200 + }, + { + "epoch": 0.2121, + "grad_norm": 12.0625, + "grad_norm_var": 0.7231608072916667, + "learning_rate": 0.0003, + "loss": 11.7257, + "loss/aux_loss": 0.048085693083703515, + "loss/crossentropy": 2.708358186483383, + "loss/logits": 0.898724827170372, + "step": 21210 + }, + { + "epoch": 0.2122, + "grad_norm": 12.0, + "grad_norm_var": 15.506103515625, + "learning_rate": 0.0003, + "loss": 11.7972, + "loss/aux_loss": 0.04809206072241068, + "loss/crossentropy": 2.744321119785309, + "loss/logits": 0.9206572264432907, + "step": 21220 + }, + { + "epoch": 0.2123, + "grad_norm": 14.3125, + "grad_norm_var": 14.617041015625, + "learning_rate": 0.0003, + "loss": 11.7268, + "loss/aux_loss": 0.048096605204045774, + "loss/crossentropy": 2.72475118637085, + "loss/logits": 0.874206417798996, + "step": 21230 + }, + { + "epoch": 0.2124, + "grad_norm": 12.9375, + "grad_norm_var": 0.4019368489583333, + "learning_rate": 0.0003, + "loss": 11.8328, + "loss/aux_loss": 0.04808426145464182, + "loss/crossentropy": 2.815295088291168, + "loss/logits": 0.9246113210916519, + "step": 21240 + }, + { + "epoch": 0.2125, + "grad_norm": 11.8125, + "grad_norm_var": 0.375634765625, + "learning_rate": 0.0003, + "loss": 11.8002, + "loss/aux_loss": 0.048097210749983785, + "loss/crossentropy": 2.5762141942977905, + "loss/logits": 0.8894416421651841, + "step": 21250 + }, + { + "epoch": 0.2126, + "grad_norm": 13.0, + "grad_norm_var": 0.410400390625, + "learning_rate": 0.0003, + "loss": 11.8324, + "loss/aux_loss": 0.048088593408465385, + "loss/crossentropy": 2.783554768562317, + "loss/logits": 0.8944692641496659, + "step": 21260 + }, + { + "epoch": 0.2127, + "grad_norm": 12.375, + "grad_norm_var": 0.23956705729166666, + "learning_rate": 0.0003, + "loss": 11.9322, + "loss/aux_loss": 0.048084756731987, + "loss/crossentropy": 2.8908560514450072, + "loss/logits": 0.9285173654556275, + "step": 21270 + }, + { + "epoch": 0.2128, + "grad_norm": 12.4375, + "grad_norm_var": 0.3651041666666667, + "learning_rate": 0.0003, + "loss": 11.7035, + "loss/aux_loss": 0.04809834379702806, + "loss/crossentropy": 2.7666608333587646, + "loss/logits": 0.8596565514802933, + "step": 21280 + }, + { + "epoch": 0.2129, + "grad_norm": 13.125, + "grad_norm_var": 0.397900390625, + "learning_rate": 0.0003, + "loss": 11.8816, + "loss/aux_loss": 0.048090608604252336, + "loss/crossentropy": 2.7386081337928774, + "loss/logits": 0.8871663898229599, + "step": 21290 + }, + { + "epoch": 0.213, + "grad_norm": 11.8125, + "grad_norm_var": 0.5919108072916667, + "learning_rate": 0.0003, + "loss": 11.6848, + "loss/aux_loss": 0.04808788150548935, + "loss/crossentropy": 2.851125454902649, + "loss/logits": 0.9087550818920136, + "step": 21300 + }, + { + "epoch": 0.2131, + "grad_norm": 12.4375, + "grad_norm_var": 0.3337890625, + "learning_rate": 0.0003, + "loss": 11.6448, + "loss/aux_loss": 0.0480893436819315, + "loss/crossentropy": 3.0054169058799745, + "loss/logits": 0.9328852593898773, + "step": 21310 + }, + { + "epoch": 0.2132, + "grad_norm": 11.75, + "grad_norm_var": 0.529931640625, + "learning_rate": 0.0003, + "loss": 11.6956, + "loss/aux_loss": 0.048091666772961617, + "loss/crossentropy": 2.760949170589447, + "loss/logits": 0.8893462926149368, + "step": 21320 + }, + { + "epoch": 0.2133, + "grad_norm": 13.125, + "grad_norm_var": 0.3516764322916667, + "learning_rate": 0.0003, + "loss": 11.8081, + "loss/aux_loss": 0.0480879507958889, + "loss/crossentropy": 2.766853415966034, + "loss/logits": 0.8801511764526367, + "step": 21330 + }, + { + "epoch": 0.2134, + "grad_norm": 12.125, + "grad_norm_var": 0.45193684895833336, + "learning_rate": 0.0003, + "loss": 11.8681, + "loss/aux_loss": 0.04809288065880537, + "loss/crossentropy": 2.6554811358451844, + "loss/logits": 0.911386126279831, + "step": 21340 + }, + { + "epoch": 0.2135, + "grad_norm": 12.375, + "grad_norm_var": 0.4337076822916667, + "learning_rate": 0.0003, + "loss": 12.0507, + "loss/aux_loss": 0.04808918032795191, + "loss/crossentropy": 2.887453854084015, + "loss/logits": 0.9052879035472869, + "step": 21350 + }, + { + "epoch": 0.2136, + "grad_norm": 12.0, + "grad_norm_var": 0.5317708333333333, + "learning_rate": 0.0003, + "loss": 11.7237, + "loss/aux_loss": 0.04809961635619402, + "loss/crossentropy": 2.739542376995087, + "loss/logits": 0.9075916647911072, + "step": 21360 + }, + { + "epoch": 0.2137, + "grad_norm": 12.4375, + "grad_norm_var": 0.486181640625, + "learning_rate": 0.0003, + "loss": 11.827, + "loss/aux_loss": 0.048078613728284834, + "loss/crossentropy": 2.7362434446811674, + "loss/logits": 0.8810094386339188, + "step": 21370 + }, + { + "epoch": 0.2138, + "grad_norm": 12.5625, + "grad_norm_var": 1.4038899739583333, + "learning_rate": 0.0003, + "loss": 11.8722, + "loss/aux_loss": 0.048095672950148584, + "loss/crossentropy": 2.9124315857887266, + "loss/logits": 0.9393705606460572, + "step": 21380 + }, + { + "epoch": 0.2139, + "grad_norm": 13.5625, + "grad_norm_var": 0.784228515625, + "learning_rate": 0.0003, + "loss": 11.753, + "loss/aux_loss": 0.04809557497501373, + "loss/crossentropy": 2.7900996267795564, + "loss/logits": 0.9042523264884949, + "step": 21390 + }, + { + "epoch": 0.214, + "grad_norm": 12.75, + "grad_norm_var": 41.506184895833336, + "learning_rate": 0.0003, + "loss": 11.8063, + "loss/aux_loss": 0.048089190199971196, + "loss/crossentropy": 2.750228983163834, + "loss/logits": 0.889886274933815, + "step": 21400 + }, + { + "epoch": 0.2141, + "grad_norm": 11.875, + "grad_norm_var": 42.12511393229167, + "learning_rate": 0.0003, + "loss": 11.889, + "loss/aux_loss": 0.04809757433831692, + "loss/crossentropy": 2.798718500137329, + "loss/logits": 0.9059660851955413, + "step": 21410 + }, + { + "epoch": 0.2142, + "grad_norm": 12.625, + "grad_norm_var": 3.5251139322916667, + "learning_rate": 0.0003, + "loss": 11.7521, + "loss/aux_loss": 0.048101062327623366, + "loss/crossentropy": 2.8062780797481537, + "loss/logits": 0.9037539154291153, + "step": 21420 + }, + { + "epoch": 0.2143, + "grad_norm": 12.1875, + "grad_norm_var": 0.24088541666666666, + "learning_rate": 0.0003, + "loss": 11.6689, + "loss/aux_loss": 0.048091071844100955, + "loss/crossentropy": 2.7986648082733154, + "loss/logits": 0.8963402301073075, + "step": 21430 + }, + { + "epoch": 0.2144, + "grad_norm": 12.0, + "grad_norm_var": 0.26614583333333336, + "learning_rate": 0.0003, + "loss": 12.0498, + "loss/aux_loss": 0.04808345343917608, + "loss/crossentropy": 2.9033891916275025, + "loss/logits": 0.9560538738965988, + "step": 21440 + }, + { + "epoch": 0.2145, + "grad_norm": 12.6875, + "grad_norm_var": 0.37862955729166664, + "learning_rate": 0.0003, + "loss": 11.8566, + "loss/aux_loss": 0.04810182619839907, + "loss/crossentropy": 2.799178421497345, + "loss/logits": 0.9117594748735428, + "step": 21450 + }, + { + "epoch": 0.2146, + "grad_norm": 12.4375, + "grad_norm_var": 0.5555826822916666, + "learning_rate": 0.0003, + "loss": 11.9289, + "loss/aux_loss": 0.04809532649815083, + "loss/crossentropy": 2.7515031695365906, + "loss/logits": 0.9373772829771042, + "step": 21460 + }, + { + "epoch": 0.2147, + "grad_norm": 11.875, + "grad_norm_var": 0.7036458333333333, + "learning_rate": 0.0003, + "loss": 11.8032, + "loss/aux_loss": 0.048093835823237896, + "loss/crossentropy": 2.7646782994270325, + "loss/logits": 0.911266279220581, + "step": 21470 + }, + { + "epoch": 0.2148, + "grad_norm": 11.75, + "grad_norm_var": 0.17369791666666667, + "learning_rate": 0.0003, + "loss": 11.914, + "loss/aux_loss": 0.04809645172208547, + "loss/crossentropy": 2.7859348595142364, + "loss/logits": 0.875312551856041, + "step": 21480 + }, + { + "epoch": 0.2149, + "grad_norm": 12.5625, + "grad_norm_var": 2.250260416666667, + "learning_rate": 0.0003, + "loss": 11.8084, + "loss/aux_loss": 0.048092559166252616, + "loss/crossentropy": 2.886737060546875, + "loss/logits": 0.8639699459075928, + "step": 21490 + }, + { + "epoch": 0.215, + "grad_norm": 11.75, + "grad_norm_var": 2.191145833333333, + "learning_rate": 0.0003, + "loss": 11.7606, + "loss/aux_loss": 0.04809298049658537, + "loss/crossentropy": 2.9002821505069734, + "loss/logits": 0.9444745779037476, + "step": 21500 + }, + { + "epoch": 0.2151, + "grad_norm": 12.625, + "grad_norm_var": 0.410400390625, + "learning_rate": 0.0003, + "loss": 11.8001, + "loss/aux_loss": 0.048099953681230545, + "loss/crossentropy": 2.702463275194168, + "loss/logits": 0.9036620557308197, + "step": 21510 + }, + { + "epoch": 0.2152, + "grad_norm": 11.875, + "grad_norm_var": 0.33474934895833336, + "learning_rate": 0.0003, + "loss": 11.88, + "loss/aux_loss": 0.048086378164589404, + "loss/crossentropy": 2.7439981400966644, + "loss/logits": 0.9525706797838212, + "step": 21520 + }, + { + "epoch": 0.2153, + "grad_norm": 11.8125, + "grad_norm_var": 0.21770833333333334, + "learning_rate": 0.0003, + "loss": 11.7527, + "loss/aux_loss": 0.04810568634420633, + "loss/crossentropy": 2.6959027111530305, + "loss/logits": 0.8563485085964203, + "step": 21530 + }, + { + "epoch": 0.2154, + "grad_norm": 12.3125, + "grad_norm_var": 0.26925455729166664, + "learning_rate": 0.0003, + "loss": 11.8753, + "loss/aux_loss": 0.04807868674397468, + "loss/crossentropy": 2.769312971830368, + "loss/logits": 0.9261300444602967, + "step": 21540 + }, + { + "epoch": 0.2155, + "grad_norm": 12.5, + "grad_norm_var": 23.747770182291667, + "learning_rate": 0.0003, + "loss": 11.9234, + "loss/aux_loss": 0.04809074979275465, + "loss/crossentropy": 2.858646285533905, + "loss/logits": 0.9108448445796966, + "step": 21550 + }, + { + "epoch": 0.2156, + "grad_norm": 12.875, + "grad_norm_var": 23.503125, + "learning_rate": 0.0003, + "loss": 11.8421, + "loss/aux_loss": 0.04809603709727526, + "loss/crossentropy": 2.884668844938278, + "loss/logits": 0.9331677317619324, + "step": 21560 + }, + { + "epoch": 0.2157, + "grad_norm": 13.5, + "grad_norm_var": 1.2812337239583333, + "learning_rate": 0.0003, + "loss": 11.7742, + "loss/aux_loss": 0.04808317497372627, + "loss/crossentropy": 2.7802948713302613, + "loss/logits": 0.9503753989934921, + "step": 21570 + }, + { + "epoch": 0.2158, + "grad_norm": 20.625, + "grad_norm_var": 4.628059895833333, + "learning_rate": 0.0003, + "loss": 11.8258, + "loss/aux_loss": 0.04808113072067499, + "loss/crossentropy": 2.7815487384796143, + "loss/logits": 0.9218122154474259, + "step": 21580 + }, + { + "epoch": 0.2159, + "grad_norm": 14.9375, + "grad_norm_var": 4.69609375, + "learning_rate": 0.0003, + "loss": 11.7204, + "loss/aux_loss": 0.048096010275185105, + "loss/crossentropy": 2.831838434934616, + "loss/logits": 0.8932915806770325, + "step": 21590 + }, + { + "epoch": 0.216, + "grad_norm": 11.8125, + "grad_norm_var": 0.756103515625, + "learning_rate": 0.0003, + "loss": 11.7068, + "loss/aux_loss": 0.048090577125549316, + "loss/crossentropy": 2.818828046321869, + "loss/logits": 0.9055852591991425, + "step": 21600 + }, + { + "epoch": 0.2161, + "grad_norm": 12.75, + "grad_norm_var": 0.34889322916666665, + "learning_rate": 0.0003, + "loss": 11.8699, + "loss/aux_loss": 0.04809344317764044, + "loss/crossentropy": 2.725664830207825, + "loss/logits": 0.9231677383184433, + "step": 21610 + }, + { + "epoch": 0.2162, + "grad_norm": 13.625, + "grad_norm_var": 0.6227701822916667, + "learning_rate": 0.0003, + "loss": 11.8195, + "loss/aux_loss": 0.04809017200022936, + "loss/crossentropy": 2.7314105927944183, + "loss/logits": 0.9009778618812561, + "step": 21620 + }, + { + "epoch": 0.2163, + "grad_norm": 12.375, + "grad_norm_var": 3.8499348958333335, + "learning_rate": 0.0003, + "loss": 11.7735, + "loss/aux_loss": 0.04809233695268631, + "loss/crossentropy": 2.7671496987342836, + "loss/logits": 0.9043550729751587, + "step": 21630 + }, + { + "epoch": 0.2164, + "grad_norm": 13.125, + "grad_norm_var": 4.204166666666667, + "learning_rate": 0.0003, + "loss": 11.7381, + "loss/aux_loss": 0.04809225425124168, + "loss/crossentropy": 2.715823769569397, + "loss/logits": 0.8934505701065063, + "step": 21640 + }, + { + "epoch": 0.2165, + "grad_norm": 13.5625, + "grad_norm_var": 1.0377604166666667, + "learning_rate": 0.0003, + "loss": 11.8044, + "loss/aux_loss": 0.04808531980961561, + "loss/crossentropy": 2.7514367580413817, + "loss/logits": 0.9063582092523574, + "step": 21650 + }, + { + "epoch": 0.2166, + "grad_norm": 12.5, + "grad_norm_var": 0.4020182291666667, + "learning_rate": 0.0003, + "loss": 11.6973, + "loss/aux_loss": 0.04809113219380379, + "loss/crossentropy": 2.71242498755455, + "loss/logits": 0.9035886704921723, + "step": 21660 + }, + { + "epoch": 0.2167, + "grad_norm": 11.6875, + "grad_norm_var": 0.5839680989583333, + "learning_rate": 0.0003, + "loss": 11.6988, + "loss/aux_loss": 0.0480911111459136, + "loss/crossentropy": 2.934883952140808, + "loss/logits": 0.8832208603620529, + "step": 21670 + }, + { + "epoch": 0.2168, + "grad_norm": 12.25, + "grad_norm_var": 0.632666015625, + "learning_rate": 0.0003, + "loss": 11.7549, + "loss/aux_loss": 0.048089130967855456, + "loss/crossentropy": 2.677219772338867, + "loss/logits": 0.8748012632131577, + "step": 21680 + }, + { + "epoch": 0.2169, + "grad_norm": 11.75, + "grad_norm_var": 0.5957682291666667, + "learning_rate": 0.0003, + "loss": 11.874, + "loss/aux_loss": 0.0480881916359067, + "loss/crossentropy": 2.8454249918460848, + "loss/logits": 0.8973806709051132, + "step": 21690 + }, + { + "epoch": 0.217, + "grad_norm": 11.625, + "grad_norm_var": 0.42552083333333335, + "learning_rate": 0.0003, + "loss": 11.8774, + "loss/aux_loss": 0.04809354934841394, + "loss/crossentropy": 2.930919277667999, + "loss/logits": 0.9322040349245071, + "step": 21700 + }, + { + "epoch": 0.2171, + "grad_norm": 13.1875, + "grad_norm_var": 0.38229166666666664, + "learning_rate": 0.0003, + "loss": 11.8088, + "loss/aux_loss": 0.04809126667678356, + "loss/crossentropy": 2.709583592414856, + "loss/logits": 0.8984523087739944, + "step": 21710 + }, + { + "epoch": 0.2172, + "grad_norm": 12.4375, + "grad_norm_var": 0.3509765625, + "learning_rate": 0.0003, + "loss": 11.8755, + "loss/aux_loss": 0.04808931071311236, + "loss/crossentropy": 2.797287333011627, + "loss/logits": 0.8914159804582595, + "step": 21720 + }, + { + "epoch": 0.2173, + "grad_norm": 12.0, + "grad_norm_var": 0.3853515625, + "learning_rate": 0.0003, + "loss": 11.7927, + "loss/aux_loss": 0.0480969849973917, + "loss/crossentropy": 2.6972643613815306, + "loss/logits": 0.8642873585224151, + "step": 21730 + }, + { + "epoch": 0.2174, + "grad_norm": 12.375, + "grad_norm_var": 0.453369140625, + "learning_rate": 0.0003, + "loss": 11.7777, + "loss/aux_loss": 0.04809529315680265, + "loss/crossentropy": 2.7677155137062073, + "loss/logits": 0.8832725346088409, + "step": 21740 + }, + { + "epoch": 0.2175, + "grad_norm": 11.5, + "grad_norm_var": 0.34739583333333335, + "learning_rate": 0.0003, + "loss": 11.9139, + "loss/aux_loss": 0.048091381415724756, + "loss/crossentropy": 2.799149090051651, + "loss/logits": 0.887069022655487, + "step": 21750 + }, + { + "epoch": 0.2176, + "grad_norm": 12.4375, + "grad_norm_var": 0.17076822916666667, + "learning_rate": 0.0003, + "loss": 11.8353, + "loss/aux_loss": 0.04808845948427916, + "loss/crossentropy": 3.0753382325172423, + "loss/logits": 0.945702788233757, + "step": 21760 + }, + { + "epoch": 0.2177, + "grad_norm": 12.1875, + "grad_norm_var": 0.2087890625, + "learning_rate": 0.0003, + "loss": 11.6985, + "loss/aux_loss": 0.048092255368828773, + "loss/crossentropy": 2.6490719497203825, + "loss/logits": 0.8540039539337159, + "step": 21770 + }, + { + "epoch": 0.2178, + "grad_norm": 12.625, + "grad_norm_var": 0.19583333333333333, + "learning_rate": 0.0003, + "loss": 11.8542, + "loss/aux_loss": 0.048097673989832404, + "loss/crossentropy": 2.8406033515930176, + "loss/logits": 0.8934641659259797, + "step": 21780 + }, + { + "epoch": 0.2179, + "grad_norm": 14.25, + "grad_norm_var": 0.6061848958333333, + "learning_rate": 0.0003, + "loss": 11.8071, + "loss/aux_loss": 0.048086115159094334, + "loss/crossentropy": 2.8944154620170592, + "loss/logits": 0.9081297039985656, + "step": 21790 + }, + { + "epoch": 0.218, + "grad_norm": 13.0, + "grad_norm_var": 14.662955729166667, + "learning_rate": 0.0003, + "loss": 11.9785, + "loss/aux_loss": 0.04810140430927277, + "loss/crossentropy": 2.7707842707633974, + "loss/logits": 0.8933016210794449, + "step": 21800 + }, + { + "epoch": 0.2181, + "grad_norm": 12.25, + "grad_norm_var": 14.276676432291667, + "learning_rate": 0.0003, + "loss": 12.1024, + "loss/aux_loss": 0.04808119479566812, + "loss/crossentropy": 2.8384734869003294, + "loss/logits": 0.9208894163370133, + "step": 21810 + }, + { + "epoch": 0.2182, + "grad_norm": 12.625, + "grad_norm_var": 0.46243489583333336, + "learning_rate": 0.0003, + "loss": 11.9, + "loss/aux_loss": 0.048093420639634135, + "loss/crossentropy": 2.710639762878418, + "loss/logits": 0.894580963253975, + "step": 21820 + }, + { + "epoch": 0.2183, + "grad_norm": 12.5, + "grad_norm_var": 1.1384765625, + "learning_rate": 0.0003, + "loss": 11.6904, + "loss/aux_loss": 0.048085536994040015, + "loss/crossentropy": 2.621820467710495, + "loss/logits": 0.8434902101755142, + "step": 21830 + }, + { + "epoch": 0.2184, + "grad_norm": 14.875, + "grad_norm_var": 24.766910807291666, + "learning_rate": 0.0003, + "loss": 11.8784, + "loss/aux_loss": 0.04809453897178173, + "loss/crossentropy": 2.8327670872211455, + "loss/logits": 0.9073660403490067, + "step": 21840 + }, + { + "epoch": 0.2185, + "grad_norm": 13.875, + "grad_norm_var": 24.213134765625, + "learning_rate": 0.0003, + "loss": 11.6735, + "loss/aux_loss": 0.04809699393808842, + "loss/crossentropy": 2.730017304420471, + "loss/logits": 0.9322267979383468, + "step": 21850 + }, + { + "epoch": 0.2186, + "grad_norm": 12.4375, + "grad_norm_var": 1.146875, + "learning_rate": 0.0003, + "loss": 11.6376, + "loss/aux_loss": 0.04807912241667509, + "loss/crossentropy": 2.786021035909653, + "loss/logits": 0.8863259345293045, + "step": 21860 + }, + { + "epoch": 0.2187, + "grad_norm": 11.6875, + "grad_norm_var": 0.44114583333333335, + "learning_rate": 0.0003, + "loss": 11.588, + "loss/aux_loss": 0.04809343423694372, + "loss/crossentropy": 2.709645652770996, + "loss/logits": 0.8401134133338928, + "step": 21870 + }, + { + "epoch": 0.2188, + "grad_norm": 12.5, + "grad_norm_var": 0.619775390625, + "learning_rate": 0.0003, + "loss": 11.6378, + "loss/aux_loss": 0.048089167289435866, + "loss/crossentropy": 2.9162204384803774, + "loss/logits": 0.8603705197572709, + "step": 21880 + }, + { + "epoch": 0.2189, + "grad_norm": 12.125, + "grad_norm_var": 0.441259765625, + "learning_rate": 0.0003, + "loss": 11.8368, + "loss/aux_loss": 0.048093576729297635, + "loss/crossentropy": 2.6666926383972167, + "loss/logits": 0.909285506606102, + "step": 21890 + }, + { + "epoch": 0.219, + "grad_norm": 12.5625, + "grad_norm_var": 0.28880208333333335, + "learning_rate": 0.0003, + "loss": 11.8581, + "loss/aux_loss": 0.04808791261166334, + "loss/crossentropy": 2.8666730880737306, + "loss/logits": 0.9074492365121841, + "step": 21900 + }, + { + "epoch": 0.2191, + "grad_norm": 13.375, + "grad_norm_var": 0.5520670572916667, + "learning_rate": 0.0003, + "loss": 11.7345, + "loss/aux_loss": 0.04809127487242222, + "loss/crossentropy": 2.749277150630951, + "loss/logits": 0.9035682111978531, + "step": 21910 + }, + { + "epoch": 0.2192, + "grad_norm": 12.75, + "grad_norm_var": 0.36399739583333335, + "learning_rate": 0.0003, + "loss": 11.7659, + "loss/aux_loss": 0.048089764453470706, + "loss/crossentropy": 2.681344139575958, + "loss/logits": 0.900179210305214, + "step": 21920 + }, + { + "epoch": 0.2193, + "grad_norm": 11.8125, + "grad_norm_var": 1.0594889322916667, + "learning_rate": 0.0003, + "loss": 11.7378, + "loss/aux_loss": 0.0481033293530345, + "loss/crossentropy": 2.6201368153095244, + "loss/logits": 0.8585344612598419, + "step": 21930 + }, + { + "epoch": 0.2194, + "grad_norm": 13.375, + "grad_norm_var": 1.4016764322916666, + "learning_rate": 0.0003, + "loss": 11.9075, + "loss/aux_loss": 0.04807730689644814, + "loss/crossentropy": 2.8310318291187286, + "loss/logits": 0.9102666884660721, + "step": 21940 + }, + { + "epoch": 0.2195, + "grad_norm": 12.75, + "grad_norm_var": 0.6691243489583333, + "learning_rate": 0.0003, + "loss": 11.6649, + "loss/aux_loss": 0.048088141903281215, + "loss/crossentropy": 2.9216031610965727, + "loss/logits": 0.9007417112588882, + "step": 21950 + }, + { + "epoch": 0.2196, + "grad_norm": 12.0, + "grad_norm_var": 0.5438639322916666, + "learning_rate": 0.0003, + "loss": 11.7617, + "loss/aux_loss": 0.04808661881834268, + "loss/crossentropy": 2.839560979604721, + "loss/logits": 0.8860168516635895, + "step": 21960 + }, + { + "epoch": 0.2197, + "grad_norm": 11.8125, + "grad_norm_var": 3.3452473958333333, + "learning_rate": 0.0003, + "loss": 11.7196, + "loss/aux_loss": 0.048084485530853274, + "loss/crossentropy": 2.7600815176963804, + "loss/logits": 0.880821418762207, + "step": 21970 + }, + { + "epoch": 0.2198, + "grad_norm": 11.5, + "grad_norm_var": 0.5484375, + "learning_rate": 0.0003, + "loss": 11.8505, + "loss/aux_loss": 0.04809036664664745, + "loss/crossentropy": 2.9518965005874636, + "loss/logits": 0.8992276877164841, + "step": 21980 + }, + { + "epoch": 0.2199, + "grad_norm": 14.5625, + "grad_norm_var": 0.8442057291666667, + "learning_rate": 0.0003, + "loss": 11.7163, + "loss/aux_loss": 0.04808600507676601, + "loss/crossentropy": 2.7657691895961762, + "loss/logits": 0.9018583208322525, + "step": 21990 + }, + { + "epoch": 0.22, + "grad_norm": 12.1875, + "grad_norm_var": 0.9114583333333334, + "learning_rate": 0.0003, + "loss": 11.6793, + "loss/aux_loss": 0.048087524622678755, + "loss/crossentropy": 2.6612784922122956, + "loss/logits": 0.8721506536006928, + "step": 22000 + }, + { + "epoch": 0.2201, + "grad_norm": 12.375, + "grad_norm_var": 0.41848958333333336, + "learning_rate": 0.0003, + "loss": 11.9123, + "loss/aux_loss": 0.04809351172298193, + "loss/crossentropy": 2.81771005988121, + "loss/logits": 0.9572886168956757, + "step": 22010 + }, + { + "epoch": 0.2202, + "grad_norm": 12.6875, + "grad_norm_var": 6.704622395833334, + "learning_rate": 0.0003, + "loss": 11.8047, + "loss/aux_loss": 0.04809220097959042, + "loss/crossentropy": 2.817577600479126, + "loss/logits": 0.8661315441131592, + "step": 22020 + }, + { + "epoch": 0.2203, + "grad_norm": 13.125, + "grad_norm_var": 5.740364583333333, + "learning_rate": 0.0003, + "loss": 11.9214, + "loss/aux_loss": 0.04808165710419417, + "loss/crossentropy": 2.768628853559494, + "loss/logits": 0.8950851440429688, + "step": 22030 + }, + { + "epoch": 0.2204, + "grad_norm": 12.3125, + "grad_norm_var": 22.538785807291667, + "learning_rate": 0.0003, + "loss": 11.9163, + "loss/aux_loss": 0.04808369372040033, + "loss/crossentropy": 2.7819134533405303, + "loss/logits": 0.8844695091247559, + "step": 22040 + }, + { + "epoch": 0.2205, + "grad_norm": 13.4375, + "grad_norm_var": 0.431884765625, + "learning_rate": 0.0003, + "loss": 11.7862, + "loss/aux_loss": 0.048092160001397134, + "loss/crossentropy": 2.9033903241157533, + "loss/logits": 0.9348111391067505, + "step": 22050 + }, + { + "epoch": 0.2206, + "grad_norm": 11.875, + "grad_norm_var": 0.6830729166666667, + "learning_rate": 0.0003, + "loss": 11.7504, + "loss/aux_loss": 0.04808596298098564, + "loss/crossentropy": 2.782503831386566, + "loss/logits": 0.8899946212768555, + "step": 22060 + }, + { + "epoch": 0.2207, + "grad_norm": 12.5625, + "grad_norm_var": 0.4663899739583333, + "learning_rate": 0.0003, + "loss": 11.7566, + "loss/aux_loss": 0.048095478489995, + "loss/crossentropy": 2.7743342220783234, + "loss/logits": 0.8939844936132431, + "step": 22070 + }, + { + "epoch": 0.2208, + "grad_norm": 12.375, + "grad_norm_var": 0.262744140625, + "learning_rate": 0.0003, + "loss": 11.8283, + "loss/aux_loss": 0.04808841645717621, + "loss/crossentropy": 2.554595720767975, + "loss/logits": 0.8608134061098098, + "step": 22080 + }, + { + "epoch": 0.2209, + "grad_norm": 12.375, + "grad_norm_var": 0.2847493489583333, + "learning_rate": 0.0003, + "loss": 11.8984, + "loss/aux_loss": 0.048084456473588943, + "loss/crossentropy": 2.8617907404899596, + "loss/logits": 0.9006909459829331, + "step": 22090 + }, + { + "epoch": 0.221, + "grad_norm": 11.9375, + "grad_norm_var": 0.272509765625, + "learning_rate": 0.0003, + "loss": 11.6435, + "loss/aux_loss": 0.04809146206825972, + "loss/crossentropy": 2.5811066746711733, + "loss/logits": 0.8601150065660477, + "step": 22100 + }, + { + "epoch": 0.2211, + "grad_norm": 12.4375, + "grad_norm_var": 0.3798014322916667, + "learning_rate": 0.0003, + "loss": 11.8274, + "loss/aux_loss": 0.04808408990502357, + "loss/crossentropy": 2.7892317831516267, + "loss/logits": 0.905488446354866, + "step": 22110 + }, + { + "epoch": 0.2212, + "grad_norm": 12.5, + "grad_norm_var": 0.2619140625, + "learning_rate": 0.0003, + "loss": 11.6912, + "loss/aux_loss": 0.048095279932022096, + "loss/crossentropy": 2.7180880904197693, + "loss/logits": 0.8672394514083862, + "step": 22120 + }, + { + "epoch": 0.2213, + "grad_norm": 12.375, + "grad_norm_var": 0.29607747395833334, + "learning_rate": 0.0003, + "loss": 11.8284, + "loss/aux_loss": 0.048083136044442656, + "loss/crossentropy": 2.751019012928009, + "loss/logits": 0.8794708341360092, + "step": 22130 + }, + { + "epoch": 0.2214, + "grad_norm": 12.4375, + "grad_norm_var": 0.1525390625, + "learning_rate": 0.0003, + "loss": 11.8437, + "loss/aux_loss": 0.04808746688067913, + "loss/crossentropy": 2.961369812488556, + "loss/logits": 0.8975703865289688, + "step": 22140 + }, + { + "epoch": 0.2215, + "grad_norm": 12.5, + "grad_norm_var": 0.28045247395833334, + "learning_rate": 0.0003, + "loss": 11.7406, + "loss/aux_loss": 0.04809634368866682, + "loss/crossentropy": 2.7276877880096437, + "loss/logits": 0.8959174305200577, + "step": 22150 + }, + { + "epoch": 0.2216, + "grad_norm": 12.8125, + "grad_norm_var": 0.389697265625, + "learning_rate": 0.0003, + "loss": 11.7637, + "loss/aux_loss": 0.04808992594480514, + "loss/crossentropy": 2.621533715724945, + "loss/logits": 0.8763752758502961, + "step": 22160 + }, + { + "epoch": 0.2217, + "grad_norm": 13.5, + "grad_norm_var": 0.34347330729166664, + "learning_rate": 0.0003, + "loss": 11.6706, + "loss/aux_loss": 0.04809186160564423, + "loss/crossentropy": 2.7217795610427857, + "loss/logits": 0.8898161560297012, + "step": 22170 + }, + { + "epoch": 0.2218, + "grad_norm": 12.3125, + "grad_norm_var": 0.601025390625, + "learning_rate": 0.0003, + "loss": 11.6356, + "loss/aux_loss": 0.04809278640896082, + "loss/crossentropy": 2.7290789067745207, + "loss/logits": 0.8597593367099762, + "step": 22180 + }, + { + "epoch": 0.2219, + "grad_norm": 12.375, + "grad_norm_var": 0.43359375, + "learning_rate": 0.0003, + "loss": 11.7534, + "loss/aux_loss": 0.04809238947927952, + "loss/crossentropy": 2.707075160741806, + "loss/logits": 0.8652425140142441, + "step": 22190 + }, + { + "epoch": 0.222, + "grad_norm": 11.9375, + "grad_norm_var": 0.35885416666666664, + "learning_rate": 0.0003, + "loss": 11.6756, + "loss/aux_loss": 0.04808915685862303, + "loss/crossentropy": 2.8292889297008514, + "loss/logits": 0.8936943262815475, + "step": 22200 + }, + { + "epoch": 0.2221, + "grad_norm": 12.3125, + "grad_norm_var": 0.37180989583333335, + "learning_rate": 0.0003, + "loss": 11.825, + "loss/aux_loss": 0.048080139234662055, + "loss/crossentropy": 2.70860413312912, + "loss/logits": 0.8934529781341553, + "step": 22210 + }, + { + "epoch": 0.2222, + "grad_norm": 12.1875, + "grad_norm_var": 0.28566080729166665, + "learning_rate": 0.0003, + "loss": 11.7766, + "loss/aux_loss": 0.04809638597071171, + "loss/crossentropy": 2.704670661687851, + "loss/logits": 0.8709723800420761, + "step": 22220 + }, + { + "epoch": 0.2223, + "grad_norm": 11.0625, + "grad_norm_var": 0.307666015625, + "learning_rate": 0.0003, + "loss": 11.6679, + "loss/aux_loss": 0.04809023775160313, + "loss/crossentropy": 2.6749909996986387, + "loss/logits": 0.9050649791955948, + "step": 22230 + }, + { + "epoch": 0.2224, + "grad_norm": 12.625, + "grad_norm_var": 0.3921875, + "learning_rate": 0.0003, + "loss": 11.7514, + "loss/aux_loss": 0.04809599500149488, + "loss/crossentropy": 2.760683298110962, + "loss/logits": 0.8826134830713273, + "step": 22240 + }, + { + "epoch": 0.2225, + "grad_norm": 13.3125, + "grad_norm_var": 1.2234375, + "learning_rate": 0.0003, + "loss": 11.9613, + "loss/aux_loss": 0.048089191876351835, + "loss/crossentropy": 2.767944025993347, + "loss/logits": 0.8672012895345688, + "step": 22250 + }, + { + "epoch": 0.2226, + "grad_norm": 13.4375, + "grad_norm_var": 0.4613118489583333, + "learning_rate": 0.0003, + "loss": 11.7787, + "loss/aux_loss": 0.0480911660939455, + "loss/crossentropy": 2.735838997364044, + "loss/logits": 0.8864156484603882, + "step": 22260 + }, + { + "epoch": 0.2227, + "grad_norm": 11.9375, + "grad_norm_var": 0.26223958333333336, + "learning_rate": 0.0003, + "loss": 11.7865, + "loss/aux_loss": 0.04808713924139738, + "loss/crossentropy": 2.83599910736084, + "loss/logits": 0.9017521053552627, + "step": 22270 + }, + { + "epoch": 0.2228, + "grad_norm": 10.9375, + "grad_norm_var": 0.37473958333333335, + "learning_rate": 0.0003, + "loss": 11.7516, + "loss/aux_loss": 0.0480857228860259, + "loss/crossentropy": 2.830791783332825, + "loss/logits": 0.9194071799516678, + "step": 22280 + }, + { + "epoch": 0.2229, + "grad_norm": 13.5, + "grad_norm_var": 0.6660807291666667, + "learning_rate": 0.0003, + "loss": 11.8099, + "loss/aux_loss": 0.04808124005794525, + "loss/crossentropy": 2.8348045706748963, + "loss/logits": 0.903641340136528, + "step": 22290 + }, + { + "epoch": 0.223, + "grad_norm": 13.1875, + "grad_norm_var": 1.6406087239583333, + "learning_rate": 0.0003, + "loss": 11.8544, + "loss/aux_loss": 0.04809259995818138, + "loss/crossentropy": 2.897055411338806, + "loss/logits": 0.9287648230791092, + "step": 22300 + }, + { + "epoch": 0.2231, + "grad_norm": 12.9375, + "grad_norm_var": 0.3203125, + "learning_rate": 0.0003, + "loss": 11.7389, + "loss/aux_loss": 0.048087797872722146, + "loss/crossentropy": 2.8019288659095762, + "loss/logits": 0.8725453674793243, + "step": 22310 + }, + { + "epoch": 0.2232, + "grad_norm": 12.1875, + "grad_norm_var": 0.9932291666666667, + "learning_rate": 0.0003, + "loss": 11.8846, + "loss/aux_loss": 0.048088356666266915, + "loss/crossentropy": 2.6834902286529543, + "loss/logits": 0.8782364130020142, + "step": 22320 + }, + { + "epoch": 0.2233, + "grad_norm": 12.4375, + "grad_norm_var": 1.0511555989583334, + "learning_rate": 0.0003, + "loss": 11.7531, + "loss/aux_loss": 0.048094474151730536, + "loss/crossentropy": 2.6937114894390106, + "loss/logits": 0.8777317255735397, + "step": 22330 + }, + { + "epoch": 0.2234, + "grad_norm": 13.0, + "grad_norm_var": 0.5492024739583333, + "learning_rate": 0.0003, + "loss": 11.6566, + "loss/aux_loss": 0.048084880039095876, + "loss/crossentropy": 2.72471564412117, + "loss/logits": 0.8733905553817749, + "step": 22340 + }, + { + "epoch": 0.2235, + "grad_norm": 11.5625, + "grad_norm_var": 0.867431640625, + "learning_rate": 0.0003, + "loss": 11.6595, + "loss/aux_loss": 0.048089952766895296, + "loss/crossentropy": 2.840152883529663, + "loss/logits": 0.9174001008272171, + "step": 22350 + }, + { + "epoch": 0.2236, + "grad_norm": 12.1875, + "grad_norm_var": 0.560400390625, + "learning_rate": 0.0003, + "loss": 11.7196, + "loss/aux_loss": 0.0480857165530324, + "loss/crossentropy": 2.8605542302131655, + "loss/logits": 0.9183370441198349, + "step": 22360 + }, + { + "epoch": 0.2237, + "grad_norm": 11.5625, + "grad_norm_var": 0.40740559895833334, + "learning_rate": 0.0003, + "loss": 11.7638, + "loss/aux_loss": 0.048090421594679356, + "loss/crossentropy": 2.639111566543579, + "loss/logits": 0.8860017955303192, + "step": 22370 + }, + { + "epoch": 0.2238, + "grad_norm": 11.6875, + "grad_norm_var": 0.20128580729166667, + "learning_rate": 0.0003, + "loss": 11.9126, + "loss/aux_loss": 0.04809177350252867, + "loss/crossentropy": 2.785674238204956, + "loss/logits": 0.8810646086931229, + "step": 22380 + }, + { + "epoch": 0.2239, + "grad_norm": 12.5, + "grad_norm_var": 0.218603515625, + "learning_rate": 0.0003, + "loss": 11.6681, + "loss/aux_loss": 0.048096783272922036, + "loss/crossentropy": 2.7202962040901184, + "loss/logits": 0.8531753093004226, + "step": 22390 + }, + { + "epoch": 0.224, + "grad_norm": 13.4375, + "grad_norm_var": 2.6541015625, + "learning_rate": 0.0003, + "loss": 11.9144, + "loss/aux_loss": 0.0480857115238905, + "loss/crossentropy": 2.794497346878052, + "loss/logits": 0.8948001682758331, + "step": 22400 + }, + { + "epoch": 0.2241, + "grad_norm": 16.75, + "grad_norm_var": 3.6030598958333333, + "learning_rate": 0.0003, + "loss": 11.7171, + "loss/aux_loss": 0.04808383211493492, + "loss/crossentropy": 2.809972804784775, + "loss/logits": 0.9038825124502182, + "step": 22410 + }, + { + "epoch": 0.2242, + "grad_norm": 13.25, + "grad_norm_var": 1.80625, + "learning_rate": 0.0003, + "loss": 11.4694, + "loss/aux_loss": 0.04809340089559555, + "loss/crossentropy": 2.78351212143898, + "loss/logits": 0.8861901849508286, + "step": 22420 + }, + { + "epoch": 0.2243, + "grad_norm": 12.125, + "grad_norm_var": 0.8671223958333333, + "learning_rate": 0.0003, + "loss": 11.6393, + "loss/aux_loss": 0.04809296205639839, + "loss/crossentropy": 2.749815273284912, + "loss/logits": 0.9010616183280945, + "step": 22430 + }, + { + "epoch": 0.2244, + "grad_norm": 12.0, + "grad_norm_var": 9.208837890625, + "learning_rate": 0.0003, + "loss": 11.7555, + "loss/aux_loss": 0.04808775205165148, + "loss/crossentropy": 2.968637430667877, + "loss/logits": 0.9231620490550995, + "step": 22440 + }, + { + "epoch": 0.2245, + "grad_norm": 12.1875, + "grad_norm_var": 0.4400390625, + "learning_rate": 0.0003, + "loss": 11.6987, + "loss/aux_loss": 0.0480935113504529, + "loss/crossentropy": 2.851404082775116, + "loss/logits": 0.897919625043869, + "step": 22450 + }, + { + "epoch": 0.2246, + "grad_norm": 11.875, + "grad_norm_var": 0.2806640625, + "learning_rate": 0.0003, + "loss": 11.7398, + "loss/aux_loss": 0.04809074774384499, + "loss/crossentropy": 2.65076659321785, + "loss/logits": 0.8881769001483917, + "step": 22460 + }, + { + "epoch": 0.2247, + "grad_norm": 12.0625, + "grad_norm_var": 1.1931640625, + "learning_rate": 0.0003, + "loss": 11.6777, + "loss/aux_loss": 0.04809971358627081, + "loss/crossentropy": 2.645810514688492, + "loss/logits": 0.8362853050231933, + "step": 22470 + }, + { + "epoch": 0.2248, + "grad_norm": 14.8125, + "grad_norm_var": 103.21354166666667, + "learning_rate": 0.0003, + "loss": 11.8468, + "loss/aux_loss": 0.048092149384319785, + "loss/crossentropy": 2.8907059490680695, + "loss/logits": 0.8786774843931198, + "step": 22480 + }, + { + "epoch": 0.2249, + "grad_norm": 13.0625, + "grad_norm_var": 100.78019205729167, + "learning_rate": 0.0003, + "loss": 11.7205, + "loss/aux_loss": 0.04809787534177303, + "loss/crossentropy": 2.716031605005264, + "loss/logits": 0.8944111734628677, + "step": 22490 + }, + { + "epoch": 0.225, + "grad_norm": 13.75, + "grad_norm_var": 1.50625, + "learning_rate": 0.0003, + "loss": 11.6669, + "loss/aux_loss": 0.048082894459366796, + "loss/crossentropy": 2.7395161747932435, + "loss/logits": 0.8643982857465744, + "step": 22500 + }, + { + "epoch": 0.2251, + "grad_norm": 12.375, + "grad_norm_var": 1.5290201822916667, + "learning_rate": 0.0003, + "loss": 11.5853, + "loss/aux_loss": 0.04808229207992554, + "loss/crossentropy": 2.7536255359649657, + "loss/logits": 0.881432518362999, + "step": 22510 + }, + { + "epoch": 0.2252, + "grad_norm": 11.625, + "grad_norm_var": 0.448681640625, + "learning_rate": 0.0003, + "loss": 11.7597, + "loss/aux_loss": 0.04810373391956091, + "loss/crossentropy": 2.747165524959564, + "loss/logits": 0.8833198219537735, + "step": 22520 + }, + { + "epoch": 0.2253, + "grad_norm": 11.8125, + "grad_norm_var": 0.46432291666666664, + "learning_rate": 0.0003, + "loss": 11.8328, + "loss/aux_loss": 0.04809314012527466, + "loss/crossentropy": 2.67395840883255, + "loss/logits": 0.9117985635995864, + "step": 22530 + }, + { + "epoch": 0.2254, + "grad_norm": 12.6875, + "grad_norm_var": 0.15987955729166667, + "learning_rate": 0.0003, + "loss": 11.6949, + "loss/aux_loss": 0.04808804150670767, + "loss/crossentropy": 2.7599778056144713, + "loss/logits": 0.8923967123031616, + "step": 22540 + }, + { + "epoch": 0.2255, + "grad_norm": 12.4375, + "grad_norm_var": 0.5255045572916667, + "learning_rate": 0.0003, + "loss": 11.6803, + "loss/aux_loss": 0.0480937123298645, + "loss/crossentropy": 2.731845957040787, + "loss/logits": 0.9252343803644181, + "step": 22550 + }, + { + "epoch": 0.2256, + "grad_norm": 13.125, + "grad_norm_var": 0.7700520833333333, + "learning_rate": 0.0003, + "loss": 11.8578, + "loss/aux_loss": 0.04808880146592855, + "loss/crossentropy": 2.9239902973175047, + "loss/logits": 0.8844455033540726, + "step": 22560 + }, + { + "epoch": 0.2257, + "grad_norm": 11.625, + "grad_norm_var": 1.0207682291666667, + "learning_rate": 0.0003, + "loss": 11.7067, + "loss/aux_loss": 0.04809002634137869, + "loss/crossentropy": 2.89136780500412, + "loss/logits": 0.9194484144449234, + "step": 22570 + }, + { + "epoch": 0.2258, + "grad_norm": 12.0625, + "grad_norm_var": 1.0624348958333334, + "learning_rate": 0.0003, + "loss": 11.5666, + "loss/aux_loss": 0.048089371994137764, + "loss/crossentropy": 2.775197160243988, + "loss/logits": 0.8997502565383911, + "step": 22580 + }, + { + "epoch": 0.2259, + "grad_norm": 11.9375, + "grad_norm_var": 2.983056640625, + "learning_rate": 0.0003, + "loss": 11.7513, + "loss/aux_loss": 0.04809422306716442, + "loss/crossentropy": 2.7966256499290467, + "loss/logits": 0.8797528147697449, + "step": 22590 + }, + { + "epoch": 0.226, + "grad_norm": 11.6875, + "grad_norm_var": 1.4276041666666666, + "learning_rate": 0.0003, + "loss": 11.6478, + "loss/aux_loss": 0.048089210875332355, + "loss/crossentropy": 2.7377444982528685, + "loss/logits": 0.9153787553310394, + "step": 22600 + }, + { + "epoch": 0.2261, + "grad_norm": 14.75, + "grad_norm_var": 0.5952962239583334, + "learning_rate": 0.0003, + "loss": 11.7362, + "loss/aux_loss": 0.0480865390971303, + "loss/crossentropy": 2.7379279255867006, + "loss/logits": 0.8918594628572464, + "step": 22610 + }, + { + "epoch": 0.2262, + "grad_norm": 14.25, + "grad_norm_var": 2.0085774739583333, + "learning_rate": 0.0003, + "loss": 11.7772, + "loss/aux_loss": 0.048102805577218535, + "loss/crossentropy": 2.7676973700523377, + "loss/logits": 0.9144205540418625, + "step": 22620 + }, + { + "epoch": 0.2263, + "grad_norm": 11.625, + "grad_norm_var": 0.8378743489583333, + "learning_rate": 0.0003, + "loss": 11.7351, + "loss/aux_loss": 0.04808990322053432, + "loss/crossentropy": 2.763928699493408, + "loss/logits": 0.9580153465270996, + "step": 22630 + }, + { + "epoch": 0.2264, + "grad_norm": 11.875, + "grad_norm_var": 0.2749837239583333, + "learning_rate": 0.0003, + "loss": 11.7633, + "loss/aux_loss": 0.04808971676975489, + "loss/crossentropy": 2.798508107662201, + "loss/logits": 0.9377011686563492, + "step": 22640 + }, + { + "epoch": 0.2265, + "grad_norm": 11.8125, + "grad_norm_var": 0.5379557291666667, + "learning_rate": 0.0003, + "loss": 11.6478, + "loss/aux_loss": 0.04807438086718321, + "loss/crossentropy": 2.628761428594589, + "loss/logits": 0.901868748664856, + "step": 22650 + }, + { + "epoch": 0.2266, + "grad_norm": 11.625, + "grad_norm_var": 0.8390462239583333, + "learning_rate": 0.0003, + "loss": 11.7954, + "loss/aux_loss": 0.04809013176709413, + "loss/crossentropy": 2.828604358434677, + "loss/logits": 0.9256632804870606, + "step": 22660 + }, + { + "epoch": 0.2267, + "grad_norm": 12.625, + "grad_norm_var": 0.5305826822916667, + "learning_rate": 0.0003, + "loss": 11.6155, + "loss/aux_loss": 0.048091720603406427, + "loss/crossentropy": 2.6923003435134887, + "loss/logits": 0.9034017592668533, + "step": 22670 + }, + { + "epoch": 0.2268, + "grad_norm": 11.8125, + "grad_norm_var": 0.2908854166666667, + "learning_rate": 0.0003, + "loss": 11.7298, + "loss/aux_loss": 0.04808386079967022, + "loss/crossentropy": 2.871219742298126, + "loss/logits": 0.9219763696193695, + "step": 22680 + }, + { + "epoch": 0.2269, + "grad_norm": 12.375, + "grad_norm_var": 0.4014973958333333, + "learning_rate": 0.0003, + "loss": 11.6765, + "loss/aux_loss": 0.048084440641105175, + "loss/crossentropy": 2.6931581676006315, + "loss/logits": 0.8805693238973618, + "step": 22690 + }, + { + "epoch": 0.227, + "grad_norm": 13.6875, + "grad_norm_var": 1.3969889322916667, + "learning_rate": 0.0003, + "loss": 11.7598, + "loss/aux_loss": 0.04809512868523598, + "loss/crossentropy": 2.6413376092910767, + "loss/logits": 0.8963235735893249, + "step": 22700 + }, + { + "epoch": 0.2271, + "grad_norm": 12.6875, + "grad_norm_var": 0.39296875, + "learning_rate": 0.0003, + "loss": 11.8279, + "loss/aux_loss": 0.04809400998055935, + "loss/crossentropy": 2.791292655467987, + "loss/logits": 0.8894282549619674, + "step": 22710 + }, + { + "epoch": 0.2272, + "grad_norm": 12.4375, + "grad_norm_var": 0.3333333333333333, + "learning_rate": 0.0003, + "loss": 11.8071, + "loss/aux_loss": 0.048091070353984834, + "loss/crossentropy": 2.6207732558250427, + "loss/logits": 0.8468140810728073, + "step": 22720 + }, + { + "epoch": 0.2273, + "grad_norm": 11.3125, + "grad_norm_var": 0.35128580729166664, + "learning_rate": 0.0003, + "loss": 11.6102, + "loss/aux_loss": 0.04808534793555737, + "loss/crossentropy": 2.6754838645458223, + "loss/logits": 0.9055883139371872, + "step": 22730 + }, + { + "epoch": 0.2274, + "grad_norm": 11.6875, + "grad_norm_var": 0.44217122395833336, + "learning_rate": 0.0003, + "loss": 11.5479, + "loss/aux_loss": 0.04807910211384296, + "loss/crossentropy": 2.8112324655056, + "loss/logits": 0.8994197815656662, + "step": 22740 + }, + { + "epoch": 0.2275, + "grad_norm": 13.75, + "grad_norm_var": 192.89264322916668, + "learning_rate": 0.0003, + "loss": 11.6927, + "loss/aux_loss": 0.04809170123189688, + "loss/crossentropy": 2.8299485445022583, + "loss/logits": 0.8875698268413543, + "step": 22750 + }, + { + "epoch": 0.2276, + "grad_norm": 13.125, + "grad_norm_var": 1.995947265625, + "learning_rate": 0.0003, + "loss": 11.9368, + "loss/aux_loss": 0.04808708317577839, + "loss/crossentropy": 2.8931700348854066, + "loss/logits": 0.9314892888069153, + "step": 22760 + }, + { + "epoch": 0.2277, + "grad_norm": 13.5, + "grad_norm_var": 0.3824055989583333, + "learning_rate": 0.0003, + "loss": 11.7043, + "loss/aux_loss": 0.048081159219145776, + "loss/crossentropy": 2.8375320076942443, + "loss/logits": 0.8928394854068756, + "step": 22770 + }, + { + "epoch": 0.2278, + "grad_norm": 13.5, + "grad_norm_var": 0.5559895833333334, + "learning_rate": 0.0003, + "loss": 11.6841, + "loss/aux_loss": 0.048088202998042104, + "loss/crossentropy": 2.8470484018325806, + "loss/logits": 0.8978864282369614, + "step": 22780 + }, + { + "epoch": 0.2279, + "grad_norm": 12.5625, + "grad_norm_var": 0.7514973958333333, + "learning_rate": 0.0003, + "loss": 11.7581, + "loss/aux_loss": 0.048088363744318484, + "loss/crossentropy": 2.70345995426178, + "loss/logits": 0.8970031559467315, + "step": 22790 + }, + { + "epoch": 0.228, + "grad_norm": 13.0625, + "grad_norm_var": 0.5981770833333333, + "learning_rate": 0.0003, + "loss": 11.7236, + "loss/aux_loss": 0.048087395168840884, + "loss/crossentropy": 2.7649930655956267, + "loss/logits": 0.877352437376976, + "step": 22800 + }, + { + "epoch": 0.2281, + "grad_norm": 12.5, + "grad_norm_var": 0.2879557291666667, + "learning_rate": 0.0003, + "loss": 11.7192, + "loss/aux_loss": 0.048083343915641306, + "loss/crossentropy": 2.756735974550247, + "loss/logits": 0.8518844783306122, + "step": 22810 + }, + { + "epoch": 0.2282, + "grad_norm": 12.4375, + "grad_norm_var": 0.23995768229166667, + "learning_rate": 0.0003, + "loss": 11.7118, + "loss/aux_loss": 0.04808799996972084, + "loss/crossentropy": 2.756363260746002, + "loss/logits": 0.907541635632515, + "step": 22820 + }, + { + "epoch": 0.2283, + "grad_norm": 13.25, + "grad_norm_var": 33.850244140625, + "learning_rate": 0.0003, + "loss": 11.7879, + "loss/aux_loss": 0.04809822123497724, + "loss/crossentropy": 2.733557677268982, + "loss/logits": 0.896966302394867, + "step": 22830 + }, + { + "epoch": 0.2284, + "grad_norm": 12.25, + "grad_norm_var": 32.6337890625, + "learning_rate": 0.0003, + "loss": 11.7543, + "loss/aux_loss": 0.0481018140912056, + "loss/crossentropy": 2.759879392385483, + "loss/logits": 0.8836588621139526, + "step": 22840 + }, + { + "epoch": 0.2285, + "grad_norm": 12.4375, + "grad_norm_var": 0.12433268229166666, + "learning_rate": 0.0003, + "loss": 11.7755, + "loss/aux_loss": 0.04808047190308571, + "loss/crossentropy": 2.8531015515327454, + "loss/logits": 0.9062738597393036, + "step": 22850 + }, + { + "epoch": 0.2286, + "grad_norm": 12.9375, + "grad_norm_var": 0.10519205729166667, + "learning_rate": 0.0003, + "loss": 11.682, + "loss/aux_loss": 0.04810410905629396, + "loss/crossentropy": 2.9115795135498046, + "loss/logits": 0.9097014546394349, + "step": 22860 + }, + { + "epoch": 0.2287, + "grad_norm": 13.1875, + "grad_norm_var": 2.2611979166666667, + "learning_rate": 0.0003, + "loss": 11.6901, + "loss/aux_loss": 0.048095389269292355, + "loss/crossentropy": 2.8267542123794556, + "loss/logits": 0.9044133692979812, + "step": 22870 + }, + { + "epoch": 0.2288, + "grad_norm": 12.8125, + "grad_norm_var": 0.6259765625, + "learning_rate": 0.0003, + "loss": 11.541, + "loss/aux_loss": 0.04808672312647104, + "loss/crossentropy": 2.6773048043251038, + "loss/logits": 0.876294469833374, + "step": 22880 + }, + { + "epoch": 0.2289, + "grad_norm": 12.5, + "grad_norm_var": 0.6655598958333333, + "learning_rate": 0.0003, + "loss": 11.7934, + "loss/aux_loss": 0.04808298200368881, + "loss/crossentropy": 2.837724781036377, + "loss/logits": 0.9370515316724777, + "step": 22890 + }, + { + "epoch": 0.229, + "grad_norm": 11.875, + "grad_norm_var": 0.620556640625, + "learning_rate": 0.0003, + "loss": 11.8536, + "loss/aux_loss": 0.04808994997292757, + "loss/crossentropy": 2.795435976982117, + "loss/logits": 0.8800392180681229, + "step": 22900 + }, + { + "epoch": 0.2291, + "grad_norm": 13.0625, + "grad_norm_var": 0.425634765625, + "learning_rate": 0.0003, + "loss": 11.5746, + "loss/aux_loss": 0.04808636344969273, + "loss/crossentropy": 2.9793556809425352, + "loss/logits": 0.86444131731987, + "step": 22910 + }, + { + "epoch": 0.2292, + "grad_norm": 11.9375, + "grad_norm_var": 0.29375, + "learning_rate": 0.0003, + "loss": 11.7719, + "loss/aux_loss": 0.048105028085410596, + "loss/crossentropy": 2.7319081902503966, + "loss/logits": 0.8803468406200409, + "step": 22920 + }, + { + "epoch": 0.2293, + "grad_norm": 11.9375, + "grad_norm_var": 0.30441080729166664, + "learning_rate": 0.0003, + "loss": 11.8234, + "loss/aux_loss": 0.04808462020009756, + "loss/crossentropy": 2.821427547931671, + "loss/logits": 0.8934536874294281, + "step": 22930 + }, + { + "epoch": 0.2294, + "grad_norm": 13.4375, + "grad_norm_var": 0.47537434895833336, + "learning_rate": 0.0003, + "loss": 11.7563, + "loss/aux_loss": 0.048085405677556994, + "loss/crossentropy": 2.751129651069641, + "loss/logits": 0.9135142832994461, + "step": 22940 + }, + { + "epoch": 0.2295, + "grad_norm": 11.6875, + "grad_norm_var": 0.386181640625, + "learning_rate": 0.0003, + "loss": 11.6328, + "loss/aux_loss": 0.04809577204287052, + "loss/crossentropy": 2.6600105464458466, + "loss/logits": 0.8954817146062851, + "step": 22950 + }, + { + "epoch": 0.2296, + "grad_norm": 12.25, + "grad_norm_var": 0.460400390625, + "learning_rate": 0.0003, + "loss": 11.9641, + "loss/aux_loss": 0.04808154255151749, + "loss/crossentropy": 2.909790873527527, + "loss/logits": 0.9102999448776246, + "step": 22960 + }, + { + "epoch": 0.2297, + "grad_norm": 12.4375, + "grad_norm_var": 0.38483072916666666, + "learning_rate": 0.0003, + "loss": 11.5812, + "loss/aux_loss": 0.04809312988072634, + "loss/crossentropy": 2.7076492428779604, + "loss/logits": 0.8611445337533951, + "step": 22970 + }, + { + "epoch": 0.2298, + "grad_norm": 12.625, + "grad_norm_var": 1.0254557291666666, + "learning_rate": 0.0003, + "loss": 11.8651, + "loss/aux_loss": 0.0480959540233016, + "loss/crossentropy": 2.900135505199432, + "loss/logits": 0.8992374151945114, + "step": 22980 + }, + { + "epoch": 0.2299, + "grad_norm": 11.5, + "grad_norm_var": 1.0036458333333333, + "learning_rate": 0.0003, + "loss": 11.787, + "loss/aux_loss": 0.0480839628726244, + "loss/crossentropy": 2.7226045966148376, + "loss/logits": 0.9184604525566101, + "step": 22990 + }, + { + "epoch": 0.23, + "grad_norm": 12.625, + "grad_norm_var": 0.5655598958333333, + "learning_rate": 0.0003, + "loss": 11.6746, + "loss/aux_loss": 0.0480868011713028, + "loss/crossentropy": 2.7485710740089417, + "loss/logits": 0.8804697394371033, + "step": 23000 + }, + { + "epoch": 0.2301, + "grad_norm": 12.75, + "grad_norm_var": 0.434228515625, + "learning_rate": 0.0003, + "loss": 11.6392, + "loss/aux_loss": 0.04808677285909653, + "loss/crossentropy": 2.898291528224945, + "loss/logits": 0.8929012924432754, + "step": 23010 + }, + { + "epoch": 0.2302, + "grad_norm": 11.9375, + "grad_norm_var": 0.2840983072916667, + "learning_rate": 0.0003, + "loss": 11.7874, + "loss/aux_loss": 0.04808888360857964, + "loss/crossentropy": 2.7599457263946534, + "loss/logits": 0.9127897024154663, + "step": 23020 + }, + { + "epoch": 0.2303, + "grad_norm": 12.625, + "grad_norm_var": 0.23567708333333334, + "learning_rate": 0.0003, + "loss": 11.8279, + "loss/aux_loss": 0.04809257406741381, + "loss/crossentropy": 2.8466604590415954, + "loss/logits": 0.9271048754453659, + "step": 23030 + }, + { + "epoch": 0.2304, + "grad_norm": 12.375, + "grad_norm_var": 0.42942708333333335, + "learning_rate": 0.0003, + "loss": 11.5945, + "loss/aux_loss": 0.04807973112910986, + "loss/crossentropy": 2.6964468479156496, + "loss/logits": 0.8535116940736771, + "step": 23040 + }, + { + "epoch": 0.2305, + "grad_norm": 12.5, + "grad_norm_var": 0.4483723958333333, + "learning_rate": 0.0003, + "loss": 11.7818, + "loss/aux_loss": 0.04808500371873379, + "loss/crossentropy": 2.773308277130127, + "loss/logits": 0.8800354272127151, + "step": 23050 + }, + { + "epoch": 0.2306, + "grad_norm": 13.0, + "grad_norm_var": 0.8705729166666667, + "learning_rate": 0.0003, + "loss": 11.7319, + "loss/aux_loss": 0.04808144625276327, + "loss/crossentropy": 2.7295325756073, + "loss/logits": 0.9066801935434341, + "step": 23060 + }, + { + "epoch": 0.2307, + "grad_norm": 12.375, + "grad_norm_var": 0.8344889322916667, + "learning_rate": 0.0003, + "loss": 11.7069, + "loss/aux_loss": 0.048089582659304145, + "loss/crossentropy": 2.673926168680191, + "loss/logits": 0.8846386224031448, + "step": 23070 + }, + { + "epoch": 0.2308, + "grad_norm": 12.75, + "grad_norm_var": 0.4093098958333333, + "learning_rate": 0.0003, + "loss": 11.6815, + "loss/aux_loss": 0.048079993948340415, + "loss/crossentropy": 2.838780736923218, + "loss/logits": 0.8946847975254059, + "step": 23080 + }, + { + "epoch": 0.2309, + "grad_norm": 11.8125, + "grad_norm_var": 0.2999348958333333, + "learning_rate": 0.0003, + "loss": 11.76, + "loss/aux_loss": 0.04808596204966307, + "loss/crossentropy": 2.7160808563232424, + "loss/logits": 0.9145932257175445, + "step": 23090 + }, + { + "epoch": 0.231, + "grad_norm": 11.75, + "grad_norm_var": 0.32493489583333335, + "learning_rate": 0.0003, + "loss": 11.6019, + "loss/aux_loss": 0.04809281267225742, + "loss/crossentropy": 2.861774879693985, + "loss/logits": 0.8780633181333541, + "step": 23100 + }, + { + "epoch": 0.2311, + "grad_norm": 11.875, + "grad_norm_var": 0.43802083333333336, + "learning_rate": 0.0003, + "loss": 11.8316, + "loss/aux_loss": 0.04808343816548586, + "loss/crossentropy": 2.7952277660369873, + "loss/logits": 0.9055339187383652, + "step": 23110 + }, + { + "epoch": 0.2312, + "grad_norm": 12.3125, + "grad_norm_var": 0.670947265625, + "learning_rate": 0.0003, + "loss": 11.8885, + "loss/aux_loss": 0.048087695986032485, + "loss/crossentropy": 2.7752737283706663, + "loss/logits": 0.9146613448858261, + "step": 23120 + }, + { + "epoch": 0.2313, + "grad_norm": 12.9375, + "grad_norm_var": 0.6710774739583333, + "learning_rate": 0.0003, + "loss": 11.7875, + "loss/aux_loss": 0.04809215571731329, + "loss/crossentropy": 2.8519309163093567, + "loss/logits": 0.9128359079360961, + "step": 23130 + }, + { + "epoch": 0.2314, + "grad_norm": 13.25, + "grad_norm_var": 0.5870930989583333, + "learning_rate": 0.0003, + "loss": 11.5688, + "loss/aux_loss": 0.04807926807552576, + "loss/crossentropy": 2.764670741558075, + "loss/logits": 0.9091036021709442, + "step": 23140 + }, + { + "epoch": 0.2315, + "grad_norm": 12.3125, + "grad_norm_var": 0.349072265625, + "learning_rate": 0.0003, + "loss": 11.7851, + "loss/aux_loss": 0.04807650428265333, + "loss/crossentropy": 2.707452893257141, + "loss/logits": 0.894105252623558, + "step": 23150 + }, + { + "epoch": 0.2316, + "grad_norm": 13.0625, + "grad_norm_var": 133.3056640625, + "learning_rate": 0.0003, + "loss": 11.6686, + "loss/aux_loss": 0.04811877477914095, + "loss/crossentropy": 2.869442331790924, + "loss/logits": 0.8912162572145462, + "step": 23160 + }, + { + "epoch": 0.2317, + "grad_norm": 12.6875, + "grad_norm_var": 131.73795572916666, + "learning_rate": 0.0003, + "loss": 11.8418, + "loss/aux_loss": 0.0480883814394474, + "loss/crossentropy": 2.8353028416633608, + "loss/logits": 0.9513318210840225, + "step": 23170 + }, + { + "epoch": 0.2318, + "grad_norm": 12.75, + "grad_norm_var": 2.652718098958333, + "learning_rate": 0.0003, + "loss": 11.6498, + "loss/aux_loss": 0.0480922332033515, + "loss/crossentropy": 2.7743508577346803, + "loss/logits": 0.9048791795969009, + "step": 23180 + }, + { + "epoch": 0.2319, + "grad_norm": 12.125, + "grad_norm_var": 2.579931640625, + "learning_rate": 0.0003, + "loss": 11.5636, + "loss/aux_loss": 0.04809550289064646, + "loss/crossentropy": 2.796035075187683, + "loss/logits": 0.8494812101125717, + "step": 23190 + }, + { + "epoch": 0.232, + "grad_norm": 13.5, + "grad_norm_var": 0.26764322916666666, + "learning_rate": 0.0003, + "loss": 11.8704, + "loss/aux_loss": 0.048094166442751884, + "loss/crossentropy": 2.6794604539871214, + "loss/logits": 0.9145378708839417, + "step": 23200 + }, + { + "epoch": 0.2321, + "grad_norm": 13.125, + "grad_norm_var": 0.4744791666666667, + "learning_rate": 0.0003, + "loss": 11.7678, + "loss/aux_loss": 0.04808245878666639, + "loss/crossentropy": 2.75232680439949, + "loss/logits": 0.8650152295827865, + "step": 23210 + }, + { + "epoch": 0.2322, + "grad_norm": 11.625, + "grad_norm_var": 0.46243489583333336, + "learning_rate": 0.0003, + "loss": 11.6645, + "loss/aux_loss": 0.04808831550180912, + "loss/crossentropy": 2.7722482800483705, + "loss/logits": 0.8622186064720154, + "step": 23220 + }, + { + "epoch": 0.2323, + "grad_norm": 13.6875, + "grad_norm_var": 0.834619140625, + "learning_rate": 0.0003, + "loss": 11.7358, + "loss/aux_loss": 0.04809358511120081, + "loss/crossentropy": 2.8247627317905426, + "loss/logits": 0.8922833681106568, + "step": 23230 + }, + { + "epoch": 0.2324, + "grad_norm": 12.1875, + "grad_norm_var": 0.651806640625, + "learning_rate": 0.0003, + "loss": 11.6687, + "loss/aux_loss": 0.048091284930706024, + "loss/crossentropy": 2.7717152774333953, + "loss/logits": 0.8566128462553024, + "step": 23240 + }, + { + "epoch": 0.2325, + "grad_norm": 12.25, + "grad_norm_var": 0.3120930989583333, + "learning_rate": 0.0003, + "loss": 11.6438, + "loss/aux_loss": 0.04808084759861231, + "loss/crossentropy": 2.743647050857544, + "loss/logits": 0.8860394328832626, + "step": 23250 + }, + { + "epoch": 0.2326, + "grad_norm": 12.4375, + "grad_norm_var": 0.2906087239583333, + "learning_rate": 0.0003, + "loss": 11.7391, + "loss/aux_loss": 0.04808401893824339, + "loss/crossentropy": 2.8058079719543456, + "loss/logits": 0.9324862480163574, + "step": 23260 + }, + { + "epoch": 0.2327, + "grad_norm": 11.8125, + "grad_norm_var": 0.28878580729166664, + "learning_rate": 0.0003, + "loss": 11.6006, + "loss/aux_loss": 0.04809973333030939, + "loss/crossentropy": 2.797436898946762, + "loss/logits": 0.8818973273038864, + "step": 23270 + }, + { + "epoch": 0.2328, + "grad_norm": 12.375, + "grad_norm_var": 0.17237955729166668, + "learning_rate": 0.0003, + "loss": 11.6795, + "loss/aux_loss": 0.04809030685573816, + "loss/crossentropy": 2.8836780309677126, + "loss/logits": 0.9173092126846314, + "step": 23280 + }, + { + "epoch": 0.2329, + "grad_norm": 12.125, + "grad_norm_var": 0.27786458333333336, + "learning_rate": 0.0003, + "loss": 11.6142, + "loss/aux_loss": 0.04809027072042227, + "loss/crossentropy": 2.829053020477295, + "loss/logits": 0.8915682911872864, + "step": 23290 + }, + { + "epoch": 0.233, + "grad_norm": 12.3125, + "grad_norm_var": 0.172509765625, + "learning_rate": 0.0003, + "loss": 11.7196, + "loss/aux_loss": 0.04808690585196018, + "loss/crossentropy": 2.793965721130371, + "loss/logits": 0.8819243282079696, + "step": 23300 + }, + { + "epoch": 0.2331, + "grad_norm": 11.625, + "grad_norm_var": 0.4320149739583333, + "learning_rate": 0.0003, + "loss": 11.7091, + "loss/aux_loss": 0.04808926545083523, + "loss/crossentropy": 2.5778140842914583, + "loss/logits": 0.8577252298593521, + "step": 23310 + }, + { + "epoch": 0.2332, + "grad_norm": 13.375, + "grad_norm_var": 0.718212890625, + "learning_rate": 0.0003, + "loss": 11.8099, + "loss/aux_loss": 0.04810008257627487, + "loss/crossentropy": 2.9423258543014525, + "loss/logits": 0.9043860971927643, + "step": 23320 + }, + { + "epoch": 0.2333, + "grad_norm": 13.0, + "grad_norm_var": 0.8684895833333334, + "learning_rate": 0.0003, + "loss": 11.5931, + "loss/aux_loss": 0.04809061642736197, + "loss/crossentropy": 2.75088050365448, + "loss/logits": 0.8834013044834137, + "step": 23330 + }, + { + "epoch": 0.2334, + "grad_norm": 12.375, + "grad_norm_var": 0.7817057291666667, + "learning_rate": 0.0003, + "loss": 11.6771, + "loss/aux_loss": 0.0480886397883296, + "loss/crossentropy": 2.788175332546234, + "loss/logits": 0.9197595477104187, + "step": 23340 + }, + { + "epoch": 0.2335, + "grad_norm": 12.4375, + "grad_norm_var": 0.45305989583333334, + "learning_rate": 0.0003, + "loss": 11.807, + "loss/aux_loss": 0.04809667635709047, + "loss/crossentropy": 2.8132767200469972, + "loss/logits": 0.8944458961486816, + "step": 23350 + }, + { + "epoch": 0.2336, + "grad_norm": 11.625, + "grad_norm_var": 0.40729166666666666, + "learning_rate": 0.0003, + "loss": 11.7539, + "loss/aux_loss": 0.0480756500735879, + "loss/crossentropy": 2.86536762714386, + "loss/logits": 0.876064345240593, + "step": 23360 + }, + { + "epoch": 0.2337, + "grad_norm": 12.4375, + "grad_norm_var": 0.5337076822916667, + "learning_rate": 0.0003, + "loss": 11.6954, + "loss/aux_loss": 0.048088740557432175, + "loss/crossentropy": 2.861802363395691, + "loss/logits": 0.8744904607534408, + "step": 23370 + }, + { + "epoch": 0.2338, + "grad_norm": 12.1875, + "grad_norm_var": 0.4778645833333333, + "learning_rate": 0.0003, + "loss": 11.7553, + "loss/aux_loss": 0.0480877548456192, + "loss/crossentropy": 2.8678762316703796, + "loss/logits": 0.9301058530807496, + "step": 23380 + }, + { + "epoch": 0.2339, + "grad_norm": 12.625, + "grad_norm_var": 0.28899739583333334, + "learning_rate": 0.0003, + "loss": 11.7144, + "loss/aux_loss": 0.04808529950678349, + "loss/crossentropy": 2.858980119228363, + "loss/logits": 0.877686470746994, + "step": 23390 + }, + { + "epoch": 0.234, + "grad_norm": 12.0625, + "grad_norm_var": 0.184619140625, + "learning_rate": 0.0003, + "loss": 11.6092, + "loss/aux_loss": 0.048086689226329325, + "loss/crossentropy": 2.603729021549225, + "loss/logits": 0.850713437795639, + "step": 23400 + }, + { + "epoch": 0.2341, + "grad_norm": 12.25, + "grad_norm_var": 1.2035807291666667, + "learning_rate": 0.0003, + "loss": 11.6522, + "loss/aux_loss": 0.048092365451157096, + "loss/crossentropy": 2.9920172095298767, + "loss/logits": 0.8820174932479858, + "step": 23410 + }, + { + "epoch": 0.2342, + "grad_norm": 12.5, + "grad_norm_var": 0.3931640625, + "learning_rate": 0.0003, + "loss": 11.7154, + "loss/aux_loss": 0.048093152418732646, + "loss/crossentropy": 2.6669042885303496, + "loss/logits": 0.8748789399862289, + "step": 23420 + }, + { + "epoch": 0.2343, + "grad_norm": 12.3125, + "grad_norm_var": 0.329150390625, + "learning_rate": 0.0003, + "loss": 11.6354, + "loss/aux_loss": 0.04808267038315535, + "loss/crossentropy": 2.6751941323280333, + "loss/logits": 0.8559034675359726, + "step": 23430 + }, + { + "epoch": 0.2344, + "grad_norm": 11.625, + "grad_norm_var": 0.696728515625, + "learning_rate": 0.0003, + "loss": 11.6513, + "loss/aux_loss": 0.048090195283293724, + "loss/crossentropy": 2.750403940677643, + "loss/logits": 0.8850887566804886, + "step": 23440 + }, + { + "epoch": 0.2345, + "grad_norm": 12.375, + "grad_norm_var": 0.2567545572916667, + "learning_rate": 0.0003, + "loss": 11.8244, + "loss/aux_loss": 0.048084620386362076, + "loss/crossentropy": 2.9328520774841307, + "loss/logits": 0.9011586248874665, + "step": 23450 + }, + { + "epoch": 0.2346, + "grad_norm": 12.75, + "grad_norm_var": 0.09739583333333333, + "learning_rate": 0.0003, + "loss": 11.6918, + "loss/aux_loss": 0.04808659795671701, + "loss/crossentropy": 2.7696239829063414, + "loss/logits": 0.8988734126091004, + "step": 23460 + }, + { + "epoch": 0.2347, + "grad_norm": 13.125, + "grad_norm_var": 0.332666015625, + "learning_rate": 0.0003, + "loss": 11.8292, + "loss/aux_loss": 0.0480822155252099, + "loss/crossentropy": 2.910421371459961, + "loss/logits": 0.900927659869194, + "step": 23470 + }, + { + "epoch": 0.2348, + "grad_norm": 12.125, + "grad_norm_var": 0.54921875, + "learning_rate": 0.0003, + "loss": 11.6806, + "loss/aux_loss": 0.04809327684342861, + "loss/crossentropy": 2.8524921536445618, + "loss/logits": 0.9137292951345444, + "step": 23480 + }, + { + "epoch": 0.2349, + "grad_norm": 13.875, + "grad_norm_var": 2.0796223958333333, + "learning_rate": 0.0003, + "loss": 11.7016, + "loss/aux_loss": 0.048085125908255574, + "loss/crossentropy": 2.680436742305756, + "loss/logits": 0.8520400941371917, + "step": 23490 + }, + { + "epoch": 0.235, + "grad_norm": 12.8125, + "grad_norm_var": 2.0036295572916667, + "learning_rate": 0.0003, + "loss": 11.7285, + "loss/aux_loss": 0.04808577839285135, + "loss/crossentropy": 2.6893193125724792, + "loss/logits": 0.8656334489583969, + "step": 23500 + }, + { + "epoch": 0.2351, + "grad_norm": 12.25, + "grad_norm_var": 0.3738932291666667, + "learning_rate": 0.0003, + "loss": 11.7499, + "loss/aux_loss": 0.048086671903729436, + "loss/crossentropy": 2.8467262983322144, + "loss/logits": 0.8982198029756546, + "step": 23510 + }, + { + "epoch": 0.2352, + "grad_norm": 12.8125, + "grad_norm_var": 0.202197265625, + "learning_rate": 0.0003, + "loss": 11.8075, + "loss/aux_loss": 0.04808525312691927, + "loss/crossentropy": 2.8524319410324095, + "loss/logits": 0.8897974759340286, + "step": 23520 + }, + { + "epoch": 0.2353, + "grad_norm": 14.0625, + "grad_norm_var": 0.5054524739583334, + "learning_rate": 0.0003, + "loss": 11.951, + "loss/aux_loss": 0.048080182448029515, + "loss/crossentropy": 2.7926797032356263, + "loss/logits": 0.8885494351387024, + "step": 23530 + }, + { + "epoch": 0.2354, + "grad_norm": 11.8125, + "grad_norm_var": 0.5848307291666667, + "learning_rate": 0.0003, + "loss": 11.5039, + "loss/aux_loss": 0.048084812425076964, + "loss/crossentropy": 2.79399893283844, + "loss/logits": 0.8674466758966446, + "step": 23540 + }, + { + "epoch": 0.2355, + "grad_norm": 13.1875, + "grad_norm_var": 0.29816080729166666, + "learning_rate": 0.0003, + "loss": 11.7157, + "loss/aux_loss": 0.048091997392475605, + "loss/crossentropy": 2.812830251455307, + "loss/logits": 0.8933589518070221, + "step": 23550 + }, + { + "epoch": 0.2356, + "grad_norm": 13.0625, + "grad_norm_var": 0.20045572916666668, + "learning_rate": 0.0003, + "loss": 11.6117, + "loss/aux_loss": 0.048093314096331594, + "loss/crossentropy": 2.773878538608551, + "loss/logits": 0.8754092365503311, + "step": 23560 + }, + { + "epoch": 0.2357, + "grad_norm": 13.75, + "grad_norm_var": 1.3430826822916666, + "learning_rate": 0.0003, + "loss": 11.7499, + "loss/aux_loss": 0.04808519445359707, + "loss/crossentropy": 2.888442850112915, + "loss/logits": 0.8818228989839554, + "step": 23570 + }, + { + "epoch": 0.2358, + "grad_norm": 12.5625, + "grad_norm_var": 1.249072265625, + "learning_rate": 0.0003, + "loss": 11.8386, + "loss/aux_loss": 0.04807713199406862, + "loss/crossentropy": 2.6926519870758057, + "loss/logits": 0.8780468791723252, + "step": 23580 + }, + { + "epoch": 0.2359, + "grad_norm": 12.0, + "grad_norm_var": 0.44166666666666665, + "learning_rate": 0.0003, + "loss": 11.6326, + "loss/aux_loss": 0.048089764825999734, + "loss/crossentropy": 2.6616145730018617, + "loss/logits": 0.8930239170789719, + "step": 23590 + }, + { + "epoch": 0.236, + "grad_norm": 12.625, + "grad_norm_var": 0.453125, + "learning_rate": 0.0003, + "loss": 11.7866, + "loss/aux_loss": 0.048086699284613135, + "loss/crossentropy": 2.7168304443359377, + "loss/logits": 0.8998159736394882, + "step": 23600 + }, + { + "epoch": 0.2361, + "grad_norm": 13.5, + "grad_norm_var": 0.5286295572916667, + "learning_rate": 0.0003, + "loss": 11.906, + "loss/aux_loss": 0.048088240809738635, + "loss/crossentropy": 2.7172608613967895, + "loss/logits": 0.8923117220401764, + "step": 23610 + }, + { + "epoch": 0.2362, + "grad_norm": 12.8125, + "grad_norm_var": 0.30104166666666665, + "learning_rate": 0.0003, + "loss": 11.5223, + "loss/aux_loss": 0.04808646198362112, + "loss/crossentropy": 2.706549334526062, + "loss/logits": 0.9012427359819413, + "step": 23620 + }, + { + "epoch": 0.2363, + "grad_norm": 12.625, + "grad_norm_var": 0.30572916666666666, + "learning_rate": 0.0003, + "loss": 11.6055, + "loss/aux_loss": 0.04809609428048134, + "loss/crossentropy": 2.7896106839179993, + "loss/logits": 0.8902231156826019, + "step": 23630 + }, + { + "epoch": 0.2364, + "grad_norm": 12.6875, + "grad_norm_var": 0.37233072916666665, + "learning_rate": 0.0003, + "loss": 11.962, + "loss/aux_loss": 0.048088495060801505, + "loss/crossentropy": 3.007174789905548, + "loss/logits": 0.9304111152887344, + "step": 23640 + }, + { + "epoch": 0.2365, + "grad_norm": 12.3125, + "grad_norm_var": 0.286181640625, + "learning_rate": 0.0003, + "loss": 11.5338, + "loss/aux_loss": 0.04809539187699556, + "loss/crossentropy": 2.633792459964752, + "loss/logits": 0.8579708755016326, + "step": 23650 + }, + { + "epoch": 0.2366, + "grad_norm": 11.8125, + "grad_norm_var": 0.7730305989583334, + "learning_rate": 0.0003, + "loss": 11.6288, + "loss/aux_loss": 0.048088702373206615, + "loss/crossentropy": 2.738995945453644, + "loss/logits": 0.8920851528644562, + "step": 23660 + }, + { + "epoch": 0.2367, + "grad_norm": 11.4375, + "grad_norm_var": 0.74921875, + "learning_rate": 0.0003, + "loss": 11.7406, + "loss/aux_loss": 0.04808499738574028, + "loss/crossentropy": 2.750992178916931, + "loss/logits": 0.8741905808448791, + "step": 23670 + }, + { + "epoch": 0.2368, + "grad_norm": 12.5, + "grad_norm_var": 0.7702473958333333, + "learning_rate": 0.0003, + "loss": 11.7279, + "loss/aux_loss": 0.04809374678879976, + "loss/crossentropy": 3.0117597341537476, + "loss/logits": 0.9128781437873841, + "step": 23680 + }, + { + "epoch": 0.2369, + "grad_norm": 12.375, + "grad_norm_var": 1.1684733072916667, + "learning_rate": 0.0003, + "loss": 11.6513, + "loss/aux_loss": 0.04808376375585795, + "loss/crossentropy": 2.730088675022125, + "loss/logits": 0.8473275810480118, + "step": 23690 + }, + { + "epoch": 0.237, + "grad_norm": 11.5625, + "grad_norm_var": 1.056494140625, + "learning_rate": 0.0003, + "loss": 11.8007, + "loss/aux_loss": 0.048095212876796724, + "loss/crossentropy": 2.7834979057312013, + "loss/logits": 0.8746491730213165, + "step": 23700 + }, + { + "epoch": 0.2371, + "grad_norm": 13.875, + "grad_norm_var": 1.337353515625, + "learning_rate": 0.0003, + "loss": 11.5859, + "loss/aux_loss": 0.048090392164885996, + "loss/crossentropy": 2.772467577457428, + "loss/logits": 0.9062090307474137, + "step": 23710 + }, + { + "epoch": 0.2372, + "grad_norm": 12.5625, + "grad_norm_var": 0.7644368489583333, + "learning_rate": 0.0003, + "loss": 11.674, + "loss/aux_loss": 0.048084620386362076, + "loss/crossentropy": 2.814483368396759, + "loss/logits": 0.9223452210426331, + "step": 23720 + }, + { + "epoch": 0.2373, + "grad_norm": 12.1875, + "grad_norm_var": 0.1921875, + "learning_rate": 0.0003, + "loss": 11.652, + "loss/aux_loss": 0.048086386919021604, + "loss/crossentropy": 2.937320578098297, + "loss/logits": 0.867486959695816, + "step": 23730 + }, + { + "epoch": 0.2374, + "grad_norm": 13.375, + "grad_norm_var": 0.28098958333333335, + "learning_rate": 0.0003, + "loss": 11.6409, + "loss/aux_loss": 0.048084064945578577, + "loss/crossentropy": 2.8077134013175966, + "loss/logits": 0.8964003264904022, + "step": 23740 + }, + { + "epoch": 0.2375, + "grad_norm": 12.6875, + "grad_norm_var": 1.6445149739583333, + "learning_rate": 0.0003, + "loss": 11.8131, + "loss/aux_loss": 0.048087784089148045, + "loss/crossentropy": 2.9468029141426086, + "loss/logits": 0.9184634208679199, + "step": 23750 + }, + { + "epoch": 0.2376, + "grad_norm": 11.625, + "grad_norm_var": 1.9526041666666667, + "learning_rate": 0.0003, + "loss": 11.7089, + "loss/aux_loss": 0.04808371346443892, + "loss/crossentropy": 2.8894827246665953, + "loss/logits": 0.8926361262798309, + "step": 23760 + }, + { + "epoch": 0.2377, + "grad_norm": 11.375, + "grad_norm_var": 0.48899739583333335, + "learning_rate": 0.0003, + "loss": 11.7193, + "loss/aux_loss": 0.04808234348893166, + "loss/crossentropy": 2.8240739822387697, + "loss/logits": 0.8710032075643539, + "step": 23770 + }, + { + "epoch": 0.2378, + "grad_norm": 13.5, + "grad_norm_var": 2.7316243489583334, + "learning_rate": 0.0003, + "loss": 11.5324, + "loss/aux_loss": 0.0480916004627943, + "loss/crossentropy": 2.7174317240715027, + "loss/logits": 0.8585263520479203, + "step": 23780 + }, + { + "epoch": 0.2379, + "grad_norm": 12.0, + "grad_norm_var": 2.7860514322916665, + "learning_rate": 0.0003, + "loss": 11.8451, + "loss/aux_loss": 0.048090549744665624, + "loss/crossentropy": 2.843950593471527, + "loss/logits": 0.9064037382602692, + "step": 23790 + }, + { + "epoch": 0.238, + "grad_norm": 12.125, + "grad_norm_var": 0.31365559895833334, + "learning_rate": 0.0003, + "loss": 11.7494, + "loss/aux_loss": 0.04807556625455618, + "loss/crossentropy": 2.704611933231354, + "loss/logits": 0.8805167257785798, + "step": 23800 + }, + { + "epoch": 0.2381, + "grad_norm": 13.0, + "grad_norm_var": 0.40826822916666666, + "learning_rate": 0.0003, + "loss": 11.5183, + "loss/aux_loss": 0.04808481372892857, + "loss/crossentropy": 2.536171966791153, + "loss/logits": 0.8397417157888413, + "step": 23810 + }, + { + "epoch": 0.2382, + "grad_norm": 12.5, + "grad_norm_var": 0.27545572916666666, + "learning_rate": 0.0003, + "loss": 11.6364, + "loss/aux_loss": 0.048093268647789955, + "loss/crossentropy": 2.7942283511161805, + "loss/logits": 0.8568087071180344, + "step": 23820 + }, + { + "epoch": 0.2383, + "grad_norm": 12.25, + "grad_norm_var": 0.4332682291666667, + "learning_rate": 0.0003, + "loss": 11.6525, + "loss/aux_loss": 0.048078492656350134, + "loss/crossentropy": 2.7623124718666077, + "loss/logits": 0.891291829943657, + "step": 23830 + }, + { + "epoch": 0.2384, + "grad_norm": 12.5625, + "grad_norm_var": 0.34152018229166664, + "learning_rate": 0.0003, + "loss": 11.7087, + "loss/aux_loss": 0.048094586841762064, + "loss/crossentropy": 2.678860205411911, + "loss/logits": 0.8403998255729676, + "step": 23840 + }, + { + "epoch": 0.2385, + "grad_norm": 13.375, + "grad_norm_var": 0.4306640625, + "learning_rate": 0.0003, + "loss": 11.8878, + "loss/aux_loss": 0.04808190613985062, + "loss/crossentropy": 2.858857882022858, + "loss/logits": 0.8940188169479371, + "step": 23850 + }, + { + "epoch": 0.2386, + "grad_norm": 12.375, + "grad_norm_var": 14.811832682291667, + "learning_rate": 0.0003, + "loss": 11.7931, + "loss/aux_loss": 0.048093811981379984, + "loss/crossentropy": 2.911667358875275, + "loss/logits": 0.911825567483902, + "step": 23860 + }, + { + "epoch": 0.2387, + "grad_norm": 12.9375, + "grad_norm_var": 0.6860514322916667, + "learning_rate": 0.0003, + "loss": 11.6265, + "loss/aux_loss": 0.04809156283736229, + "loss/crossentropy": 2.897962212562561, + "loss/logits": 0.9157760441303253, + "step": 23870 + }, + { + "epoch": 0.2388, + "grad_norm": 13.5625, + "grad_norm_var": 0.5841145833333333, + "learning_rate": 0.0003, + "loss": 11.6992, + "loss/aux_loss": 0.048074356466531756, + "loss/crossentropy": 2.6709546744823456, + "loss/logits": 0.892021319270134, + "step": 23880 + }, + { + "epoch": 0.2389, + "grad_norm": 12.5625, + "grad_norm_var": 0.5063639322916667, + "learning_rate": 0.0003, + "loss": 11.9177, + "loss/aux_loss": 0.04809843823313713, + "loss/crossentropy": 2.818812572956085, + "loss/logits": 0.9256382822990418, + "step": 23890 + }, + { + "epoch": 0.239, + "grad_norm": 13.3125, + "grad_norm_var": 0.2604166666666667, + "learning_rate": 0.0003, + "loss": 11.6635, + "loss/aux_loss": 0.048081925325095656, + "loss/crossentropy": 2.768986976146698, + "loss/logits": 0.8817100405693055, + "step": 23900 + }, + { + "epoch": 0.2391, + "grad_norm": 13.0, + "grad_norm_var": 1.490869140625, + "learning_rate": 0.0003, + "loss": 11.6234, + "loss/aux_loss": 0.048085336573421955, + "loss/crossentropy": 2.748648244142532, + "loss/logits": 0.9171183824539184, + "step": 23910 + }, + { + "epoch": 0.2392, + "grad_norm": 12.25, + "grad_norm_var": 1.4512858072916666, + "learning_rate": 0.0003, + "loss": 11.7612, + "loss/aux_loss": 0.04809110928326845, + "loss/crossentropy": 2.7163472533226014, + "loss/logits": 0.8676630944013596, + "step": 23920 + }, + { + "epoch": 0.2393, + "grad_norm": 13.1875, + "grad_norm_var": 0.746728515625, + "learning_rate": 0.0003, + "loss": 11.7462, + "loss/aux_loss": 0.048089482076466086, + "loss/crossentropy": 2.930872416496277, + "loss/logits": 0.9010693699121475, + "step": 23930 + }, + { + "epoch": 0.2394, + "grad_norm": 12.6875, + "grad_norm_var": 0.5486979166666667, + "learning_rate": 0.0003, + "loss": 11.896, + "loss/aux_loss": 0.048085294850170615, + "loss/crossentropy": 2.798089528083801, + "loss/logits": 0.8853724330663681, + "step": 23940 + }, + { + "epoch": 0.2395, + "grad_norm": 12.5625, + "grad_norm_var": 0.3851399739583333, + "learning_rate": 0.0003, + "loss": 11.6158, + "loss/aux_loss": 0.04808676280081272, + "loss/crossentropy": 2.726479697227478, + "loss/logits": 0.8973057448863984, + "step": 23950 + }, + { + "epoch": 0.2396, + "grad_norm": 14.25, + "grad_norm_var": 0.3282389322916667, + "learning_rate": 0.0003, + "loss": 11.7098, + "loss/aux_loss": 0.048084568418562415, + "loss/crossentropy": 2.6635270595550535, + "loss/logits": 0.8485010534524917, + "step": 23960 + }, + { + "epoch": 0.2397, + "grad_norm": 11.5, + "grad_norm_var": 0.7620930989583333, + "learning_rate": 0.0003, + "loss": 11.5762, + "loss/aux_loss": 0.048088745586574076, + "loss/crossentropy": 2.744753432273865, + "loss/logits": 0.8812153309583663, + "step": 23970 + }, + { + "epoch": 0.2398, + "grad_norm": 12.8125, + "grad_norm_var": 0.4384765625, + "learning_rate": 0.0003, + "loss": 11.7152, + "loss/aux_loss": 0.04808169640600681, + "loss/crossentropy": 2.7333896338939665, + "loss/logits": 0.9171457648277282, + "step": 23980 + }, + { + "epoch": 0.2399, + "grad_norm": 29.5, + "grad_norm_var": 19.277067057291667, + "learning_rate": 0.0003, + "loss": 11.5902, + "loss/aux_loss": 0.04808990899473429, + "loss/crossentropy": 2.865362215042114, + "loss/logits": 0.8600284993648529, + "step": 23990 + }, + { + "epoch": 0.24, + "grad_norm": 12.6875, + "grad_norm_var": 18.717171223958335, + "learning_rate": 0.0003, + "loss": 11.785, + "loss/aux_loss": 0.04809808786958456, + "loss/crossentropy": 2.812199038267136, + "loss/logits": 0.8720352232456208, + "step": 24000 + }, + { + "epoch": 0.2401, + "grad_norm": 11.875, + "grad_norm_var": 0.16744791666666667, + "learning_rate": 0.0003, + "loss": 11.6551, + "loss/aux_loss": 0.04809410627931356, + "loss/crossentropy": 2.7996535181999205, + "loss/logits": 0.9151755809783936, + "step": 24010 + }, + { + "epoch": 0.2402, + "grad_norm": 13.125, + "grad_norm_var": 0.5535807291666667, + "learning_rate": 0.0003, + "loss": 11.7477, + "loss/aux_loss": 0.04809091780334711, + "loss/crossentropy": 2.7805619120597838, + "loss/logits": 0.9082524001598358, + "step": 24020 + }, + { + "epoch": 0.2403, + "grad_norm": 13.25, + "grad_norm_var": 0.4984375, + "learning_rate": 0.0003, + "loss": 11.6539, + "loss/aux_loss": 0.04808855000883341, + "loss/crossentropy": 2.7244292974472044, + "loss/logits": 0.8969215124845504, + "step": 24030 + }, + { + "epoch": 0.2404, + "grad_norm": 11.4375, + "grad_norm_var": 0.6372395833333333, + "learning_rate": 0.0003, + "loss": 11.8925, + "loss/aux_loss": 0.04808660857379436, + "loss/crossentropy": 2.8658366203308105, + "loss/logits": 0.9359738051891326, + "step": 24040 + }, + { + "epoch": 0.2405, + "grad_norm": 12.3125, + "grad_norm_var": 0.3150390625, + "learning_rate": 0.0003, + "loss": 11.6215, + "loss/aux_loss": 0.0480943713337183, + "loss/crossentropy": 2.7264646887779236, + "loss/logits": 0.8683656752109528, + "step": 24050 + }, + { + "epoch": 0.2406, + "grad_norm": 13.625, + "grad_norm_var": 0.8895182291666667, + "learning_rate": 0.0003, + "loss": 11.6396, + "loss/aux_loss": 0.04809534400701523, + "loss/crossentropy": 2.7673123121261596, + "loss/logits": 0.8471581250429153, + "step": 24060 + }, + { + "epoch": 0.2407, + "grad_norm": 11.8125, + "grad_norm_var": 0.6770182291666667, + "learning_rate": 0.0003, + "loss": 11.6868, + "loss/aux_loss": 0.04808705560863018, + "loss/crossentropy": 2.7488048553466795, + "loss/logits": 0.918352234363556, + "step": 24070 + }, + { + "epoch": 0.2408, + "grad_norm": 11.75, + "grad_norm_var": 0.4117024739583333, + "learning_rate": 0.0003, + "loss": 11.6628, + "loss/aux_loss": 0.048093785718083384, + "loss/crossentropy": 2.8444491744041445, + "loss/logits": 0.8840048730373382, + "step": 24080 + }, + { + "epoch": 0.2409, + "grad_norm": 12.0, + "grad_norm_var": 0.461962890625, + "learning_rate": 0.0003, + "loss": 11.5886, + "loss/aux_loss": 0.04807962272316217, + "loss/crossentropy": 2.580649846792221, + "loss/logits": 0.8638424456119538, + "step": 24090 + }, + { + "epoch": 0.241, + "grad_norm": 11.8125, + "grad_norm_var": 0.3580729166666667, + "learning_rate": 0.0003, + "loss": 11.7413, + "loss/aux_loss": 0.04808608740568161, + "loss/crossentropy": 2.8323341250419616, + "loss/logits": 0.8996293157339096, + "step": 24100 + }, + { + "epoch": 0.2411, + "grad_norm": 12.3125, + "grad_norm_var": 1.4559895833333334, + "learning_rate": 0.0003, + "loss": 11.6987, + "loss/aux_loss": 0.048095569014549255, + "loss/crossentropy": 2.7775806427001952, + "loss/logits": 0.8833230465650559, + "step": 24110 + }, + { + "epoch": 0.2412, + "grad_norm": 12.75, + "grad_norm_var": 0.4014973958333333, + "learning_rate": 0.0003, + "loss": 11.5617, + "loss/aux_loss": 0.04807928055524826, + "loss/crossentropy": 2.7209652066230774, + "loss/logits": 0.8995220333337783, + "step": 24120 + }, + { + "epoch": 0.2413, + "grad_norm": 13.6875, + "grad_norm_var": 0.43951822916666666, + "learning_rate": 0.0003, + "loss": 11.6276, + "loss/aux_loss": 0.04808956161141396, + "loss/crossentropy": 2.757280480861664, + "loss/logits": 0.8826348453760147, + "step": 24130 + }, + { + "epoch": 0.2414, + "grad_norm": 12.75, + "grad_norm_var": 0.33177083333333335, + "learning_rate": 0.0003, + "loss": 11.5764, + "loss/aux_loss": 0.048085734620690344, + "loss/crossentropy": 2.783533537387848, + "loss/logits": 0.8649254590272903, + "step": 24140 + }, + { + "epoch": 0.2415, + "grad_norm": 12.75, + "grad_norm_var": 3.777327473958333, + "learning_rate": 0.0003, + "loss": 11.6753, + "loss/aux_loss": 0.04808969590812921, + "loss/crossentropy": 2.678688037395477, + "loss/logits": 0.8812060475349426, + "step": 24150 + }, + { + "epoch": 0.2416, + "grad_norm": 13.5625, + "grad_norm_var": 3.59375, + "learning_rate": 0.0003, + "loss": 11.5482, + "loss/aux_loss": 0.0480934102088213, + "loss/crossentropy": 2.7929326593875885, + "loss/logits": 0.8826883345842361, + "step": 24160 + }, + { + "epoch": 0.2417, + "grad_norm": 12.125, + "grad_norm_var": 0.3504557291666667, + "learning_rate": 0.0003, + "loss": 11.3805, + "loss/aux_loss": 0.04809669218957424, + "loss/crossentropy": 2.5945322930812837, + "loss/logits": 0.8391418486833573, + "step": 24170 + }, + { + "epoch": 0.2418, + "grad_norm": 13.3125, + "grad_norm_var": 1.5546712239583333, + "learning_rate": 0.0003, + "loss": 11.5907, + "loss/aux_loss": 0.04808249343186617, + "loss/crossentropy": 2.734819310903549, + "loss/logits": 0.8693808823823929, + "step": 24180 + }, + { + "epoch": 0.2419, + "grad_norm": 12.0, + "grad_norm_var": 0.388525390625, + "learning_rate": 0.0003, + "loss": 11.6518, + "loss/aux_loss": 0.04808155260980129, + "loss/crossentropy": 2.7737566351890566, + "loss/logits": 0.8870420664548874, + "step": 24190 + }, + { + "epoch": 0.242, + "grad_norm": 12.5, + "grad_norm_var": 0.5936848958333333, + "learning_rate": 0.0003, + "loss": 11.6452, + "loss/aux_loss": 0.04808844365179539, + "loss/crossentropy": 2.78323655128479, + "loss/logits": 0.8678434014320373, + "step": 24200 + }, + { + "epoch": 0.2421, + "grad_norm": 12.1875, + "grad_norm_var": 0.5105305989583333, + "learning_rate": 0.0003, + "loss": 11.4859, + "loss/aux_loss": 0.0480864379554987, + "loss/crossentropy": 2.769151270389557, + "loss/logits": 0.8635666728019714, + "step": 24210 + }, + { + "epoch": 0.2422, + "grad_norm": 11.8125, + "grad_norm_var": 0.23917643229166666, + "learning_rate": 0.0003, + "loss": 11.772, + "loss/aux_loss": 0.048094228468835355, + "loss/crossentropy": 2.812956178188324, + "loss/logits": 0.9112226068973541, + "step": 24220 + }, + { + "epoch": 0.2423, + "grad_norm": 11.75, + "grad_norm_var": 0.328759765625, + "learning_rate": 0.0003, + "loss": 11.669, + "loss/aux_loss": 0.04808081742376089, + "loss/crossentropy": 2.7633156895637514, + "loss/logits": 0.8465212196111679, + "step": 24230 + }, + { + "epoch": 0.2424, + "grad_norm": 13.0625, + "grad_norm_var": 0.5065104166666666, + "learning_rate": 0.0003, + "loss": 11.6417, + "loss/aux_loss": 0.04809690322726965, + "loss/crossentropy": 2.744890737533569, + "loss/logits": 0.9320186167955399, + "step": 24240 + }, + { + "epoch": 0.2425, + "grad_norm": 12.125, + "grad_norm_var": 4.525374348958334, + "learning_rate": 0.0003, + "loss": 11.8212, + "loss/aux_loss": 0.04809857420623302, + "loss/crossentropy": 2.7761366605758666, + "loss/logits": 0.8958946943283081, + "step": 24250 + }, + { + "epoch": 0.2426, + "grad_norm": 13.0, + "grad_norm_var": 0.2875, + "learning_rate": 0.0003, + "loss": 11.631, + "loss/aux_loss": 0.04808212518692016, + "loss/crossentropy": 2.8316932320594788, + "loss/logits": 0.8923729687929154, + "step": 24260 + }, + { + "epoch": 0.2427, + "grad_norm": 12.6875, + "grad_norm_var": 0.431884765625, + "learning_rate": 0.0003, + "loss": 11.6632, + "loss/aux_loss": 0.048090006597340106, + "loss/crossentropy": 2.7280545473098754, + "loss/logits": 0.8711204528808594, + "step": 24270 + }, + { + "epoch": 0.2428, + "grad_norm": 13.0, + "grad_norm_var": 0.3106770833333333, + "learning_rate": 0.0003, + "loss": 11.6513, + "loss/aux_loss": 0.048082527332007886, + "loss/crossentropy": 2.7286766350269316, + "loss/logits": 0.8891306400299073, + "step": 24280 + }, + { + "epoch": 0.2429, + "grad_norm": 13.6875, + "grad_norm_var": 0.24837239583333334, + "learning_rate": 0.0003, + "loss": 11.6466, + "loss/aux_loss": 0.048082358203828335, + "loss/crossentropy": 2.9011032223701476, + "loss/logits": 0.8931461691856384, + "step": 24290 + }, + { + "epoch": 0.243, + "grad_norm": 12.375, + "grad_norm_var": 0.2994140625, + "learning_rate": 0.0003, + "loss": 11.5106, + "loss/aux_loss": 0.048092238046228884, + "loss/crossentropy": 2.682009291648865, + "loss/logits": 0.8750419646501542, + "step": 24300 + }, + { + "epoch": 0.2431, + "grad_norm": 13.0625, + "grad_norm_var": 0.26443684895833336, + "learning_rate": 0.0003, + "loss": 11.7725, + "loss/aux_loss": 0.04807449225336313, + "loss/crossentropy": 2.79811235666275, + "loss/logits": 0.8686894834041595, + "step": 24310 + }, + { + "epoch": 0.2432, + "grad_norm": 12.125, + "grad_norm_var": 0.4176432291666667, + "learning_rate": 0.0003, + "loss": 11.6726, + "loss/aux_loss": 0.04808956328779459, + "loss/crossentropy": 2.6413731455802916, + "loss/logits": 0.8763428032398224, + "step": 24320 + }, + { + "epoch": 0.2433, + "grad_norm": 12.6875, + "grad_norm_var": 0.5105305989583333, + "learning_rate": 0.0003, + "loss": 11.6373, + "loss/aux_loss": 0.048085764050483704, + "loss/crossentropy": 2.6801956832408904, + "loss/logits": 0.8385010361671448, + "step": 24330 + }, + { + "epoch": 0.2434, + "grad_norm": 12.375, + "grad_norm_var": 1.7222493489583333, + "learning_rate": 0.0003, + "loss": 11.6253, + "loss/aux_loss": 0.04808337558060884, + "loss/crossentropy": 2.7743629932403566, + "loss/logits": 0.9043363749980926, + "step": 24340 + }, + { + "epoch": 0.2435, + "grad_norm": 12.1875, + "grad_norm_var": 0.13566080729166666, + "learning_rate": 0.0003, + "loss": 11.6274, + "loss/aux_loss": 0.048089998215436934, + "loss/crossentropy": 2.8051861047744753, + "loss/logits": 0.8626913219690323, + "step": 24350 + }, + { + "epoch": 0.2436, + "grad_norm": 13.0, + "grad_norm_var": 0.408447265625, + "learning_rate": 0.0003, + "loss": 11.6349, + "loss/aux_loss": 0.048083267733454706, + "loss/crossentropy": 2.909751272201538, + "loss/logits": 0.9241881400346756, + "step": 24360 + }, + { + "epoch": 0.2437, + "grad_norm": 12.75, + "grad_norm_var": 0.28854166666666664, + "learning_rate": 0.0003, + "loss": 11.83, + "loss/aux_loss": 0.048077669180929664, + "loss/crossentropy": 2.8290556192398073, + "loss/logits": 0.8948422998189927, + "step": 24370 + }, + { + "epoch": 0.2438, + "grad_norm": 12.3125, + "grad_norm_var": 0.49420572916666666, + "learning_rate": 0.0003, + "loss": 11.7276, + "loss/aux_loss": 0.048091036081314084, + "loss/crossentropy": 2.913513660430908, + "loss/logits": 0.9211061328649521, + "step": 24380 + }, + { + "epoch": 0.2439, + "grad_norm": 12.5, + "grad_norm_var": 0.4315104166666667, + "learning_rate": 0.0003, + "loss": 11.6142, + "loss/aux_loss": 0.04808881543576717, + "loss/crossentropy": 2.8371637940406798, + "loss/logits": 0.8743333727121353, + "step": 24390 + }, + { + "epoch": 0.244, + "grad_norm": 11.375, + "grad_norm_var": 0.3895182291666667, + "learning_rate": 0.0003, + "loss": 11.6272, + "loss/aux_loss": 0.04808051008731127, + "loss/crossentropy": 2.772133195400238, + "loss/logits": 0.8708338439464569, + "step": 24400 + }, + { + "epoch": 0.2441, + "grad_norm": 15.25, + "grad_norm_var": 0.8311848958333333, + "learning_rate": 0.0003, + "loss": 11.7284, + "loss/aux_loss": 0.048093300126492974, + "loss/crossentropy": 2.814771521091461, + "loss/logits": 0.8521833211183548, + "step": 24410 + }, + { + "epoch": 0.2442, + "grad_norm": 12.125, + "grad_norm_var": 0.7298014322916667, + "learning_rate": 0.0003, + "loss": 11.5798, + "loss/aux_loss": 0.0480850936844945, + "loss/crossentropy": 2.5935686111450194, + "loss/logits": 0.8524984180927276, + "step": 24420 + }, + { + "epoch": 0.2443, + "grad_norm": 13.0, + "grad_norm_var": 0.20519205729166667, + "learning_rate": 0.0003, + "loss": 11.5651, + "loss/aux_loss": 0.04809098821133375, + "loss/crossentropy": 2.702817916870117, + "loss/logits": 0.8753552913665772, + "step": 24430 + }, + { + "epoch": 0.2444, + "grad_norm": 13.125, + "grad_norm_var": 0.23098958333333333, + "learning_rate": 0.0003, + "loss": 11.7146, + "loss/aux_loss": 0.048081192560493945, + "loss/crossentropy": 2.789567303657532, + "loss/logits": 0.8468646883964539, + "step": 24440 + }, + { + "epoch": 0.2445, + "grad_norm": 16.5, + "grad_norm_var": 1.030712890625, + "learning_rate": 0.0003, + "loss": 11.7024, + "loss/aux_loss": 0.04808731079101562, + "loss/crossentropy": 2.7426182508468626, + "loss/logits": 0.8534230351448059, + "step": 24450 + }, + { + "epoch": 0.2446, + "grad_norm": 12.6875, + "grad_norm_var": 1.5005208333333333, + "learning_rate": 0.0003, + "loss": 11.6524, + "loss/aux_loss": 0.04808608312159777, + "loss/crossentropy": 2.8565701603889466, + "loss/logits": 0.8676398396492004, + "step": 24460 + }, + { + "epoch": 0.2447, + "grad_norm": 14.625, + "grad_norm_var": 1.1264973958333333, + "learning_rate": 0.0003, + "loss": 11.4879, + "loss/aux_loss": 0.048085580207407476, + "loss/crossentropy": 2.898229694366455, + "loss/logits": 0.8859318733215332, + "step": 24470 + }, + { + "epoch": 0.2448, + "grad_norm": 13.1875, + "grad_norm_var": 0.4410807291666667, + "learning_rate": 0.0003, + "loss": 11.6515, + "loss/aux_loss": 0.04807874243706465, + "loss/crossentropy": 2.850202000141144, + "loss/logits": 0.88456309735775, + "step": 24480 + }, + { + "epoch": 0.2449, + "grad_norm": 11.375, + "grad_norm_var": 0.3223958333333333, + "learning_rate": 0.0003, + "loss": 11.7189, + "loss/aux_loss": 0.048079111985862254, + "loss/crossentropy": 2.808897280693054, + "loss/logits": 0.8941350758075715, + "step": 24490 + }, + { + "epoch": 0.245, + "grad_norm": 13.0, + "grad_norm_var": 1.0204264322916667, + "learning_rate": 0.0003, + "loss": 11.5987, + "loss/aux_loss": 0.04809224735945463, + "loss/crossentropy": 2.7774929463863374, + "loss/logits": 0.8461995214223862, + "step": 24500 + }, + { + "epoch": 0.2451, + "grad_norm": 13.25, + "grad_norm_var": 1.1275390625, + "learning_rate": 0.0003, + "loss": 11.5477, + "loss/aux_loss": 0.04809009712189436, + "loss/crossentropy": 2.791159617900848, + "loss/logits": 0.8854242950677872, + "step": 24510 + }, + { + "epoch": 0.2452, + "grad_norm": 12.5625, + "grad_norm_var": 0.5010416666666667, + "learning_rate": 0.0003, + "loss": 11.7486, + "loss/aux_loss": 0.048082375340163706, + "loss/crossentropy": 2.664378434419632, + "loss/logits": 0.8719703197479248, + "step": 24520 + }, + { + "epoch": 0.2453, + "grad_norm": 45.25, + "grad_norm_var": 66.98527018229167, + "learning_rate": 0.0003, + "loss": 11.9223, + "loss/aux_loss": 0.048085335083305834, + "loss/crossentropy": 2.715837526321411, + "loss/logits": 0.9175373882055282, + "step": 24530 + }, + { + "epoch": 0.2454, + "grad_norm": 13.1875, + "grad_norm_var": 64.931103515625, + "learning_rate": 0.0003, + "loss": 11.5829, + "loss/aux_loss": 0.04809392262250185, + "loss/crossentropy": 2.799639356136322, + "loss/logits": 0.8938455194234848, + "step": 24540 + }, + { + "epoch": 0.2455, + "grad_norm": 13.0625, + "grad_norm_var": 0.3728515625, + "learning_rate": 0.0003, + "loss": 11.7221, + "loss/aux_loss": 0.04808378964662552, + "loss/crossentropy": 2.777541899681091, + "loss/logits": 0.8686657905578613, + "step": 24550 + }, + { + "epoch": 0.2456, + "grad_norm": 12.875, + "grad_norm_var": 0.32355143229166666, + "learning_rate": 0.0003, + "loss": 11.8611, + "loss/aux_loss": 0.048089235462248324, + "loss/crossentropy": 2.872539556026459, + "loss/logits": 0.9508152902126312, + "step": 24560 + }, + { + "epoch": 0.2457, + "grad_norm": 12.9375, + "grad_norm_var": 0.30987955729166666, + "learning_rate": 0.0003, + "loss": 11.5595, + "loss/aux_loss": 0.04808945395052433, + "loss/crossentropy": 2.9004230976104735, + "loss/logits": 0.9007263153791427, + "step": 24570 + }, + { + "epoch": 0.2458, + "grad_norm": 11.6875, + "grad_norm_var": 0.2916015625, + "learning_rate": 0.0003, + "loss": 11.7691, + "loss/aux_loss": 0.048084504902362823, + "loss/crossentropy": 2.81181880235672, + "loss/logits": 0.8950879544019699, + "step": 24580 + }, + { + "epoch": 0.2459, + "grad_norm": 12.6875, + "grad_norm_var": 2.4169270833333334, + "learning_rate": 0.0003, + "loss": 11.7429, + "loss/aux_loss": 0.04808555655181408, + "loss/crossentropy": 2.890042209625244, + "loss/logits": 0.8888252973556519, + "step": 24590 + }, + { + "epoch": 0.246, + "grad_norm": 13.125, + "grad_norm_var": 1.9864420572916666, + "learning_rate": 0.0003, + "loss": 11.7697, + "loss/aux_loss": 0.04808424487709999, + "loss/crossentropy": 2.8062154173851015, + "loss/logits": 0.8845973283052444, + "step": 24600 + }, + { + "epoch": 0.2461, + "grad_norm": 12.3125, + "grad_norm_var": 0.47342122395833336, + "learning_rate": 0.0003, + "loss": 11.6025, + "loss/aux_loss": 0.04808759950101375, + "loss/crossentropy": 2.638791638612747, + "loss/logits": 0.8616722971200943, + "step": 24610 + }, + { + "epoch": 0.2462, + "grad_norm": 12.0, + "grad_norm_var": 2.019514973958333, + "learning_rate": 0.0003, + "loss": 11.3808, + "loss/aux_loss": 0.0480861397460103, + "loss/crossentropy": 2.6739172518253325, + "loss/logits": 0.857967483997345, + "step": 24620 + }, + { + "epoch": 0.2463, + "grad_norm": 14.125, + "grad_norm_var": 11.3953125, + "learning_rate": 0.0003, + "loss": 11.7237, + "loss/aux_loss": 0.048097777739167215, + "loss/crossentropy": 2.6798054337501527, + "loss/logits": 0.9119128674268723, + "step": 24630 + }, + { + "epoch": 0.2464, + "grad_norm": 12.75, + "grad_norm_var": 3.7905598958333333, + "learning_rate": 0.0003, + "loss": 11.5751, + "loss/aux_loss": 0.04808594770729542, + "loss/crossentropy": 2.6202758669853212, + "loss/logits": 0.8779011040925979, + "step": 24640 + }, + { + "epoch": 0.2465, + "grad_norm": 13.75, + "grad_norm_var": 4.788004557291667, + "learning_rate": 0.0003, + "loss": 11.5291, + "loss/aux_loss": 0.0480955732986331, + "loss/crossentropy": 2.6888505935668947, + "loss/logits": 0.8404242038726807, + "step": 24650 + }, + { + "epoch": 0.2466, + "grad_norm": 11.875, + "grad_norm_var": 0.5864583333333333, + "learning_rate": 0.0003, + "loss": 11.644, + "loss/aux_loss": 0.0480740413069725, + "loss/crossentropy": 2.9277888417243956, + "loss/logits": 0.8985714882612228, + "step": 24660 + }, + { + "epoch": 0.2467, + "grad_norm": 11.5, + "grad_norm_var": 0.4141764322916667, + "learning_rate": 0.0003, + "loss": 11.7472, + "loss/aux_loss": 0.048084502667188646, + "loss/crossentropy": 2.661174988746643, + "loss/logits": 0.8858565300703048, + "step": 24670 + }, + { + "epoch": 0.2468, + "grad_norm": 11.9375, + "grad_norm_var": 0.5109212239583333, + "learning_rate": 0.0003, + "loss": 11.7638, + "loss/aux_loss": 0.048086884804069994, + "loss/crossentropy": 2.990370142459869, + "loss/logits": 0.9090431898832321, + "step": 24680 + }, + { + "epoch": 0.2469, + "grad_norm": 12.9375, + "grad_norm_var": 0.48333333333333334, + "learning_rate": 0.0003, + "loss": 11.7163, + "loss/aux_loss": 0.04808713924139738, + "loss/crossentropy": 2.6298464059829714, + "loss/logits": 0.8759280443191528, + "step": 24690 + }, + { + "epoch": 0.247, + "grad_norm": 13.875, + "grad_norm_var": 11.1150390625, + "learning_rate": 0.0003, + "loss": 11.6638, + "loss/aux_loss": 0.04808840285986662, + "loss/crossentropy": 2.8006470084190367, + "loss/logits": 0.8584316343069076, + "step": 24700 + }, + { + "epoch": 0.2471, + "grad_norm": 14.375, + "grad_norm_var": 10.780208333333333, + "learning_rate": 0.0003, + "loss": 11.7515, + "loss/aux_loss": 0.04808585830032826, + "loss/crossentropy": 2.7873790740966795, + "loss/logits": 0.8926585078239441, + "step": 24710 + }, + { + "epoch": 0.2472, + "grad_norm": 12.4375, + "grad_norm_var": 0.2879557291666667, + "learning_rate": 0.0003, + "loss": 11.6114, + "loss/aux_loss": 0.048089620657265186, + "loss/crossentropy": 2.5645355701446535, + "loss/logits": 0.8463786870241166, + "step": 24720 + }, + { + "epoch": 0.2473, + "grad_norm": 12.25, + "grad_norm_var": 0.454541015625, + "learning_rate": 0.0003, + "loss": 11.6297, + "loss/aux_loss": 0.04808438029140234, + "loss/crossentropy": 2.8039010167121887, + "loss/logits": 0.9113354772329331, + "step": 24730 + }, + { + "epoch": 0.2474, + "grad_norm": 13.5, + "grad_norm_var": 0.4088541666666667, + "learning_rate": 0.0003, + "loss": 11.5603, + "loss/aux_loss": 0.04807938225567341, + "loss/crossentropy": 2.6084700644016268, + "loss/logits": 0.869733153283596, + "step": 24740 + }, + { + "epoch": 0.2475, + "grad_norm": 12.6875, + "grad_norm_var": 0.48826497395833335, + "learning_rate": 0.0003, + "loss": 11.5865, + "loss/aux_loss": 0.04809128176420927, + "loss/crossentropy": 2.571286141872406, + "loss/logits": 0.8443722426891327, + "step": 24750 + }, + { + "epoch": 0.2476, + "grad_norm": 12.875, + "grad_norm_var": 1.0113932291666667, + "learning_rate": 0.0003, + "loss": 11.5163, + "loss/aux_loss": 0.04808931332081556, + "loss/crossentropy": 2.765368914604187, + "loss/logits": 0.8564152508974076, + "step": 24760 + }, + { + "epoch": 0.2477, + "grad_norm": 12.5625, + "grad_norm_var": 0.372119140625, + "learning_rate": 0.0003, + "loss": 11.5343, + "loss/aux_loss": 0.04808468669652939, + "loss/crossentropy": 2.847411096096039, + "loss/logits": 0.9346977740526199, + "step": 24770 + }, + { + "epoch": 0.2478, + "grad_norm": 11.875, + "grad_norm_var": 0.7286458333333333, + "learning_rate": 0.0003, + "loss": 11.6268, + "loss/aux_loss": 0.0480878546833992, + "loss/crossentropy": 2.6432011306285856, + "loss/logits": 0.8749868780374527, + "step": 24780 + }, + { + "epoch": 0.2479, + "grad_norm": 12.875, + "grad_norm_var": 0.5903483072916667, + "learning_rate": 0.0003, + "loss": 11.6199, + "loss/aux_loss": 0.04810009114444256, + "loss/crossentropy": 2.712999904155731, + "loss/logits": 0.8862002283334732, + "step": 24790 + }, + { + "epoch": 0.248, + "grad_norm": 12.125, + "grad_norm_var": 0.41730143229166666, + "learning_rate": 0.0003, + "loss": 11.6145, + "loss/aux_loss": 0.048082103952765465, + "loss/crossentropy": 2.768636167049408, + "loss/logits": 0.862320426106453, + "step": 24800 + }, + { + "epoch": 0.2481, + "grad_norm": 12.875, + "grad_norm_var": 0.48370768229166666, + "learning_rate": 0.0003, + "loss": 11.7063, + "loss/aux_loss": 0.048084338754415513, + "loss/crossentropy": 2.689415818452835, + "loss/logits": 0.8696422547101974, + "step": 24810 + }, + { + "epoch": 0.2482, + "grad_norm": 13.375, + "grad_norm_var": 0.5311848958333333, + "learning_rate": 0.0003, + "loss": 11.7926, + "loss/aux_loss": 0.04808789137750864, + "loss/crossentropy": 2.7690405011177064, + "loss/logits": 0.8735806256532669, + "step": 24820 + }, + { + "epoch": 0.2483, + "grad_norm": 12.875, + "grad_norm_var": 0.32745768229166666, + "learning_rate": 0.0003, + "loss": 11.5478, + "loss/aux_loss": 0.04808993488550186, + "loss/crossentropy": 2.7007332861423494, + "loss/logits": 0.9016262739896774, + "step": 24830 + }, + { + "epoch": 0.2484, + "grad_norm": 13.25, + "grad_norm_var": 693.0523274739584, + "learning_rate": 0.0003, + "loss": 11.5585, + "loss/aux_loss": 0.04810011051595211, + "loss/crossentropy": 2.5810685038566588, + "loss/logits": 0.864134407043457, + "step": 24840 + }, + { + "epoch": 0.2485, + "grad_norm": 12.8125, + "grad_norm_var": 0.8244140625, + "learning_rate": 0.0003, + "loss": 11.6735, + "loss/aux_loss": 0.048092559725046155, + "loss/crossentropy": 2.752824580669403, + "loss/logits": 0.8686655551195145, + "step": 24850 + }, + { + "epoch": 0.2486, + "grad_norm": 12.9375, + "grad_norm_var": 0.671337890625, + "learning_rate": 0.0003, + "loss": 11.8462, + "loss/aux_loss": 0.04807847458869219, + "loss/crossentropy": 2.9063711762428284, + "loss/logits": 0.9188533276319504, + "step": 24860 + }, + { + "epoch": 0.2487, + "grad_norm": 12.4375, + "grad_norm_var": 0.27459309895833334, + "learning_rate": 0.0003, + "loss": 11.644, + "loss/aux_loss": 0.04809844493865967, + "loss/crossentropy": 2.7648052334785462, + "loss/logits": 0.867266783118248, + "step": 24870 + }, + { + "epoch": 0.2488, + "grad_norm": 13.6875, + "grad_norm_var": 3.9888020833333333, + "learning_rate": 0.0003, + "loss": 11.5867, + "loss/aux_loss": 0.04808073379099369, + "loss/crossentropy": 2.7701613664627076, + "loss/logits": 0.9056573390960694, + "step": 24880 + }, + { + "epoch": 0.2489, + "grad_norm": 11.9375, + "grad_norm_var": 2.978385416666667, + "learning_rate": 0.0003, + "loss": 11.5873, + "loss/aux_loss": 0.04808578360825777, + "loss/crossentropy": 2.8290310978889464, + "loss/logits": 0.8755116432905197, + "step": 24890 + }, + { + "epoch": 0.249, + "grad_norm": 13.625, + "grad_norm_var": 0.8264973958333334, + "learning_rate": 0.0003, + "loss": 11.6796, + "loss/aux_loss": 0.04808125514537096, + "loss/crossentropy": 2.816509687900543, + "loss/logits": 0.9027190536260605, + "step": 24900 + }, + { + "epoch": 0.2491, + "grad_norm": 12.0625, + "grad_norm_var": 0.8419270833333333, + "learning_rate": 0.0003, + "loss": 11.5386, + "loss/aux_loss": 0.0480922881513834, + "loss/crossentropy": 2.76520716547966, + "loss/logits": 0.892632269859314, + "step": 24910 + }, + { + "epoch": 0.2492, + "grad_norm": 11.75, + "grad_norm_var": 0.30193684895833334, + "learning_rate": 0.0003, + "loss": 11.693, + "loss/aux_loss": 0.0480814853683114, + "loss/crossentropy": 2.720960557460785, + "loss/logits": 0.8856659799814224, + "step": 24920 + }, + { + "epoch": 0.2493, + "grad_norm": 14.25, + "grad_norm_var": 0.8380208333333333, + "learning_rate": 0.0003, + "loss": 11.5816, + "loss/aux_loss": 0.04809188954532147, + "loss/crossentropy": 2.646233397722244, + "loss/logits": 0.8720506697893142, + "step": 24930 + }, + { + "epoch": 0.2494, + "grad_norm": 12.0625, + "grad_norm_var": 1.5056640625, + "learning_rate": 0.0003, + "loss": 11.7634, + "loss/aux_loss": 0.04808452129364014, + "loss/crossentropy": 2.83508540391922, + "loss/logits": 0.8792938023805619, + "step": 24940 + }, + { + "epoch": 0.2495, + "grad_norm": 12.875, + "grad_norm_var": 0.49733072916666665, + "learning_rate": 0.0003, + "loss": 11.4464, + "loss/aux_loss": 0.048088039830327035, + "loss/crossentropy": 2.7586292266845702, + "loss/logits": 0.8813098579645157, + "step": 24950 + }, + { + "epoch": 0.2496, + "grad_norm": 13.4375, + "grad_norm_var": 1.2320149739583333, + "learning_rate": 0.0003, + "loss": 11.6449, + "loss/aux_loss": 0.04809652119874954, + "loss/crossentropy": 2.7922417759895324, + "loss/logits": 0.9288152068853378, + "step": 24960 + }, + { + "epoch": 0.2497, + "grad_norm": 13.0625, + "grad_norm_var": 0.31027018229166664, + "learning_rate": 0.0003, + "loss": 11.5653, + "loss/aux_loss": 0.048079832829535006, + "loss/crossentropy": 2.713176792860031, + "loss/logits": 0.8755482017993927, + "step": 24970 + }, + { + "epoch": 0.2498, + "grad_norm": 12.625, + "grad_norm_var": 0.5158854166666667, + "learning_rate": 0.0003, + "loss": 11.6771, + "loss/aux_loss": 0.04808032140135765, + "loss/crossentropy": 2.844007110595703, + "loss/logits": 0.8661886304616928, + "step": 24980 + }, + { + "epoch": 0.2499, + "grad_norm": 11.625, + "grad_norm_var": 0.5384765625, + "learning_rate": 0.0003, + "loss": 11.4978, + "loss/aux_loss": 0.048091298528015615, + "loss/crossentropy": 2.7953576683998107, + "loss/logits": 0.8499617218971253, + "step": 24990 + }, + { + "epoch": 0.25, + "grad_norm": 12.875, + "grad_norm_var": 0.9852701822916666, + "learning_rate": 0.0003, + "loss": 11.5883, + "loss/aux_loss": 0.04808867685496807, + "loss/crossentropy": 2.528530162572861, + "loss/logits": 0.854027372598648, + "step": 25000 + }, + { + "epoch": 0.2501, + "grad_norm": 11.25, + "grad_norm_var": 0.9353515625, + "learning_rate": 0.0003, + "loss": 11.6363, + "loss/aux_loss": 0.04808705858886242, + "loss/crossentropy": 2.843946361541748, + "loss/logits": 0.8924509882926941, + "step": 25010 + }, + { + "epoch": 0.2502, + "grad_norm": 12.3125, + "grad_norm_var": 0.35349934895833335, + "learning_rate": 0.0003, + "loss": 11.8235, + "loss/aux_loss": 0.04808267503976822, + "loss/crossentropy": 2.891792821884155, + "loss/logits": 0.9047430366277694, + "step": 25020 + }, + { + "epoch": 0.2503, + "grad_norm": 12.75, + "grad_norm_var": 0.2886555989583333, + "learning_rate": 0.0003, + "loss": 11.6444, + "loss/aux_loss": 0.048091215640306474, + "loss/crossentropy": 2.904352879524231, + "loss/logits": 0.877800577878952, + "step": 25030 + }, + { + "epoch": 0.2504, + "grad_norm": 12.1875, + "grad_norm_var": 0.33318684895833334, + "learning_rate": 0.0003, + "loss": 11.6522, + "loss/aux_loss": 0.04808731395751238, + "loss/crossentropy": 2.7148699164390564, + "loss/logits": 0.8997314661741257, + "step": 25040 + }, + { + "epoch": 0.2505, + "grad_norm": 13.8125, + "grad_norm_var": 0.37303059895833335, + "learning_rate": 0.0003, + "loss": 11.8294, + "loss/aux_loss": 0.04808868896216154, + "loss/crossentropy": 2.790895849466324, + "loss/logits": 0.9084702879190445, + "step": 25050 + }, + { + "epoch": 0.2506, + "grad_norm": 12.3125, + "grad_norm_var": 0.36920572916666666, + "learning_rate": 0.0003, + "loss": 11.7443, + "loss/aux_loss": 0.04809939563274383, + "loss/crossentropy": 2.911231255531311, + "loss/logits": 0.8896218776702881, + "step": 25060 + }, + { + "epoch": 0.2507, + "grad_norm": 13.0, + "grad_norm_var": 0.3450520833333333, + "learning_rate": 0.0003, + "loss": 11.5574, + "loss/aux_loss": 0.048084039054811, + "loss/crossentropy": 2.8057745695114136, + "loss/logits": 0.8932328909635544, + "step": 25070 + }, + { + "epoch": 0.2508, + "grad_norm": 12.6875, + "grad_norm_var": 0.4593587239583333, + "learning_rate": 0.0003, + "loss": 11.8264, + "loss/aux_loss": 0.04808835387229919, + "loss/crossentropy": 2.9282448649406434, + "loss/logits": 0.9194880992174148, + "step": 25080 + }, + { + "epoch": 0.2509, + "grad_norm": 11.8125, + "grad_norm_var": 0.40167643229166666, + "learning_rate": 0.0003, + "loss": 11.5529, + "loss/aux_loss": 0.04807906914502382, + "loss/crossentropy": 2.532350409030914, + "loss/logits": 0.8764607399702072, + "step": 25090 + }, + { + "epoch": 0.251, + "grad_norm": 12.25, + "grad_norm_var": 0.2752604166666667, + "learning_rate": 0.0003, + "loss": 11.5749, + "loss/aux_loss": 0.048076278157532217, + "loss/crossentropy": 2.7150728702545166, + "loss/logits": 0.8551843196153641, + "step": 25100 + }, + { + "epoch": 0.2511, + "grad_norm": 12.375, + "grad_norm_var": 0.14368489583333333, + "learning_rate": 0.0003, + "loss": 11.6707, + "loss/aux_loss": 0.048083152808249, + "loss/crossentropy": 2.6560796737670898, + "loss/logits": 0.8625768065452576, + "step": 25110 + }, + { + "epoch": 0.2512, + "grad_norm": 12.375, + "grad_norm_var": 0.238525390625, + "learning_rate": 0.0003, + "loss": 11.31, + "loss/aux_loss": 0.04808560535311699, + "loss/crossentropy": 2.692982625961304, + "loss/logits": 0.8346902966499329, + "step": 25120 + }, + { + "epoch": 0.2513, + "grad_norm": 12.9375, + "grad_norm_var": 0.13326822916666667, + "learning_rate": 0.0003, + "loss": 11.6084, + "loss/aux_loss": 0.04809413086622953, + "loss/crossentropy": 2.8028789699077605, + "loss/logits": 0.8861746788024902, + "step": 25130 + }, + { + "epoch": 0.2514, + "grad_norm": 13.5, + "grad_norm_var": 0.17068684895833333, + "learning_rate": 0.0003, + "loss": 11.7255, + "loss/aux_loss": 0.04807902295142412, + "loss/crossentropy": 2.77418338060379, + "loss/logits": 0.8591089010238647, + "step": 25140 + }, + { + "epoch": 0.2515, + "grad_norm": 12.0, + "grad_norm_var": 14.376497395833333, + "learning_rate": 0.0003, + "loss": 11.5784, + "loss/aux_loss": 0.04810504075139761, + "loss/crossentropy": 2.6118695974349975, + "loss/logits": 0.8479388684034348, + "step": 25150 + }, + { + "epoch": 0.2516, + "grad_norm": 12.0625, + "grad_norm_var": 14.047119140625, + "learning_rate": 0.0003, + "loss": 11.6321, + "loss/aux_loss": 0.04807586278766394, + "loss/crossentropy": 2.8019371032714844, + "loss/logits": 0.9008934259414673, + "step": 25160 + }, + { + "epoch": 0.2517, + "grad_norm": 12.625, + "grad_norm_var": 5.012955729166666, + "learning_rate": 0.0003, + "loss": 11.6511, + "loss/aux_loss": 0.048094934225082396, + "loss/crossentropy": 2.6834351480007173, + "loss/logits": 0.8760968536138535, + "step": 25170 + }, + { + "epoch": 0.2518, + "grad_norm": 13.3125, + "grad_norm_var": 2.4541015625, + "learning_rate": 0.0003, + "loss": 11.7221, + "loss/aux_loss": 0.04808460958302021, + "loss/crossentropy": 2.778984820842743, + "loss/logits": 0.8968686580657959, + "step": 25180 + }, + { + "epoch": 0.2519, + "grad_norm": 12.5625, + "grad_norm_var": 0.365625, + "learning_rate": 0.0003, + "loss": 11.6415, + "loss/aux_loss": 0.04808416347950697, + "loss/crossentropy": 2.6761425912380217, + "loss/logits": 0.8864099949598312, + "step": 25190 + }, + { + "epoch": 0.252, + "grad_norm": 12.6875, + "grad_norm_var": 0.19620768229166666, + "learning_rate": 0.0003, + "loss": 11.7456, + "loss/aux_loss": 0.04808526486158371, + "loss/crossentropy": 2.809593695402145, + "loss/logits": 0.9087226182222367, + "step": 25200 + }, + { + "epoch": 0.2521, + "grad_norm": 12.875, + "grad_norm_var": 0.4205729166666667, + "learning_rate": 0.0003, + "loss": 11.649, + "loss/aux_loss": 0.04808737710118294, + "loss/crossentropy": 2.8328150868415833, + "loss/logits": 0.8956249594688416, + "step": 25210 + }, + { + "epoch": 0.2522, + "grad_norm": 11.8125, + "grad_norm_var": 0.3546223958333333, + "learning_rate": 0.0003, + "loss": 11.6388, + "loss/aux_loss": 0.048089083097875115, + "loss/crossentropy": 2.736672604084015, + "loss/logits": 0.8746443182229996, + "step": 25220 + }, + { + "epoch": 0.2523, + "grad_norm": 12.8125, + "grad_norm_var": 0.3624348958333333, + "learning_rate": 0.0003, + "loss": 11.6174, + "loss/aux_loss": 0.04808070510625839, + "loss/crossentropy": 3.001026463508606, + "loss/logits": 0.8746398031711579, + "step": 25230 + }, + { + "epoch": 0.2524, + "grad_norm": 12.4375, + "grad_norm_var": 3.6433430989583333, + "learning_rate": 0.0003, + "loss": 11.7027, + "loss/aux_loss": 0.048089549690485, + "loss/crossentropy": 2.7485376834869384, + "loss/logits": 0.8798061728477478, + "step": 25240 + }, + { + "epoch": 0.2525, + "grad_norm": 15.25, + "grad_norm_var": 1.2581868489583334, + "learning_rate": 0.0003, + "loss": 11.8481, + "loss/aux_loss": 0.048088116385042665, + "loss/crossentropy": 2.907766008377075, + "loss/logits": 0.927979850769043, + "step": 25250 + }, + { + "epoch": 0.2526, + "grad_norm": 12.75, + "grad_norm_var": 0.674853515625, + "learning_rate": 0.0003, + "loss": 11.5283, + "loss/aux_loss": 0.04809806887060404, + "loss/crossentropy": 2.813064420223236, + "loss/logits": 0.8849809437990188, + "step": 25260 + }, + { + "epoch": 0.2527, + "grad_norm": 13.5625, + "grad_norm_var": 0.222119140625, + "learning_rate": 0.0003, + "loss": 11.4406, + "loss/aux_loss": 0.048092986829578875, + "loss/crossentropy": 2.680801993608475, + "loss/logits": 0.8623011440038681, + "step": 25270 + }, + { + "epoch": 0.2528, + "grad_norm": 12.5625, + "grad_norm_var": 0.28541666666666665, + "learning_rate": 0.0003, + "loss": 11.6218, + "loss/aux_loss": 0.04807704258710146, + "loss/crossentropy": 2.6295208811759947, + "loss/logits": 0.8690453052520752, + "step": 25280 + }, + { + "epoch": 0.2529, + "grad_norm": 13.0, + "grad_norm_var": 0.35154622395833335, + "learning_rate": 0.0003, + "loss": 11.6239, + "loss/aux_loss": 0.048089309222996236, + "loss/crossentropy": 2.666023552417755, + "loss/logits": 0.8957912296056747, + "step": 25290 + }, + { + "epoch": 0.253, + "grad_norm": 12.875, + "grad_norm_var": 0.26666666666666666, + "learning_rate": 0.0003, + "loss": 11.4241, + "loss/aux_loss": 0.048083115555346015, + "loss/crossentropy": 2.83921422958374, + "loss/logits": 0.883840236067772, + "step": 25300 + }, + { + "epoch": 0.2531, + "grad_norm": 12.625, + "grad_norm_var": 0.5619791666666667, + "learning_rate": 0.0003, + "loss": 11.6852, + "loss/aux_loss": 0.048087958991527555, + "loss/crossentropy": 2.8075567483901978, + "loss/logits": 0.8912134855985642, + "step": 25310 + }, + { + "epoch": 0.2532, + "grad_norm": 12.0625, + "grad_norm_var": 0.5716145833333334, + "learning_rate": 0.0003, + "loss": 11.6948, + "loss/aux_loss": 0.048075183667242526, + "loss/crossentropy": 2.834882414340973, + "loss/logits": 0.9086032390594483, + "step": 25320 + }, + { + "epoch": 0.2533, + "grad_norm": 12.0, + "grad_norm_var": 0.39993489583333336, + "learning_rate": 0.0003, + "loss": 11.5648, + "loss/aux_loss": 0.04808804150670767, + "loss/crossentropy": 2.7690355598926546, + "loss/logits": 0.8761381387710572, + "step": 25330 + }, + { + "epoch": 0.2534, + "grad_norm": 12.8125, + "grad_norm_var": 208.065625, + "learning_rate": 0.0003, + "loss": 11.8033, + "loss/aux_loss": 0.048099280893802644, + "loss/crossentropy": 2.8881313681602476, + "loss/logits": 0.9373392134904861, + "step": 25340 + }, + { + "epoch": 0.2535, + "grad_norm": 12.4375, + "grad_norm_var": 0.7734212239583333, + "learning_rate": 0.0003, + "loss": 11.6623, + "loss/aux_loss": 0.04808682128787041, + "loss/crossentropy": 2.7554580926895142, + "loss/logits": 0.8729925930500031, + "step": 25350 + }, + { + "epoch": 0.2536, + "grad_norm": 12.125, + "grad_norm_var": 0.687744140625, + "learning_rate": 0.0003, + "loss": 11.6251, + "loss/aux_loss": 0.048085390403866765, + "loss/crossentropy": 2.6890475332736967, + "loss/logits": 0.8627552896738052, + "step": 25360 + }, + { + "epoch": 0.2537, + "grad_norm": 12.9375, + "grad_norm_var": 0.4130045572916667, + "learning_rate": 0.0003, + "loss": 11.6627, + "loss/aux_loss": 0.04808287601917982, + "loss/crossentropy": 2.9235028862953185, + "loss/logits": 0.9073904246091843, + "step": 25370 + }, + { + "epoch": 0.2538, + "grad_norm": 12.875, + "grad_norm_var": 0.6108723958333333, + "learning_rate": 0.0003, + "loss": 11.6099, + "loss/aux_loss": 0.0480959203094244, + "loss/crossentropy": 2.6273869574069977, + "loss/logits": 0.8745059370994568, + "step": 25380 + }, + { + "epoch": 0.2539, + "grad_norm": 12.0625, + "grad_norm_var": 0.6755208333333333, + "learning_rate": 0.0003, + "loss": 11.6251, + "loss/aux_loss": 0.04807964153587818, + "loss/crossentropy": 2.7656370401382446, + "loss/logits": 0.8815957188606263, + "step": 25390 + }, + { + "epoch": 0.254, + "grad_norm": 12.8125, + "grad_norm_var": 4.477018229166666, + "learning_rate": 0.0003, + "loss": 11.8206, + "loss/aux_loss": 0.048090039566159246, + "loss/crossentropy": 2.8786653161048887, + "loss/logits": 0.8921761155128479, + "step": 25400 + }, + { + "epoch": 0.2541, + "grad_norm": 13.0625, + "grad_norm_var": 0.45514322916666666, + "learning_rate": 0.0003, + "loss": 11.6351, + "loss/aux_loss": 0.048083982057869436, + "loss/crossentropy": 2.8235522508621216, + "loss/logits": 0.914072972536087, + "step": 25410 + }, + { + "epoch": 0.2542, + "grad_norm": 12.5, + "grad_norm_var": 0.38671875, + "learning_rate": 0.0003, + "loss": 11.5798, + "loss/aux_loss": 0.04809177853167057, + "loss/crossentropy": 2.6220124840736387, + "loss/logits": 0.8807148039340973, + "step": 25420 + }, + { + "epoch": 0.2543, + "grad_norm": 12.875, + "grad_norm_var": 0.16027018229166667, + "learning_rate": 0.0003, + "loss": 11.6279, + "loss/aux_loss": 0.04809038415551185, + "loss/crossentropy": 2.70097331404686, + "loss/logits": 0.8611227154731751, + "step": 25430 + }, + { + "epoch": 0.2544, + "grad_norm": 15.0625, + "grad_norm_var": 1.840869140625, + "learning_rate": 0.0003, + "loss": 11.5793, + "loss/aux_loss": 0.04807671457529068, + "loss/crossentropy": 2.6667848229408264, + "loss/logits": 0.8680727303028106, + "step": 25440 + }, + { + "epoch": 0.2545, + "grad_norm": 12.3125, + "grad_norm_var": 2.2770670572916667, + "learning_rate": 0.0003, + "loss": 11.5694, + "loss/aux_loss": 0.04809817839413881, + "loss/crossentropy": 2.905455070734024, + "loss/logits": 0.9042976140975952, + "step": 25450 + }, + { + "epoch": 0.2546, + "grad_norm": 12.125, + "grad_norm_var": 1.07265625, + "learning_rate": 0.0003, + "loss": 11.7353, + "loss/aux_loss": 0.04808375872671604, + "loss/crossentropy": 2.9227493464946748, + "loss/logits": 0.9188271731138229, + "step": 25460 + }, + { + "epoch": 0.2547, + "grad_norm": 13.75, + "grad_norm_var": 1.6091145833333333, + "learning_rate": 0.0003, + "loss": 11.8679, + "loss/aux_loss": 0.048084712401032445, + "loss/crossentropy": 2.888567340373993, + "loss/logits": 0.8932348757982254, + "step": 25470 + }, + { + "epoch": 0.2548, + "grad_norm": 12.125, + "grad_norm_var": 2.026416015625, + "learning_rate": 0.0003, + "loss": 11.6388, + "loss/aux_loss": 0.04808471277356148, + "loss/crossentropy": 2.767711889743805, + "loss/logits": 0.8723369985818863, + "step": 25480 + }, + { + "epoch": 0.2549, + "grad_norm": 13.0, + "grad_norm_var": 0.26458333333333334, + "learning_rate": 0.0003, + "loss": 11.6305, + "loss/aux_loss": 0.048084079287946224, + "loss/crossentropy": 2.7936369240283967, + "loss/logits": 0.9237923324108124, + "step": 25490 + }, + { + "epoch": 0.255, + "grad_norm": 12.8125, + "grad_norm_var": 0.2837890625, + "learning_rate": 0.0003, + "loss": 11.5733, + "loss/aux_loss": 0.04808622878044844, + "loss/crossentropy": 2.749346935749054, + "loss/logits": 0.9049636125564575, + "step": 25500 + }, + { + "epoch": 0.2551, + "grad_norm": 11.3125, + "grad_norm_var": 0.6278483072916666, + "learning_rate": 0.0003, + "loss": 11.6192, + "loss/aux_loss": 0.048095759376883505, + "loss/crossentropy": 2.862506020069122, + "loss/logits": 0.9004225820302963, + "step": 25510 + }, + { + "epoch": 0.2552, + "grad_norm": 13.4375, + "grad_norm_var": 0.5340983072916666, + "learning_rate": 0.0003, + "loss": 11.6151, + "loss/aux_loss": 0.048074688762426376, + "loss/crossentropy": 2.6472591876983644, + "loss/logits": 0.8818151533603669, + "step": 25520 + }, + { + "epoch": 0.2553, + "grad_norm": 12.5625, + "grad_norm_var": 0.26764322916666666, + "learning_rate": 0.0003, + "loss": 11.7054, + "loss/aux_loss": 0.048086194694042204, + "loss/crossentropy": 2.7406247556209564, + "loss/logits": 0.868209832906723, + "step": 25530 + }, + { + "epoch": 0.2554, + "grad_norm": 12.375, + "grad_norm_var": 0.19036458333333334, + "learning_rate": 0.0003, + "loss": 11.6693, + "loss/aux_loss": 0.04809033088386059, + "loss/crossentropy": 2.64967337846756, + "loss/logits": 0.8649856716394424, + "step": 25540 + }, + { + "epoch": 0.2555, + "grad_norm": 14.0625, + "grad_norm_var": 0.5400390625, + "learning_rate": 0.0003, + "loss": 11.6214, + "loss/aux_loss": 0.04808993134647608, + "loss/crossentropy": 2.640416944026947, + "loss/logits": 0.85947944521904, + "step": 25550 + }, + { + "epoch": 0.2556, + "grad_norm": 15.0625, + "grad_norm_var": 0.6822265625, + "learning_rate": 0.0003, + "loss": 11.5651, + "loss/aux_loss": 0.04808539263904095, + "loss/crossentropy": 2.7396462678909304, + "loss/logits": 0.8959068596363068, + "step": 25560 + }, + { + "epoch": 0.2557, + "grad_norm": 11.5625, + "grad_norm_var": 1.1291666666666667, + "learning_rate": 0.0003, + "loss": 11.3876, + "loss/aux_loss": 0.048090565018355846, + "loss/crossentropy": 2.7080013751983643, + "loss/logits": 0.8827524572610855, + "step": 25570 + }, + { + "epoch": 0.2558, + "grad_norm": 12.375, + "grad_norm_var": 1.1947916666666667, + "learning_rate": 0.0003, + "loss": 11.7393, + "loss/aux_loss": 0.0480854069814086, + "loss/crossentropy": 2.8358673214912415, + "loss/logits": 0.9099754065275192, + "step": 25580 + }, + { + "epoch": 0.2559, + "grad_norm": 12.625, + "grad_norm_var": 0.8205729166666667, + "learning_rate": 0.0003, + "loss": 11.5309, + "loss/aux_loss": 0.048086575232446194, + "loss/crossentropy": 2.6382982313632963, + "loss/logits": 0.8928437829017639, + "step": 25590 + }, + { + "epoch": 0.256, + "grad_norm": 12.4375, + "grad_norm_var": 0.19763997395833333, + "learning_rate": 0.0003, + "loss": 11.5599, + "loss/aux_loss": 0.0480822779238224, + "loss/crossentropy": 2.821625292301178, + "loss/logits": 0.8573364794254303, + "step": 25600 + }, + { + "epoch": 0.2561, + "grad_norm": 12.625, + "grad_norm_var": 0.8898274739583333, + "learning_rate": 0.0003, + "loss": 11.5878, + "loss/aux_loss": 0.04809480402618647, + "loss/crossentropy": 2.7832887768745422, + "loss/logits": 0.8507031291723252, + "step": 25610 + }, + { + "epoch": 0.2562, + "grad_norm": 13.125, + "grad_norm_var": 0.9608723958333333, + "learning_rate": 0.0003, + "loss": 11.5664, + "loss/aux_loss": 0.04807593021541834, + "loss/crossentropy": 2.731532007455826, + "loss/logits": 0.8629953473806381, + "step": 25620 + }, + { + "epoch": 0.2563, + "grad_norm": 11.8125, + "grad_norm_var": 0.45558268229166665, + "learning_rate": 0.0003, + "loss": 11.759, + "loss/aux_loss": 0.048087543621659276, + "loss/crossentropy": 2.854618912935257, + "loss/logits": 0.9120256692171097, + "step": 25630 + }, + { + "epoch": 0.2564, + "grad_norm": 11.5625, + "grad_norm_var": 0.35390625, + "learning_rate": 0.0003, + "loss": 11.6293, + "loss/aux_loss": 0.04808175507932901, + "loss/crossentropy": 2.798936349153519, + "loss/logits": 0.8882687538862228, + "step": 25640 + }, + { + "epoch": 0.2565, + "grad_norm": 12.3125, + "grad_norm_var": 1.4824055989583333, + "learning_rate": 0.0003, + "loss": 11.5947, + "loss/aux_loss": 0.04808369390666485, + "loss/crossentropy": 2.745366007089615, + "loss/logits": 0.8873745143413544, + "step": 25650 + }, + { + "epoch": 0.2566, + "grad_norm": 13.75, + "grad_norm_var": 1.8715983072916667, + "learning_rate": 0.0003, + "loss": 11.6324, + "loss/aux_loss": 0.04809246361255646, + "loss/crossentropy": 2.9094391226768495, + "loss/logits": 0.890197029709816, + "step": 25660 + }, + { + "epoch": 0.2567, + "grad_norm": 13.5, + "grad_norm_var": 0.9082682291666667, + "learning_rate": 0.0003, + "loss": 11.5822, + "loss/aux_loss": 0.04809121619910002, + "loss/crossentropy": 2.8318686723709106, + "loss/logits": 0.8528442829847336, + "step": 25670 + }, + { + "epoch": 0.2568, + "grad_norm": 13.0, + "grad_norm_var": 0.335400390625, + "learning_rate": 0.0003, + "loss": 11.6176, + "loss/aux_loss": 0.04808668699115515, + "loss/crossentropy": 2.590553969144821, + "loss/logits": 0.8228752464056015, + "step": 25680 + }, + { + "epoch": 0.2569, + "grad_norm": 12.9375, + "grad_norm_var": 0.34427083333333336, + "learning_rate": 0.0003, + "loss": 11.8269, + "loss/aux_loss": 0.04809065740555525, + "loss/crossentropy": 2.8544575750827788, + "loss/logits": 0.8806323766708374, + "step": 25690 + }, + { + "epoch": 0.257, + "grad_norm": 13.3125, + "grad_norm_var": 14.973177083333333, + "learning_rate": 0.0003, + "loss": 11.6281, + "loss/aux_loss": 0.048080390132963655, + "loss/crossentropy": 2.8158641815185548, + "loss/logits": 0.9019149035215378, + "step": 25700 + }, + { + "epoch": 0.2571, + "grad_norm": 14.5625, + "grad_norm_var": 14.9875, + "learning_rate": 0.0003, + "loss": 11.7727, + "loss/aux_loss": 0.04808665197342634, + "loss/crossentropy": 2.745079851150513, + "loss/logits": 0.9046284675598144, + "step": 25710 + }, + { + "epoch": 0.2572, + "grad_norm": 13.4375, + "grad_norm_var": 1.815869140625, + "learning_rate": 0.0003, + "loss": 11.6452, + "loss/aux_loss": 0.048082825168967246, + "loss/crossentropy": 2.678209352493286, + "loss/logits": 0.8738999456167221, + "step": 25720 + }, + { + "epoch": 0.2573, + "grad_norm": 12.25, + "grad_norm_var": 0.8992024739583333, + "learning_rate": 0.0003, + "loss": 11.5744, + "loss/aux_loss": 0.04809469617903232, + "loss/crossentropy": 2.65126051902771, + "loss/logits": 0.8762364238500595, + "step": 25730 + }, + { + "epoch": 0.2574, + "grad_norm": 13.25, + "grad_norm_var": 0.5166015625, + "learning_rate": 0.0003, + "loss": 11.5408, + "loss/aux_loss": 0.04808213766664267, + "loss/crossentropy": 2.7597130656242372, + "loss/logits": 0.8789749413728714, + "step": 25740 + }, + { + "epoch": 0.2575, + "grad_norm": 12.875, + "grad_norm_var": 0.3753743489583333, + "learning_rate": 0.0003, + "loss": 11.5427, + "loss/aux_loss": 0.0480932829901576, + "loss/crossentropy": 2.783565378189087, + "loss/logits": 0.8482154309749603, + "step": 25750 + }, + { + "epoch": 0.2576, + "grad_norm": 13.5, + "grad_norm_var": 0.131103515625, + "learning_rate": 0.0003, + "loss": 11.6673, + "loss/aux_loss": 0.04809057265520096, + "loss/crossentropy": 2.7950429677963258, + "loss/logits": 0.8746149778366089, + "step": 25760 + }, + { + "epoch": 0.2577, + "grad_norm": 13.375, + "grad_norm_var": 0.22864583333333333, + "learning_rate": 0.0003, + "loss": 11.7949, + "loss/aux_loss": 0.04808306787163019, + "loss/crossentropy": 2.7718122243881225, + "loss/logits": 0.9080736309289932, + "step": 25770 + }, + { + "epoch": 0.2578, + "grad_norm": 13.4375, + "grad_norm_var": 0.50625, + "learning_rate": 0.0003, + "loss": 11.6395, + "loss/aux_loss": 0.04808392096310854, + "loss/crossentropy": 2.849545049667358, + "loss/logits": 0.9059659481048584, + "step": 25780 + }, + { + "epoch": 0.2579, + "grad_norm": 13.375, + "grad_norm_var": 0.4791666666666667, + "learning_rate": 0.0003, + "loss": 11.6708, + "loss/aux_loss": 0.048084107041358945, + "loss/crossentropy": 2.835736083984375, + "loss/logits": 0.8999375015497207, + "step": 25790 + }, + { + "epoch": 0.258, + "grad_norm": 13.625, + "grad_norm_var": 0.36638997395833334, + "learning_rate": 0.0003, + "loss": 11.6178, + "loss/aux_loss": 0.048090285435318944, + "loss/crossentropy": 2.8020094275474547, + "loss/logits": 0.8916181594133377, + "step": 25800 + }, + { + "epoch": 0.2581, + "grad_norm": 12.375, + "grad_norm_var": 0.3465983072916667, + "learning_rate": 0.0003, + "loss": 11.4354, + "loss/aux_loss": 0.04808872099965811, + "loss/crossentropy": 2.652283215522766, + "loss/logits": 0.8868981301784515, + "step": 25810 + }, + { + "epoch": 0.2582, + "grad_norm": 12.75, + "grad_norm_var": 0.18787434895833333, + "learning_rate": 0.0003, + "loss": 11.6028, + "loss/aux_loss": 0.04808285553008318, + "loss/crossentropy": 2.6281425058841705, + "loss/logits": 0.8745265692472458, + "step": 25820 + }, + { + "epoch": 0.2583, + "grad_norm": 13.3125, + "grad_norm_var": 0.23671875, + "learning_rate": 0.0003, + "loss": 11.7574, + "loss/aux_loss": 0.048094558902084826, + "loss/crossentropy": 2.900680327415466, + "loss/logits": 0.8990511387586594, + "step": 25830 + }, + { + "epoch": 0.2584, + "grad_norm": 13.25, + "grad_norm_var": 12.904166666666667, + "learning_rate": 0.0003, + "loss": 11.5865, + "loss/aux_loss": 0.0480920797213912, + "loss/crossentropy": 2.7239894449710844, + "loss/logits": 0.8879533141851426, + "step": 25840 + }, + { + "epoch": 0.2585, + "grad_norm": 14.3125, + "grad_norm_var": 12.797379557291666, + "learning_rate": 0.0003, + "loss": 11.4818, + "loss/aux_loss": 0.0480831490829587, + "loss/crossentropy": 2.729900598526001, + "loss/logits": 0.8634632736444473, + "step": 25850 + }, + { + "epoch": 0.2586, + "grad_norm": 13.5625, + "grad_norm_var": 0.5343098958333333, + "learning_rate": 0.0003, + "loss": 11.6186, + "loss/aux_loss": 0.048087738640606406, + "loss/crossentropy": 2.9004143357276915, + "loss/logits": 0.9239776521921158, + "step": 25860 + }, + { + "epoch": 0.2587, + "grad_norm": 11.9375, + "grad_norm_var": 0.45286458333333335, + "learning_rate": 0.0003, + "loss": 11.5712, + "loss/aux_loss": 0.04809346161782742, + "loss/crossentropy": 2.6097486078739167, + "loss/logits": 0.8526429146528244, + "step": 25870 + }, + { + "epoch": 0.2588, + "grad_norm": 14.25, + "grad_norm_var": 0.6369140625, + "learning_rate": 0.0003, + "loss": 11.6677, + "loss/aux_loss": 0.04809065666049719, + "loss/crossentropy": 2.687888467311859, + "loss/logits": 0.857833543419838, + "step": 25880 + }, + { + "epoch": 0.2589, + "grad_norm": 13.5, + "grad_norm_var": 0.5983723958333333, + "learning_rate": 0.0003, + "loss": 11.7092, + "loss/aux_loss": 0.04807872045785189, + "loss/crossentropy": 2.8037472486495973, + "loss/logits": 0.8581808120012283, + "step": 25890 + }, + { + "epoch": 0.259, + "grad_norm": 12.5625, + "grad_norm_var": 0.6403483072916667, + "learning_rate": 0.0003, + "loss": 11.6719, + "loss/aux_loss": 0.04809915721416473, + "loss/crossentropy": 2.8186557054519654, + "loss/logits": 0.8756254881620407, + "step": 25900 + }, + { + "epoch": 0.2591, + "grad_norm": 12.5625, + "grad_norm_var": 0.6071451822916667, + "learning_rate": 0.0003, + "loss": 11.6912, + "loss/aux_loss": 0.04807936530560255, + "loss/crossentropy": 2.649220663309097, + "loss/logits": 0.8669601738452911, + "step": 25910 + }, + { + "epoch": 0.2592, + "grad_norm": 13.1875, + "grad_norm_var": 0.38136393229166665, + "learning_rate": 0.0003, + "loss": 11.5527, + "loss/aux_loss": 0.04808104652911425, + "loss/crossentropy": 2.690031635761261, + "loss/logits": 0.8827971279621124, + "step": 25920 + }, + { + "epoch": 0.2593, + "grad_norm": 12.75, + "grad_norm_var": 0.2999348958333333, + "learning_rate": 0.0003, + "loss": 11.5134, + "loss/aux_loss": 0.04808360021561384, + "loss/crossentropy": 2.661074286699295, + "loss/logits": 0.8508890032768249, + "step": 25930 + }, + { + "epoch": 0.2594, + "grad_norm": 11.9375, + "grad_norm_var": 0.33541666666666664, + "learning_rate": 0.0003, + "loss": 11.6292, + "loss/aux_loss": 0.04808128103613853, + "loss/crossentropy": 2.8447935700416567, + "loss/logits": 0.9187151223421097, + "step": 25940 + }, + { + "epoch": 0.2595, + "grad_norm": 13.1875, + "grad_norm_var": 67.79713541666666, + "learning_rate": 0.0003, + "loss": 11.6673, + "loss/aux_loss": 0.04809282161295414, + "loss/crossentropy": 2.7433357715606688, + "loss/logits": 0.8858923971652984, + "step": 25950 + }, + { + "epoch": 0.2596, + "grad_norm": 12.5625, + "grad_norm_var": 67.778369140625, + "learning_rate": 0.0003, + "loss": 11.5163, + "loss/aux_loss": 0.04808433465659619, + "loss/crossentropy": 2.951106083393097, + "loss/logits": 0.8900610029697418, + "step": 25960 + }, + { + "epoch": 0.2597, + "grad_norm": 12.875, + "grad_norm_var": 0.17355143229166667, + "learning_rate": 0.0003, + "loss": 11.6586, + "loss/aux_loss": 0.048085383325815204, + "loss/crossentropy": 2.7705465078353884, + "loss/logits": 0.873355257511139, + "step": 25970 + }, + { + "epoch": 0.2598, + "grad_norm": 12.375, + "grad_norm_var": 0.315478515625, + "learning_rate": 0.0003, + "loss": 11.5712, + "loss/aux_loss": 0.04809144306927919, + "loss/crossentropy": 2.6246292769908903, + "loss/logits": 0.8274095565080642, + "step": 25980 + }, + { + "epoch": 0.2599, + "grad_norm": 12.5625, + "grad_norm_var": 0.2718098958333333, + "learning_rate": 0.0003, + "loss": 11.6178, + "loss/aux_loss": 0.04809448383748531, + "loss/crossentropy": 2.7689568281173704, + "loss/logits": 0.8811231583356858, + "step": 25990 + }, + { + "epoch": 0.26, + "grad_norm": 12.6875, + "grad_norm_var": 0.435400390625, + "learning_rate": 0.0003, + "loss": 11.5572, + "loss/aux_loss": 0.04808851294219494, + "loss/crossentropy": 2.837930643558502, + "loss/logits": 0.8556112885475159, + "step": 26000 + }, + { + "epoch": 0.2601, + "grad_norm": 12.75, + "grad_norm_var": 0.48951822916666665, + "learning_rate": 0.0003, + "loss": 11.4797, + "loss/aux_loss": 0.04808756932616234, + "loss/crossentropy": 2.7235122978687287, + "loss/logits": 0.846416375041008, + "step": 26010 + }, + { + "epoch": 0.2602, + "grad_norm": 12.75, + "grad_norm_var": 0.22369791666666666, + "learning_rate": 0.0003, + "loss": 11.4731, + "loss/aux_loss": 0.04808957688510418, + "loss/crossentropy": 2.7550492763519285, + "loss/logits": 0.8788383483886719, + "step": 26020 + }, + { + "epoch": 0.2603, + "grad_norm": 11.6875, + "grad_norm_var": 0.282275390625, + "learning_rate": 0.0003, + "loss": 11.447, + "loss/aux_loss": 0.04809210356324911, + "loss/crossentropy": 2.713481593132019, + "loss/logits": 0.8958616107702255, + "step": 26030 + }, + { + "epoch": 0.2604, + "grad_norm": 14.5625, + "grad_norm_var": 0.9640625, + "learning_rate": 0.0003, + "loss": 11.7818, + "loss/aux_loss": 0.04807912241667509, + "loss/crossentropy": 2.9397593259811403, + "loss/logits": 0.8796298056840897, + "step": 26040 + }, + { + "epoch": 0.2605, + "grad_norm": 13.375, + "grad_norm_var": 0.8477701822916667, + "learning_rate": 0.0003, + "loss": 11.5145, + "loss/aux_loss": 0.048083426989614965, + "loss/crossentropy": 2.6227431118488314, + "loss/logits": 0.9062366098165512, + "step": 26050 + }, + { + "epoch": 0.2606, + "grad_norm": 11.0625, + "grad_norm_var": 0.5407389322916667, + "learning_rate": 0.0003, + "loss": 11.7285, + "loss/aux_loss": 0.048087958618998526, + "loss/crossentropy": 2.8171847641468046, + "loss/logits": 0.8737149238586426, + "step": 26060 + }, + { + "epoch": 0.2607, + "grad_norm": 13.125, + "grad_norm_var": 0.6838541666666667, + "learning_rate": 0.0003, + "loss": 11.591, + "loss/aux_loss": 0.04808755628764629, + "loss/crossentropy": 2.6662391245365145, + "loss/logits": 0.8623921722173691, + "step": 26070 + }, + { + "epoch": 0.2608, + "grad_norm": 12.875, + "grad_norm_var": 0.23904622395833333, + "learning_rate": 0.0003, + "loss": 11.3801, + "loss/aux_loss": 0.04808433074504137, + "loss/crossentropy": 2.7102749705314637, + "loss/logits": 0.8698142647743226, + "step": 26080 + }, + { + "epoch": 0.2609, + "grad_norm": 12.625, + "grad_norm_var": 0.2552083333333333, + "learning_rate": 0.0003, + "loss": 11.6073, + "loss/aux_loss": 0.048082430846989155, + "loss/crossentropy": 2.717993235588074, + "loss/logits": 0.9071961134672165, + "step": 26090 + }, + { + "epoch": 0.261, + "grad_norm": 12.8125, + "grad_norm_var": 0.5120930989583333, + "learning_rate": 0.0003, + "loss": 11.5483, + "loss/aux_loss": 0.048091440461575985, + "loss/crossentropy": 2.642581123113632, + "loss/logits": 0.8533870339393616, + "step": 26100 + }, + { + "epoch": 0.2611, + "grad_norm": 13.0625, + "grad_norm_var": 0.5610514322916667, + "learning_rate": 0.0003, + "loss": 11.5351, + "loss/aux_loss": 0.04807432852685452, + "loss/crossentropy": 2.754035633802414, + "loss/logits": 0.8514756739139557, + "step": 26110 + }, + { + "epoch": 0.2612, + "grad_norm": 12.75, + "grad_norm_var": 0.4058430989583333, + "learning_rate": 0.0003, + "loss": 11.5255, + "loss/aux_loss": 0.048089875094592574, + "loss/crossentropy": 2.742984265089035, + "loss/logits": 0.8871752589941024, + "step": 26120 + }, + { + "epoch": 0.2613, + "grad_norm": 11.0, + "grad_norm_var": 0.46640625, + "learning_rate": 0.0003, + "loss": 11.619, + "loss/aux_loss": 0.04808270148932934, + "loss/crossentropy": 2.87599663734436, + "loss/logits": 0.887411966919899, + "step": 26130 + }, + { + "epoch": 0.2614, + "grad_norm": 13.875, + "grad_norm_var": 0.481494140625, + "learning_rate": 0.0003, + "loss": 11.7214, + "loss/aux_loss": 0.04808396827429533, + "loss/crossentropy": 2.8104442954063416, + "loss/logits": 0.9045211106538773, + "step": 26140 + }, + { + "epoch": 0.2615, + "grad_norm": 12.5, + "grad_norm_var": 0.326025390625, + "learning_rate": 0.0003, + "loss": 11.5677, + "loss/aux_loss": 0.048081147111952306, + "loss/crossentropy": 2.7364437103271486, + "loss/logits": 0.8579443216323852, + "step": 26150 + }, + { + "epoch": 0.2616, + "grad_norm": 12.5, + "grad_norm_var": 0.48587239583333336, + "learning_rate": 0.0003, + "loss": 11.7239, + "loss/aux_loss": 0.04808290041983128, + "loss/crossentropy": 2.8139244556427, + "loss/logits": 0.9172647058963775, + "step": 26160 + }, + { + "epoch": 0.2617, + "grad_norm": 12.375, + "grad_norm_var": 0.2626139322916667, + "learning_rate": 0.0003, + "loss": 11.6432, + "loss/aux_loss": 0.04808450732380152, + "loss/crossentropy": 2.843486213684082, + "loss/logits": 0.8691521465778351, + "step": 26170 + }, + { + "epoch": 0.2618, + "grad_norm": 13.1875, + "grad_norm_var": 0.27615559895833336, + "learning_rate": 0.0003, + "loss": 11.6097, + "loss/aux_loss": 0.04808285385370255, + "loss/crossentropy": 2.858822929859161, + "loss/logits": 0.9088696330785752, + "step": 26180 + }, + { + "epoch": 0.2619, + "grad_norm": 13.25, + "grad_norm_var": 5.26015625, + "learning_rate": 0.0003, + "loss": 11.6214, + "loss/aux_loss": 0.048090561851859094, + "loss/crossentropy": 2.7624371886253356, + "loss/logits": 0.8909125179052353, + "step": 26190 + }, + { + "epoch": 0.262, + "grad_norm": 12.6875, + "grad_norm_var": 0.24112955729166666, + "learning_rate": 0.0003, + "loss": 11.5151, + "loss/aux_loss": 0.04809354785829782, + "loss/crossentropy": 2.7231006741523744, + "loss/logits": 0.8609473258256912, + "step": 26200 + }, + { + "epoch": 0.2621, + "grad_norm": 12.625, + "grad_norm_var": 0.21354166666666666, + "learning_rate": 0.0003, + "loss": 11.5397, + "loss/aux_loss": 0.048093443363904954, + "loss/crossentropy": 2.785054862499237, + "loss/logits": 0.8880521357059479, + "step": 26210 + }, + { + "epoch": 0.2622, + "grad_norm": 14.5625, + "grad_norm_var": 0.42849934895833336, + "learning_rate": 0.0003, + "loss": 11.6959, + "loss/aux_loss": 0.04808842465281486, + "loss/crossentropy": 2.5711670517921448, + "loss/logits": 0.8366287380456925, + "step": 26220 + }, + { + "epoch": 0.2623, + "grad_norm": 13.125, + "grad_norm_var": 0.404931640625, + "learning_rate": 0.0003, + "loss": 11.5461, + "loss/aux_loss": 0.04808579571545124, + "loss/crossentropy": 2.6578098058700563, + "loss/logits": 0.826743358373642, + "step": 26230 + }, + { + "epoch": 0.2624, + "grad_norm": 12.5, + "grad_norm_var": 0.44212239583333335, + "learning_rate": 0.0003, + "loss": 11.7337, + "loss/aux_loss": 0.04809788726270199, + "loss/crossentropy": 2.923408627510071, + "loss/logits": 0.9030967593193054, + "step": 26240 + }, + { + "epoch": 0.2625, + "grad_norm": 12.375, + "grad_norm_var": 0.3395182291666667, + "learning_rate": 0.0003, + "loss": 11.5184, + "loss/aux_loss": 0.04807977341115475, + "loss/crossentropy": 2.7566078901290894, + "loss/logits": 0.8837383359670639, + "step": 26250 + }, + { + "epoch": 0.2626, + "grad_norm": 12.3125, + "grad_norm_var": 0.50546875, + "learning_rate": 0.0003, + "loss": 11.4649, + "loss/aux_loss": 0.04808521345257759, + "loss/crossentropy": 2.762271225452423, + "loss/logits": 0.849093359708786, + "step": 26260 + }, + { + "epoch": 0.2627, + "grad_norm": 12.9375, + "grad_norm_var": 0.6986979166666667, + "learning_rate": 0.0003, + "loss": 11.5331, + "loss/aux_loss": 0.04808817598968744, + "loss/crossentropy": 2.7467776775360107, + "loss/logits": 0.875149542093277, + "step": 26270 + }, + { + "epoch": 0.2628, + "grad_norm": 13.8125, + "grad_norm_var": 0.4869140625, + "learning_rate": 0.0003, + "loss": 11.5921, + "loss/aux_loss": 0.04808767884969711, + "loss/crossentropy": 2.7188161492347716, + "loss/logits": 0.8897079229354858, + "step": 26280 + }, + { + "epoch": 0.2629, + "grad_norm": 14.5625, + "grad_norm_var": 5.152018229166667, + "learning_rate": 0.0003, + "loss": 11.5294, + "loss/aux_loss": 0.048097186163067816, + "loss/crossentropy": 2.8574177980422975, + "loss/logits": 0.8552042752504349, + "step": 26290 + }, + { + "epoch": 0.263, + "grad_norm": 12.625, + "grad_norm_var": 5.252978515625, + "learning_rate": 0.0003, + "loss": 11.7339, + "loss/aux_loss": 0.04809401351958513, + "loss/crossentropy": 2.900586748123169, + "loss/logits": 0.9175508260726929, + "step": 26300 + }, + { + "epoch": 0.2631, + "grad_norm": 12.4375, + "grad_norm_var": 0.36300455729166664, + "learning_rate": 0.0003, + "loss": 11.8741, + "loss/aux_loss": 0.04807520993053913, + "loss/crossentropy": 2.955962133407593, + "loss/logits": 0.8903858751058579, + "step": 26310 + }, + { + "epoch": 0.2632, + "grad_norm": 13.375, + "grad_norm_var": 0.33006184895833335, + "learning_rate": 0.0003, + "loss": 11.6969, + "loss/aux_loss": 0.04809126146137714, + "loss/crossentropy": 2.8080713510513307, + "loss/logits": 0.8494821518659592, + "step": 26320 + }, + { + "epoch": 0.2633, + "grad_norm": 12.1875, + "grad_norm_var": 0.5207682291666667, + "learning_rate": 0.0003, + "loss": 11.6019, + "loss/aux_loss": 0.04808663856238127, + "loss/crossentropy": 2.8642470121383665, + "loss/logits": 0.869270795583725, + "step": 26330 + }, + { + "epoch": 0.2634, + "grad_norm": 12.5625, + "grad_norm_var": 0.13592122395833334, + "learning_rate": 0.0003, + "loss": 11.6974, + "loss/aux_loss": 0.04808529764413834, + "loss/crossentropy": 2.7711823523044585, + "loss/logits": 0.89097281396389, + "step": 26340 + }, + { + "epoch": 0.2635, + "grad_norm": 11.9375, + "grad_norm_var": 0.22342122395833333, + "learning_rate": 0.0003, + "loss": 11.6506, + "loss/aux_loss": 0.04808674547821283, + "loss/crossentropy": 2.8846903085708617, + "loss/logits": 0.8763628363609314, + "step": 26350 + }, + { + "epoch": 0.2636, + "grad_norm": 12.3125, + "grad_norm_var": 0.3492024739583333, + "learning_rate": 0.0003, + "loss": 11.6322, + "loss/aux_loss": 0.048084713704884055, + "loss/crossentropy": 2.593327397108078, + "loss/logits": 0.8438139349222183, + "step": 26360 + }, + { + "epoch": 0.2637, + "grad_norm": 11.875, + "grad_norm_var": 0.308056640625, + "learning_rate": 0.0003, + "loss": 11.7669, + "loss/aux_loss": 0.048082432709634304, + "loss/crossentropy": 2.8110472738742827, + "loss/logits": 0.9003132045269012, + "step": 26370 + }, + { + "epoch": 0.2638, + "grad_norm": 12.0625, + "grad_norm_var": 0.28177083333333336, + "learning_rate": 0.0003, + "loss": 11.6746, + "loss/aux_loss": 0.04809358026832342, + "loss/crossentropy": 2.8158118963241576, + "loss/logits": 0.8962694942951203, + "step": 26380 + }, + { + "epoch": 0.2639, + "grad_norm": 14.3125, + "grad_norm_var": 0.706103515625, + "learning_rate": 0.0003, + "loss": 11.6306, + "loss/aux_loss": 0.048089638352394104, + "loss/crossentropy": 2.935623896121979, + "loss/logits": 0.8567668348550797, + "step": 26390 + }, + { + "epoch": 0.264, + "grad_norm": 12.4375, + "grad_norm_var": 0.8358723958333333, + "learning_rate": 0.0003, + "loss": 11.6547, + "loss/aux_loss": 0.04807798489928246, + "loss/crossentropy": 2.7165545761585235, + "loss/logits": 0.8737062573432922, + "step": 26400 + }, + { + "epoch": 0.2641, + "grad_norm": 12.1875, + "grad_norm_var": 0.24270833333333333, + "learning_rate": 0.0003, + "loss": 11.5237, + "loss/aux_loss": 0.048086860589683054, + "loss/crossentropy": 2.733562481403351, + "loss/logits": 0.8268854200839997, + "step": 26410 + }, + { + "epoch": 0.2642, + "grad_norm": 14.125, + "grad_norm_var": 0.341650390625, + "learning_rate": 0.0003, + "loss": 11.6531, + "loss/aux_loss": 0.048096845485270025, + "loss/crossentropy": 2.649859589338303, + "loss/logits": 0.8604597598314285, + "step": 26420 + }, + { + "epoch": 0.2643, + "grad_norm": 13.5, + "grad_norm_var": 0.2969889322916667, + "learning_rate": 0.0003, + "loss": 11.4639, + "loss/aux_loss": 0.0480915080755949, + "loss/crossentropy": 2.7871899247169494, + "loss/logits": 0.8752772063016891, + "step": 26430 + }, + { + "epoch": 0.2644, + "grad_norm": 13.4375, + "grad_norm_var": 0.21451822916666666, + "learning_rate": 0.0003, + "loss": 11.6634, + "loss/aux_loss": 0.04808179382234812, + "loss/crossentropy": 2.647487211227417, + "loss/logits": 0.8647993594408036, + "step": 26440 + }, + { + "epoch": 0.2645, + "grad_norm": 13.0625, + "grad_norm_var": 0.248681640625, + "learning_rate": 0.0003, + "loss": 11.5747, + "loss/aux_loss": 0.0480790950357914, + "loss/crossentropy": 2.751650595664978, + "loss/logits": 0.9247609049081802, + "step": 26450 + }, + { + "epoch": 0.2646, + "grad_norm": 12.375, + "grad_norm_var": 0.15052083333333333, + "learning_rate": 0.0003, + "loss": 11.3959, + "loss/aux_loss": 0.04808844346553087, + "loss/crossentropy": 2.733932113647461, + "loss/logits": 0.854560700058937, + "step": 26460 + }, + { + "epoch": 0.2647, + "grad_norm": 13.0625, + "grad_norm_var": 1.4692057291666667, + "learning_rate": 0.0003, + "loss": 11.6134, + "loss/aux_loss": 0.04808341935276985, + "loss/crossentropy": 2.7537252068519593, + "loss/logits": 0.8283010810613632, + "step": 26470 + }, + { + "epoch": 0.2648, + "grad_norm": 12.625, + "grad_norm_var": 0.579541015625, + "learning_rate": 0.0003, + "loss": 11.6445, + "loss/aux_loss": 0.04808064494282007, + "loss/crossentropy": 2.715360426902771, + "loss/logits": 0.8789581745862961, + "step": 26480 + }, + { + "epoch": 0.2649, + "grad_norm": 13.5625, + "grad_norm_var": 0.43683268229166666, + "learning_rate": 0.0003, + "loss": 11.5003, + "loss/aux_loss": 0.04808939713984728, + "loss/crossentropy": 2.752163290977478, + "loss/logits": 0.8825481355190277, + "step": 26490 + }, + { + "epoch": 0.265, + "grad_norm": 12.1875, + "grad_norm_var": 0.44138997395833335, + "learning_rate": 0.0003, + "loss": 11.4298, + "loss/aux_loss": 0.04808378238230944, + "loss/crossentropy": 2.66518457531929, + "loss/logits": 0.8795353204011918, + "step": 26500 + }, + { + "epoch": 0.2651, + "grad_norm": 13.375, + "grad_norm_var": 1.1955729166666667, + "learning_rate": 0.0003, + "loss": 11.5447, + "loss/aux_loss": 0.048084886930882934, + "loss/crossentropy": 2.7104385554790498, + "loss/logits": 0.8602730393409729, + "step": 26510 + }, + { + "epoch": 0.2652, + "grad_norm": 12.1875, + "grad_norm_var": 1.5015462239583333, + "learning_rate": 0.0003, + "loss": 11.5465, + "loss/aux_loss": 0.04808354377746582, + "loss/crossentropy": 2.6698466658592226, + "loss/logits": 0.8915825933218002, + "step": 26520 + }, + { + "epoch": 0.2653, + "grad_norm": 13.25, + "grad_norm_var": 0.8618326822916667, + "learning_rate": 0.0003, + "loss": 11.3179, + "loss/aux_loss": 0.04808184951543808, + "loss/crossentropy": 2.778967833518982, + "loss/logits": 0.8605857610702514, + "step": 26530 + }, + { + "epoch": 0.2654, + "grad_norm": 12.4375, + "grad_norm_var": 0.7886555989583334, + "learning_rate": 0.0003, + "loss": 11.6822, + "loss/aux_loss": 0.04807785861194134, + "loss/crossentropy": 2.9185683012008665, + "loss/logits": 0.876692533493042, + "step": 26540 + }, + { + "epoch": 0.2655, + "grad_norm": 12.9375, + "grad_norm_var": 0.366650390625, + "learning_rate": 0.0003, + "loss": 11.6013, + "loss/aux_loss": 0.04808231629431248, + "loss/crossentropy": 2.7370326638221742, + "loss/logits": 0.867209044098854, + "step": 26550 + }, + { + "epoch": 0.2656, + "grad_norm": 14.125, + "grad_norm_var": 0.47784830729166666, + "learning_rate": 0.0003, + "loss": 11.5959, + "loss/aux_loss": 0.0480883315205574, + "loss/crossentropy": 2.799232506752014, + "loss/logits": 0.8930830955505371, + "step": 26560 + }, + { + "epoch": 0.2657, + "grad_norm": 12.125, + "grad_norm_var": 0.6108723958333333, + "learning_rate": 0.0003, + "loss": 11.6437, + "loss/aux_loss": 0.04807663895189762, + "loss/crossentropy": 2.7677125334739685, + "loss/logits": 0.9062906086444855, + "step": 26570 + }, + { + "epoch": 0.2658, + "grad_norm": 12.375, + "grad_norm_var": 0.232275390625, + "learning_rate": 0.0003, + "loss": 11.4253, + "loss/aux_loss": 0.04809227138757706, + "loss/crossentropy": 2.684408128261566, + "loss/logits": 0.878808343410492, + "step": 26580 + }, + { + "epoch": 0.2659, + "grad_norm": 13.75, + "grad_norm_var": 0.4923014322916667, + "learning_rate": 0.0003, + "loss": 11.5427, + "loss/aux_loss": 0.04807818587869406, + "loss/crossentropy": 2.9242777824401855, + "loss/logits": 0.8927222698926925, + "step": 26590 + }, + { + "epoch": 0.266, + "grad_norm": 13.625, + "grad_norm_var": 0.483056640625, + "learning_rate": 0.0003, + "loss": 11.5401, + "loss/aux_loss": 0.04808124490082264, + "loss/crossentropy": 2.8354394733905792, + "loss/logits": 0.8955871939659119, + "step": 26600 + }, + { + "epoch": 0.2661, + "grad_norm": 13.6875, + "grad_norm_var": 0.9447916666666667, + "learning_rate": 0.0003, + "loss": 11.5013, + "loss/aux_loss": 0.04808798339217901, + "loss/crossentropy": 2.9905380249023437, + "loss/logits": 0.9007535576820374, + "step": 26610 + }, + { + "epoch": 0.2662, + "grad_norm": 11.875, + "grad_norm_var": 0.6120930989583333, + "learning_rate": 0.0003, + "loss": 11.4842, + "loss/aux_loss": 0.04808419458568096, + "loss/crossentropy": 2.6949995160102844, + "loss/logits": 0.8447135239839554, + "step": 26620 + }, + { + "epoch": 0.2663, + "grad_norm": 11.5, + "grad_norm_var": 0.5355305989583333, + "learning_rate": 0.0003, + "loss": 11.5274, + "loss/aux_loss": 0.048078814335167405, + "loss/crossentropy": 2.6906064808368684, + "loss/logits": 0.9114213407039642, + "step": 26630 + }, + { + "epoch": 0.2664, + "grad_norm": 15.625, + "grad_norm_var": 0.9187337239583333, + "learning_rate": 0.0003, + "loss": 11.6276, + "loss/aux_loss": 0.048075567744672296, + "loss/crossentropy": 2.8382048666477204, + "loss/logits": 0.8896595865488053, + "step": 26640 + }, + { + "epoch": 0.2665, + "grad_norm": 12.9375, + "grad_norm_var": 0.5497395833333333, + "learning_rate": 0.0003, + "loss": 11.7462, + "loss/aux_loss": 0.048085755482316014, + "loss/crossentropy": 2.9384037852287292, + "loss/logits": 0.8757476270198822, + "step": 26650 + }, + { + "epoch": 0.2666, + "grad_norm": 14.375, + "grad_norm_var": 0.6030598958333333, + "learning_rate": 0.0003, + "loss": 11.578, + "loss/aux_loss": 0.04808356873691082, + "loss/crossentropy": 2.7069952189922333, + "loss/logits": 0.8372669726610183, + "step": 26660 + }, + { + "epoch": 0.2667, + "grad_norm": 13.6875, + "grad_norm_var": 0.6320149739583333, + "learning_rate": 0.0003, + "loss": 11.6305, + "loss/aux_loss": 0.04808407332748175, + "loss/crossentropy": 2.866211920976639, + "loss/logits": 0.926874178647995, + "step": 26670 + }, + { + "epoch": 0.2668, + "grad_norm": 12.4375, + "grad_norm_var": 0.42649739583333335, + "learning_rate": 0.0003, + "loss": 11.512, + "loss/aux_loss": 0.048087192699313164, + "loss/crossentropy": 2.779119974374771, + "loss/logits": 0.8863234221935272, + "step": 26680 + }, + { + "epoch": 0.2669, + "grad_norm": 12.25, + "grad_norm_var": 5.164957682291667, + "learning_rate": 0.0003, + "loss": 11.529, + "loss/aux_loss": 0.048087614588439465, + "loss/crossentropy": 2.786206305027008, + "loss/logits": 0.8752316683530807, + "step": 26690 + }, + { + "epoch": 0.267, + "grad_norm": 14.1875, + "grad_norm_var": 0.3640625, + "learning_rate": 0.0003, + "loss": 11.6246, + "loss/aux_loss": 0.048085883259773254, + "loss/crossentropy": 3.017382597923279, + "loss/logits": 0.9026335388422012, + "step": 26700 + }, + { + "epoch": 0.2671, + "grad_norm": 12.75, + "grad_norm_var": 0.36847330729166666, + "learning_rate": 0.0003, + "loss": 11.7263, + "loss/aux_loss": 0.04808237832039595, + "loss/crossentropy": 2.7911539018154143, + "loss/logits": 0.8946028083562851, + "step": 26710 + }, + { + "epoch": 0.2672, + "grad_norm": 12.5625, + "grad_norm_var": 0.3453125, + "learning_rate": 0.0003, + "loss": 11.5712, + "loss/aux_loss": 0.04808432050049305, + "loss/crossentropy": 2.7174662709236146, + "loss/logits": 0.862998154759407, + "step": 26720 + }, + { + "epoch": 0.2673, + "grad_norm": 14.4375, + "grad_norm_var": 0.49609375, + "learning_rate": 0.0003, + "loss": 11.697, + "loss/aux_loss": 0.04808508008718491, + "loss/crossentropy": 2.8050862312316895, + "loss/logits": 0.8875089168548584, + "step": 26730 + }, + { + "epoch": 0.2674, + "grad_norm": 13.5, + "grad_norm_var": 0.591650390625, + "learning_rate": 0.0003, + "loss": 11.6684, + "loss/aux_loss": 0.04808866996318102, + "loss/crossentropy": 2.8705193996429443, + "loss/logits": 0.8535552382469177, + "step": 26740 + }, + { + "epoch": 0.2675, + "grad_norm": 12.5625, + "grad_norm_var": 0.2669270833333333, + "learning_rate": 0.0003, + "loss": 11.584, + "loss/aux_loss": 0.04808246474713087, + "loss/crossentropy": 2.6758610129356386, + "loss/logits": 0.8840056896209717, + "step": 26750 + }, + { + "epoch": 0.2676, + "grad_norm": 12.25, + "grad_norm_var": 0.265625, + "learning_rate": 0.0003, + "loss": 11.5109, + "loss/aux_loss": 0.04808396678417921, + "loss/crossentropy": 2.7247723996639253, + "loss/logits": 0.8758624956011772, + "step": 26760 + }, + { + "epoch": 0.2677, + "grad_norm": 13.3125, + "grad_norm_var": 0.5591145833333333, + "learning_rate": 0.0003, + "loss": 11.5339, + "loss/aux_loss": 0.04808528199791908, + "loss/crossentropy": 2.6667077600955964, + "loss/logits": 0.8680305898189544, + "step": 26770 + }, + { + "epoch": 0.2678, + "grad_norm": 12.125, + "grad_norm_var": 0.2708333333333333, + "learning_rate": 0.0003, + "loss": 11.5729, + "loss/aux_loss": 0.048083195276558396, + "loss/crossentropy": 2.646883499622345, + "loss/logits": 0.8598526418209076, + "step": 26780 + }, + { + "epoch": 0.2679, + "grad_norm": 12.5, + "grad_norm_var": 0.31573893229166666, + "learning_rate": 0.0003, + "loss": 11.4182, + "loss/aux_loss": 0.04808151088654995, + "loss/crossentropy": 2.8521530270576476, + "loss/logits": 0.8823621451854706, + "step": 26790 + }, + { + "epoch": 0.268, + "grad_norm": 12.6875, + "grad_norm_var": 0.250634765625, + "learning_rate": 0.0003, + "loss": 11.442, + "loss/aux_loss": 0.0480816463008523, + "loss/crossentropy": 2.7866445600986482, + "loss/logits": 0.8612229824066162, + "step": 26800 + }, + { + "epoch": 0.2681, + "grad_norm": 12.75, + "grad_norm_var": 0.379931640625, + "learning_rate": 0.0003, + "loss": 11.5301, + "loss/aux_loss": 0.048094099201261994, + "loss/crossentropy": 2.8141289949417114, + "loss/logits": 0.8547409534454345, + "step": 26810 + }, + { + "epoch": 0.2682, + "grad_norm": 13.6875, + "grad_norm_var": 0.29933268229166665, + "learning_rate": 0.0003, + "loss": 11.7054, + "loss/aux_loss": 0.048088458552956584, + "loss/crossentropy": 2.8167191982269286, + "loss/logits": 0.9029836922883987, + "step": 26820 + }, + { + "epoch": 0.2683, + "grad_norm": 12.75, + "grad_norm_var": 0.539697265625, + "learning_rate": 0.0003, + "loss": 11.5224, + "loss/aux_loss": 0.0480873117223382, + "loss/crossentropy": 2.6812549769878387, + "loss/logits": 0.8772442221641541, + "step": 26830 + }, + { + "epoch": 0.2684, + "grad_norm": 13.1875, + "grad_norm_var": 0.6906087239583333, + "learning_rate": 0.0003, + "loss": 11.565, + "loss/aux_loss": 0.04808258228003979, + "loss/crossentropy": 2.7515992164611816, + "loss/logits": 0.879136735200882, + "step": 26840 + }, + { + "epoch": 0.2685, + "grad_norm": 12.75, + "grad_norm_var": 1.274462890625, + "learning_rate": 0.0003, + "loss": 11.6051, + "loss/aux_loss": 0.048084885254502295, + "loss/crossentropy": 2.586995255947113, + "loss/logits": 0.8740639716386795, + "step": 26850 + }, + { + "epoch": 0.2686, + "grad_norm": 13.5, + "grad_norm_var": 0.2515625, + "learning_rate": 0.0003, + "loss": 11.389, + "loss/aux_loss": 0.0480819696560502, + "loss/crossentropy": 2.7247099459171293, + "loss/logits": 0.8685719013214112, + "step": 26860 + }, + { + "epoch": 0.2687, + "grad_norm": 12.375, + "grad_norm_var": 72.13385416666667, + "learning_rate": 0.0003, + "loss": 11.6388, + "loss/aux_loss": 0.04809704162180424, + "loss/crossentropy": 2.6560521006584166, + "loss/logits": 0.8820772796869278, + "step": 26870 + }, + { + "epoch": 0.2688, + "grad_norm": 12.9375, + "grad_norm_var": 0.1853515625, + "learning_rate": 0.0003, + "loss": 11.6079, + "loss/aux_loss": 0.04808767940849066, + "loss/crossentropy": 2.5908863723278044, + "loss/logits": 0.8665450185537338, + "step": 26880 + }, + { + "epoch": 0.2689, + "grad_norm": 13.125, + "grad_norm_var": 0.6077473958333334, + "learning_rate": 0.0003, + "loss": 11.6409, + "loss/aux_loss": 0.048092026449739936, + "loss/crossentropy": 2.7218611598014832, + "loss/logits": 0.8763234496116639, + "step": 26890 + }, + { + "epoch": 0.269, + "grad_norm": 13.0625, + "grad_norm_var": 0.35323893229166664, + "learning_rate": 0.0003, + "loss": 11.674, + "loss/aux_loss": 0.04808721747249365, + "loss/crossentropy": 2.7273535072803496, + "loss/logits": 0.8680451363325119, + "step": 26900 + }, + { + "epoch": 0.2691, + "grad_norm": 11.75, + "grad_norm_var": 2.484635416666667, + "learning_rate": 0.0003, + "loss": 11.6283, + "loss/aux_loss": 0.04808290395885706, + "loss/crossentropy": 2.7846306562423706, + "loss/logits": 0.909109690785408, + "step": 26910 + }, + { + "epoch": 0.2692, + "grad_norm": 12.25, + "grad_norm_var": 0.7494140625, + "learning_rate": 0.0003, + "loss": 11.3449, + "loss/aux_loss": 0.04808285906910896, + "loss/crossentropy": 2.779471981525421, + "loss/logits": 0.8647771954536438, + "step": 26920 + }, + { + "epoch": 0.2693, + "grad_norm": 13.375, + "grad_norm_var": 40.298160807291666, + "learning_rate": 0.0003, + "loss": 11.5546, + "loss/aux_loss": 0.0480886047706008, + "loss/crossentropy": 2.7506704151630403, + "loss/logits": 0.8976017504930496, + "step": 26930 + }, + { + "epoch": 0.2694, + "grad_norm": 13.4375, + "grad_norm_var": 0.459619140625, + "learning_rate": 0.0003, + "loss": 11.59, + "loss/aux_loss": 0.04809030499309301, + "loss/crossentropy": 2.809163624048233, + "loss/logits": 0.9011189788579941, + "step": 26940 + }, + { + "epoch": 0.2695, + "grad_norm": 13.3125, + "grad_norm_var": 0.374462890625, + "learning_rate": 0.0003, + "loss": 11.6427, + "loss/aux_loss": 0.04809495285153389, + "loss/crossentropy": 2.747515672445297, + "loss/logits": 0.8646955370903016, + "step": 26950 + }, + { + "epoch": 0.2696, + "grad_norm": 13.0625, + "grad_norm_var": 0.414306640625, + "learning_rate": 0.0003, + "loss": 11.4118, + "loss/aux_loss": 0.048085729405283925, + "loss/crossentropy": 2.776648241281509, + "loss/logits": 0.8744494527578354, + "step": 26960 + }, + { + "epoch": 0.2697, + "grad_norm": 12.625, + "grad_norm_var": 0.496337890625, + "learning_rate": 0.0003, + "loss": 11.6107, + "loss/aux_loss": 0.0480810709297657, + "loss/crossentropy": 2.7663753151893617, + "loss/logits": 0.8697874486446381, + "step": 26970 + }, + { + "epoch": 0.2698, + "grad_norm": 12.6875, + "grad_norm_var": 0.36013997395833336, + "learning_rate": 0.0003, + "loss": 11.4819, + "loss/aux_loss": 0.04809112492948771, + "loss/crossentropy": 2.5756009936332704, + "loss/logits": 0.832565313577652, + "step": 26980 + }, + { + "epoch": 0.2699, + "grad_norm": 13.4375, + "grad_norm_var": 0.3505045572916667, + "learning_rate": 0.0003, + "loss": 11.5591, + "loss/aux_loss": 0.04808456730097532, + "loss/crossentropy": 2.715240556001663, + "loss/logits": 0.9006956547498703, + "step": 26990 + }, + { + "epoch": 0.27, + "grad_norm": 12.6875, + "grad_norm_var": 8.5306640625, + "learning_rate": 0.0003, + "loss": 11.3831, + "loss/aux_loss": 0.04807609599083662, + "loss/crossentropy": 2.7678284883499145, + "loss/logits": 0.8785594999790192, + "step": 27000 + }, + { + "epoch": 0.2701, + "grad_norm": 13.6875, + "grad_norm_var": 0.16300455729166666, + "learning_rate": 0.0003, + "loss": 11.6529, + "loss/aux_loss": 0.0480866638943553, + "loss/crossentropy": 2.6905364990234375, + "loss/logits": 0.8590237915515899, + "step": 27010 + }, + { + "epoch": 0.2702, + "grad_norm": 12.4375, + "grad_norm_var": 0.3159993489583333, + "learning_rate": 0.0003, + "loss": 11.6173, + "loss/aux_loss": 0.048088221624493596, + "loss/crossentropy": 2.8152272939682006, + "loss/logits": 0.8619503259658814, + "step": 27020 + }, + { + "epoch": 0.2703, + "grad_norm": 13.375, + "grad_norm_var": 0.1962890625, + "learning_rate": 0.0003, + "loss": 11.6658, + "loss/aux_loss": 0.04808411095291376, + "loss/crossentropy": 2.842225217819214, + "loss/logits": 0.8590496510267258, + "step": 27030 + }, + { + "epoch": 0.2704, + "grad_norm": 13.125, + "grad_norm_var": 0.583447265625, + "learning_rate": 0.0003, + "loss": 11.4641, + "loss/aux_loss": 0.048092346824705604, + "loss/crossentropy": 2.7975880026817324, + "loss/logits": 0.8442713886499404, + "step": 27040 + }, + { + "epoch": 0.2705, + "grad_norm": 12.625, + "grad_norm_var": 0.9462076822916666, + "learning_rate": 0.0003, + "loss": 11.6847, + "loss/aux_loss": 0.04807819910347462, + "loss/crossentropy": 2.8695399880409242, + "loss/logits": 0.9112765967845917, + "step": 27050 + }, + { + "epoch": 0.2706, + "grad_norm": 12.6875, + "grad_norm_var": 0.490869140625, + "learning_rate": 0.0003, + "loss": 11.6033, + "loss/aux_loss": 0.048083419911563396, + "loss/crossentropy": 2.5501754522323608, + "loss/logits": 0.8562157094478607, + "step": 27060 + }, + { + "epoch": 0.2707, + "grad_norm": 13.125, + "grad_norm_var": 0.5098958333333333, + "learning_rate": 0.0003, + "loss": 11.768, + "loss/aux_loss": 0.04808535445481539, + "loss/crossentropy": 2.795117211341858, + "loss/logits": 0.8820730477571488, + "step": 27070 + }, + { + "epoch": 0.2708, + "grad_norm": 12.9375, + "grad_norm_var": 0.5040201822916667, + "learning_rate": 0.0003, + "loss": 11.5046, + "loss/aux_loss": 0.04808331392705441, + "loss/crossentropy": 2.6431259870529176, + "loss/logits": 0.8500302553176879, + "step": 27080 + }, + { + "epoch": 0.2709, + "grad_norm": 12.9375, + "grad_norm_var": 0.49724934895833334, + "learning_rate": 0.0003, + "loss": 11.5656, + "loss/aux_loss": 0.04808337949216366, + "loss/crossentropy": 2.9284351587295534, + "loss/logits": 0.9117402613162995, + "step": 27090 + }, + { + "epoch": 0.271, + "grad_norm": 14.0, + "grad_norm_var": 0.35857747395833334, + "learning_rate": 0.0003, + "loss": 11.4482, + "loss/aux_loss": 0.048084777966141704, + "loss/crossentropy": 2.5703269481658935, + "loss/logits": 0.8528429746627808, + "step": 27100 + }, + { + "epoch": 0.2711, + "grad_norm": 12.5625, + "grad_norm_var": 0.37916666666666665, + "learning_rate": 0.0003, + "loss": 11.4456, + "loss/aux_loss": 0.04808557964861393, + "loss/crossentropy": 2.910882604122162, + "loss/logits": 0.9175168991088867, + "step": 27110 + }, + { + "epoch": 0.2712, + "grad_norm": 14.0, + "grad_norm_var": 0.4205729166666667, + "learning_rate": 0.0003, + "loss": 11.5441, + "loss/aux_loss": 0.048091310635209085, + "loss/crossentropy": 2.818699848651886, + "loss/logits": 0.8743287414312363, + "step": 27120 + }, + { + "epoch": 0.2713, + "grad_norm": 12.375, + "grad_norm_var": 0.42389322916666666, + "learning_rate": 0.0003, + "loss": 11.433, + "loss/aux_loss": 0.04808477144688368, + "loss/crossentropy": 2.7713629007339478, + "loss/logits": 0.8480364233255386, + "step": 27130 + }, + { + "epoch": 0.2714, + "grad_norm": 12.3125, + "grad_norm_var": 0.49270833333333336, + "learning_rate": 0.0003, + "loss": 11.4776, + "loss/aux_loss": 0.048079821094870565, + "loss/crossentropy": 2.791437101364136, + "loss/logits": 0.8964347183704376, + "step": 27140 + }, + { + "epoch": 0.2715, + "grad_norm": 12.75, + "grad_norm_var": 0.180712890625, + "learning_rate": 0.0003, + "loss": 11.6727, + "loss/aux_loss": 0.04808723647147417, + "loss/crossentropy": 2.799302804470062, + "loss/logits": 0.9013757139444352, + "step": 27150 + }, + { + "epoch": 0.2716, + "grad_norm": 11.5, + "grad_norm_var": 0.32732747395833334, + "learning_rate": 0.0003, + "loss": 11.5774, + "loss/aux_loss": 0.04808212071657181, + "loss/crossentropy": 2.7711844205856324, + "loss/logits": 0.8806311905384063, + "step": 27160 + }, + { + "epoch": 0.2717, + "grad_norm": 12.125, + "grad_norm_var": 0.3702473958333333, + "learning_rate": 0.0003, + "loss": 11.6159, + "loss/aux_loss": 0.04808286111801863, + "loss/crossentropy": 2.7594713032245637, + "loss/logits": 0.8914604634046555, + "step": 27170 + }, + { + "epoch": 0.2718, + "grad_norm": 12.75, + "grad_norm_var": 0.3030598958333333, + "learning_rate": 0.0003, + "loss": 11.6285, + "loss/aux_loss": 0.04807987064123154, + "loss/crossentropy": 2.8910335302352905, + "loss/logits": 0.8775747418403625, + "step": 27180 + }, + { + "epoch": 0.2719, + "grad_norm": 13.0, + "grad_norm_var": 0.7816243489583333, + "learning_rate": 0.0003, + "loss": 11.5454, + "loss/aux_loss": 0.048089956678450105, + "loss/crossentropy": 2.863065016269684, + "loss/logits": 0.8842020243406296, + "step": 27190 + }, + { + "epoch": 0.272, + "grad_norm": 12.5, + "grad_norm_var": 0.403759765625, + "learning_rate": 0.0003, + "loss": 11.6229, + "loss/aux_loss": 0.048087510466575625, + "loss/crossentropy": 2.7746796369552613, + "loss/logits": 0.8964880555868149, + "step": 27200 + }, + { + "epoch": 0.2721, + "grad_norm": 11.8125, + "grad_norm_var": 0.4014973958333333, + "learning_rate": 0.0003, + "loss": 11.7884, + "loss/aux_loss": 0.04808600451797247, + "loss/crossentropy": 2.9064237117767333, + "loss/logits": 0.9102618426084519, + "step": 27210 + }, + { + "epoch": 0.2722, + "grad_norm": 14.1875, + "grad_norm_var": 0.48899739583333335, + "learning_rate": 0.0003, + "loss": 11.5522, + "loss/aux_loss": 0.04808280412107706, + "loss/crossentropy": 2.8749096155166627, + "loss/logits": 0.8867197275161743, + "step": 27220 + }, + { + "epoch": 0.2723, + "grad_norm": 13.75, + "grad_norm_var": 0.46868489583333334, + "learning_rate": 0.0003, + "loss": 11.6081, + "loss/aux_loss": 0.04809102062135935, + "loss/crossentropy": 2.806208127737045, + "loss/logits": 0.9116105139255524, + "step": 27230 + }, + { + "epoch": 0.2724, + "grad_norm": 14.25, + "grad_norm_var": 1.3843098958333333, + "learning_rate": 0.0003, + "loss": 11.5575, + "loss/aux_loss": 0.04807698503136635, + "loss/crossentropy": 2.9286911368370054, + "loss/logits": 0.8646731346845626, + "step": 27240 + }, + { + "epoch": 0.2725, + "grad_norm": 13.125, + "grad_norm_var": 1.3192057291666666, + "learning_rate": 0.0003, + "loss": 11.5521, + "loss/aux_loss": 0.04809784200042486, + "loss/crossentropy": 2.829662698507309, + "loss/logits": 0.9164007723331451, + "step": 27250 + }, + { + "epoch": 0.2726, + "grad_norm": 14.5, + "grad_norm_var": 0.9409993489583334, + "learning_rate": 0.0003, + "loss": 11.4188, + "loss/aux_loss": 0.04808580614626408, + "loss/crossentropy": 2.5694850265979765, + "loss/logits": 0.8638879209756851, + "step": 27260 + }, + { + "epoch": 0.2727, + "grad_norm": 12.75, + "grad_norm_var": 0.5489420572916667, + "learning_rate": 0.0003, + "loss": 11.5619, + "loss/aux_loss": 0.04808543249964714, + "loss/crossentropy": 2.8475801050662994, + "loss/logits": 0.8850974351167679, + "step": 27270 + }, + { + "epoch": 0.2728, + "grad_norm": 15.0, + "grad_norm_var": 2.805712890625, + "learning_rate": 0.0003, + "loss": 11.5083, + "loss/aux_loss": 0.04808332417160273, + "loss/crossentropy": 2.787931036949158, + "loss/logits": 0.9341086566448211, + "step": 27280 + }, + { + "epoch": 0.2729, + "grad_norm": 13.125, + "grad_norm_var": 2.713134765625, + "learning_rate": 0.0003, + "loss": 11.6033, + "loss/aux_loss": 0.048095279932022096, + "loss/crossentropy": 2.7088790059089662, + "loss/logits": 0.8688720375299454, + "step": 27290 + }, + { + "epoch": 0.273, + "grad_norm": 13.3125, + "grad_norm_var": 1.1643229166666667, + "learning_rate": 0.0003, + "loss": 11.469, + "loss/aux_loss": 0.04808818940073252, + "loss/crossentropy": 2.6304342091083526, + "loss/logits": 0.8546393603086472, + "step": 27300 + }, + { + "epoch": 0.2731, + "grad_norm": 12.875, + "grad_norm_var": 0.48170572916666665, + "learning_rate": 0.0003, + "loss": 11.4469, + "loss/aux_loss": 0.048087147809565065, + "loss/crossentropy": 2.6426671385765075, + "loss/logits": 0.8355433255434036, + "step": 27310 + }, + { + "epoch": 0.2732, + "grad_norm": 12.8125, + "grad_norm_var": 0.2900390625, + "learning_rate": 0.0003, + "loss": 11.5179, + "loss/aux_loss": 0.04808585401624441, + "loss/crossentropy": 2.7217608451843263, + "loss/logits": 0.8875219106674195, + "step": 27320 + }, + { + "epoch": 0.2733, + "grad_norm": 12.8125, + "grad_norm_var": 0.5149576822916667, + "learning_rate": 0.0003, + "loss": 11.5724, + "loss/aux_loss": 0.04808675888925791, + "loss/crossentropy": 2.8080302834510804, + "loss/logits": 0.8594749808311463, + "step": 27330 + }, + { + "epoch": 0.2734, + "grad_norm": 11.875, + "grad_norm_var": 0.49947916666666664, + "learning_rate": 0.0003, + "loss": 11.4152, + "loss/aux_loss": 0.048075826838612556, + "loss/crossentropy": 2.8058014869689942, + "loss/logits": 0.8780788242816925, + "step": 27340 + }, + { + "epoch": 0.2735, + "grad_norm": 14.75, + "grad_norm_var": 0.5893229166666667, + "learning_rate": 0.0003, + "loss": 11.516, + "loss/aux_loss": 0.04808777756989002, + "loss/crossentropy": 2.840649002790451, + "loss/logits": 0.8928281188011169, + "step": 27350 + }, + { + "epoch": 0.2736, + "grad_norm": 13.25, + "grad_norm_var": 1.003759765625, + "learning_rate": 0.0003, + "loss": 11.5198, + "loss/aux_loss": 0.048086445592343804, + "loss/crossentropy": 2.6759494841098785, + "loss/logits": 0.866712149977684, + "step": 27360 + }, + { + "epoch": 0.2737, + "grad_norm": 12.0625, + "grad_norm_var": 1.5731770833333334, + "learning_rate": 0.0003, + "loss": 11.5458, + "loss/aux_loss": 0.048074633441865446, + "loss/crossentropy": 2.805925118923187, + "loss/logits": 0.8581642717123031, + "step": 27370 + }, + { + "epoch": 0.2738, + "grad_norm": 13.4375, + "grad_norm_var": 1.6359375, + "learning_rate": 0.0003, + "loss": 11.5177, + "loss/aux_loss": 0.04809354469180107, + "loss/crossentropy": 2.7967172265052795, + "loss/logits": 0.8863823890686036, + "step": 27380 + }, + { + "epoch": 0.2739, + "grad_norm": 12.0, + "grad_norm_var": 0.6683430989583333, + "learning_rate": 0.0003, + "loss": 11.4643, + "loss/aux_loss": 0.04809064380824566, + "loss/crossentropy": 2.7077986001968384, + "loss/logits": 0.8588582128286362, + "step": 27390 + }, + { + "epoch": 0.274, + "grad_norm": 13.6875, + "grad_norm_var": 0.38671875, + "learning_rate": 0.0003, + "loss": 11.5795, + "loss/aux_loss": 0.048084030672907827, + "loss/crossentropy": 2.750448912382126, + "loss/logits": 0.8991485238075256, + "step": 27400 + }, + { + "epoch": 0.2741, + "grad_norm": 13.75, + "grad_norm_var": 0.5129557291666667, + "learning_rate": 0.0003, + "loss": 11.5598, + "loss/aux_loss": 0.048072746582329275, + "loss/crossentropy": 2.761543083190918, + "loss/logits": 0.8485326498746872, + "step": 27410 + }, + { + "epoch": 0.2742, + "grad_norm": 12.8125, + "grad_norm_var": 0.4556640625, + "learning_rate": 0.0003, + "loss": 11.3436, + "loss/aux_loss": 0.048084151931107044, + "loss/crossentropy": 2.604575699567795, + "loss/logits": 0.8465295255184173, + "step": 27420 + }, + { + "epoch": 0.2743, + "grad_norm": 13.5625, + "grad_norm_var": 0.5302083333333333, + "learning_rate": 0.0003, + "loss": 11.4867, + "loss/aux_loss": 0.04809482246637344, + "loss/crossentropy": 2.54980583190918, + "loss/logits": 0.8440865933895111, + "step": 27430 + }, + { + "epoch": 0.2744, + "grad_norm": 13.625, + "grad_norm_var": 4.076285807291667, + "learning_rate": 0.0003, + "loss": 11.5182, + "loss/aux_loss": 0.04807782378047705, + "loss/crossentropy": 2.7580845236778258, + "loss/logits": 0.882664081454277, + "step": 27440 + }, + { + "epoch": 0.2745, + "grad_norm": 12.9375, + "grad_norm_var": 0.379931640625, + "learning_rate": 0.0003, + "loss": 11.4551, + "loss/aux_loss": 0.0480885649099946, + "loss/crossentropy": 2.777578568458557, + "loss/logits": 0.87208411693573, + "step": 27450 + }, + { + "epoch": 0.2746, + "grad_norm": 12.6875, + "grad_norm_var": 0.34427083333333336, + "learning_rate": 0.0003, + "loss": 11.4538, + "loss/aux_loss": 0.04808246586471796, + "loss/crossentropy": 2.8558852434158326, + "loss/logits": 0.8860389828681946, + "step": 27460 + }, + { + "epoch": 0.2747, + "grad_norm": 12.6875, + "grad_norm_var": 0.2786458333333333, + "learning_rate": 0.0003, + "loss": 11.5559, + "loss/aux_loss": 0.048081047087907794, + "loss/crossentropy": 2.5927527368068697, + "loss/logits": 0.8575126707553864, + "step": 27470 + }, + { + "epoch": 0.2748, + "grad_norm": 13.25, + "grad_norm_var": 0.42185872395833335, + "learning_rate": 0.0003, + "loss": 11.4521, + "loss/aux_loss": 0.048087083548307416, + "loss/crossentropy": 2.752536880970001, + "loss/logits": 0.8572630852460861, + "step": 27480 + }, + { + "epoch": 0.2749, + "grad_norm": 13.5625, + "grad_norm_var": 0.46599934895833334, + "learning_rate": 0.0003, + "loss": 11.6157, + "loss/aux_loss": 0.04808045290410519, + "loss/crossentropy": 2.865806245803833, + "loss/logits": 0.9061722487211228, + "step": 27490 + }, + { + "epoch": 0.275, + "grad_norm": 12.5, + "grad_norm_var": 0.45753580729166665, + "learning_rate": 0.0003, + "loss": 11.6082, + "loss/aux_loss": 0.04807562381029129, + "loss/crossentropy": 2.7103756070137024, + "loss/logits": 0.9027006924152374, + "step": 27500 + }, + { + "epoch": 0.2751, + "grad_norm": 12.5625, + "grad_norm_var": 0.322119140625, + "learning_rate": 0.0003, + "loss": 11.5937, + "loss/aux_loss": 0.04807604216039181, + "loss/crossentropy": 2.841430151462555, + "loss/logits": 0.8734793215990067, + "step": 27510 + }, + { + "epoch": 0.2752, + "grad_norm": 13.625, + "grad_norm_var": 1.0101399739583334, + "learning_rate": 0.0003, + "loss": 11.642, + "loss/aux_loss": 0.048089314438402656, + "loss/crossentropy": 2.8229639172554015, + "loss/logits": 0.8981556743383408, + "step": 27520 + }, + { + "epoch": 0.2753, + "grad_norm": 13.9375, + "grad_norm_var": 2.7639973958333335, + "learning_rate": 0.0003, + "loss": 11.4326, + "loss/aux_loss": 0.04808274004608393, + "loss/crossentropy": 2.5911940157413484, + "loss/logits": 0.8717542558908462, + "step": 27530 + }, + { + "epoch": 0.2754, + "grad_norm": 13.25, + "grad_norm_var": 3.187760416666667, + "learning_rate": 0.0003, + "loss": 11.459, + "loss/aux_loss": 0.048087103292346, + "loss/crossentropy": 2.75337210893631, + "loss/logits": 0.8438648998737335, + "step": 27540 + }, + { + "epoch": 0.2755, + "grad_norm": 12.625, + "grad_norm_var": 0.6598795572916667, + "learning_rate": 0.0003, + "loss": 11.4768, + "loss/aux_loss": 0.04808942452073097, + "loss/crossentropy": 2.6577411115169527, + "loss/logits": 0.8484239518642426, + "step": 27550 + }, + { + "epoch": 0.2756, + "grad_norm": 13.5625, + "grad_norm_var": 0.30625, + "learning_rate": 0.0003, + "loss": 11.5486, + "loss/aux_loss": 0.048091153427958486, + "loss/crossentropy": 2.8733396172523498, + "loss/logits": 0.8842191725969315, + "step": 27560 + }, + { + "epoch": 0.2757, + "grad_norm": 12.1875, + "grad_norm_var": 0.292431640625, + "learning_rate": 0.0003, + "loss": 11.3524, + "loss/aux_loss": 0.048078698106110096, + "loss/crossentropy": 2.8211456060409548, + "loss/logits": 0.8710596203804016, + "step": 27570 + }, + { + "epoch": 0.2758, + "grad_norm": 12.625, + "grad_norm_var": 0.356494140625, + "learning_rate": 0.0003, + "loss": 11.3923, + "loss/aux_loss": 0.04809141457080841, + "loss/crossentropy": 2.810522723197937, + "loss/logits": 0.9148801237344741, + "step": 27580 + }, + { + "epoch": 0.2759, + "grad_norm": 13.125, + "grad_norm_var": 0.688525390625, + "learning_rate": 0.0003, + "loss": 11.46, + "loss/aux_loss": 0.04809119552373886, + "loss/crossentropy": 2.571478694677353, + "loss/logits": 0.8658771872520447, + "step": 27590 + }, + { + "epoch": 0.276, + "grad_norm": 11.875, + "grad_norm_var": 0.5340983072916666, + "learning_rate": 0.0003, + "loss": 11.4551, + "loss/aux_loss": 0.04807944148778916, + "loss/crossentropy": 2.8209518790245056, + "loss/logits": 0.8891745388507843, + "step": 27600 + }, + { + "epoch": 0.2761, + "grad_norm": 12.0, + "grad_norm_var": 0.3296875, + "learning_rate": 0.0003, + "loss": 11.5451, + "loss/aux_loss": 0.04807922653853893, + "loss/crossentropy": 2.714892899990082, + "loss/logits": 0.9096907198429107, + "step": 27610 + }, + { + "epoch": 0.2762, + "grad_norm": 13.75, + "grad_norm_var": 0.6673014322916667, + "learning_rate": 0.0003, + "loss": 11.5883, + "loss/aux_loss": 0.04808503799140453, + "loss/crossentropy": 2.7832123041152954, + "loss/logits": 0.894438949227333, + "step": 27620 + }, + { + "epoch": 0.2763, + "grad_norm": 12.3125, + "grad_norm_var": 0.39869791666666665, + "learning_rate": 0.0003, + "loss": 11.591, + "loss/aux_loss": 0.048086524568498136, + "loss/crossentropy": 2.882674145698547, + "loss/logits": 0.8659522473812103, + "step": 27630 + }, + { + "epoch": 0.2764, + "grad_norm": 12.9375, + "grad_norm_var": 0.4231770833333333, + "learning_rate": 0.0003, + "loss": 11.5889, + "loss/aux_loss": 0.048090490885078906, + "loss/crossentropy": 2.76504762172699, + "loss/logits": 0.8722820281982422, + "step": 27640 + }, + { + "epoch": 0.2765, + "grad_norm": 13.0, + "grad_norm_var": 0.4369140625, + "learning_rate": 0.0003, + "loss": 11.7117, + "loss/aux_loss": 0.04808298014104366, + "loss/crossentropy": 2.773081195354462, + "loss/logits": 0.8426672071218491, + "step": 27650 + }, + { + "epoch": 0.2766, + "grad_norm": 13.5625, + "grad_norm_var": 141.33151041666667, + "learning_rate": 0.0003, + "loss": 11.678, + "loss/aux_loss": 0.04809086322784424, + "loss/crossentropy": 2.782191741466522, + "loss/logits": 0.882819551229477, + "step": 27660 + }, + { + "epoch": 0.2767, + "grad_norm": 17.5, + "grad_norm_var": 1.9231608072916666, + "learning_rate": 0.0003, + "loss": 11.4237, + "loss/aux_loss": 0.04807958249002695, + "loss/crossentropy": 2.6636355757713317, + "loss/logits": 0.8572327792644501, + "step": 27670 + }, + { + "epoch": 0.2768, + "grad_norm": 13.0, + "grad_norm_var": 1.603125, + "learning_rate": 0.0003, + "loss": 11.4229, + "loss/aux_loss": 0.0480815913528204, + "loss/crossentropy": 2.890985882282257, + "loss/logits": 0.8461414545774459, + "step": 27680 + }, + { + "epoch": 0.2769, + "grad_norm": 13.8125, + "grad_norm_var": 1.2593098958333333, + "learning_rate": 0.0003, + "loss": 11.6608, + "loss/aux_loss": 0.048087593354284766, + "loss/crossentropy": 2.839232790470123, + "loss/logits": 0.8831172704696655, + "step": 27690 + }, + { + "epoch": 0.277, + "grad_norm": 13.1875, + "grad_norm_var": 1.5706868489583334, + "learning_rate": 0.0003, + "loss": 11.5061, + "loss/aux_loss": 0.04807878881692886, + "loss/crossentropy": 2.685603749752045, + "loss/logits": 0.877737945318222, + "step": 27700 + }, + { + "epoch": 0.2771, + "grad_norm": 13.0625, + "grad_norm_var": 0.9463541666666667, + "learning_rate": 0.0003, + "loss": 11.5132, + "loss/aux_loss": 0.04808571934700012, + "loss/crossentropy": 2.794572043418884, + "loss/logits": 0.8642860800027847, + "step": 27710 + }, + { + "epoch": 0.2772, + "grad_norm": 13.625, + "grad_norm_var": 0.5702473958333333, + "learning_rate": 0.0003, + "loss": 11.5303, + "loss/aux_loss": 0.04808823838829994, + "loss/crossentropy": 2.8599129617214203, + "loss/logits": 0.8836135894060135, + "step": 27720 + }, + { + "epoch": 0.2773, + "grad_norm": 13.0625, + "grad_norm_var": 0.462353515625, + "learning_rate": 0.0003, + "loss": 11.5011, + "loss/aux_loss": 0.048078597895801066, + "loss/crossentropy": 2.775701862573624, + "loss/logits": 0.8424900531768799, + "step": 27730 + }, + { + "epoch": 0.2774, + "grad_norm": 12.3125, + "grad_norm_var": 0.5702962239583333, + "learning_rate": 0.0003, + "loss": 11.6663, + "loss/aux_loss": 0.048077501729130744, + "loss/crossentropy": 2.7949552178382873, + "loss/logits": 0.8686194092035293, + "step": 27740 + }, + { + "epoch": 0.2775, + "grad_norm": 13.3125, + "grad_norm_var": 9.37421875, + "learning_rate": 0.0003, + "loss": 11.4591, + "loss/aux_loss": 0.048088216595351695, + "loss/crossentropy": 2.655610829591751, + "loss/logits": 0.853823122382164, + "step": 27750 + }, + { + "epoch": 0.2776, + "grad_norm": 13.5625, + "grad_norm_var": 1.8868326822916666, + "learning_rate": 0.0003, + "loss": 11.6049, + "loss/aux_loss": 0.04809031039476395, + "loss/crossentropy": 2.91897075176239, + "loss/logits": 0.8576211661100388, + "step": 27760 + }, + { + "epoch": 0.2777, + "grad_norm": 12.875, + "grad_norm_var": 1.4387858072916666, + "learning_rate": 0.0003, + "loss": 11.3556, + "loss/aux_loss": 0.04808913040906191, + "loss/crossentropy": 2.6004298627376556, + "loss/logits": 0.8527587816119194, + "step": 27770 + }, + { + "epoch": 0.2778, + "grad_norm": 12.8125, + "grad_norm_var": 0.44972330729166665, + "learning_rate": 0.0003, + "loss": 11.5785, + "loss/aux_loss": 0.04808642938733101, + "loss/crossentropy": 2.721281111240387, + "loss/logits": 0.8621364802122116, + "step": 27780 + }, + { + "epoch": 0.2779, + "grad_norm": 11.9375, + "grad_norm_var": 0.27024739583333335, + "learning_rate": 0.0003, + "loss": 11.4411, + "loss/aux_loss": 0.04807955492287874, + "loss/crossentropy": 2.6037085890769958, + "loss/logits": 0.8568143039941788, + "step": 27790 + }, + { + "epoch": 0.278, + "grad_norm": 13.8125, + "grad_norm_var": 0.540625, + "learning_rate": 0.0003, + "loss": 11.5396, + "loss/aux_loss": 0.048087948746979234, + "loss/crossentropy": 2.8853622317314147, + "loss/logits": 0.8670936018228531, + "step": 27800 + }, + { + "epoch": 0.2781, + "grad_norm": 12.0625, + "grad_norm_var": 0.20974934895833333, + "learning_rate": 0.0003, + "loss": 11.6926, + "loss/aux_loss": 0.048087144270539284, + "loss/crossentropy": 2.8916147112846375, + "loss/logits": 0.9140586674213409, + "step": 27810 + }, + { + "epoch": 0.2782, + "grad_norm": 12.75, + "grad_norm_var": 0.9538899739583333, + "learning_rate": 0.0003, + "loss": 11.4982, + "loss/aux_loss": 0.048076816648244855, + "loss/crossentropy": 2.742960512638092, + "loss/logits": 0.8774885207414627, + "step": 27820 + }, + { + "epoch": 0.2783, + "grad_norm": 13.3125, + "grad_norm_var": 2.1745930989583333, + "learning_rate": 0.0003, + "loss": 11.5583, + "loss/aux_loss": 0.048096643574535844, + "loss/crossentropy": 2.8718445897102356, + "loss/logits": 0.8958581119775773, + "step": 27830 + }, + { + "epoch": 0.2784, + "grad_norm": 14.125, + "grad_norm_var": 2.208268229166667, + "learning_rate": 0.0003, + "loss": 11.7273, + "loss/aux_loss": 0.04808414224535227, + "loss/crossentropy": 2.936895763874054, + "loss/logits": 0.8936193466186524, + "step": 27840 + }, + { + "epoch": 0.2785, + "grad_norm": 12.0625, + "grad_norm_var": 1.5369140625, + "learning_rate": 0.0003, + "loss": 11.42, + "loss/aux_loss": 0.04808822255581617, + "loss/crossentropy": 2.812196373939514, + "loss/logits": 0.8949258029460907, + "step": 27850 + }, + { + "epoch": 0.2786, + "grad_norm": 13.0625, + "grad_norm_var": 0.7958170572916666, + "learning_rate": 0.0003, + "loss": 11.3989, + "loss/aux_loss": 0.04807552136480808, + "loss/crossentropy": 2.788651943206787, + "loss/logits": 0.8348335802555085, + "step": 27860 + }, + { + "epoch": 0.2787, + "grad_norm": 13.6875, + "grad_norm_var": 0.7613932291666666, + "learning_rate": 0.0003, + "loss": 11.5888, + "loss/aux_loss": 0.048087479919195174, + "loss/crossentropy": 2.7045384287834167, + "loss/logits": 0.9338543623685837, + "step": 27870 + }, + { + "epoch": 0.2788, + "grad_norm": 13.25, + "grad_norm_var": 0.6075520833333333, + "learning_rate": 0.0003, + "loss": 11.5237, + "loss/aux_loss": 0.048085146211087705, + "loss/crossentropy": 2.7415911316871644, + "loss/logits": 0.856224250793457, + "step": 27880 + }, + { + "epoch": 0.2789, + "grad_norm": 12.3125, + "grad_norm_var": 0.38058268229166664, + "learning_rate": 0.0003, + "loss": 11.5602, + "loss/aux_loss": 0.04807989951223135, + "loss/crossentropy": 2.7379807472229003, + "loss/logits": 0.8650393694639206, + "step": 27890 + }, + { + "epoch": 0.279, + "grad_norm": 13.0, + "grad_norm_var": 0.1525390625, + "learning_rate": 0.0003, + "loss": 11.608, + "loss/aux_loss": 0.048083983920514585, + "loss/crossentropy": 2.8002022445201873, + "loss/logits": 0.8693450152873993, + "step": 27900 + }, + { + "epoch": 0.2791, + "grad_norm": 13.0625, + "grad_norm_var": 0.425244140625, + "learning_rate": 0.0003, + "loss": 11.5797, + "loss/aux_loss": 0.04808452669531107, + "loss/crossentropy": 2.7015219628810883, + "loss/logits": 0.874952495098114, + "step": 27910 + }, + { + "epoch": 0.2792, + "grad_norm": 12.5625, + "grad_norm_var": 0.667041015625, + "learning_rate": 0.0003, + "loss": 11.5391, + "loss/aux_loss": 0.04808107353746891, + "loss/crossentropy": 2.6511988162994387, + "loss/logits": 0.844877976179123, + "step": 27920 + }, + { + "epoch": 0.2793, + "grad_norm": 16.0, + "grad_norm_var": 3.456103515625, + "learning_rate": 0.0003, + "loss": 11.4723, + "loss/aux_loss": 0.0480935912579298, + "loss/crossentropy": 2.69124321937561, + "loss/logits": 0.8498719304800033, + "step": 27930 + }, + { + "epoch": 0.2794, + "grad_norm": 16.0, + "grad_norm_var": 2.3822265625, + "learning_rate": 0.0003, + "loss": 11.5932, + "loss/aux_loss": 0.0480877697467804, + "loss/crossentropy": 2.886867892742157, + "loss/logits": 0.8901501029729844, + "step": 27940 + }, + { + "epoch": 0.2795, + "grad_norm": 11.75, + "grad_norm_var": 0.9264973958333333, + "learning_rate": 0.0003, + "loss": 11.3919, + "loss/aux_loss": 0.0480851836502552, + "loss/crossentropy": 2.5583014130592345, + "loss/logits": 0.8257781475782394, + "step": 27950 + }, + { + "epoch": 0.2796, + "grad_norm": 12.6875, + "grad_norm_var": 0.704150390625, + "learning_rate": 0.0003, + "loss": 11.349, + "loss/aux_loss": 0.048085610195994374, + "loss/crossentropy": 2.756976544857025, + "loss/logits": 0.8353757977485656, + "step": 27960 + }, + { + "epoch": 0.2797, + "grad_norm": 14.125, + "grad_norm_var": 0.378369140625, + "learning_rate": 0.0003, + "loss": 11.5423, + "loss/aux_loss": 0.048080562800168994, + "loss/crossentropy": 2.733251041173935, + "loss/logits": 0.8563009589910507, + "step": 27970 + }, + { + "epoch": 0.2798, + "grad_norm": 13.375, + "grad_norm_var": 0.47369791666666666, + "learning_rate": 0.0003, + "loss": 11.4917, + "loss/aux_loss": 0.048089844174683094, + "loss/crossentropy": 2.684944635629654, + "loss/logits": 0.8707854568958282, + "step": 27980 + }, + { + "epoch": 0.2799, + "grad_norm": 12.0, + "grad_norm_var": 0.1666015625, + "learning_rate": 0.0003, + "loss": 11.6145, + "loss/aux_loss": 0.04807740524411201, + "loss/crossentropy": 2.8147059202194216, + "loss/logits": 0.8616000026464462, + "step": 27990 + }, + { + "epoch": 0.28, + "grad_norm": 13.8125, + "grad_norm_var": 0.35989583333333336, + "learning_rate": 0.0003, + "loss": 11.605, + "loss/aux_loss": 0.048080855049192905, + "loss/crossentropy": 2.7819701194763184, + "loss/logits": 0.8910420656204223, + "step": 28000 + }, + { + "epoch": 0.2801, + "grad_norm": 12.25, + "grad_norm_var": 1.1546223958333333, + "learning_rate": 0.0003, + "loss": 11.4798, + "loss/aux_loss": 0.048088941164314745, + "loss/crossentropy": 2.7400415241718292, + "loss/logits": 0.8723890751600265, + "step": 28010 + }, + { + "epoch": 0.2802, + "grad_norm": 13.5, + "grad_norm_var": 0.7181640625, + "learning_rate": 0.0003, + "loss": 11.5532, + "loss/aux_loss": 0.04808063674718142, + "loss/crossentropy": 2.8234307289123537, + "loss/logits": 0.8648925483226776, + "step": 28020 + }, + { + "epoch": 0.2803, + "grad_norm": 14.1875, + "grad_norm_var": 0.7884765625, + "learning_rate": 0.0003, + "loss": 11.5742, + "loss/aux_loss": 0.048078789934515954, + "loss/crossentropy": 2.8293231964111327, + "loss/logits": 0.9036984205245971, + "step": 28030 + }, + { + "epoch": 0.2804, + "grad_norm": 12.8125, + "grad_norm_var": 0.5999348958333334, + "learning_rate": 0.0003, + "loss": 11.5579, + "loss/aux_loss": 0.04807864893227816, + "loss/crossentropy": 2.8947394490242004, + "loss/logits": 0.8946270734071732, + "step": 28040 + }, + { + "epoch": 0.2805, + "grad_norm": 12.5625, + "grad_norm_var": 0.49635416666666665, + "learning_rate": 0.0003, + "loss": 11.5083, + "loss/aux_loss": 0.04808484595268965, + "loss/crossentropy": 2.6746840596199037, + "loss/logits": 0.8644850313663482, + "step": 28050 + }, + { + "epoch": 0.2806, + "grad_norm": 12.875, + "grad_norm_var": 1402.0190104166666, + "learning_rate": 0.0003, + "loss": 11.557, + "loss/aux_loss": 0.048079652898013595, + "loss/crossentropy": 2.6483142554759977, + "loss/logits": 0.8723117738962174, + "step": 28060 + }, + { + "epoch": 0.2807, + "grad_norm": 13.1875, + "grad_norm_var": 1397.2218587239583, + "learning_rate": 0.0003, + "loss": 11.4769, + "loss/aux_loss": 0.04809585195034742, + "loss/crossentropy": 2.778903841972351, + "loss/logits": 0.8626104056835174, + "step": 28070 + }, + { + "epoch": 0.2808, + "grad_norm": 12.6875, + "grad_norm_var": 0.4710774739583333, + "learning_rate": 0.0003, + "loss": 11.6971, + "loss/aux_loss": 0.048076757788658143, + "loss/crossentropy": 2.6813664495944978, + "loss/logits": 0.8790749669075012, + "step": 28080 + }, + { + "epoch": 0.2809, + "grad_norm": 13.5, + "grad_norm_var": 0.48170572916666665, + "learning_rate": 0.0003, + "loss": 11.5118, + "loss/aux_loss": 0.048085582070052625, + "loss/crossentropy": 2.8408753156661986, + "loss/logits": 0.8780440986156464, + "step": 28090 + }, + { + "epoch": 0.281, + "grad_norm": 13.8125, + "grad_norm_var": 0.3015625, + "learning_rate": 0.0003, + "loss": 11.3568, + "loss/aux_loss": 0.04808991327881813, + "loss/crossentropy": 2.5951784670352938, + "loss/logits": 0.8443504124879837, + "step": 28100 + }, + { + "epoch": 0.2811, + "grad_norm": 15.0, + "grad_norm_var": 0.451806640625, + "learning_rate": 0.0003, + "loss": 11.5474, + "loss/aux_loss": 0.048089342564344405, + "loss/crossentropy": 2.6413574039936067, + "loss/logits": 0.8657073110342026, + "step": 28110 + }, + { + "epoch": 0.2812, + "grad_norm": 12.375, + "grad_norm_var": 0.6270182291666667, + "learning_rate": 0.0003, + "loss": 11.3937, + "loss/aux_loss": 0.04809096623212099, + "loss/crossentropy": 2.7579336047172545, + "loss/logits": 0.8672685265541077, + "step": 28120 + }, + { + "epoch": 0.2813, + "grad_norm": 12.875, + "grad_norm_var": 0.3942057291666667, + "learning_rate": 0.0003, + "loss": 11.5618, + "loss/aux_loss": 0.04808580968528986, + "loss/crossentropy": 2.7464880406856538, + "loss/logits": 0.8809055328369141, + "step": 28130 + }, + { + "epoch": 0.2814, + "grad_norm": 12.1875, + "grad_norm_var": 0.3719889322916667, + "learning_rate": 0.0003, + "loss": 11.4683, + "loss/aux_loss": 0.04808620549738407, + "loss/crossentropy": 2.729203450679779, + "loss/logits": 0.8312252789735795, + "step": 28140 + }, + { + "epoch": 0.2815, + "grad_norm": 11.9375, + "grad_norm_var": 4.386832682291667, + "learning_rate": 0.0003, + "loss": 11.592, + "loss/aux_loss": 0.04809138756245375, + "loss/crossentropy": 2.76963392496109, + "loss/logits": 0.8744438081979752, + "step": 28150 + }, + { + "epoch": 0.2816, + "grad_norm": 13.0, + "grad_norm_var": 4.50078125, + "learning_rate": 0.0003, + "loss": 11.5057, + "loss/aux_loss": 0.04808358568698168, + "loss/crossentropy": 2.7290889263153075, + "loss/logits": 0.8577259719371796, + "step": 28160 + }, + { + "epoch": 0.2817, + "grad_norm": 12.875, + "grad_norm_var": 1.4513020833333334, + "learning_rate": 0.0003, + "loss": 11.7034, + "loss/aux_loss": 0.04809059873223305, + "loss/crossentropy": 2.6322677731513977, + "loss/logits": 0.8687084138393402, + "step": 28170 + }, + { + "epoch": 0.2818, + "grad_norm": 12.8125, + "grad_norm_var": 0.6677083333333333, + "learning_rate": 0.0003, + "loss": 11.4394, + "loss/aux_loss": 0.04808175694197416, + "loss/crossentropy": 2.655596649646759, + "loss/logits": 0.8649382144212723, + "step": 28180 + }, + { + "epoch": 0.2819, + "grad_norm": 12.375, + "grad_norm_var": 0.7869140625, + "learning_rate": 0.0003, + "loss": 11.5506, + "loss/aux_loss": 0.048091573640704155, + "loss/crossentropy": 2.83742733001709, + "loss/logits": 0.8902147322893142, + "step": 28190 + }, + { + "epoch": 0.282, + "grad_norm": 12.5, + "grad_norm_var": 0.4087890625, + "learning_rate": 0.0003, + "loss": 11.5759, + "loss/aux_loss": 0.048080408945679665, + "loss/crossentropy": 2.654254060983658, + "loss/logits": 0.8351120471954345, + "step": 28200 + }, + { + "epoch": 0.2821, + "grad_norm": 13.4375, + "grad_norm_var": 0.566650390625, + "learning_rate": 0.0003, + "loss": 11.6061, + "loss/aux_loss": 0.04808955620974302, + "loss/crossentropy": 2.756123435497284, + "loss/logits": 0.8621039360761642, + "step": 28210 + }, + { + "epoch": 0.2822, + "grad_norm": 16.125, + "grad_norm_var": 0.8916666666666667, + "learning_rate": 0.0003, + "loss": 11.7105, + "loss/aux_loss": 0.04808141849935055, + "loss/crossentropy": 2.6973861932754515, + "loss/logits": 0.9015775710344315, + "step": 28220 + }, + { + "epoch": 0.2823, + "grad_norm": 13.4375, + "grad_norm_var": 1.2152180989583334, + "learning_rate": 0.0003, + "loss": 11.5418, + "loss/aux_loss": 0.04808177202939987, + "loss/crossentropy": 2.7852281630039215, + "loss/logits": 0.8739712238311768, + "step": 28230 + }, + { + "epoch": 0.2824, + "grad_norm": 12.375, + "grad_norm_var": 0.35714518229166664, + "learning_rate": 0.0003, + "loss": 11.4382, + "loss/aux_loss": 0.0480819521471858, + "loss/crossentropy": 2.8439980030059813, + "loss/logits": 0.8818934857845306, + "step": 28240 + }, + { + "epoch": 0.2825, + "grad_norm": 12.5, + "grad_norm_var": 0.40310872395833336, + "learning_rate": 0.0003, + "loss": 11.3937, + "loss/aux_loss": 0.04809800013899803, + "loss/crossentropy": 2.752180802822113, + "loss/logits": 0.8844646722078323, + "step": 28250 + }, + { + "epoch": 0.2826, + "grad_norm": 12.3125, + "grad_norm_var": 0.5004557291666667, + "learning_rate": 0.0003, + "loss": 11.5381, + "loss/aux_loss": 0.04807610791176557, + "loss/crossentropy": 2.587927532196045, + "loss/logits": 0.8262291848659515, + "step": 28260 + }, + { + "epoch": 0.2827, + "grad_norm": 14.625, + "grad_norm_var": 0.5624348958333333, + "learning_rate": 0.0003, + "loss": 11.3631, + "loss/aux_loss": 0.04808882139623165, + "loss/crossentropy": 2.8169186234474184, + "loss/logits": 0.8660006016492844, + "step": 28270 + }, + { + "epoch": 0.2828, + "grad_norm": 13.4375, + "grad_norm_var": 0.6494791666666667, + "learning_rate": 0.0003, + "loss": 11.7342, + "loss/aux_loss": 0.04807655774056911, + "loss/crossentropy": 2.7526882588863373, + "loss/logits": 0.8869515836238862, + "step": 28280 + }, + { + "epoch": 0.2829, + "grad_norm": 12.5, + "grad_norm_var": 0.586181640625, + "learning_rate": 0.0003, + "loss": 11.759, + "loss/aux_loss": 0.048088379204273224, + "loss/crossentropy": 2.7451719284057616, + "loss/logits": 0.9054650783538818, + "step": 28290 + }, + { + "epoch": 0.283, + "grad_norm": 12.0, + "grad_norm_var": 0.5676920572916667, + "learning_rate": 0.0003, + "loss": 11.4817, + "loss/aux_loss": 0.04807801488786936, + "loss/crossentropy": 2.8209929168224335, + "loss/logits": 0.8833929538726807, + "step": 28300 + }, + { + "epoch": 0.2831, + "grad_norm": 14.375, + "grad_norm_var": 0.9614420572916667, + "learning_rate": 0.0003, + "loss": 11.4367, + "loss/aux_loss": 0.048084160685539244, + "loss/crossentropy": 2.6196862697601317, + "loss/logits": 0.8329427570104599, + "step": 28310 + }, + { + "epoch": 0.2832, + "grad_norm": 13.5, + "grad_norm_var": 0.5783854166666667, + "learning_rate": 0.0003, + "loss": 11.5229, + "loss/aux_loss": 0.0480828158557415, + "loss/crossentropy": 2.847772258520126, + "loss/logits": 0.8690467923879623, + "step": 28320 + }, + { + "epoch": 0.2833, + "grad_norm": 12.125, + "grad_norm_var": 0.8005045572916667, + "learning_rate": 0.0003, + "loss": 11.4262, + "loss/aux_loss": 0.048074822314083575, + "loss/crossentropy": 2.687458795309067, + "loss/logits": 0.8230527967214585, + "step": 28330 + }, + { + "epoch": 0.2834, + "grad_norm": 14.6875, + "grad_norm_var": 0.592041015625, + "learning_rate": 0.0003, + "loss": 11.6375, + "loss/aux_loss": 0.048090120404958726, + "loss/crossentropy": 2.898961102962494, + "loss/logits": 0.8803368806838989, + "step": 28340 + }, + { + "epoch": 0.2835, + "grad_norm": 13.375, + "grad_norm_var": 0.2587890625, + "learning_rate": 0.0003, + "loss": 11.6828, + "loss/aux_loss": 0.048081318661570546, + "loss/crossentropy": 2.6955320119857786, + "loss/logits": 0.8958701252937317, + "step": 28350 + }, + { + "epoch": 0.2836, + "grad_norm": 12.1875, + "grad_norm_var": 0.29420572916666665, + "learning_rate": 0.0003, + "loss": 11.6121, + "loss/aux_loss": 0.048087548650801185, + "loss/crossentropy": 2.8415299594402312, + "loss/logits": 0.8907667517662048, + "step": 28360 + }, + { + "epoch": 0.2837, + "grad_norm": 13.5625, + "grad_norm_var": 0.29659830729166664, + "learning_rate": 0.0003, + "loss": 11.6166, + "loss/aux_loss": 0.048079358972609045, + "loss/crossentropy": 2.7396019995212555, + "loss/logits": 0.8907454043626786, + "step": 28370 + }, + { + "epoch": 0.2838, + "grad_norm": 13.375, + "grad_norm_var": 0.152978515625, + "learning_rate": 0.0003, + "loss": 11.4829, + "loss/aux_loss": 0.04807760044932365, + "loss/crossentropy": 2.768324136734009, + "loss/logits": 0.8386527955532074, + "step": 28380 + }, + { + "epoch": 0.2839, + "grad_norm": 13.5625, + "grad_norm_var": 0.23865559895833333, + "learning_rate": 0.0003, + "loss": 11.6151, + "loss/aux_loss": 0.048082569241523744, + "loss/crossentropy": 2.878603792190552, + "loss/logits": 0.8644590139389038, + "step": 28390 + }, + { + "epoch": 0.284, + "grad_norm": 14.8125, + "grad_norm_var": 0.49680989583333335, + "learning_rate": 0.0003, + "loss": 11.6059, + "loss/aux_loss": 0.04808495007455349, + "loss/crossentropy": 2.6303452491760253, + "loss/logits": 0.8935995787382126, + "step": 28400 + }, + { + "epoch": 0.2841, + "grad_norm": 12.125, + "grad_norm_var": 0.733056640625, + "learning_rate": 0.0003, + "loss": 11.4731, + "loss/aux_loss": 0.048085509426891805, + "loss/crossentropy": 2.828030973672867, + "loss/logits": 0.8445266515016556, + "step": 28410 + }, + { + "epoch": 0.2842, + "grad_norm": 13.6875, + "grad_norm_var": 0.44503580729166664, + "learning_rate": 0.0003, + "loss": 11.4002, + "loss/aux_loss": 0.04808097891509533, + "loss/crossentropy": 2.6388884663581846, + "loss/logits": 0.9015711516141891, + "step": 28420 + }, + { + "epoch": 0.2843, + "grad_norm": 13.75, + "grad_norm_var": 0.4434733072916667, + "learning_rate": 0.0003, + "loss": 11.514, + "loss/aux_loss": 0.048092016205191615, + "loss/crossentropy": 2.815828490257263, + "loss/logits": 0.9084261149168015, + "step": 28430 + }, + { + "epoch": 0.2844, + "grad_norm": 12.625, + "grad_norm_var": 0.54453125, + "learning_rate": 0.0003, + "loss": 11.5181, + "loss/aux_loss": 0.04807435479015112, + "loss/crossentropy": 2.7728405237197875, + "loss/logits": 0.887555119395256, + "step": 28440 + }, + { + "epoch": 0.2845, + "grad_norm": 12.5625, + "grad_norm_var": 0.42916666666666664, + "learning_rate": 0.0003, + "loss": 11.5056, + "loss/aux_loss": 0.04808815475553274, + "loss/crossentropy": 2.8544474244117737, + "loss/logits": 0.8347731292247772, + "step": 28450 + }, + { + "epoch": 0.2846, + "grad_norm": 14.0625, + "grad_norm_var": 0.5046223958333333, + "learning_rate": 0.0003, + "loss": 11.5202, + "loss/aux_loss": 0.048085703514516354, + "loss/crossentropy": 2.570024532079697, + "loss/logits": 0.8466786921024323, + "step": 28460 + }, + { + "epoch": 0.2847, + "grad_norm": 14.375, + "grad_norm_var": 0.6171223958333333, + "learning_rate": 0.0003, + "loss": 11.319, + "loss/aux_loss": 0.04809032492339611, + "loss/crossentropy": 2.7953803539276123, + "loss/logits": 0.8648561179637909, + "step": 28470 + }, + { + "epoch": 0.2848, + "grad_norm": 13.625, + "grad_norm_var": 0.5113932291666666, + "learning_rate": 0.0003, + "loss": 11.452, + "loss/aux_loss": 0.04808381143957376, + "loss/crossentropy": 2.7199991762638094, + "loss/logits": 0.8637189954519272, + "step": 28480 + }, + { + "epoch": 0.2849, + "grad_norm": 12.625, + "grad_norm_var": 0.410400390625, + "learning_rate": 0.0003, + "loss": 11.4654, + "loss/aux_loss": 0.04808434545993805, + "loss/crossentropy": 2.8280713319778443, + "loss/logits": 0.9059916436672211, + "step": 28490 + }, + { + "epoch": 0.285, + "grad_norm": 51.75, + "grad_norm_var": 95.12967122395834, + "learning_rate": 0.0003, + "loss": 11.5128, + "loss/aux_loss": 0.0480845658108592, + "loss/crossentropy": 2.858335256576538, + "loss/logits": 0.9236481755971908, + "step": 28500 + }, + { + "epoch": 0.2851, + "grad_norm": 14.1875, + "grad_norm_var": 92.11041666666667, + "learning_rate": 0.0003, + "loss": 11.5787, + "loss/aux_loss": 0.04808925464749336, + "loss/crossentropy": 2.846384787559509, + "loss/logits": 0.8865832269191742, + "step": 28510 + }, + { + "epoch": 0.2852, + "grad_norm": 13.625, + "grad_norm_var": 0.3070149739583333, + "learning_rate": 0.0003, + "loss": 11.6155, + "loss/aux_loss": 0.048087059520184994, + "loss/crossentropy": 2.6891987919807434, + "loss/logits": 0.8724437922239303, + "step": 28520 + }, + { + "epoch": 0.2853, + "grad_norm": 13.1875, + "grad_norm_var": 0.28487955729166664, + "learning_rate": 0.0003, + "loss": 11.647, + "loss/aux_loss": 0.04809146039187908, + "loss/crossentropy": 2.803027904033661, + "loss/logits": 0.8911570340394974, + "step": 28530 + }, + { + "epoch": 0.2854, + "grad_norm": 13.0, + "grad_norm_var": 0.46796875, + "learning_rate": 0.0003, + "loss": 11.6234, + "loss/aux_loss": 0.04807390477508307, + "loss/crossentropy": 2.7482841432094576, + "loss/logits": 0.8875775545835495, + "step": 28540 + }, + { + "epoch": 0.2855, + "grad_norm": 12.75, + "grad_norm_var": 55.06139322916667, + "learning_rate": 0.0003, + "loss": 11.552, + "loss/aux_loss": 0.04810281321406364, + "loss/crossentropy": 2.6797729313373564, + "loss/logits": 0.8286285102367401, + "step": 28550 + }, + { + "epoch": 0.2856, + "grad_norm": 13.4375, + "grad_norm_var": 1.0009765625, + "learning_rate": 0.0003, + "loss": 11.489, + "loss/aux_loss": 0.048082736879587175, + "loss/crossentropy": 2.6085013091564178, + "loss/logits": 0.8426523476839065, + "step": 28560 + }, + { + "epoch": 0.2857, + "grad_norm": 15.625, + "grad_norm_var": 0.8051432291666667, + "learning_rate": 0.0003, + "loss": 11.5542, + "loss/aux_loss": 0.04808100238442421, + "loss/crossentropy": 2.788421058654785, + "loss/logits": 0.865365993976593, + "step": 28570 + }, + { + "epoch": 0.2858, + "grad_norm": 13.3125, + "grad_norm_var": 0.9296712239583333, + "learning_rate": 0.0003, + "loss": 11.5754, + "loss/aux_loss": 0.04808872230350971, + "loss/crossentropy": 2.6733221411705017, + "loss/logits": 0.8689684510231018, + "step": 28580 + }, + { + "epoch": 0.2859, + "grad_norm": 13.5, + "grad_norm_var": 0.5859375, + "learning_rate": 0.0003, + "loss": 11.3833, + "loss/aux_loss": 0.048073044046759605, + "loss/crossentropy": 2.7072140097618105, + "loss/logits": 0.8347415089607239, + "step": 28590 + }, + { + "epoch": 0.286, + "grad_norm": 13.0625, + "grad_norm_var": 0.8980305989583334, + "learning_rate": 0.0003, + "loss": 11.5353, + "loss/aux_loss": 0.04807990416884422, + "loss/crossentropy": 2.6441255509853363, + "loss/logits": 0.8408429473638535, + "step": 28600 + }, + { + "epoch": 0.2861, + "grad_norm": 13.1875, + "grad_norm_var": 0.32667643229166665, + "learning_rate": 0.0003, + "loss": 11.3859, + "loss/aux_loss": 0.04808638412505388, + "loss/crossentropy": 2.8064417958259584, + "loss/logits": 0.8815937727689743, + "step": 28610 + }, + { + "epoch": 0.2862, + "grad_norm": 13.6875, + "grad_norm_var": 0.1900390625, + "learning_rate": 0.0003, + "loss": 11.3424, + "loss/aux_loss": 0.04808064606040716, + "loss/crossentropy": 2.692779916524887, + "loss/logits": 0.8429404377937317, + "step": 28620 + }, + { + "epoch": 0.2863, + "grad_norm": 13.5625, + "grad_norm_var": 0.3089680989583333, + "learning_rate": 0.0003, + "loss": 11.656, + "loss/aux_loss": 0.04809319917112589, + "loss/crossentropy": 2.8748478055000306, + "loss/logits": 0.9063412040472031, + "step": 28630 + }, + { + "epoch": 0.2864, + "grad_norm": 16.875, + "grad_norm_var": 13.566650390625, + "learning_rate": 0.0003, + "loss": 11.4876, + "loss/aux_loss": 0.04807700905948877, + "loss/crossentropy": 2.779051947593689, + "loss/logits": 0.8662481039762497, + "step": 28640 + }, + { + "epoch": 0.2865, + "grad_norm": 15.0625, + "grad_norm_var": 12.745426432291667, + "learning_rate": 0.0003, + "loss": 11.4317, + "loss/aux_loss": 0.04808287639170885, + "loss/crossentropy": 2.825643515586853, + "loss/logits": 0.8664470076560974, + "step": 28650 + }, + { + "epoch": 0.2866, + "grad_norm": 13.1875, + "grad_norm_var": 0.5754557291666667, + "learning_rate": 0.0003, + "loss": 11.2821, + "loss/aux_loss": 0.04808809049427509, + "loss/crossentropy": 2.735125958919525, + "loss/logits": 0.8850825309753418, + "step": 28660 + }, + { + "epoch": 0.2867, + "grad_norm": 15.0, + "grad_norm_var": 1.239306640625, + "learning_rate": 0.0003, + "loss": 11.6871, + "loss/aux_loss": 0.04809844307601452, + "loss/crossentropy": 2.6700200915336607, + "loss/logits": 0.8919987231492996, + "step": 28670 + }, + { + "epoch": 0.2868, + "grad_norm": 14.75, + "grad_norm_var": 1.1426432291666666, + "learning_rate": 0.0003, + "loss": 11.4459, + "loss/aux_loss": 0.04808284323662519, + "loss/crossentropy": 2.639972817897797, + "loss/logits": 0.8575376510620117, + "step": 28680 + }, + { + "epoch": 0.2869, + "grad_norm": 13.1875, + "grad_norm_var": 0.633447265625, + "learning_rate": 0.0003, + "loss": 11.448, + "loss/aux_loss": 0.04808585159480572, + "loss/crossentropy": 2.684755891561508, + "loss/logits": 0.858421990275383, + "step": 28690 + }, + { + "epoch": 0.287, + "grad_norm": 13.3125, + "grad_norm_var": 1.0254557291666666, + "learning_rate": 0.0003, + "loss": 11.4013, + "loss/aux_loss": 0.04808926824480295, + "loss/crossentropy": 2.733264869451523, + "loss/logits": 0.8662783950567245, + "step": 28700 + }, + { + "epoch": 0.2871, + "grad_norm": 13.125, + "grad_norm_var": 0.4328125, + "learning_rate": 0.0003, + "loss": 11.6559, + "loss/aux_loss": 0.04808164779096842, + "loss/crossentropy": 2.625485306978226, + "loss/logits": 0.8678814113140106, + "step": 28710 + }, + { + "epoch": 0.2872, + "grad_norm": 12.75, + "grad_norm_var": 0.3489583333333333, + "learning_rate": 0.0003, + "loss": 11.5625, + "loss/aux_loss": 0.048087488114833835, + "loss/crossentropy": 2.7748125314712526, + "loss/logits": 0.8737635612487793, + "step": 28720 + }, + { + "epoch": 0.2873, + "grad_norm": 13.75, + "grad_norm_var": 0.5188639322916667, + "learning_rate": 0.0003, + "loss": 11.3701, + "loss/aux_loss": 0.04807881489396095, + "loss/crossentropy": 2.825051474571228, + "loss/logits": 0.869257315993309, + "step": 28730 + }, + { + "epoch": 0.2874, + "grad_norm": 12.75, + "grad_norm_var": 0.3101399739583333, + "learning_rate": 0.0003, + "loss": 11.5001, + "loss/aux_loss": 0.04808692578226328, + "loss/crossentropy": 2.682652533054352, + "loss/logits": 0.8922774195671082, + "step": 28740 + }, + { + "epoch": 0.2875, + "grad_norm": 12.875, + "grad_norm_var": 0.2384765625, + "learning_rate": 0.0003, + "loss": 11.5308, + "loss/aux_loss": 0.048078923113644124, + "loss/crossentropy": 2.748230826854706, + "loss/logits": 0.9064568638801574, + "step": 28750 + }, + { + "epoch": 0.2876, + "grad_norm": 12.375, + "grad_norm_var": 0.2869140625, + "learning_rate": 0.0003, + "loss": 11.4102, + "loss/aux_loss": 0.0480932604521513, + "loss/crossentropy": 2.6067364394664763, + "loss/logits": 0.8314082384109497, + "step": 28760 + }, + { + "epoch": 0.2877, + "grad_norm": 12.5625, + "grad_norm_var": 0.4200358072916667, + "learning_rate": 0.0003, + "loss": 11.4282, + "loss/aux_loss": 0.04807720612734556, + "loss/crossentropy": 2.7676464080810548, + "loss/logits": 0.8628965139389038, + "step": 28770 + }, + { + "epoch": 0.2878, + "grad_norm": 13.5, + "grad_norm_var": 0.6832682291666666, + "learning_rate": 0.0003, + "loss": 11.5929, + "loss/aux_loss": 0.048078482411801814, + "loss/crossentropy": 2.7174685835838317, + "loss/logits": 0.8926462024450302, + "step": 28780 + }, + { + "epoch": 0.2879, + "grad_norm": 13.875, + "grad_norm_var": 0.73671875, + "learning_rate": 0.0003, + "loss": 11.4883, + "loss/aux_loss": 0.04808929469436407, + "loss/crossentropy": 2.8711092829704286, + "loss/logits": 0.9005297362804413, + "step": 28790 + }, + { + "epoch": 0.288, + "grad_norm": 58.75, + "grad_norm_var": 130.48567708333334, + "learning_rate": 0.0003, + "loss": 11.499, + "loss/aux_loss": 0.04807561915367842, + "loss/crossentropy": 2.684372991323471, + "loss/logits": 0.8370014727115631, + "step": 28800 + }, + { + "epoch": 0.2881, + "grad_norm": 12.75, + "grad_norm_var": 130.08795572916668, + "learning_rate": 0.0003, + "loss": 11.4676, + "loss/aux_loss": 0.04809402357786894, + "loss/crossentropy": 2.8078381299972532, + "loss/logits": 0.8840030491352081, + "step": 28810 + }, + { + "epoch": 0.2882, + "grad_norm": 13.25, + "grad_norm_var": 0.5019368489583333, + "learning_rate": 0.0003, + "loss": 11.4451, + "loss/aux_loss": 0.04808465614914894, + "loss/crossentropy": 2.681842344999313, + "loss/logits": 0.882282269001007, + "step": 28820 + }, + { + "epoch": 0.2883, + "grad_norm": 12.8125, + "grad_norm_var": 0.2556640625, + "learning_rate": 0.0003, + "loss": 11.4593, + "loss/aux_loss": 0.04808338657021523, + "loss/crossentropy": 2.718055808544159, + "loss/logits": 0.8846965968608856, + "step": 28830 + }, + { + "epoch": 0.2884, + "grad_norm": 13.625, + "grad_norm_var": 3.116259765625, + "learning_rate": 0.0003, + "loss": 11.4866, + "loss/aux_loss": 0.048081991448998454, + "loss/crossentropy": 2.824300652742386, + "loss/logits": 0.8746830075979233, + "step": 28840 + }, + { + "epoch": 0.2885, + "grad_norm": 13.625, + "grad_norm_var": 0.311181640625, + "learning_rate": 0.0003, + "loss": 11.2453, + "loss/aux_loss": 0.04808319676667452, + "loss/crossentropy": 2.759380376338959, + "loss/logits": 0.8470451653003692, + "step": 28850 + }, + { + "epoch": 0.2886, + "grad_norm": 12.5625, + "grad_norm_var": 1.086181640625, + "learning_rate": 0.0003, + "loss": 11.5076, + "loss/aux_loss": 0.04808532185852528, + "loss/crossentropy": 2.7223631918430327, + "loss/logits": 0.8595778405666351, + "step": 28860 + }, + { + "epoch": 0.2887, + "grad_norm": 13.3125, + "grad_norm_var": 0.7536458333333333, + "learning_rate": 0.0003, + "loss": 11.5933, + "loss/aux_loss": 0.04808366596698761, + "loss/crossentropy": 2.6659576177597044, + "loss/logits": 0.8435241490602493, + "step": 28870 + }, + { + "epoch": 0.2888, + "grad_norm": 12.375, + "grad_norm_var": 1.361572265625, + "learning_rate": 0.0003, + "loss": 11.6656, + "loss/aux_loss": 0.04809127729386091, + "loss/crossentropy": 2.731011927127838, + "loss/logits": 0.8725592494010925, + "step": 28880 + }, + { + "epoch": 0.2889, + "grad_norm": 12.8125, + "grad_norm_var": 1.0494140625, + "learning_rate": 0.0003, + "loss": 11.3996, + "loss/aux_loss": 0.04808544144034386, + "loss/crossentropy": 2.852566087245941, + "loss/logits": 0.9123659133911133, + "step": 28890 + }, + { + "epoch": 0.289, + "grad_norm": 12.625, + "grad_norm_var": 0.5302083333333333, + "learning_rate": 0.0003, + "loss": 11.5145, + "loss/aux_loss": 0.04808180872350931, + "loss/crossentropy": 2.8115632593631745, + "loss/logits": 0.8549628496170044, + "step": 28900 + }, + { + "epoch": 0.2891, + "grad_norm": 12.625, + "grad_norm_var": 0.4416015625, + "learning_rate": 0.0003, + "loss": 11.4916, + "loss/aux_loss": 0.048086360283195975, + "loss/crossentropy": 2.8208987712860107, + "loss/logits": 0.8684911131858826, + "step": 28910 + }, + { + "epoch": 0.2892, + "grad_norm": 12.875, + "grad_norm_var": 0.306494140625, + "learning_rate": 0.0003, + "loss": 11.6515, + "loss/aux_loss": 0.048084983974695204, + "loss/crossentropy": 2.82762331366539, + "loss/logits": 0.8740989983081817, + "step": 28920 + }, + { + "epoch": 0.2893, + "grad_norm": 12.375, + "grad_norm_var": 0.20428059895833334, + "learning_rate": 0.0003, + "loss": 11.3879, + "loss/aux_loss": 0.048082150518894196, + "loss/crossentropy": 2.7328701674938203, + "loss/logits": 0.8231735050678253, + "step": 28930 + }, + { + "epoch": 0.2894, + "grad_norm": 13.375, + "grad_norm_var": 0.40572916666666664, + "learning_rate": 0.0003, + "loss": 11.4964, + "loss/aux_loss": 0.04808959234505892, + "loss/crossentropy": 2.681563550233841, + "loss/logits": 0.8461143642663955, + "step": 28940 + }, + { + "epoch": 0.2895, + "grad_norm": 12.5, + "grad_norm_var": 0.245947265625, + "learning_rate": 0.0003, + "loss": 11.3121, + "loss/aux_loss": 0.04808651022613049, + "loss/crossentropy": 2.6562454462051392, + "loss/logits": 0.8221059828996659, + "step": 28950 + }, + { + "epoch": 0.2896, + "grad_norm": 13.0625, + "grad_norm_var": 0.122509765625, + "learning_rate": 0.0003, + "loss": 11.4933, + "loss/aux_loss": 0.04808787871152163, + "loss/crossentropy": 2.725093901157379, + "loss/logits": 0.8643009692430497, + "step": 28960 + }, + { + "epoch": 0.2897, + "grad_norm": 12.8125, + "grad_norm_var": 0.23203125, + "learning_rate": 0.0003, + "loss": 11.4234, + "loss/aux_loss": 0.04808852039277554, + "loss/crossentropy": 2.7337252140045165, + "loss/logits": 0.8330873519182205, + "step": 28970 + }, + { + "epoch": 0.2898, + "grad_norm": 13.1875, + "grad_norm_var": 0.406494140625, + "learning_rate": 0.0003, + "loss": 11.53, + "loss/aux_loss": 0.048081529699265955, + "loss/crossentropy": 2.6677880942821504, + "loss/logits": 0.8496348142623902, + "step": 28980 + }, + { + "epoch": 0.2899, + "grad_norm": 12.4375, + "grad_norm_var": 0.3011555989583333, + "learning_rate": 0.0003, + "loss": 11.3731, + "loss/aux_loss": 0.04807930588722229, + "loss/crossentropy": 2.6456236064434053, + "loss/logits": 0.8759482502937317, + "step": 28990 + }, + { + "epoch": 0.29, + "grad_norm": 12.3125, + "grad_norm_var": 0.14021809895833334, + "learning_rate": 0.0003, + "loss": 11.5333, + "loss/aux_loss": 0.048075495101511476, + "loss/crossentropy": 2.8131125450134276, + "loss/logits": 0.8634077340364457, + "step": 29000 + }, + { + "epoch": 0.2901, + "grad_norm": 12.8125, + "grad_norm_var": 0.12342122395833334, + "learning_rate": 0.0003, + "loss": 11.5456, + "loss/aux_loss": 0.04809298645704985, + "loss/crossentropy": 2.8442670702934265, + "loss/logits": 0.8695224732160568, + "step": 29010 + }, + { + "epoch": 0.2902, + "grad_norm": 12.25, + "grad_norm_var": 0.21458333333333332, + "learning_rate": 0.0003, + "loss": 11.5506, + "loss/aux_loss": 0.04808041173964739, + "loss/crossentropy": 2.5905315399169924, + "loss/logits": 0.8509000718593598, + "step": 29020 + }, + { + "epoch": 0.2903, + "grad_norm": 12.25, + "grad_norm_var": 0.5184733072916666, + "learning_rate": 0.0003, + "loss": 11.5184, + "loss/aux_loss": 0.0480886347591877, + "loss/crossentropy": 2.892964768409729, + "loss/logits": 0.887509498000145, + "step": 29030 + }, + { + "epoch": 0.2904, + "grad_norm": 14.1875, + "grad_norm_var": 0.609375, + "learning_rate": 0.0003, + "loss": 11.6753, + "loss/aux_loss": 0.048083288595080376, + "loss/crossentropy": 2.7917010486125946, + "loss/logits": 0.9125192284584045, + "step": 29040 + }, + { + "epoch": 0.2905, + "grad_norm": 12.875, + "grad_norm_var": 0.8778483072916666, + "learning_rate": 0.0003, + "loss": 11.4772, + "loss/aux_loss": 0.04808119647204876, + "loss/crossentropy": 2.6203967094421388, + "loss/logits": 0.8430052489042282, + "step": 29050 + }, + { + "epoch": 0.2906, + "grad_norm": 13.125, + "grad_norm_var": 2.5011555989583334, + "learning_rate": 0.0003, + "loss": 11.3984, + "loss/aux_loss": 0.048092004284262656, + "loss/crossentropy": 2.7506280064582826, + "loss/logits": 0.8497364521026611, + "step": 29060 + }, + { + "epoch": 0.2907, + "grad_norm": 13.1875, + "grad_norm_var": 2.12578125, + "learning_rate": 0.0003, + "loss": 11.4993, + "loss/aux_loss": 0.04808201938867569, + "loss/crossentropy": 2.765266942977905, + "loss/logits": 0.8675076127052307, + "step": 29070 + }, + { + "epoch": 0.2908, + "grad_norm": 12.9375, + "grad_norm_var": 0.17838541666666666, + "learning_rate": 0.0003, + "loss": 11.4641, + "loss/aux_loss": 0.048099024966359136, + "loss/crossentropy": 2.6994349300861358, + "loss/logits": 0.8749856293201447, + "step": 29080 + }, + { + "epoch": 0.2909, + "grad_norm": 12.75, + "grad_norm_var": 0.386962890625, + "learning_rate": 0.0003, + "loss": 11.6602, + "loss/aux_loss": 0.048081963881850244, + "loss/crossentropy": 2.8470765471458437, + "loss/logits": 0.9124794363975525, + "step": 29090 + }, + { + "epoch": 0.291, + "grad_norm": 12.75, + "grad_norm_var": 0.6010416666666667, + "learning_rate": 0.0003, + "loss": 11.7205, + "loss/aux_loss": 0.0480800811201334, + "loss/crossentropy": 2.632298457622528, + "loss/logits": 0.8833822071552276, + "step": 29100 + }, + { + "epoch": 0.2911, + "grad_norm": 13.375, + "grad_norm_var": 0.9180826822916667, + "learning_rate": 0.0003, + "loss": 11.5494, + "loss/aux_loss": 0.0480863980948925, + "loss/crossentropy": 2.8006245315074922, + "loss/logits": 0.8473658740520478, + "step": 29110 + }, + { + "epoch": 0.2912, + "grad_norm": 15.375, + "grad_norm_var": 20.746614583333333, + "learning_rate": 0.0003, + "loss": 11.4375, + "loss/aux_loss": 0.0480819221585989, + "loss/crossentropy": 2.8713009297847747, + "loss/logits": 0.8780528694391251, + "step": 29120 + }, + { + "epoch": 0.2913, + "grad_norm": 13.6875, + "grad_norm_var": 20.1234375, + "learning_rate": 0.0003, + "loss": 11.3549, + "loss/aux_loss": 0.048090817779302596, + "loss/crossentropy": 2.890332305431366, + "loss/logits": 0.9046699106693268, + "step": 29130 + }, + { + "epoch": 0.2914, + "grad_norm": 13.25, + "grad_norm_var": 0.9503743489583333, + "learning_rate": 0.0003, + "loss": 11.377, + "loss/aux_loss": 0.048084873519837854, + "loss/crossentropy": 2.6816116988658907, + "loss/logits": 0.8458852350711823, + "step": 29140 + }, + { + "epoch": 0.2915, + "grad_norm": 12.75, + "grad_norm_var": 0.383447265625, + "learning_rate": 0.0003, + "loss": 11.5313, + "loss/aux_loss": 0.04807651937007904, + "loss/crossentropy": 2.791849434375763, + "loss/logits": 0.8508994936943054, + "step": 29150 + }, + { + "epoch": 0.2916, + "grad_norm": 13.75, + "grad_norm_var": 0.5989583333333334, + "learning_rate": 0.0003, + "loss": 11.5582, + "loss/aux_loss": 0.04808504190295935, + "loss/crossentropy": 2.727277672290802, + "loss/logits": 0.8817504495382309, + "step": 29160 + }, + { + "epoch": 0.2917, + "grad_norm": 13.3125, + "grad_norm_var": 1.3075358072916667, + "learning_rate": 0.0003, + "loss": 11.5291, + "loss/aux_loss": 0.04808583091944456, + "loss/crossentropy": 2.900036704540253, + "loss/logits": 0.8793569028377533, + "step": 29170 + }, + { + "epoch": 0.2918, + "grad_norm": 12.1875, + "grad_norm_var": 1.3009765625, + "learning_rate": 0.0003, + "loss": 11.5269, + "loss/aux_loss": 0.04807761292904615, + "loss/crossentropy": 2.790689837932587, + "loss/logits": 0.8850260347127914, + "step": 29180 + }, + { + "epoch": 0.2919, + "grad_norm": 12.4375, + "grad_norm_var": 0.7283854166666667, + "learning_rate": 0.0003, + "loss": 11.454, + "loss/aux_loss": 0.04808845371007919, + "loss/crossentropy": 2.3579376369714735, + "loss/logits": 0.795929902791977, + "step": 29190 + }, + { + "epoch": 0.292, + "grad_norm": 14.0, + "grad_norm_var": 0.2769368489583333, + "learning_rate": 0.0003, + "loss": 11.4508, + "loss/aux_loss": 0.04808519259095192, + "loss/crossentropy": 2.759959888458252, + "loss/logits": 0.8524115920066834, + "step": 29200 + }, + { + "epoch": 0.2921, + "grad_norm": 12.3125, + "grad_norm_var": 0.40349934895833334, + "learning_rate": 0.0003, + "loss": 11.5466, + "loss/aux_loss": 0.048085874505341054, + "loss/crossentropy": 2.7660795211791993, + "loss/logits": 0.8959324955940247, + "step": 29210 + }, + { + "epoch": 0.2922, + "grad_norm": 12.6875, + "grad_norm_var": 0.9738932291666667, + "learning_rate": 0.0003, + "loss": 11.5221, + "loss/aux_loss": 0.04807844292372465, + "loss/crossentropy": 2.739389771223068, + "loss/logits": 0.9002001017332077, + "step": 29220 + }, + { + "epoch": 0.2923, + "grad_norm": 12.6875, + "grad_norm_var": 0.8861979166666667, + "learning_rate": 0.0003, + "loss": 11.604, + "loss/aux_loss": 0.04809589311480522, + "loss/crossentropy": 2.8620386838912966, + "loss/logits": 0.8738722622394561, + "step": 29230 + }, + { + "epoch": 0.2924, + "grad_norm": 13.0, + "grad_norm_var": 0.8202473958333333, + "learning_rate": 0.0003, + "loss": 11.5345, + "loss/aux_loss": 0.048076955042779444, + "loss/crossentropy": 2.765863335132599, + "loss/logits": 0.8648845195770264, + "step": 29240 + }, + { + "epoch": 0.2925, + "grad_norm": 14.125, + "grad_norm_var": 0.6406087239583333, + "learning_rate": 0.0003, + "loss": 11.4273, + "loss/aux_loss": 0.04808403495699167, + "loss/crossentropy": 2.6642160415649414, + "loss/logits": 0.8542217493057251, + "step": 29250 + }, + { + "epoch": 0.2926, + "grad_norm": 14.125, + "grad_norm_var": 0.6924479166666667, + "learning_rate": 0.0003, + "loss": 11.4789, + "loss/aux_loss": 0.048092410899698734, + "loss/crossentropy": 2.6984796285629273, + "loss/logits": 0.865311412513256, + "step": 29260 + }, + { + "epoch": 0.2927, + "grad_norm": 14.1875, + "grad_norm_var": 0.48125, + "learning_rate": 0.0003, + "loss": 11.6032, + "loss/aux_loss": 0.04807758815586567, + "loss/crossentropy": 2.764070636034012, + "loss/logits": 0.9029274463653565, + "step": 29270 + }, + { + "epoch": 0.2928, + "grad_norm": 12.6875, + "grad_norm_var": 0.37005208333333334, + "learning_rate": 0.0003, + "loss": 11.4, + "loss/aux_loss": 0.04808840956538916, + "loss/crossentropy": 2.5958930790424346, + "loss/logits": 0.8585714161396026, + "step": 29280 + }, + { + "epoch": 0.2929, + "grad_norm": 13.3125, + "grad_norm_var": 0.3031087239583333, + "learning_rate": 0.0003, + "loss": 11.3794, + "loss/aux_loss": 0.048076996393501756, + "loss/crossentropy": 2.841508948802948, + "loss/logits": 0.8784020185470581, + "step": 29290 + }, + { + "epoch": 0.293, + "grad_norm": 13.375, + "grad_norm_var": 1.5706868489583334, + "learning_rate": 0.0003, + "loss": 11.445, + "loss/aux_loss": 0.04809955209493637, + "loss/crossentropy": 2.80951851606369, + "loss/logits": 0.8316282510757447, + "step": 29300 + }, + { + "epoch": 0.2931, + "grad_norm": 13.1875, + "grad_norm_var": 0.7716145833333333, + "learning_rate": 0.0003, + "loss": 11.5951, + "loss/aux_loss": 0.04807936865836382, + "loss/crossentropy": 2.7189249217510225, + "loss/logits": 0.8644407778978348, + "step": 29310 + }, + { + "epoch": 0.2932, + "grad_norm": 12.6875, + "grad_norm_var": 0.5526041666666667, + "learning_rate": 0.0003, + "loss": 11.5453, + "loss/aux_loss": 0.048080489970743656, + "loss/crossentropy": 2.8331545174121855, + "loss/logits": 0.8822837799787522, + "step": 29320 + }, + { + "epoch": 0.2933, + "grad_norm": 13.1875, + "grad_norm_var": 0.23229166666666667, + "learning_rate": 0.0003, + "loss": 11.7391, + "loss/aux_loss": 0.0480887183919549, + "loss/crossentropy": 2.7327269673347474, + "loss/logits": 0.8839978009462357, + "step": 29330 + }, + { + "epoch": 0.2934, + "grad_norm": 13.625, + "grad_norm_var": 0.22654622395833332, + "learning_rate": 0.0003, + "loss": 11.4887, + "loss/aux_loss": 0.04807803481817245, + "loss/crossentropy": 2.721933346986771, + "loss/logits": 0.8577464699745179, + "step": 29340 + }, + { + "epoch": 0.2935, + "grad_norm": 13.25, + "grad_norm_var": 0.18448893229166666, + "learning_rate": 0.0003, + "loss": 11.4648, + "loss/aux_loss": 0.04808668624609709, + "loss/crossentropy": 2.8373769760131835, + "loss/logits": 0.8713858962059021, + "step": 29350 + }, + { + "epoch": 0.2936, + "grad_norm": 13.1875, + "grad_norm_var": 1.0161458333333333, + "learning_rate": 0.0003, + "loss": 11.4781, + "loss/aux_loss": 0.04806650560349226, + "loss/crossentropy": 2.651980197429657, + "loss/logits": 0.8469345271587372, + "step": 29360 + }, + { + "epoch": 0.2937, + "grad_norm": 14.1875, + "grad_norm_var": 0.3165201822916667, + "learning_rate": 0.0003, + "loss": 11.4494, + "loss/aux_loss": 0.048085213825106624, + "loss/crossentropy": 2.8691640198230743, + "loss/logits": 0.8683050394058227, + "step": 29370 + }, + { + "epoch": 0.2938, + "grad_norm": 13.75, + "grad_norm_var": 0.42630208333333336, + "learning_rate": 0.0003, + "loss": 11.5254, + "loss/aux_loss": 0.04808463230729103, + "loss/crossentropy": 2.6619069993495943, + "loss/logits": 0.8452241331338882, + "step": 29380 + }, + { + "epoch": 0.2939, + "grad_norm": 12.625, + "grad_norm_var": 0.3916015625, + "learning_rate": 0.0003, + "loss": 11.4964, + "loss/aux_loss": 0.04807299487292767, + "loss/crossentropy": 2.63063805103302, + "loss/logits": 0.8311535373330117, + "step": 29390 + }, + { + "epoch": 0.294, + "grad_norm": 13.3125, + "grad_norm_var": 0.3087890625, + "learning_rate": 0.0003, + "loss": 11.437, + "loss/aux_loss": 0.048074229061603545, + "loss/crossentropy": 2.745287525653839, + "loss/logits": 0.8874899983406067, + "step": 29400 + }, + { + "epoch": 0.2941, + "grad_norm": 13.125, + "grad_norm_var": 0.2837890625, + "learning_rate": 0.0003, + "loss": 11.5053, + "loss/aux_loss": 0.04808676596730947, + "loss/crossentropy": 2.740684485435486, + "loss/logits": 0.8409494936466217, + "step": 29410 + }, + { + "epoch": 0.2942, + "grad_norm": 12.1875, + "grad_norm_var": 0.21920572916666667, + "learning_rate": 0.0003, + "loss": 11.5673, + "loss/aux_loss": 0.04807845540344715, + "loss/crossentropy": 2.8806114912033083, + "loss/logits": 0.8758183747529984, + "step": 29420 + }, + { + "epoch": 0.2943, + "grad_norm": 12.8125, + "grad_norm_var": 1.0633951822916667, + "learning_rate": 0.0003, + "loss": 11.4363, + "loss/aux_loss": 0.04808698520064354, + "loss/crossentropy": 2.717182183265686, + "loss/logits": 0.8745023101568222, + "step": 29430 + }, + { + "epoch": 0.2944, + "grad_norm": 13.25, + "grad_norm_var": 0.4212890625, + "learning_rate": 0.0003, + "loss": 11.5294, + "loss/aux_loss": 0.048078556172549726, + "loss/crossentropy": 2.6900423645973204, + "loss/logits": 0.8533391326665878, + "step": 29440 + }, + { + "epoch": 0.2945, + "grad_norm": 13.8125, + "grad_norm_var": 0.2962890625, + "learning_rate": 0.0003, + "loss": 11.4001, + "loss/aux_loss": 0.0480863269418478, + "loss/crossentropy": 2.7647065460681914, + "loss/logits": 0.8730087608098984, + "step": 29450 + }, + { + "epoch": 0.2946, + "grad_norm": 13.625, + "grad_norm_var": 1.1960774739583333, + "learning_rate": 0.0003, + "loss": 11.5183, + "loss/aux_loss": 0.0480956656858325, + "loss/crossentropy": 2.675736755132675, + "loss/logits": 0.8797785133123398, + "step": 29460 + }, + { + "epoch": 0.2947, + "grad_norm": 12.8125, + "grad_norm_var": 1.1954264322916666, + "learning_rate": 0.0003, + "loss": 11.3158, + "loss/aux_loss": 0.0480833875015378, + "loss/crossentropy": 2.6308836817741392, + "loss/logits": 0.8395634293556213, + "step": 29470 + }, + { + "epoch": 0.2948, + "grad_norm": 12.0625, + "grad_norm_var": 0.5860514322916667, + "learning_rate": 0.0003, + "loss": 11.2813, + "loss/aux_loss": 0.048089759424328804, + "loss/crossentropy": 2.599923449754715, + "loss/logits": 0.8390878111124038, + "step": 29480 + }, + { + "epoch": 0.2949, + "grad_norm": 13.6875, + "grad_norm_var": 0.5067545572916666, + "learning_rate": 0.0003, + "loss": 11.4816, + "loss/aux_loss": 0.0480830617249012, + "loss/crossentropy": 2.597203868627548, + "loss/logits": 0.8285312354564667, + "step": 29490 + }, + { + "epoch": 0.295, + "grad_norm": 12.5625, + "grad_norm_var": 0.3780598958333333, + "learning_rate": 0.0003, + "loss": 11.416, + "loss/aux_loss": 0.048088861629366875, + "loss/crossentropy": 2.8947019577026367, + "loss/logits": 0.8721114903688431, + "step": 29500 + }, + { + "epoch": 0.2951, + "grad_norm": 12.9375, + "grad_norm_var": 0.34698893229166666, + "learning_rate": 0.0003, + "loss": 11.4425, + "loss/aux_loss": 0.048086998984217644, + "loss/crossentropy": 2.6360188245773317, + "loss/logits": 0.848075520992279, + "step": 29510 + }, + { + "epoch": 0.2952, + "grad_norm": 12.125, + "grad_norm_var": 0.190869140625, + "learning_rate": 0.0003, + "loss": 11.3914, + "loss/aux_loss": 0.04809488840401173, + "loss/crossentropy": 2.63630353808403, + "loss/logits": 0.8704184353351593, + "step": 29520 + }, + { + "epoch": 0.2953, + "grad_norm": 12.6875, + "grad_norm_var": 0.31834309895833335, + "learning_rate": 0.0003, + "loss": 11.5943, + "loss/aux_loss": 0.04807685688138008, + "loss/crossentropy": 2.7015933096408844, + "loss/logits": 0.8864135921001435, + "step": 29530 + }, + { + "epoch": 0.2954, + "grad_norm": 12.6875, + "grad_norm_var": 0.263525390625, + "learning_rate": 0.0003, + "loss": 11.5153, + "loss/aux_loss": 0.04809237774461508, + "loss/crossentropy": 2.8147780299186707, + "loss/logits": 0.8690049260854721, + "step": 29540 + }, + { + "epoch": 0.2955, + "grad_norm": 15.375, + "grad_norm_var": 4.4134765625, + "learning_rate": 0.0003, + "loss": 11.5648, + "loss/aux_loss": 0.048073606193065645, + "loss/crossentropy": 2.8568318367004393, + "loss/logits": 0.9089872241020203, + "step": 29550 + }, + { + "epoch": 0.2956, + "grad_norm": 12.125, + "grad_norm_var": 4.423372395833334, + "learning_rate": 0.0003, + "loss": 11.4575, + "loss/aux_loss": 0.04808273408561945, + "loss/crossentropy": 2.7235109508037567, + "loss/logits": 0.8837243676185608, + "step": 29560 + }, + { + "epoch": 0.2957, + "grad_norm": 13.75, + "grad_norm_var": 0.4041015625, + "learning_rate": 0.0003, + "loss": 11.4504, + "loss/aux_loss": 0.04809108339250088, + "loss/crossentropy": 2.7183880388736723, + "loss/logits": 0.8825681626796722, + "step": 29570 + }, + { + "epoch": 0.2958, + "grad_norm": 12.875, + "grad_norm_var": 0.15149739583333333, + "learning_rate": 0.0003, + "loss": 11.4876, + "loss/aux_loss": 0.04807641636580229, + "loss/crossentropy": 2.8469328343868257, + "loss/logits": 0.8711580604314804, + "step": 29580 + }, + { + "epoch": 0.2959, + "grad_norm": 12.4375, + "grad_norm_var": 0.16300455729166666, + "learning_rate": 0.0003, + "loss": 11.5655, + "loss/aux_loss": 0.04808410815894604, + "loss/crossentropy": 2.7582703590393067, + "loss/logits": 0.8958453744649887, + "step": 29590 + }, + { + "epoch": 0.296, + "grad_norm": 14.3125, + "grad_norm_var": 0.378125, + "learning_rate": 0.0003, + "loss": 11.587, + "loss/aux_loss": 0.048079511523246764, + "loss/crossentropy": 2.830457305908203, + "loss/logits": 0.916206705570221, + "step": 29600 + }, + { + "epoch": 0.2961, + "grad_norm": 12.625, + "grad_norm_var": 0.6526041666666667, + "learning_rate": 0.0003, + "loss": 11.3545, + "loss/aux_loss": 0.048083196952939035, + "loss/crossentropy": 2.6322430610656737, + "loss/logits": 0.8604376584291458, + "step": 29610 + }, + { + "epoch": 0.2962, + "grad_norm": 13.625, + "grad_norm_var": 0.5140625, + "learning_rate": 0.0003, + "loss": 11.3875, + "loss/aux_loss": 0.048088392801582815, + "loss/crossentropy": 2.7959298372268675, + "loss/logits": 0.8871166080236434, + "step": 29620 + }, + { + "epoch": 0.2963, + "grad_norm": 18.875, + "grad_norm_var": 3.191389973958333, + "learning_rate": 0.0003, + "loss": 11.3276, + "loss/aux_loss": 0.04808803517371416, + "loss/crossentropy": 2.799188733100891, + "loss/logits": 0.8698725253343582, + "step": 29630 + }, + { + "epoch": 0.2964, + "grad_norm": 14.5625, + "grad_norm_var": 2.5827962239583333, + "learning_rate": 0.0003, + "loss": 11.51, + "loss/aux_loss": 0.04808425158262253, + "loss/crossentropy": 2.8158665776252745, + "loss/logits": 0.8623910456895828, + "step": 29640 + }, + { + "epoch": 0.2965, + "grad_norm": 15.3125, + "grad_norm_var": 0.7145833333333333, + "learning_rate": 0.0003, + "loss": 11.4356, + "loss/aux_loss": 0.0480889655649662, + "loss/crossentropy": 2.744274616241455, + "loss/logits": 0.8680987030267715, + "step": 29650 + }, + { + "epoch": 0.2966, + "grad_norm": 13.8125, + "grad_norm_var": 0.6032389322916667, + "learning_rate": 0.0003, + "loss": 11.3509, + "loss/aux_loss": 0.04808337744325399, + "loss/crossentropy": 2.6652339160442353, + "loss/logits": 0.8523492991924286, + "step": 29660 + }, + { + "epoch": 0.2967, + "grad_norm": 13.3125, + "grad_norm_var": 0.3489583333333333, + "learning_rate": 0.0003, + "loss": 11.4915, + "loss/aux_loss": 0.04808827750384807, + "loss/crossentropy": 2.8008382678031922, + "loss/logits": 0.894950145483017, + "step": 29670 + }, + { + "epoch": 0.2968, + "grad_norm": 15.375, + "grad_norm_var": 0.7530598958333333, + "learning_rate": 0.0003, + "loss": 11.5014, + "loss/aux_loss": 0.04807999767363071, + "loss/crossentropy": 2.7002371549606323, + "loss/logits": 0.8693482935428619, + "step": 29680 + }, + { + "epoch": 0.2969, + "grad_norm": 13.25, + "grad_norm_var": 0.5391764322916667, + "learning_rate": 0.0003, + "loss": 11.474, + "loss/aux_loss": 0.048070631176233294, + "loss/crossentropy": 2.921978032588959, + "loss/logits": 0.899678111076355, + "step": 29690 + }, + { + "epoch": 0.297, + "grad_norm": 14.0625, + "grad_norm_var": 0.4332682291666667, + "learning_rate": 0.0003, + "loss": 11.5034, + "loss/aux_loss": 0.048087138868868354, + "loss/crossentropy": 2.7084551751613617, + "loss/logits": 0.8722521513700485, + "step": 29700 + }, + { + "epoch": 0.2971, + "grad_norm": 12.9375, + "grad_norm_var": 0.5426432291666666, + "learning_rate": 0.0003, + "loss": 11.6086, + "loss/aux_loss": 0.04807428289204836, + "loss/crossentropy": 2.5567859768867494, + "loss/logits": 0.8739277720451355, + "step": 29710 + }, + { + "epoch": 0.2972, + "grad_norm": 12.5, + "grad_norm_var": 0.34427083333333336, + "learning_rate": 0.0003, + "loss": 11.3334, + "loss/aux_loss": 0.048082271590828896, + "loss/crossentropy": 2.7359997153282167, + "loss/logits": 0.8679700314998626, + "step": 29720 + }, + { + "epoch": 0.2973, + "grad_norm": 13.1875, + "grad_norm_var": 1.246337890625, + "learning_rate": 0.0003, + "loss": 11.3988, + "loss/aux_loss": 0.04808435477316379, + "loss/crossentropy": 2.658203488588333, + "loss/logits": 0.8306466698646545, + "step": 29730 + }, + { + "epoch": 0.2974, + "grad_norm": 13.5, + "grad_norm_var": 0.9030598958333333, + "learning_rate": 0.0003, + "loss": 11.5435, + "loss/aux_loss": 0.048076405003666875, + "loss/crossentropy": 2.683915287256241, + "loss/logits": 0.8744410634040832, + "step": 29740 + }, + { + "epoch": 0.2975, + "grad_norm": 16.125, + "grad_norm_var": 92.749462890625, + "learning_rate": 0.0003, + "loss": 11.5341, + "loss/aux_loss": 0.04808039367198944, + "loss/crossentropy": 2.7447816848754885, + "loss/logits": 0.8636388152837753, + "step": 29750 + }, + { + "epoch": 0.2976, + "grad_norm": 12.75, + "grad_norm_var": 92.86678059895833, + "learning_rate": 0.0003, + "loss": 11.381, + "loss/aux_loss": 0.048084843531250955, + "loss/crossentropy": 2.6227781534194947, + "loss/logits": 0.8834515571594238, + "step": 29760 + }, + { + "epoch": 0.2977, + "grad_norm": 12.625, + "grad_norm_var": 0.331884765625, + "learning_rate": 0.0003, + "loss": 11.4901, + "loss/aux_loss": 0.04807569459080696, + "loss/crossentropy": 2.6902407228946688, + "loss/logits": 0.8586381793022155, + "step": 29770 + }, + { + "epoch": 0.2978, + "grad_norm": 13.25, + "grad_norm_var": 0.506884765625, + "learning_rate": 0.0003, + "loss": 11.4579, + "loss/aux_loss": 0.04808189757168293, + "loss/crossentropy": 2.9000410437583923, + "loss/logits": 0.8773433297872544, + "step": 29780 + }, + { + "epoch": 0.2979, + "grad_norm": 12.875, + "grad_norm_var": 0.5541015625, + "learning_rate": 0.0003, + "loss": 11.4549, + "loss/aux_loss": 0.048081548884510994, + "loss/crossentropy": 2.702808624505997, + "loss/logits": 0.8589540451765061, + "step": 29790 + }, + { + "epoch": 0.298, + "grad_norm": 13.125, + "grad_norm_var": 0.4332682291666667, + "learning_rate": 0.0003, + "loss": 11.5048, + "loss/aux_loss": 0.04808537419885397, + "loss/crossentropy": 2.7269632279872895, + "loss/logits": 0.8750491231679917, + "step": 29800 + }, + { + "epoch": 0.2981, + "grad_norm": 13.0625, + "grad_norm_var": 0.23123372395833333, + "learning_rate": 0.0003, + "loss": 11.4269, + "loss/aux_loss": 0.04808894339948892, + "loss/crossentropy": 2.5503712058067323, + "loss/logits": 0.8601103842258453, + "step": 29810 + }, + { + "epoch": 0.2982, + "grad_norm": 13.5625, + "grad_norm_var": 0.6322265625, + "learning_rate": 0.0003, + "loss": 11.4354, + "loss/aux_loss": 0.048082937858998774, + "loss/crossentropy": 2.692414093017578, + "loss/logits": 0.8803679436445236, + "step": 29820 + }, + { + "epoch": 0.2983, + "grad_norm": 13.1875, + "grad_norm_var": 0.363525390625, + "learning_rate": 0.0003, + "loss": 11.4705, + "loss/aux_loss": 0.048084008321166036, + "loss/crossentropy": 2.779222333431244, + "loss/logits": 0.8870409220457077, + "step": 29830 + }, + { + "epoch": 0.2984, + "grad_norm": 12.5625, + "grad_norm_var": 0.25636393229166665, + "learning_rate": 0.0003, + "loss": 11.3471, + "loss/aux_loss": 0.04808189794421196, + "loss/crossentropy": 2.8359466314315798, + "loss/logits": 0.895777115225792, + "step": 29840 + }, + { + "epoch": 0.2985, + "grad_norm": 14.625, + "grad_norm_var": 0.4911295572916667, + "learning_rate": 0.0003, + "loss": 11.5757, + "loss/aux_loss": 0.048088085278868674, + "loss/crossentropy": 2.681327813863754, + "loss/logits": 0.8474095374345779, + "step": 29850 + }, + { + "epoch": 0.2986, + "grad_norm": 14.1875, + "grad_norm_var": 1.6374837239583333, + "learning_rate": 0.0003, + "loss": 11.4771, + "loss/aux_loss": 0.048079535365104675, + "loss/crossentropy": 2.74518221616745, + "loss/logits": 0.8559576362371445, + "step": 29860 + }, + { + "epoch": 0.2987, + "grad_norm": 12.6875, + "grad_norm_var": 0.528759765625, + "learning_rate": 0.0003, + "loss": 11.6162, + "loss/aux_loss": 0.04807321559637785, + "loss/crossentropy": 2.8557145297527313, + "loss/logits": 0.9057471811771393, + "step": 29870 + }, + { + "epoch": 0.2988, + "grad_norm": 12.875, + "grad_norm_var": 0.8013020833333333, + "learning_rate": 0.0003, + "loss": 11.6681, + "loss/aux_loss": 0.048088300973176956, + "loss/crossentropy": 2.751220625638962, + "loss/logits": 0.8958558738231659, + "step": 29880 + }, + { + "epoch": 0.2989, + "grad_norm": 11.75, + "grad_norm_var": 0.9364583333333333, + "learning_rate": 0.0003, + "loss": 11.4696, + "loss/aux_loss": 0.04808711316436529, + "loss/crossentropy": 2.7763246476650236, + "loss/logits": 0.8446446388959885, + "step": 29890 + }, + { + "epoch": 0.299, + "grad_norm": 13.25, + "grad_norm_var": 0.60078125, + "learning_rate": 0.0003, + "loss": 11.3327, + "loss/aux_loss": 0.048075996339321136, + "loss/crossentropy": 2.47065726518631, + "loss/logits": 0.8439252525568008, + "step": 29900 + }, + { + "epoch": 0.2991, + "grad_norm": 13.125, + "grad_norm_var": 0.4515462239583333, + "learning_rate": 0.0003, + "loss": 11.3397, + "loss/aux_loss": 0.04808225966989994, + "loss/crossentropy": 2.684304392337799, + "loss/logits": 0.8657424867153167, + "step": 29910 + }, + { + "epoch": 0.2992, + "grad_norm": 12.8125, + "grad_norm_var": 0.081884765625, + "learning_rate": 0.0003, + "loss": 11.4104, + "loss/aux_loss": 0.04808203261345625, + "loss/crossentropy": 2.857515978813171, + "loss/logits": 0.8850834548473359, + "step": 29920 + }, + { + "epoch": 0.2993, + "grad_norm": 13.625, + "grad_norm_var": 310.426806640625, + "learning_rate": 0.0003, + "loss": 11.3021, + "loss/aux_loss": 0.04808754250407219, + "loss/crossentropy": 2.865814244747162, + "loss/logits": 0.9045403331518174, + "step": 29930 + }, + { + "epoch": 0.2994, + "grad_norm": 13.0625, + "grad_norm_var": 1.0432291666666667, + "learning_rate": 0.0003, + "loss": 11.5943, + "loss/aux_loss": 0.04808537941426039, + "loss/crossentropy": 2.7755528509616854, + "loss/logits": 0.888872966170311, + "step": 29940 + }, + { + "epoch": 0.2995, + "grad_norm": 12.4375, + "grad_norm_var": 0.2540201822916667, + "learning_rate": 0.0003, + "loss": 11.3798, + "loss/aux_loss": 0.048091139644384384, + "loss/crossentropy": 2.5855644285678863, + "loss/logits": 0.8455839395523072, + "step": 29950 + }, + { + "epoch": 0.2996, + "grad_norm": 14.0, + "grad_norm_var": 1.2926432291666667, + "learning_rate": 0.0003, + "loss": 11.4159, + "loss/aux_loss": 0.04809227306395769, + "loss/crossentropy": 2.719691050052643, + "loss/logits": 0.8842676371335983, + "step": 29960 + }, + { + "epoch": 0.2997, + "grad_norm": 13.125, + "grad_norm_var": 1.37109375, + "learning_rate": 0.0003, + "loss": 11.4488, + "loss/aux_loss": 0.0480900889262557, + "loss/crossentropy": 2.6705852150917053, + "loss/logits": 0.8526766896247864, + "step": 29970 + }, + { + "epoch": 0.2998, + "grad_norm": 13.0, + "grad_norm_var": 0.16365559895833334, + "learning_rate": 0.0003, + "loss": 11.4354, + "loss/aux_loss": 0.04807566087692976, + "loss/crossentropy": 2.8326764822006227, + "loss/logits": 0.8908536106348037, + "step": 29980 + }, + { + "epoch": 0.2999, + "grad_norm": 14.25, + "grad_norm_var": 0.218603515625, + "learning_rate": 0.0003, + "loss": 11.4361, + "loss/aux_loss": 0.04809534475207329, + "loss/crossentropy": 2.653664433956146, + "loss/logits": 0.8664328694343567, + "step": 29990 + }, + { + "epoch": 0.3, + "grad_norm": 13.875, + "grad_norm_var": 16.411393229166666, + "learning_rate": 0.0003, + "loss": 11.5995, + "loss/aux_loss": 0.04808858595788479, + "loss/crossentropy": 2.733062154054642, + "loss/logits": 0.8457317858934402, + "step": 30000 + }, + { + "epoch": 0.3001, + "grad_norm": 21.875, + "grad_norm_var": 2748.9501139322915, + "learning_rate": 0.0003, + "loss": 11.4879, + "loss/aux_loss": 0.0480996148660779, + "loss/crossentropy": 2.7736578941345216, + "loss/logits": 0.8403934806585311, + "step": 30010 + }, + { + "epoch": 0.3002, + "grad_norm": 13.125, + "grad_norm_var": 19.3478515625, + "learning_rate": 0.0003, + "loss": 11.4726, + "loss/aux_loss": 0.048092894814908506, + "loss/crossentropy": 2.705011248588562, + "loss/logits": 0.8455720961093902, + "step": 30020 + }, + { + "epoch": 0.3003, + "grad_norm": 12.125, + "grad_norm_var": 37.18553059895833, + "learning_rate": 0.0003, + "loss": 11.4553, + "loss/aux_loss": 0.04807716105133295, + "loss/crossentropy": 2.9149852752685548, + "loss/logits": 0.8917495250701905, + "step": 30030 + }, + { + "epoch": 0.3004, + "grad_norm": 12.1875, + "grad_norm_var": 37.059488932291664, + "learning_rate": 0.0003, + "loss": 11.3983, + "loss/aux_loss": 0.04808226209133863, + "loss/crossentropy": 2.841688472032547, + "loss/logits": 0.8982161253690719, + "step": 30040 + }, + { + "epoch": 0.3005, + "grad_norm": 12.8125, + "grad_norm_var": 0.34451497395833336, + "learning_rate": 0.0003, + "loss": 11.6804, + "loss/aux_loss": 0.04808265995234251, + "loss/crossentropy": 2.737361544370651, + "loss/logits": 0.8148900896310807, + "step": 30050 + }, + { + "epoch": 0.3006, + "grad_norm": 14.3125, + "grad_norm_var": 0.5416015625, + "learning_rate": 0.0003, + "loss": 11.6761, + "loss/aux_loss": 0.04808789361268282, + "loss/crossentropy": 2.577936816215515, + "loss/logits": 0.8507110446691513, + "step": 30060 + }, + { + "epoch": 0.3007, + "grad_norm": 15.125, + "grad_norm_var": 4.1171875, + "learning_rate": 0.0003, + "loss": 11.6774, + "loss/aux_loss": 0.04809730667620897, + "loss/crossentropy": 2.743165111541748, + "loss/logits": 0.8715814143419266, + "step": 30070 + }, + { + "epoch": 0.3008, + "grad_norm": 12.625, + "grad_norm_var": 1.2132649739583334, + "learning_rate": 0.0003, + "loss": 11.6082, + "loss/aux_loss": 0.04807158131152391, + "loss/crossentropy": 2.795542907714844, + "loss/logits": 0.8549720883369446, + "step": 30080 + }, + { + "epoch": 0.3009, + "grad_norm": 12.375, + "grad_norm_var": 0.562744140625, + "learning_rate": 0.0003, + "loss": 11.5119, + "loss/aux_loss": 0.04808140005916357, + "loss/crossentropy": 2.8126461267471314, + "loss/logits": 0.8966263324022293, + "step": 30090 + }, + { + "epoch": 0.301, + "grad_norm": 12.75, + "grad_norm_var": 0.65390625, + "learning_rate": 0.0003, + "loss": 11.4426, + "loss/aux_loss": 0.048090949095785616, + "loss/crossentropy": 2.7090745508670806, + "loss/logits": 0.8385947048664093, + "step": 30100 + }, + { + "epoch": 0.3011, + "grad_norm": 12.6875, + "grad_norm_var": 0.6181640625, + "learning_rate": 0.0003, + "loss": 11.5944, + "loss/aux_loss": 0.04808566849678755, + "loss/crossentropy": 2.751387929916382, + "loss/logits": 0.8912309646606446, + "step": 30110 + }, + { + "epoch": 0.3012, + "grad_norm": 13.25, + "grad_norm_var": 0.54609375, + "learning_rate": 0.0003, + "loss": 11.3972, + "loss/aux_loss": 0.04808124527335167, + "loss/crossentropy": 2.841317903995514, + "loss/logits": 0.8788245469331741, + "step": 30120 + }, + { + "epoch": 0.3013, + "grad_norm": 13.375, + "grad_norm_var": 0.324462890625, + "learning_rate": 0.0003, + "loss": 11.5188, + "loss/aux_loss": 0.04808822274208069, + "loss/crossentropy": 2.6749501705169676, + "loss/logits": 0.8571932524442673, + "step": 30130 + }, + { + "epoch": 0.3014, + "grad_norm": 13.0, + "grad_norm_var": 0.18631184895833333, + "learning_rate": 0.0003, + "loss": 11.5919, + "loss/aux_loss": 0.04807674512267113, + "loss/crossentropy": 2.8263603806495667, + "loss/logits": 0.9167416036128998, + "step": 30140 + }, + { + "epoch": 0.3015, + "grad_norm": 13.25, + "grad_norm_var": 0.5544108072916667, + "learning_rate": 0.0003, + "loss": 11.3587, + "loss/aux_loss": 0.04808600451797247, + "loss/crossentropy": 2.689769744873047, + "loss/logits": 0.8513695240020752, + "step": 30150 + }, + { + "epoch": 0.3016, + "grad_norm": 13.5, + "grad_norm_var": 0.20271809895833334, + "learning_rate": 0.0003, + "loss": 11.4439, + "loss/aux_loss": 0.04808979425579309, + "loss/crossentropy": 2.705520159006119, + "loss/logits": 0.8287128508090973, + "step": 30160 + }, + { + "epoch": 0.3017, + "grad_norm": 13.375, + "grad_norm_var": 0.34373372395833335, + "learning_rate": 0.0003, + "loss": 11.3879, + "loss/aux_loss": 0.04808113612234592, + "loss/crossentropy": 2.59697830080986, + "loss/logits": 0.8457653447985649, + "step": 30170 + }, + { + "epoch": 0.3018, + "grad_norm": 13.3125, + "grad_norm_var": 0.40572916666666664, + "learning_rate": 0.0003, + "loss": 11.6186, + "loss/aux_loss": 0.04808783624321222, + "loss/crossentropy": 2.7700432360172274, + "loss/logits": 0.8764997065067291, + "step": 30180 + }, + { + "epoch": 0.3019, + "grad_norm": 13.375, + "grad_norm_var": 0.5805826822916667, + "learning_rate": 0.0003, + "loss": 11.4003, + "loss/aux_loss": 0.04809058122336864, + "loss/crossentropy": 2.7070785045623778, + "loss/logits": 0.8807005375623703, + "step": 30190 + }, + { + "epoch": 0.302, + "grad_norm": 12.0, + "grad_norm_var": 0.5976399739583333, + "learning_rate": 0.0003, + "loss": 11.2816, + "loss/aux_loss": 0.048085220903158185, + "loss/crossentropy": 2.625184786319733, + "loss/logits": 0.8463748693466187, + "step": 30200 + }, + { + "epoch": 0.3021, + "grad_norm": 13.875, + "grad_norm_var": 0.5113932291666666, + "learning_rate": 0.0003, + "loss": 11.4653, + "loss/aux_loss": 0.04808960650116205, + "loss/crossentropy": 2.690778136253357, + "loss/logits": 0.8736082255840302, + "step": 30210 + }, + { + "epoch": 0.3022, + "grad_norm": 13.375, + "grad_norm_var": 0.38605143229166666, + "learning_rate": 0.0003, + "loss": 11.5442, + "loss/aux_loss": 0.04808677174150944, + "loss/crossentropy": 2.769177794456482, + "loss/logits": 0.8669810116291046, + "step": 30220 + }, + { + "epoch": 0.3023, + "grad_norm": 13.875, + "grad_norm_var": 0.43279622395833334, + "learning_rate": 0.0003, + "loss": 11.3678, + "loss/aux_loss": 0.04807732906192541, + "loss/crossentropy": 2.66594517827034, + "loss/logits": 0.8435944467782974, + "step": 30230 + }, + { + "epoch": 0.3024, + "grad_norm": 13.625, + "grad_norm_var": 0.349462890625, + "learning_rate": 0.0003, + "loss": 11.5848, + "loss/aux_loss": 0.048086778819561006, + "loss/crossentropy": 2.83836350440979, + "loss/logits": 0.8659113794565201, + "step": 30240 + }, + { + "epoch": 0.3025, + "grad_norm": 13.5, + "grad_norm_var": 0.40305989583333335, + "learning_rate": 0.0003, + "loss": 11.3988, + "loss/aux_loss": 0.04808458890765906, + "loss/crossentropy": 2.716430550813675, + "loss/logits": 0.8613810330629349, + "step": 30250 + }, + { + "epoch": 0.3026, + "grad_norm": 65.0, + "grad_norm_var": 168.16276041666666, + "learning_rate": 0.0003, + "loss": 11.4989, + "loss/aux_loss": 0.04808981157839298, + "loss/crossentropy": 2.8385813772678374, + "loss/logits": 0.8725706160068512, + "step": 30260 + }, + { + "epoch": 0.3027, + "grad_norm": 13.25, + "grad_norm_var": 168.298681640625, + "learning_rate": 0.0003, + "loss": 11.4201, + "loss/aux_loss": 0.04808289185166359, + "loss/crossentropy": 2.7978179454803467, + "loss/logits": 0.8753843992948532, + "step": 30270 + }, + { + "epoch": 0.3028, + "grad_norm": 12.5625, + "grad_norm_var": 0.8785807291666666, + "learning_rate": 0.0003, + "loss": 11.5324, + "loss/aux_loss": 0.048077399097383025, + "loss/crossentropy": 2.7661708891391754, + "loss/logits": 0.8352512449026108, + "step": 30280 + }, + { + "epoch": 0.3029, + "grad_norm": 14.0, + "grad_norm_var": 0.48072916666666665, + "learning_rate": 0.0003, + "loss": 11.5311, + "loss/aux_loss": 0.04808757621794939, + "loss/crossentropy": 2.787020003795624, + "loss/logits": 0.8474883437156677, + "step": 30290 + }, + { + "epoch": 0.303, + "grad_norm": 13.875, + "grad_norm_var": 0.32146809895833334, + "learning_rate": 0.0003, + "loss": 11.5551, + "loss/aux_loss": 0.048091338202357295, + "loss/crossentropy": 2.6601632058620455, + "loss/logits": 0.8541019320487976, + "step": 30300 + }, + { + "epoch": 0.3031, + "grad_norm": 14.1875, + "grad_norm_var": 0.6218098958333333, + "learning_rate": 0.0003, + "loss": 11.4167, + "loss/aux_loss": 0.04807760640978813, + "loss/crossentropy": 2.621816486120224, + "loss/logits": 0.8466038852930069, + "step": 30310 + }, + { + "epoch": 0.3032, + "grad_norm": 13.5, + "grad_norm_var": 0.8030598958333334, + "learning_rate": 0.0003, + "loss": 11.6232, + "loss/aux_loss": 0.048088185116648675, + "loss/crossentropy": 2.719381844997406, + "loss/logits": 0.853942820429802, + "step": 30320 + }, + { + "epoch": 0.3033, + "grad_norm": 12.0625, + "grad_norm_var": 0.863134765625, + "learning_rate": 0.0003, + "loss": 11.4434, + "loss/aux_loss": 0.04807737711817026, + "loss/crossentropy": 2.677357393503189, + "loss/logits": 0.8405762702226639, + "step": 30330 + }, + { + "epoch": 0.3034, + "grad_norm": 12.875, + "grad_norm_var": 0.522119140625, + "learning_rate": 0.0003, + "loss": 11.3732, + "loss/aux_loss": 0.04808907844126224, + "loss/crossentropy": 2.676799476146698, + "loss/logits": 0.8608356237411499, + "step": 30340 + }, + { + "epoch": 0.3035, + "grad_norm": 13.5625, + "grad_norm_var": 0.465478515625, + "learning_rate": 0.0003, + "loss": 11.5126, + "loss/aux_loss": 0.048095157742500304, + "loss/crossentropy": 2.7750605642795563, + "loss/logits": 0.8369805574417114, + "step": 30350 + }, + { + "epoch": 0.3036, + "grad_norm": 13.125, + "grad_norm_var": 0.25, + "learning_rate": 0.0003, + "loss": 11.4669, + "loss/aux_loss": 0.04807246904820204, + "loss/crossentropy": 2.611363673210144, + "loss/logits": 0.8665720134973526, + "step": 30360 + }, + { + "epoch": 0.3037, + "grad_norm": 12.875, + "grad_norm_var": 0.29140625, + "learning_rate": 0.0003, + "loss": 11.637, + "loss/aux_loss": 0.048090710304677486, + "loss/crossentropy": 2.826841878890991, + "loss/logits": 0.8832567691802978, + "step": 30370 + }, + { + "epoch": 0.3038, + "grad_norm": 14.6875, + "grad_norm_var": 3.5155598958333334, + "learning_rate": 0.0003, + "loss": 11.542, + "loss/aux_loss": 0.04807953424751758, + "loss/crossentropy": 2.7730814576148988, + "loss/logits": 0.8635116755962372, + "step": 30380 + }, + { + "epoch": 0.3039, + "grad_norm": 16.375, + "grad_norm_var": 3.6962890625, + "learning_rate": 0.0003, + "loss": 11.4562, + "loss/aux_loss": 0.048072515055537224, + "loss/crossentropy": 2.73685462474823, + "loss/logits": 0.8686755329370499, + "step": 30390 + }, + { + "epoch": 0.304, + "grad_norm": 14.625, + "grad_norm_var": 0.9533854166666667, + "learning_rate": 0.0003, + "loss": 11.5056, + "loss/aux_loss": 0.048081375658512115, + "loss/crossentropy": 2.789488208293915, + "loss/logits": 0.8672916740179062, + "step": 30400 + }, + { + "epoch": 0.3041, + "grad_norm": 13.375, + "grad_norm_var": 0.4032389322916667, + "learning_rate": 0.0003, + "loss": 11.4649, + "loss/aux_loss": 0.048084143362939355, + "loss/crossentropy": 2.8491791486740112, + "loss/logits": 0.8795545041561127, + "step": 30410 + }, + { + "epoch": 0.3042, + "grad_norm": 12.3125, + "grad_norm_var": 0.6163899739583333, + "learning_rate": 0.0003, + "loss": 11.4793, + "loss/aux_loss": 0.04807379003614187, + "loss/crossentropy": 2.7467067003250123, + "loss/logits": 0.8532693386077881, + "step": 30420 + }, + { + "epoch": 0.3043, + "grad_norm": 12.75, + "grad_norm_var": 1.0354166666666667, + "learning_rate": 0.0003, + "loss": 11.3694, + "loss/aux_loss": 0.04808585681021214, + "loss/crossentropy": 2.574428778886795, + "loss/logits": 0.837331035733223, + "step": 30430 + }, + { + "epoch": 0.3044, + "grad_norm": 13.8125, + "grad_norm_var": 0.3973795572916667, + "learning_rate": 0.0003, + "loss": 11.5313, + "loss/aux_loss": 0.04808316696435213, + "loss/crossentropy": 2.8282381296157837, + "loss/logits": 0.8839503526687622, + "step": 30440 + }, + { + "epoch": 0.3045, + "grad_norm": 14.375, + "grad_norm_var": 0.5514973958333333, + "learning_rate": 0.0003, + "loss": 11.6023, + "loss/aux_loss": 0.048081176541745665, + "loss/crossentropy": 2.7252886414527895, + "loss/logits": 0.8965796858072281, + "step": 30450 + }, + { + "epoch": 0.3046, + "grad_norm": 13.5, + "grad_norm_var": 0.6479166666666667, + "learning_rate": 0.0003, + "loss": 11.549, + "loss/aux_loss": 0.048083511739969255, + "loss/crossentropy": 2.711775553226471, + "loss/logits": 0.8697979748249054, + "step": 30460 + }, + { + "epoch": 0.3047, + "grad_norm": 12.6875, + "grad_norm_var": 0.903125, + "learning_rate": 0.0003, + "loss": 11.5003, + "loss/aux_loss": 0.048078321292996405, + "loss/crossentropy": 2.7702556967735292, + "loss/logits": 0.8688585251569748, + "step": 30470 + }, + { + "epoch": 0.3048, + "grad_norm": 14.0625, + "grad_norm_var": 0.4832682291666667, + "learning_rate": 0.0003, + "loss": 11.461, + "loss/aux_loss": 0.04808118157088757, + "loss/crossentropy": 2.677464705705643, + "loss/logits": 0.8462830722332001, + "step": 30480 + }, + { + "epoch": 0.3049, + "grad_norm": 12.75, + "grad_norm_var": 0.42213541666666665, + "learning_rate": 0.0003, + "loss": 11.3989, + "loss/aux_loss": 0.04808572474867105, + "loss/crossentropy": 2.7207378327846525, + "loss/logits": 0.9032052427530288, + "step": 30490 + }, + { + "epoch": 0.305, + "grad_norm": 13.375, + "grad_norm_var": 0.32265625, + "learning_rate": 0.0003, + "loss": 11.4989, + "loss/aux_loss": 0.04807707965373993, + "loss/crossentropy": 2.855698162317276, + "loss/logits": 0.8542087256908417, + "step": 30500 + }, + { + "epoch": 0.3051, + "grad_norm": 13.125, + "grad_norm_var": 0.43411458333333336, + "learning_rate": 0.0003, + "loss": 11.5808, + "loss/aux_loss": 0.04808335490524769, + "loss/crossentropy": 2.7730906128883364, + "loss/logits": 0.8658265113830567, + "step": 30510 + }, + { + "epoch": 0.3052, + "grad_norm": 13.25, + "grad_norm_var": 0.4462890625, + "learning_rate": 0.0003, + "loss": 11.3824, + "loss/aux_loss": 0.04808741491287947, + "loss/crossentropy": 2.6597979426383973, + "loss/logits": 0.8335691154003143, + "step": 30520 + }, + { + "epoch": 0.3053, + "grad_norm": 14.0625, + "grad_norm_var": 0.2416015625, + "learning_rate": 0.0003, + "loss": 11.5599, + "loss/aux_loss": 0.0480789877474308, + "loss/crossentropy": 2.7108544588088987, + "loss/logits": 0.8437099695205689, + "step": 30530 + }, + { + "epoch": 0.3054, + "grad_norm": 14.25, + "grad_norm_var": 0.5458333333333333, + "learning_rate": 0.0003, + "loss": 11.5408, + "loss/aux_loss": 0.048078577406704424, + "loss/crossentropy": 2.8623337388038634, + "loss/logits": 0.8665123015642167, + "step": 30540 + }, + { + "epoch": 0.3055, + "grad_norm": 13.125, + "grad_norm_var": 0.7640462239583333, + "learning_rate": 0.0003, + "loss": 11.4355, + "loss/aux_loss": 0.048077302612364294, + "loss/crossentropy": 2.7096797108650206, + "loss/logits": 0.8873968094587326, + "step": 30550 + }, + { + "epoch": 0.3056, + "grad_norm": 13.125, + "grad_norm_var": 0.31105143229166665, + "learning_rate": 0.0003, + "loss": 11.4985, + "loss/aux_loss": 0.048081740364432336, + "loss/crossentropy": 2.7807741165161133, + "loss/logits": 0.8777556359767914, + "step": 30560 + }, + { + "epoch": 0.3057, + "grad_norm": 12.75, + "grad_norm_var": 0.27810872395833336, + "learning_rate": 0.0003, + "loss": 11.4167, + "loss/aux_loss": 0.0480744456872344, + "loss/crossentropy": 2.8844709157943726, + "loss/logits": 0.9081658095121383, + "step": 30570 + }, + { + "epoch": 0.3058, + "grad_norm": 12.8125, + "grad_norm_var": 0.5905598958333333, + "learning_rate": 0.0003, + "loss": 11.4081, + "loss/aux_loss": 0.04809431917965412, + "loss/crossentropy": 2.6604327261447906, + "loss/logits": 0.8287265658378601, + "step": 30580 + }, + { + "epoch": 0.3059, + "grad_norm": 13.875, + "grad_norm_var": 0.40545247395833334, + "learning_rate": 0.0003, + "loss": 11.4122, + "loss/aux_loss": 0.048075050488114356, + "loss/crossentropy": 2.70709490776062, + "loss/logits": 0.8873141348361969, + "step": 30590 + }, + { + "epoch": 0.306, + "grad_norm": 13.0, + "grad_norm_var": 0.313134765625, + "learning_rate": 0.0003, + "loss": 11.4442, + "loss/aux_loss": 0.04807507153600454, + "loss/crossentropy": 2.7605147421360017, + "loss/logits": 0.8720807194709778, + "step": 30600 + }, + { + "epoch": 0.3061, + "grad_norm": 13.125, + "grad_norm_var": 0.3611979166666667, + "learning_rate": 0.0003, + "loss": 11.4805, + "loss/aux_loss": 0.04808742217719555, + "loss/crossentropy": 2.649176824092865, + "loss/logits": 0.8440777510404587, + "step": 30610 + }, + { + "epoch": 0.3062, + "grad_norm": 13.875, + "grad_norm_var": 0.20930989583333334, + "learning_rate": 0.0003, + "loss": 11.4971, + "loss/aux_loss": 0.048072243295609954, + "loss/crossentropy": 2.8579718112945556, + "loss/logits": 0.8863743543624878, + "step": 30620 + }, + { + "epoch": 0.3063, + "grad_norm": 12.875, + "grad_norm_var": 0.15870768229166668, + "learning_rate": 0.0003, + "loss": 11.4342, + "loss/aux_loss": 0.04809508193284273, + "loss/crossentropy": 2.7085582673549653, + "loss/logits": 0.855813917517662, + "step": 30630 + }, + { + "epoch": 0.3064, + "grad_norm": 14.625, + "grad_norm_var": 0.4129557291666667, + "learning_rate": 0.0003, + "loss": 11.5658, + "loss/aux_loss": 0.048079350404441355, + "loss/crossentropy": 2.8295456767082214, + "loss/logits": 0.8438192725181579, + "step": 30640 + }, + { + "epoch": 0.3065, + "grad_norm": 13.5625, + "grad_norm_var": 0.9072265625, + "learning_rate": 0.0003, + "loss": 11.2753, + "loss/aux_loss": 0.048086012713611126, + "loss/crossentropy": 2.637072730064392, + "loss/logits": 0.846964082121849, + "step": 30650 + }, + { + "epoch": 0.3066, + "grad_norm": 13.0625, + "grad_norm_var": 1.236962890625, + "learning_rate": 0.0003, + "loss": 11.4979, + "loss/aux_loss": 0.04808483086526394, + "loss/crossentropy": 2.7757628083229067, + "loss/logits": 0.8778376072645188, + "step": 30660 + }, + { + "epoch": 0.3067, + "grad_norm": 12.5625, + "grad_norm_var": 0.5494140625, + "learning_rate": 0.0003, + "loss": 11.4651, + "loss/aux_loss": 0.048084459826350213, + "loss/crossentropy": 2.7548458218574523, + "loss/logits": 0.9005285263061523, + "step": 30670 + }, + { + "epoch": 0.3068, + "grad_norm": 14.5625, + "grad_norm_var": 0.6807291666666667, + "learning_rate": 0.0003, + "loss": 11.6068, + "loss/aux_loss": 0.048079180717468264, + "loss/crossentropy": 2.737465226650238, + "loss/logits": 0.8828152716159821, + "step": 30680 + }, + { + "epoch": 0.3069, + "grad_norm": 13.375, + "grad_norm_var": 0.46484375, + "learning_rate": 0.0003, + "loss": 11.4114, + "loss/aux_loss": 0.04807949494570494, + "loss/crossentropy": 2.727854001522064, + "loss/logits": 0.8459988683462143, + "step": 30690 + }, + { + "epoch": 0.307, + "grad_norm": 12.0625, + "grad_norm_var": 0.50078125, + "learning_rate": 0.0003, + "loss": 11.3173, + "loss/aux_loss": 0.048087647184729576, + "loss/crossentropy": 2.7050256431102753, + "loss/logits": 0.8718531727790833, + "step": 30700 + }, + { + "epoch": 0.3071, + "grad_norm": 12.9375, + "grad_norm_var": 3.343603515625, + "learning_rate": 0.0003, + "loss": 11.4537, + "loss/aux_loss": 0.048082553595304486, + "loss/crossentropy": 2.7552569687366484, + "loss/logits": 0.8774021625518799, + "step": 30710 + }, + { + "epoch": 0.3072, + "grad_norm": 12.4375, + "grad_norm_var": 1.0231770833333333, + "learning_rate": 0.0003, + "loss": 11.3565, + "loss/aux_loss": 0.04808600023388863, + "loss/crossentropy": 2.6642140567302706, + "loss/logits": 0.8275765240192413, + "step": 30720 + }, + { + "epoch": 0.3073, + "grad_norm": 13.5625, + "grad_norm_var": 0.5879557291666667, + "learning_rate": 0.0003, + "loss": 11.5265, + "loss/aux_loss": 0.04808558952063322, + "loss/crossentropy": 2.7769883573055267, + "loss/logits": 0.8872695177793503, + "step": 30730 + }, + { + "epoch": 0.3074, + "grad_norm": 12.5625, + "grad_norm_var": 0.5175618489583333, + "learning_rate": 0.0003, + "loss": 11.4342, + "loss/aux_loss": 0.048081249184906485, + "loss/crossentropy": 2.8268501818180085, + "loss/logits": 0.8817792952060699, + "step": 30740 + }, + { + "epoch": 0.3075, + "grad_norm": 12.8125, + "grad_norm_var": 0.558447265625, + "learning_rate": 0.0003, + "loss": 11.626, + "loss/aux_loss": 0.048086341470479965, + "loss/crossentropy": 2.7402640700340273, + "loss/logits": 0.8773202210664749, + "step": 30750 + }, + { + "epoch": 0.3076, + "grad_norm": 13.875, + "grad_norm_var": 0.16588541666666667, + "learning_rate": 0.0003, + "loss": 11.6524, + "loss/aux_loss": 0.048075375705957414, + "loss/crossentropy": 2.6162120938301086, + "loss/logits": 0.842825299501419, + "step": 30760 + }, + { + "epoch": 0.3077, + "grad_norm": 12.6875, + "grad_norm_var": 0.40911458333333334, + "learning_rate": 0.0003, + "loss": 11.5109, + "loss/aux_loss": 0.04809289965778589, + "loss/crossentropy": 2.719909155368805, + "loss/logits": 0.8441285580396652, + "step": 30770 + }, + { + "epoch": 0.3078, + "grad_norm": 12.875, + "grad_norm_var": 0.6238932291666667, + "learning_rate": 0.0003, + "loss": 11.389, + "loss/aux_loss": 0.048084799014031884, + "loss/crossentropy": 2.7028889536857603, + "loss/logits": 0.8539338052272797, + "step": 30780 + }, + { + "epoch": 0.3079, + "grad_norm": 13.3125, + "grad_norm_var": 0.3238932291666667, + "learning_rate": 0.0003, + "loss": 11.3617, + "loss/aux_loss": 0.04808486420661211, + "loss/crossentropy": 2.5718206644058226, + "loss/logits": 0.8437188386917114, + "step": 30790 + }, + { + "epoch": 0.308, + "grad_norm": 13.5, + "grad_norm_var": 0.26608072916666664, + "learning_rate": 0.0003, + "loss": 11.4156, + "loss/aux_loss": 0.04808387756347656, + "loss/crossentropy": 2.6824039578437806, + "loss/logits": 0.8235249221324921, + "step": 30800 + }, + { + "epoch": 0.3081, + "grad_norm": 13.125, + "grad_norm_var": 0.27005208333333336, + "learning_rate": 0.0003, + "loss": 11.4488, + "loss/aux_loss": 0.048087064921855924, + "loss/crossentropy": 2.7326850056648255, + "loss/logits": 0.8537106692790986, + "step": 30810 + }, + { + "epoch": 0.3082, + "grad_norm": 13.5625, + "grad_norm_var": 0.23951822916666668, + "learning_rate": 0.0003, + "loss": 11.4492, + "loss/aux_loss": 0.048091364093124866, + "loss/crossentropy": 2.7636500120162966, + "loss/logits": 0.8671251088380814, + "step": 30820 + }, + { + "epoch": 0.3083, + "grad_norm": 13.4375, + "grad_norm_var": 1.5900390625, + "learning_rate": 0.0003, + "loss": 11.502, + "loss/aux_loss": 0.04808245934545994, + "loss/crossentropy": 2.856794422864914, + "loss/logits": 0.8604202717542648, + "step": 30830 + }, + { + "epoch": 0.3084, + "grad_norm": 13.5, + "grad_norm_var": 1.5712076822916667, + "learning_rate": 0.0003, + "loss": 11.37, + "loss/aux_loss": 0.04808021280914545, + "loss/crossentropy": 2.831040990352631, + "loss/logits": 0.8481287360191345, + "step": 30840 + }, + { + "epoch": 0.3085, + "grad_norm": 13.75, + "grad_norm_var": 0.31326497395833336, + "learning_rate": 0.0003, + "loss": 11.3802, + "loss/aux_loss": 0.04808556064963341, + "loss/crossentropy": 2.7837505459785463, + "loss/logits": 0.8858030140399933, + "step": 30850 + }, + { + "epoch": 0.3086, + "grad_norm": 13.3125, + "grad_norm_var": 0.6613118489583333, + "learning_rate": 0.0003, + "loss": 11.4733, + "loss/aux_loss": 0.04808393772691488, + "loss/crossentropy": 2.646303951740265, + "loss/logits": 0.8927146643400192, + "step": 30860 + }, + { + "epoch": 0.3087, + "grad_norm": 13.3125, + "grad_norm_var": 1.0082682291666667, + "learning_rate": 0.0003, + "loss": 11.5069, + "loss/aux_loss": 0.04806661605834961, + "loss/crossentropy": 2.6964763700962067, + "loss/logits": 0.8716022908687592, + "step": 30870 + }, + { + "epoch": 0.3088, + "grad_norm": 13.625, + "grad_norm_var": 1.622509765625, + "learning_rate": 0.0003, + "loss": 11.4855, + "loss/aux_loss": 0.0480882540345192, + "loss/crossentropy": 2.744866168498993, + "loss/logits": 0.8746917128562928, + "step": 30880 + }, + { + "epoch": 0.3089, + "grad_norm": 12.375, + "grad_norm_var": 0.5468098958333333, + "learning_rate": 0.0003, + "loss": 11.4911, + "loss/aux_loss": 0.04807100631296635, + "loss/crossentropy": 2.820644873380661, + "loss/logits": 0.8648158997297287, + "step": 30890 + }, + { + "epoch": 0.309, + "grad_norm": 12.875, + "grad_norm_var": 0.3555826822916667, + "learning_rate": 0.0003, + "loss": 11.3729, + "loss/aux_loss": 0.048082031309604645, + "loss/crossentropy": 2.7035711348056792, + "loss/logits": 0.8440298497676849, + "step": 30900 + }, + { + "epoch": 0.3091, + "grad_norm": 13.375, + "grad_norm_var": 0.3395182291666667, + "learning_rate": 0.0003, + "loss": 11.5239, + "loss/aux_loss": 0.048081082105636594, + "loss/crossentropy": 2.6476317226886747, + "loss/logits": 0.8694226413965225, + "step": 30910 + }, + { + "epoch": 0.3092, + "grad_norm": 14.0625, + "grad_norm_var": 0.27805989583333335, + "learning_rate": 0.0003, + "loss": 11.557, + "loss/aux_loss": 0.04807517770677805, + "loss/crossentropy": 2.789444291591644, + "loss/logits": 0.8987109959125519, + "step": 30920 + }, + { + "epoch": 0.3093, + "grad_norm": 13.0, + "grad_norm_var": 0.448291015625, + "learning_rate": 0.0003, + "loss": 11.4581, + "loss/aux_loss": 0.048085184581577775, + "loss/crossentropy": 2.8388954520225527, + "loss/logits": 0.8571620523929596, + "step": 30930 + }, + { + "epoch": 0.3094, + "grad_norm": 13.5625, + "grad_norm_var": 1.1176432291666667, + "learning_rate": 0.0003, + "loss": 11.4472, + "loss/aux_loss": 0.04808578956872225, + "loss/crossentropy": 2.7803068816661836, + "loss/logits": 0.8517871230840683, + "step": 30940 + }, + { + "epoch": 0.3095, + "grad_norm": 13.6875, + "grad_norm_var": 0.879931640625, + "learning_rate": 0.0003, + "loss": 11.2953, + "loss/aux_loss": 0.048075218498706815, + "loss/crossentropy": 2.6304258346557616, + "loss/logits": 0.8701532393693924, + "step": 30950 + }, + { + "epoch": 0.3096, + "grad_norm": 14.1875, + "grad_norm_var": 0.5738932291666666, + "learning_rate": 0.0003, + "loss": 11.4005, + "loss/aux_loss": 0.04808662962168455, + "loss/crossentropy": 2.7041739583015443, + "loss/logits": 0.8431258827447892, + "step": 30960 + }, + { + "epoch": 0.3097, + "grad_norm": 14.625, + "grad_norm_var": 0.693212890625, + "learning_rate": 0.0003, + "loss": 11.1962, + "loss/aux_loss": 0.04808198884129524, + "loss/crossentropy": 2.5106098532676695, + "loss/logits": 0.8334185928106308, + "step": 30970 + }, + { + "epoch": 0.3098, + "grad_norm": 13.5625, + "grad_norm_var": 0.5868326822916666, + "learning_rate": 0.0003, + "loss": 11.3845, + "loss/aux_loss": 0.04808704257011413, + "loss/crossentropy": 2.824685072898865, + "loss/logits": 0.8856196343898773, + "step": 30980 + }, + { + "epoch": 0.3099, + "grad_norm": 13.8125, + "grad_norm_var": 0.5233723958333333, + "learning_rate": 0.0003, + "loss": 11.2844, + "loss/aux_loss": 0.04808703400194645, + "loss/crossentropy": 2.70513573884964, + "loss/logits": 0.8594135075807572, + "step": 30990 + }, + { + "epoch": 0.31, + "grad_norm": 12.8125, + "grad_norm_var": 0.32537434895833334, + "learning_rate": 0.0003, + "loss": 11.431, + "loss/aux_loss": 0.048076413199305536, + "loss/crossentropy": 2.790517818927765, + "loss/logits": 0.8762030184268952, + "step": 31000 + }, + { + "epoch": 0.3101, + "grad_norm": 12.3125, + "grad_norm_var": 0.309375, + "learning_rate": 0.0003, + "loss": 11.3837, + "loss/aux_loss": 0.048087396286427976, + "loss/crossentropy": 2.823656415939331, + "loss/logits": 0.8393119305372239, + "step": 31010 + }, + { + "epoch": 0.3102, + "grad_norm": 13.3125, + "grad_norm_var": 38.86300455729167, + "learning_rate": 0.0003, + "loss": 11.455, + "loss/aux_loss": 0.04807733986526728, + "loss/crossentropy": 2.8026981115341187, + "loss/logits": 0.8650432884693146, + "step": 31020 + }, + { + "epoch": 0.3103, + "grad_norm": 17.125, + "grad_norm_var": 1.2098958333333334, + "learning_rate": 0.0003, + "loss": 11.4485, + "loss/aux_loss": 0.048109233193099496, + "loss/crossentropy": 2.7331307351589205, + "loss/logits": 0.8930859625339508, + "step": 31030 + }, + { + "epoch": 0.3104, + "grad_norm": 14.125, + "grad_norm_var": 1.227587890625, + "learning_rate": 0.0003, + "loss": 11.5582, + "loss/aux_loss": 0.04805864728987217, + "loss/crossentropy": 2.7618947505950926, + "loss/logits": 0.858463802933693, + "step": 31040 + }, + { + "epoch": 0.3105, + "grad_norm": 12.5625, + "grad_norm_var": 0.5054524739583334, + "learning_rate": 0.0003, + "loss": 11.2948, + "loss/aux_loss": 0.048091983795166014, + "loss/crossentropy": 2.8342044055461884, + "loss/logits": 0.8382025718688965, + "step": 31050 + }, + { + "epoch": 0.3106, + "grad_norm": 13.3125, + "grad_norm_var": 0.2530598958333333, + "learning_rate": 0.0003, + "loss": 11.3819, + "loss/aux_loss": 0.04808224979788065, + "loss/crossentropy": 2.667121487855911, + "loss/logits": 0.8481329113245011, + "step": 31060 + }, + { + "epoch": 0.3107, + "grad_norm": 13.5, + "grad_norm_var": 0.192431640625, + "learning_rate": 0.0003, + "loss": 11.4812, + "loss/aux_loss": 0.04808955937623978, + "loss/crossentropy": 2.6511776447296143, + "loss/logits": 0.8379587948322296, + "step": 31070 + }, + { + "epoch": 0.3108, + "grad_norm": 12.375, + "grad_norm_var": 0.3859375, + "learning_rate": 0.0003, + "loss": 11.4201, + "loss/aux_loss": 0.048076062090694906, + "loss/crossentropy": 2.909103608131409, + "loss/logits": 0.9112180799245835, + "step": 31080 + }, + { + "epoch": 0.3109, + "grad_norm": 12.3125, + "grad_norm_var": 0.23605143229166667, + "learning_rate": 0.0003, + "loss": 11.5163, + "loss/aux_loss": 0.04807505179196596, + "loss/crossentropy": 2.720409429073334, + "loss/logits": 0.8697555780410766, + "step": 31090 + }, + { + "epoch": 0.311, + "grad_norm": 12.625, + "grad_norm_var": 0.35149739583333334, + "learning_rate": 0.0003, + "loss": 11.3274, + "loss/aux_loss": 0.04808955602347851, + "loss/crossentropy": 2.6798146247863768, + "loss/logits": 0.8508224755525589, + "step": 31100 + }, + { + "epoch": 0.3111, + "grad_norm": 12.9375, + "grad_norm_var": 0.9098307291666666, + "learning_rate": 0.0003, + "loss": 11.2321, + "loss/aux_loss": 0.048081112653017045, + "loss/crossentropy": 2.8907525897026063, + "loss/logits": 0.8622247904539109, + "step": 31110 + }, + { + "epoch": 0.3112, + "grad_norm": 14.0, + "grad_norm_var": 0.6708333333333333, + "learning_rate": 0.0003, + "loss": 11.3221, + "loss/aux_loss": 0.048092206753790376, + "loss/crossentropy": 2.5958180367946624, + "loss/logits": 0.8239250183105469, + "step": 31120 + }, + { + "epoch": 0.3113, + "grad_norm": 13.625, + "grad_norm_var": 0.5369791666666667, + "learning_rate": 0.0003, + "loss": 11.3484, + "loss/aux_loss": 0.04808017313480377, + "loss/crossentropy": 2.730751097202301, + "loss/logits": 0.8445838242769241, + "step": 31130 + }, + { + "epoch": 0.3114, + "grad_norm": 14.0625, + "grad_norm_var": 1.043212890625, + "learning_rate": 0.0003, + "loss": 11.2755, + "loss/aux_loss": 0.0480805704370141, + "loss/crossentropy": 2.760500192642212, + "loss/logits": 0.8478647708892822, + "step": 31140 + }, + { + "epoch": 0.3115, + "grad_norm": 13.625, + "grad_norm_var": 0.799462890625, + "learning_rate": 0.0003, + "loss": 11.5213, + "loss/aux_loss": 0.048074069805443286, + "loss/crossentropy": 2.8183942079544066, + "loss/logits": 0.8743588626384735, + "step": 31150 + }, + { + "epoch": 0.3116, + "grad_norm": 13.6875, + "grad_norm_var": 0.36053059895833334, + "learning_rate": 0.0003, + "loss": 11.3418, + "loss/aux_loss": 0.048081109300255775, + "loss/crossentropy": 2.8368311285972596, + "loss/logits": 0.8868528872728347, + "step": 31160 + }, + { + "epoch": 0.3117, + "grad_norm": 13.75, + "grad_norm_var": 0.474072265625, + "learning_rate": 0.0003, + "loss": 11.3342, + "loss/aux_loss": 0.048071020655333994, + "loss/crossentropy": 2.630265325307846, + "loss/logits": 0.8686846703290939, + "step": 31170 + }, + { + "epoch": 0.3118, + "grad_norm": 13.625, + "grad_norm_var": 0.22420247395833334, + "learning_rate": 0.0003, + "loss": 11.6492, + "loss/aux_loss": 0.04808335732668638, + "loss/crossentropy": 2.9368000745773317, + "loss/logits": 0.8898035645484924, + "step": 31180 + }, + { + "epoch": 0.3119, + "grad_norm": 13.1875, + "grad_norm_var": 0.237353515625, + "learning_rate": 0.0003, + "loss": 11.5728, + "loss/aux_loss": 0.04807795882225037, + "loss/crossentropy": 2.8425419092178346, + "loss/logits": 0.8769685357809067, + "step": 31190 + }, + { + "epoch": 0.312, + "grad_norm": 13.125, + "grad_norm_var": 0.21145833333333333, + "learning_rate": 0.0003, + "loss": 11.5003, + "loss/aux_loss": 0.04807908125221729, + "loss/crossentropy": 2.8865275263786314, + "loss/logits": 0.89976706802845, + "step": 31200 + }, + { + "epoch": 0.3121, + "grad_norm": 13.0625, + "grad_norm_var": 0.4032389322916667, + "learning_rate": 0.0003, + "loss": 11.448, + "loss/aux_loss": 0.048086687363684176, + "loss/crossentropy": 2.8929324388504027, + "loss/logits": 0.8646660923957825, + "step": 31210 + }, + { + "epoch": 0.3122, + "grad_norm": 13.625, + "grad_norm_var": 3.517041015625, + "learning_rate": 0.0003, + "loss": 11.3664, + "loss/aux_loss": 0.04807643294334411, + "loss/crossentropy": 2.6457720398902893, + "loss/logits": 0.8991645514965058, + "step": 31220 + }, + { + "epoch": 0.3123, + "grad_norm": 13.5, + "grad_norm_var": 3.8999348958333333, + "learning_rate": 0.0003, + "loss": 11.4684, + "loss/aux_loss": 0.04808459933847189, + "loss/crossentropy": 2.634989720582962, + "loss/logits": 0.8654608964920044, + "step": 31230 + }, + { + "epoch": 0.3124, + "grad_norm": 14.3125, + "grad_norm_var": 0.4212890625, + "learning_rate": 0.0003, + "loss": 11.719, + "loss/aux_loss": 0.04808775335550308, + "loss/crossentropy": 2.8814554154872893, + "loss/logits": 0.8489186823368072, + "step": 31240 + }, + { + "epoch": 0.3125, + "grad_norm": 12.8125, + "grad_norm_var": 0.6421223958333333, + "learning_rate": 0.0003, + "loss": 11.5017, + "loss/aux_loss": 0.04807210192084312, + "loss/crossentropy": 2.8812507152557374, + "loss/logits": 0.8861448734998703, + "step": 31250 + }, + { + "epoch": 0.3126, + "grad_norm": 13.8125, + "grad_norm_var": 1.1890625, + "learning_rate": 0.0003, + "loss": 11.3459, + "loss/aux_loss": 0.048083325289189814, + "loss/crossentropy": 2.6456491708755494, + "loss/logits": 0.8774872869253159, + "step": 31260 + }, + { + "epoch": 0.3127, + "grad_norm": 14.5625, + "grad_norm_var": 0.7696451822916667, + "learning_rate": 0.0003, + "loss": 11.4124, + "loss/aux_loss": 0.048082329146564004, + "loss/crossentropy": 2.855871230363846, + "loss/logits": 0.8438941597938537, + "step": 31270 + }, + { + "epoch": 0.3128, + "grad_norm": 12.6875, + "grad_norm_var": 0.7869140625, + "learning_rate": 0.0003, + "loss": 11.2841, + "loss/aux_loss": 0.048089153692126275, + "loss/crossentropy": 2.7080669164657594, + "loss/logits": 0.8581030815839767, + "step": 31280 + }, + { + "epoch": 0.3129, + "grad_norm": 13.375, + "grad_norm_var": 0.5102701822916667, + "learning_rate": 0.0003, + "loss": 11.5182, + "loss/aux_loss": 0.048074636980891226, + "loss/crossentropy": 2.7201138913631437, + "loss/logits": 0.8733494013547898, + "step": 31290 + }, + { + "epoch": 0.313, + "grad_norm": 12.4375, + "grad_norm_var": 0.6463541666666667, + "learning_rate": 0.0003, + "loss": 11.5523, + "loss/aux_loss": 0.04808723703026772, + "loss/crossentropy": 2.778682363033295, + "loss/logits": 0.8577165812253952, + "step": 31300 + }, + { + "epoch": 0.3131, + "grad_norm": 13.5, + "grad_norm_var": 0.2984375, + "learning_rate": 0.0003, + "loss": 11.4788, + "loss/aux_loss": 0.048080189153552055, + "loss/crossentropy": 2.9088799715042115, + "loss/logits": 0.8727356672286988, + "step": 31310 + }, + { + "epoch": 0.3132, + "grad_norm": 12.9375, + "grad_norm_var": 0.17076822916666667, + "learning_rate": 0.0003, + "loss": 11.3828, + "loss/aux_loss": 0.04808860644698143, + "loss/crossentropy": 2.718260443210602, + "loss/logits": 0.8305355608463287, + "step": 31320 + }, + { + "epoch": 0.3133, + "grad_norm": 14.0625, + "grad_norm_var": 0.379541015625, + "learning_rate": 0.0003, + "loss": 11.5571, + "loss/aux_loss": 0.04808090459555388, + "loss/crossentropy": 2.7178287625312807, + "loss/logits": 0.8788865208625793, + "step": 31330 + }, + { + "epoch": 0.3134, + "grad_norm": 15.75, + "grad_norm_var": 1.0238932291666667, + "learning_rate": 0.0003, + "loss": 11.3747, + "loss/aux_loss": 0.04807234313338995, + "loss/crossentropy": 2.6713554739952086, + "loss/logits": 0.8787687391042709, + "step": 31340 + }, + { + "epoch": 0.3135, + "grad_norm": 12.8125, + "grad_norm_var": 1.5390625, + "learning_rate": 0.0003, + "loss": 11.3459, + "loss/aux_loss": 0.04808554369956255, + "loss/crossentropy": 2.652766835689545, + "loss/logits": 0.8742602497339249, + "step": 31350 + }, + { + "epoch": 0.3136, + "grad_norm": 13.1875, + "grad_norm_var": 1.2830729166666666, + "learning_rate": 0.0003, + "loss": 11.4199, + "loss/aux_loss": 0.04807996340095997, + "loss/crossentropy": 2.7023261964321135, + "loss/logits": 0.8666334301233292, + "step": 31360 + }, + { + "epoch": 0.3137, + "grad_norm": 11.9375, + "grad_norm_var": 1.11953125, + "learning_rate": 0.0003, + "loss": 11.4297, + "loss/aux_loss": 0.048081773333251476, + "loss/crossentropy": 2.786700093746185, + "loss/logits": 0.902243122458458, + "step": 31370 + }, + { + "epoch": 0.3138, + "grad_norm": 13.0625, + "grad_norm_var": 0.4800618489583333, + "learning_rate": 0.0003, + "loss": 11.4868, + "loss/aux_loss": 0.048089880496263504, + "loss/crossentropy": 2.808494824171066, + "loss/logits": 0.8556656092405319, + "step": 31380 + }, + { + "epoch": 0.3139, + "grad_norm": 14.1875, + "grad_norm_var": 0.21484375, + "learning_rate": 0.0003, + "loss": 11.4171, + "loss/aux_loss": 0.04808277599513531, + "loss/crossentropy": 2.771555906534195, + "loss/logits": 0.8765601277351379, + "step": 31390 + }, + { + "epoch": 0.314, + "grad_norm": 12.6875, + "grad_norm_var": 1.2710774739583333, + "learning_rate": 0.0003, + "loss": 11.6018, + "loss/aux_loss": 0.048083768039941785, + "loss/crossentropy": 2.888131785392761, + "loss/logits": 0.8636642038822174, + "step": 31400 + }, + { + "epoch": 0.3141, + "grad_norm": 13.3125, + "grad_norm_var": 0.29375, + "learning_rate": 0.0003, + "loss": 11.3992, + "loss/aux_loss": 0.048076036386191845, + "loss/crossentropy": 2.5779885232448576, + "loss/logits": 0.8284551709890365, + "step": 31410 + }, + { + "epoch": 0.3142, + "grad_norm": 15.1875, + "grad_norm_var": 0.45675455729166664, + "learning_rate": 0.0003, + "loss": 11.239, + "loss/aux_loss": 0.0480815326794982, + "loss/crossentropy": 2.857342076301575, + "loss/logits": 0.866741943359375, + "step": 31420 + }, + { + "epoch": 0.3143, + "grad_norm": 13.8125, + "grad_norm_var": 0.4875, + "learning_rate": 0.0003, + "loss": 11.5045, + "loss/aux_loss": 0.04808294028043747, + "loss/crossentropy": 2.730056095123291, + "loss/logits": 0.8772373676300049, + "step": 31430 + }, + { + "epoch": 0.3144, + "grad_norm": 12.75, + "grad_norm_var": 0.6671223958333333, + "learning_rate": 0.0003, + "loss": 11.4201, + "loss/aux_loss": 0.0480757225304842, + "loss/crossentropy": 2.8834362506866453, + "loss/logits": 0.8785695016384125, + "step": 31440 + }, + { + "epoch": 0.3145, + "grad_norm": 13.125, + "grad_norm_var": 0.709375, + "learning_rate": 0.0003, + "loss": 11.2471, + "loss/aux_loss": 0.048089978471398356, + "loss/crossentropy": 2.6122034907341005, + "loss/logits": 0.8332406580448151, + "step": 31450 + }, + { + "epoch": 0.3146, + "grad_norm": 13.5625, + "grad_norm_var": 0.6348307291666667, + "learning_rate": 0.0003, + "loss": 11.4156, + "loss/aux_loss": 0.04808205291628838, + "loss/crossentropy": 2.7532804131507875, + "loss/logits": 0.8511408418416977, + "step": 31460 + }, + { + "epoch": 0.3147, + "grad_norm": 12.5, + "grad_norm_var": 1.6400390625, + "learning_rate": 0.0003, + "loss": 11.4242, + "loss/aux_loss": 0.04809170886874199, + "loss/crossentropy": 2.6764910399913786, + "loss/logits": 0.8635757118463516, + "step": 31470 + }, + { + "epoch": 0.3148, + "grad_norm": 13.0625, + "grad_norm_var": 0.8389973958333333, + "learning_rate": 0.0003, + "loss": 11.2935, + "loss/aux_loss": 0.04808271527290344, + "loss/crossentropy": 2.6128583550453186, + "loss/logits": 0.8351830154657364, + "step": 31480 + }, + { + "epoch": 0.3149, + "grad_norm": 13.9375, + "grad_norm_var": 3.76484375, + "learning_rate": 0.0003, + "loss": 11.6416, + "loss/aux_loss": 0.048077508620917794, + "loss/crossentropy": 2.8005987286567686, + "loss/logits": 0.9104048877954483, + "step": 31490 + }, + { + "epoch": 0.315, + "grad_norm": 14.75, + "grad_norm_var": 7.100244140625, + "learning_rate": 0.0003, + "loss": 11.474, + "loss/aux_loss": 0.04809277784079313, + "loss/crossentropy": 2.739937108755112, + "loss/logits": 0.8828804194927216, + "step": 31500 + }, + { + "epoch": 0.3151, + "grad_norm": 13.75, + "grad_norm_var": 5.307926432291667, + "learning_rate": 0.0003, + "loss": 11.4576, + "loss/aux_loss": 0.04807978179305792, + "loss/crossentropy": 2.912484383583069, + "loss/logits": 0.9298472136259079, + "step": 31510 + }, + { + "epoch": 0.3152, + "grad_norm": 13.3125, + "grad_norm_var": 1.0136555989583333, + "learning_rate": 0.0003, + "loss": 11.4793, + "loss/aux_loss": 0.04807396475225687, + "loss/crossentropy": 2.864921712875366, + "loss/logits": 0.8761105090379715, + "step": 31520 + }, + { + "epoch": 0.3153, + "grad_norm": 13.0, + "grad_norm_var": 0.35670572916666665, + "learning_rate": 0.0003, + "loss": 11.2979, + "loss/aux_loss": 0.04808592237532139, + "loss/crossentropy": 2.7697804093360903, + "loss/logits": 0.8579605609178543, + "step": 31530 + }, + { + "epoch": 0.3154, + "grad_norm": 14.5, + "grad_norm_var": 0.5884765625, + "learning_rate": 0.0003, + "loss": 11.364, + "loss/aux_loss": 0.048074756562709806, + "loss/crossentropy": 2.720014047622681, + "loss/logits": 0.8563840836286545, + "step": 31540 + }, + { + "epoch": 0.3155, + "grad_norm": 15.3125, + "grad_norm_var": 0.7206868489583333, + "learning_rate": 0.0003, + "loss": 11.3455, + "loss/aux_loss": 0.048072342202067374, + "loss/crossentropy": 2.643301236629486, + "loss/logits": 0.8437513649463654, + "step": 31550 + }, + { + "epoch": 0.3156, + "grad_norm": 38.5, + "grad_norm_var": 39.195247395833334, + "learning_rate": 0.0003, + "loss": 11.5141, + "loss/aux_loss": 0.04808412864804268, + "loss/crossentropy": 2.726420682668686, + "loss/logits": 0.8565041303634644, + "step": 31560 + }, + { + "epoch": 0.3157, + "grad_norm": 13.3125, + "grad_norm_var": 39.03489583333333, + "learning_rate": 0.0003, + "loss": 11.5841, + "loss/aux_loss": 0.048086312040686606, + "loss/crossentropy": 2.7990392088890075, + "loss/logits": 0.866115254163742, + "step": 31570 + }, + { + "epoch": 0.3158, + "grad_norm": 13.375, + "grad_norm_var": 3.559830729166667, + "learning_rate": 0.0003, + "loss": 11.2449, + "loss/aux_loss": 0.04807570930570364, + "loss/crossentropy": 2.7598312139511108, + "loss/logits": 0.848942118883133, + "step": 31580 + }, + { + "epoch": 0.3159, + "grad_norm": 13.9375, + "grad_norm_var": 0.4830729166666667, + "learning_rate": 0.0003, + "loss": 11.4392, + "loss/aux_loss": 0.04808403309434652, + "loss/crossentropy": 2.7267268300056458, + "loss/logits": 0.8826425462961197, + "step": 31590 + }, + { + "epoch": 0.316, + "grad_norm": 13.6875, + "grad_norm_var": 0.37550455729166665, + "learning_rate": 0.0003, + "loss": 11.3998, + "loss/aux_loss": 0.04807063583284617, + "loss/crossentropy": 2.776882898807526, + "loss/logits": 0.8546944618225097, + "step": 31600 + }, + { + "epoch": 0.3161, + "grad_norm": 12.9375, + "grad_norm_var": 0.8063639322916667, + "learning_rate": 0.0003, + "loss": 11.5461, + "loss/aux_loss": 0.04809814915060997, + "loss/crossentropy": 2.643150007724762, + "loss/logits": 0.8487259536981583, + "step": 31610 + }, + { + "epoch": 0.3162, + "grad_norm": 12.1875, + "grad_norm_var": 0.8200358072916667, + "learning_rate": 0.0003, + "loss": 11.357, + "loss/aux_loss": 0.0480756500735879, + "loss/crossentropy": 2.7450030565261843, + "loss/logits": 0.858446741104126, + "step": 31620 + }, + { + "epoch": 0.3163, + "grad_norm": 12.5625, + "grad_norm_var": 0.8395182291666666, + "learning_rate": 0.0003, + "loss": 11.4617, + "loss/aux_loss": 0.04808495547622442, + "loss/crossentropy": 2.8632676005363464, + "loss/logits": 0.8507615506649018, + "step": 31630 + }, + { + "epoch": 0.3164, + "grad_norm": 13.625, + "grad_norm_var": 0.8075520833333333, + "learning_rate": 0.0003, + "loss": 11.3846, + "loss/aux_loss": 0.048081559129059315, + "loss/crossentropy": 2.681163477897644, + "loss/logits": 0.8584868460893631, + "step": 31640 + }, + { + "epoch": 0.3165, + "grad_norm": 14.0625, + "grad_norm_var": 0.461572265625, + "learning_rate": 0.0003, + "loss": 11.3037, + "loss/aux_loss": 0.04807944130152464, + "loss/crossentropy": 2.757179379463196, + "loss/logits": 0.8438855946063996, + "step": 31650 + }, + { + "epoch": 0.3166, + "grad_norm": 13.125, + "grad_norm_var": 0.30983072916666665, + "learning_rate": 0.0003, + "loss": 11.4289, + "loss/aux_loss": 0.048075127974152566, + "loss/crossentropy": 2.775540769100189, + "loss/logits": 0.8543334901332855, + "step": 31660 + }, + { + "epoch": 0.3167, + "grad_norm": 12.8125, + "grad_norm_var": 0.24178059895833334, + "learning_rate": 0.0003, + "loss": 11.3283, + "loss/aux_loss": 0.04808103535324335, + "loss/crossentropy": 2.771780288219452, + "loss/logits": 0.8589775919914245, + "step": 31670 + }, + { + "epoch": 0.3168, + "grad_norm": 12.75, + "grad_norm_var": 0.22146809895833333, + "learning_rate": 0.0003, + "loss": 11.5683, + "loss/aux_loss": 0.04807479549199343, + "loss/crossentropy": 2.889441192150116, + "loss/logits": 0.8831599056720734, + "step": 31680 + }, + { + "epoch": 0.3169, + "grad_norm": 14.5, + "grad_norm_var": 0.4813639322916667, + "learning_rate": 0.0003, + "loss": 11.5118, + "loss/aux_loss": 0.048088008724153045, + "loss/crossentropy": 2.794116795063019, + "loss/logits": 0.8760564774274826, + "step": 31690 + }, + { + "epoch": 0.317, + "grad_norm": 17.875, + "grad_norm_var": 60.6546875, + "learning_rate": 0.0003, + "loss": 11.4563, + "loss/aux_loss": 0.04808449987322092, + "loss/crossentropy": 2.7889586448669434, + "loss/logits": 0.8560769230127334, + "step": 31700 + }, + { + "epoch": 0.3171, + "grad_norm": 13.0, + "grad_norm_var": 61.812434895833334, + "learning_rate": 0.0003, + "loss": 11.4129, + "loss/aux_loss": 0.048075599037110804, + "loss/crossentropy": 2.764506447315216, + "loss/logits": 0.8688730716705322, + "step": 31710 + }, + { + "epoch": 0.3172, + "grad_norm": 12.3125, + "grad_norm_var": 1.903125, + "learning_rate": 0.0003, + "loss": 11.401, + "loss/aux_loss": 0.04808210451155901, + "loss/crossentropy": 2.6953455746173858, + "loss/logits": 0.8976037502288818, + "step": 31720 + }, + { + "epoch": 0.3173, + "grad_norm": 17.75, + "grad_norm_var": 4.958968098958334, + "learning_rate": 0.0003, + "loss": 11.3902, + "loss/aux_loss": 0.04808040298521519, + "loss/crossentropy": 2.7925826787948607, + "loss/logits": 0.8728821247816085, + "step": 31730 + }, + { + "epoch": 0.3174, + "grad_norm": 12.4375, + "grad_norm_var": 1.7452473958333334, + "learning_rate": 0.0003, + "loss": 11.5282, + "loss/aux_loss": 0.048076373524963856, + "loss/crossentropy": 2.8562302708625795, + "loss/logits": 0.8475510686635971, + "step": 31740 + }, + { + "epoch": 0.3175, + "grad_norm": 12.625, + "grad_norm_var": 0.6619791666666667, + "learning_rate": 0.0003, + "loss": 11.366, + "loss/aux_loss": 0.048085829429328444, + "loss/crossentropy": 2.649816393852234, + "loss/logits": 0.8313356369733811, + "step": 31750 + }, + { + "epoch": 0.3176, + "grad_norm": 14.3125, + "grad_norm_var": 0.8421223958333334, + "learning_rate": 0.0003, + "loss": 11.4549, + "loss/aux_loss": 0.0480808213353157, + "loss/crossentropy": 2.7275112867355347, + "loss/logits": 0.8897455483675003, + "step": 31760 + }, + { + "epoch": 0.3177, + "grad_norm": 13.5, + "grad_norm_var": 0.5714680989583333, + "learning_rate": 0.0003, + "loss": 11.3229, + "loss/aux_loss": 0.048082269914448264, + "loss/crossentropy": 2.7526735663414, + "loss/logits": 0.8474861830472946, + "step": 31770 + }, + { + "epoch": 0.3178, + "grad_norm": 13.3125, + "grad_norm_var": 0.47394205729166666, + "learning_rate": 0.0003, + "loss": 11.5803, + "loss/aux_loss": 0.04808472171425819, + "loss/crossentropy": 2.834140819311142, + "loss/logits": 0.8541903674602509, + "step": 31780 + }, + { + "epoch": 0.3179, + "grad_norm": 12.5, + "grad_norm_var": 0.7118326822916666, + "learning_rate": 0.0003, + "loss": 11.3839, + "loss/aux_loss": 0.04806725513190031, + "loss/crossentropy": 2.5748830080032348, + "loss/logits": 0.8210257202386856, + "step": 31790 + }, + { + "epoch": 0.318, + "grad_norm": 13.25, + "grad_norm_var": 0.6691243489583333, + "learning_rate": 0.0003, + "loss": 11.4991, + "loss/aux_loss": 0.0480900140479207, + "loss/crossentropy": 2.720593547821045, + "loss/logits": 0.8570217370986939, + "step": 31800 + }, + { + "epoch": 0.3181, + "grad_norm": 16.375, + "grad_norm_var": 7.630208333333333, + "learning_rate": 0.0003, + "loss": 11.3505, + "loss/aux_loss": 0.048079617135226724, + "loss/crossentropy": 2.76233834028244, + "loss/logits": 0.8391731053590774, + "step": 31810 + }, + { + "epoch": 0.3182, + "grad_norm": 12.125, + "grad_norm_var": 8.1328125, + "learning_rate": 0.0003, + "loss": 11.4925, + "loss/aux_loss": 0.04807399399578571, + "loss/crossentropy": 2.793236219882965, + "loss/logits": 0.8872128367424011, + "step": 31820 + }, + { + "epoch": 0.3183, + "grad_norm": 14.125, + "grad_norm_var": 0.43527018229166664, + "learning_rate": 0.0003, + "loss": 11.4887, + "loss/aux_loss": 0.048083842545747754, + "loss/crossentropy": 2.860837161540985, + "loss/logits": 0.8602665454149246, + "step": 31830 + }, + { + "epoch": 0.3184, + "grad_norm": 14.125, + "grad_norm_var": 0.341650390625, + "learning_rate": 0.0003, + "loss": 11.3578, + "loss/aux_loss": 0.04807821288704872, + "loss/crossentropy": 2.7253613471984863, + "loss/logits": 0.8533663004636765, + "step": 31840 + }, + { + "epoch": 0.3185, + "grad_norm": 15.75, + "grad_norm_var": 625.615625, + "learning_rate": 0.0003, + "loss": 11.595, + "loss/aux_loss": 0.04808409884572029, + "loss/crossentropy": 2.833714520931244, + "loss/logits": 0.8993043005466461, + "step": 31850 + }, + { + "epoch": 0.3186, + "grad_norm": 13.5625, + "grad_norm_var": 608.6546223958334, + "learning_rate": 0.0003, + "loss": 11.3319, + "loss/aux_loss": 0.048097670264542106, + "loss/crossentropy": 2.7307827293872835, + "loss/logits": 0.8190259605646133, + "step": 31860 + }, + { + "epoch": 0.3187, + "grad_norm": 14.375, + "grad_norm_var": 4.886051432291667, + "learning_rate": 0.0003, + "loss": 11.449, + "loss/aux_loss": 0.04808141943067312, + "loss/crossentropy": 2.7856150209903716, + "loss/logits": 0.851562550663948, + "step": 31870 + }, + { + "epoch": 0.3188, + "grad_norm": 13.4375, + "grad_norm_var": 0.31378580729166666, + "learning_rate": 0.0003, + "loss": 11.2744, + "loss/aux_loss": 0.048080786131322384, + "loss/crossentropy": 2.759511637687683, + "loss/logits": 0.8260854959487915, + "step": 31880 + }, + { + "epoch": 0.3189, + "grad_norm": 14.5, + "grad_norm_var": 0.337353515625, + "learning_rate": 0.0003, + "loss": 11.4055, + "loss/aux_loss": 0.048088025860488416, + "loss/crossentropy": 2.7933689653873444, + "loss/logits": 0.8317734956741333, + "step": 31890 + }, + { + "epoch": 0.319, + "grad_norm": 13.8125, + "grad_norm_var": 0.4176432291666667, + "learning_rate": 0.0003, + "loss": 11.4489, + "loss/aux_loss": 0.04807045944035053, + "loss/crossentropy": 2.671738988161087, + "loss/logits": 0.8746799319982529, + "step": 31900 + }, + { + "epoch": 0.3191, + "grad_norm": 13.125, + "grad_norm_var": 0.5280598958333333, + "learning_rate": 0.0003, + "loss": 11.4261, + "loss/aux_loss": 0.04807942863553762, + "loss/crossentropy": 2.8266174256801606, + "loss/logits": 0.9010621100664139, + "step": 31910 + }, + { + "epoch": 0.3192, + "grad_norm": 14.1875, + "grad_norm_var": 0.42823893229166665, + "learning_rate": 0.0003, + "loss": 11.3704, + "loss/aux_loss": 0.04807835109531879, + "loss/crossentropy": 2.8471252858638763, + "loss/logits": 0.8496732413768768, + "step": 31920 + }, + { + "epoch": 0.3193, + "grad_norm": 15.25, + "grad_norm_var": 0.5960774739583333, + "learning_rate": 0.0003, + "loss": 11.3404, + "loss/aux_loss": 0.04808904957026243, + "loss/crossentropy": 2.7125583946704865, + "loss/logits": 0.8533723443746567, + "step": 31930 + }, + { + "epoch": 0.3194, + "grad_norm": 12.1875, + "grad_norm_var": 0.6400390625, + "learning_rate": 0.0003, + "loss": 11.526, + "loss/aux_loss": 0.048076958023011686, + "loss/crossentropy": 2.847039544582367, + "loss/logits": 0.8665654867887497, + "step": 31940 + }, + { + "epoch": 0.3195, + "grad_norm": 12.6875, + "grad_norm_var": 1.1645182291666667, + "learning_rate": 0.0003, + "loss": 11.3755, + "loss/aux_loss": 0.04807892180979252, + "loss/crossentropy": 2.6123467087745667, + "loss/logits": 0.8488023519515991, + "step": 31950 + }, + { + "epoch": 0.3196, + "grad_norm": 13.8125, + "grad_norm_var": 0.5695149739583333, + "learning_rate": 0.0003, + "loss": 11.3545, + "loss/aux_loss": 0.048081924021244046, + "loss/crossentropy": 2.756511354446411, + "loss/logits": 0.8485715210437774, + "step": 31960 + }, + { + "epoch": 0.3197, + "grad_norm": 11.9375, + "grad_norm_var": 0.6130045572916667, + "learning_rate": 0.0003, + "loss": 11.5651, + "loss/aux_loss": 0.04808203242719174, + "loss/crossentropy": 2.696575939655304, + "loss/logits": 0.8660462826490403, + "step": 31970 + }, + { + "epoch": 0.3198, + "grad_norm": 13.625, + "grad_norm_var": 0.5766764322916667, + "learning_rate": 0.0003, + "loss": 11.5154, + "loss/aux_loss": 0.048074822314083575, + "loss/crossentropy": 2.7823503494262694, + "loss/logits": 0.8631897240877151, + "step": 31980 + }, + { + "epoch": 0.3199, + "grad_norm": 13.0625, + "grad_norm_var": 0.6207682291666666, + "learning_rate": 0.0003, + "loss": 11.4817, + "loss/aux_loss": 0.048085047490894796, + "loss/crossentropy": 2.765849894285202, + "loss/logits": 0.8870153099298477, + "step": 31990 + }, + { + "epoch": 0.32, + "grad_norm": 13.3125, + "grad_norm_var": 0.352197265625, + "learning_rate": 0.0003, + "loss": 11.4848, + "loss/aux_loss": 0.048076517321169375, + "loss/crossentropy": 2.818051886558533, + "loss/logits": 0.8712541669607162, + "step": 32000 + }, + { + "epoch": 0.3201, + "grad_norm": 14.0625, + "grad_norm_var": 0.294384765625, + "learning_rate": 0.0003, + "loss": 11.4565, + "loss/aux_loss": 0.0480777308344841, + "loss/crossentropy": 2.7592976331710815, + "loss/logits": 0.8439573287963867, + "step": 32010 + }, + { + "epoch": 0.3202, + "grad_norm": 15.3125, + "grad_norm_var": 759.9216145833333, + "learning_rate": 0.0003, + "loss": 11.679, + "loss/aux_loss": 0.04811829086393118, + "loss/crossentropy": 2.7429904997348786, + "loss/logits": 0.87835733294487, + "step": 32020 + }, + { + "epoch": 0.3203, + "grad_norm": 12.5625, + "grad_norm_var": 14.346598307291666, + "learning_rate": 0.0003, + "loss": 11.4291, + "loss/aux_loss": 0.04807977806776762, + "loss/crossentropy": 2.733081966638565, + "loss/logits": 0.8281262069940567, + "step": 32030 + }, + { + "epoch": 0.3204, + "grad_norm": 13.0, + "grad_norm_var": 0.744775390625, + "learning_rate": 0.0003, + "loss": 11.2668, + "loss/aux_loss": 0.04808316174894571, + "loss/crossentropy": 2.7543557405471804, + "loss/logits": 0.8591379880905151, + "step": 32040 + }, + { + "epoch": 0.3205, + "grad_norm": 13.8125, + "grad_norm_var": 0.4337076822916667, + "learning_rate": 0.0003, + "loss": 11.4739, + "loss/aux_loss": 0.04808379802852869, + "loss/crossentropy": 2.8462088823318483, + "loss/logits": 0.8858730256557464, + "step": 32050 + }, + { + "epoch": 0.3206, + "grad_norm": 12.4375, + "grad_norm_var": 0.350244140625, + "learning_rate": 0.0003, + "loss": 11.5575, + "loss/aux_loss": 0.04808332268148661, + "loss/crossentropy": 2.804873597621918, + "loss/logits": 0.8518771290779114, + "step": 32060 + }, + { + "epoch": 0.3207, + "grad_norm": 12.9375, + "grad_norm_var": 0.7770182291666666, + "learning_rate": 0.0003, + "loss": 11.508, + "loss/aux_loss": 0.0480858214199543, + "loss/crossentropy": 2.7543485045433043, + "loss/logits": 0.8625475555658341, + "step": 32070 + }, + { + "epoch": 0.3208, + "grad_norm": 14.5625, + "grad_norm_var": 0.5645182291666667, + "learning_rate": 0.0003, + "loss": 11.3667, + "loss/aux_loss": 0.04808385856449604, + "loss/crossentropy": 2.7623300909996034, + "loss/logits": 0.838151478767395, + "step": 32080 + }, + { + "epoch": 0.3209, + "grad_norm": 12.0625, + "grad_norm_var": 0.557275390625, + "learning_rate": 0.0003, + "loss": 11.4308, + "loss/aux_loss": 0.048071561940014364, + "loss/crossentropy": 2.7351042151451113, + "loss/logits": 0.8292459070682525, + "step": 32090 + }, + { + "epoch": 0.321, + "grad_norm": 13.1875, + "grad_norm_var": 0.52890625, + "learning_rate": 0.0003, + "loss": 11.4485, + "loss/aux_loss": 0.048081991448998454, + "loss/crossentropy": 2.742439067363739, + "loss/logits": 0.8626155495643616, + "step": 32100 + }, + { + "epoch": 0.3211, + "grad_norm": 13.4375, + "grad_norm_var": 0.3035807291666667, + "learning_rate": 0.0003, + "loss": 11.2873, + "loss/aux_loss": 0.048089691810309886, + "loss/crossentropy": 2.7030389070510865, + "loss/logits": 0.8663504242897033, + "step": 32110 + }, + { + "epoch": 0.3212, + "grad_norm": 12.8125, + "grad_norm_var": 2.847916666666667, + "learning_rate": 0.0003, + "loss": 11.4217, + "loss/aux_loss": 0.04807127509266138, + "loss/crossentropy": 2.885518616437912, + "loss/logits": 0.8689032286405564, + "step": 32120 + }, + { + "epoch": 0.3213, + "grad_norm": 13.0625, + "grad_norm_var": 2.738655598958333, + "learning_rate": 0.0003, + "loss": 11.3219, + "loss/aux_loss": 0.04809402395039797, + "loss/crossentropy": 2.8164079904556276, + "loss/logits": 0.8654607564210892, + "step": 32130 + }, + { + "epoch": 0.3214, + "grad_norm": 16.0, + "grad_norm_var": 0.8416015625, + "learning_rate": 0.0003, + "loss": 11.4046, + "loss/aux_loss": 0.04806608557701111, + "loss/crossentropy": 2.7131950318813325, + "loss/logits": 0.8581384032964706, + "step": 32140 + }, + { + "epoch": 0.3215, + "grad_norm": 13.375, + "grad_norm_var": 0.9400390625, + "learning_rate": 0.0003, + "loss": 11.4208, + "loss/aux_loss": 0.04809866081923246, + "loss/crossentropy": 2.64786559343338, + "loss/logits": 0.8615017741918564, + "step": 32150 + }, + { + "epoch": 0.3216, + "grad_norm": 13.1875, + "grad_norm_var": 1.1765625, + "learning_rate": 0.0003, + "loss": 11.3239, + "loss/aux_loss": 0.04808507617563009, + "loss/crossentropy": 2.799293911457062, + "loss/logits": 0.8632352501153946, + "step": 32160 + }, + { + "epoch": 0.3217, + "grad_norm": 13.0625, + "grad_norm_var": 0.6638020833333333, + "learning_rate": 0.0003, + "loss": 11.6439, + "loss/aux_loss": 0.048080069571733476, + "loss/crossentropy": 2.813327169418335, + "loss/logits": 0.8687012135982514, + "step": 32170 + }, + { + "epoch": 0.3218, + "grad_norm": 14.125, + "grad_norm_var": 0.4744140625, + "learning_rate": 0.0003, + "loss": 11.4474, + "loss/aux_loss": 0.048083030991256236, + "loss/crossentropy": 2.8561726570129395, + "loss/logits": 0.848215913772583, + "step": 32180 + }, + { + "epoch": 0.3219, + "grad_norm": 13.3125, + "grad_norm_var": 20.351041666666667, + "learning_rate": 0.0003, + "loss": 11.3414, + "loss/aux_loss": 0.04807794373482466, + "loss/crossentropy": 2.7342350482940674, + "loss/logits": 0.8707915544509888, + "step": 32190 + }, + { + "epoch": 0.322, + "grad_norm": 12.875, + "grad_norm_var": 19.792952473958334, + "learning_rate": 0.0003, + "loss": 11.4378, + "loss/aux_loss": 0.04808533620089293, + "loss/crossentropy": 2.751993161439896, + "loss/logits": 0.8459922909736634, + "step": 32200 + }, + { + "epoch": 0.3221, + "grad_norm": 14.0, + "grad_norm_var": 178.95364583333333, + "learning_rate": 0.0003, + "loss": 11.4808, + "loss/aux_loss": 0.048082702048122886, + "loss/crossentropy": 2.855089473724365, + "loss/logits": 0.8649806082248688, + "step": 32210 + }, + { + "epoch": 0.3222, + "grad_norm": 14.9375, + "grad_norm_var": 1.689306640625, + "learning_rate": 0.0003, + "loss": 11.4922, + "loss/aux_loss": 0.04807514175772667, + "loss/crossentropy": 2.5906433165073395, + "loss/logits": 0.843293958902359, + "step": 32220 + }, + { + "epoch": 0.3223, + "grad_norm": 12.9375, + "grad_norm_var": 1.0794108072916666, + "learning_rate": 0.0003, + "loss": 11.2442, + "loss/aux_loss": 0.048078662157058714, + "loss/crossentropy": 2.815950345993042, + "loss/logits": 0.8553645879030227, + "step": 32230 + }, + { + "epoch": 0.3224, + "grad_norm": 13.875, + "grad_norm_var": 0.636962890625, + "learning_rate": 0.0003, + "loss": 11.4003, + "loss/aux_loss": 0.04807271771132946, + "loss/crossentropy": 2.759478431940079, + "loss/logits": 0.8785617917776107, + "step": 32240 + }, + { + "epoch": 0.3225, + "grad_norm": 14.9375, + "grad_norm_var": 0.6570149739583333, + "learning_rate": 0.0003, + "loss": 11.34, + "loss/aux_loss": 0.048083205707371235, + "loss/crossentropy": 2.76770259141922, + "loss/logits": 0.8515851318836212, + "step": 32250 + }, + { + "epoch": 0.3226, + "grad_norm": 14.4375, + "grad_norm_var": 0.5434733072916667, + "learning_rate": 0.0003, + "loss": 11.3623, + "loss/aux_loss": 0.04807707685977221, + "loss/crossentropy": 2.7345820903778075, + "loss/logits": 0.8831362873315811, + "step": 32260 + }, + { + "epoch": 0.3227, + "grad_norm": 13.0, + "grad_norm_var": 0.4853515625, + "learning_rate": 0.0003, + "loss": 11.4747, + "loss/aux_loss": 0.04807059448212385, + "loss/crossentropy": 2.7904665589332582, + "loss/logits": 0.8971100717782974, + "step": 32270 + }, + { + "epoch": 0.3228, + "grad_norm": 13.25, + "grad_norm_var": 0.49114583333333334, + "learning_rate": 0.0003, + "loss": 11.4368, + "loss/aux_loss": 0.04808034915477037, + "loss/crossentropy": 2.7924468517303467, + "loss/logits": 0.8524171829223632, + "step": 32280 + }, + { + "epoch": 0.3229, + "grad_norm": 13.4375, + "grad_norm_var": 3.166145833333333, + "learning_rate": 0.0003, + "loss": 11.3999, + "loss/aux_loss": 0.04808408003300428, + "loss/crossentropy": 2.6971071362495422, + "loss/logits": 0.8586209654808045, + "step": 32290 + }, + { + "epoch": 0.323, + "grad_norm": 14.3125, + "grad_norm_var": 0.5126139322916666, + "learning_rate": 0.0003, + "loss": 11.4116, + "loss/aux_loss": 0.048079677298665045, + "loss/crossentropy": 2.803563630580902, + "loss/logits": 0.8560720324516297, + "step": 32300 + }, + { + "epoch": 0.3231, + "grad_norm": 12.0625, + "grad_norm_var": 0.36712239583333334, + "learning_rate": 0.0003, + "loss": 11.3175, + "loss/aux_loss": 0.04807909969240427, + "loss/crossentropy": 2.779414027929306, + "loss/logits": 0.8658244550228119, + "step": 32310 + }, + { + "epoch": 0.3232, + "grad_norm": 12.9375, + "grad_norm_var": 0.3101399739583333, + "learning_rate": 0.0003, + "loss": 11.2228, + "loss/aux_loss": 0.048088131844997405, + "loss/crossentropy": 2.6179952681064607, + "loss/logits": 0.81593057513237, + "step": 32320 + }, + { + "epoch": 0.3233, + "grad_norm": 13.3125, + "grad_norm_var": 0.11521809895833333, + "learning_rate": 0.0003, + "loss": 11.397, + "loss/aux_loss": 0.04808655325323343, + "loss/crossentropy": 2.8150524377822874, + "loss/logits": 0.8867930352687836, + "step": 32330 + }, + { + "epoch": 0.3234, + "grad_norm": 13.125, + "grad_norm_var": 0.21145833333333333, + "learning_rate": 0.0003, + "loss": 11.2842, + "loss/aux_loss": 0.04808272738009691, + "loss/crossentropy": 2.7270405888557434, + "loss/logits": 0.8832971513271332, + "step": 32340 + }, + { + "epoch": 0.3235, + "grad_norm": 13.625, + "grad_norm_var": 1.1784993489583333, + "learning_rate": 0.0003, + "loss": 11.5892, + "loss/aux_loss": 0.0480838356539607, + "loss/crossentropy": 2.7016100168228148, + "loss/logits": 0.8885600864887238, + "step": 32350 + }, + { + "epoch": 0.3236, + "grad_norm": 12.9375, + "grad_norm_var": 0.38605143229166666, + "learning_rate": 0.0003, + "loss": 11.2312, + "loss/aux_loss": 0.0480771854519844, + "loss/crossentropy": 2.814963674545288, + "loss/logits": 0.8497733741998672, + "step": 32360 + }, + { + "epoch": 0.3237, + "grad_norm": 13.5, + "grad_norm_var": 0.35618489583333335, + "learning_rate": 0.0003, + "loss": 11.263, + "loss/aux_loss": 0.048084933497011664, + "loss/crossentropy": 2.7236967265605925, + "loss/logits": 0.8645591795444488, + "step": 32370 + }, + { + "epoch": 0.3238, + "grad_norm": 13.8125, + "grad_norm_var": 0.5859212239583333, + "learning_rate": 0.0003, + "loss": 11.5835, + "loss/aux_loss": 0.04807772319763899, + "loss/crossentropy": 2.7548747062683105, + "loss/logits": 0.8677338659763336, + "step": 32380 + }, + { + "epoch": 0.3239, + "grad_norm": 14.375, + "grad_norm_var": 0.41399739583333334, + "learning_rate": 0.0003, + "loss": 11.2919, + "loss/aux_loss": 0.0480786906555295, + "loss/crossentropy": 2.8514682233333586, + "loss/logits": 0.850545859336853, + "step": 32390 + }, + { + "epoch": 0.324, + "grad_norm": 12.875, + "grad_norm_var": 0.6752604166666667, + "learning_rate": 0.0003, + "loss": 11.3948, + "loss/aux_loss": 0.048079483583569525, + "loss/crossentropy": 2.9048630833625793, + "loss/logits": 0.8999389052391052, + "step": 32400 + }, + { + "epoch": 0.3241, + "grad_norm": 12.9375, + "grad_norm_var": 43.757747395833334, + "learning_rate": 0.0003, + "loss": 11.3345, + "loss/aux_loss": 0.04807599056512117, + "loss/crossentropy": 2.835081601142883, + "loss/logits": 0.8689920961856842, + "step": 32410 + }, + { + "epoch": 0.3242, + "grad_norm": 13.375, + "grad_norm_var": 41.23292643229167, + "learning_rate": 0.0003, + "loss": 11.4935, + "loss/aux_loss": 0.04808198846876621, + "loss/crossentropy": 2.7409429788589477, + "loss/logits": 0.852924308180809, + "step": 32420 + }, + { + "epoch": 0.3243, + "grad_norm": 14.0625, + "grad_norm_var": 0.7806640625, + "learning_rate": 0.0003, + "loss": 11.3053, + "loss/aux_loss": 0.04807913806289434, + "loss/crossentropy": 2.697390305995941, + "loss/logits": 0.8261424988508225, + "step": 32430 + }, + { + "epoch": 0.3244, + "grad_norm": 13.1875, + "grad_norm_var": 0.3082682291666667, + "learning_rate": 0.0003, + "loss": 11.4026, + "loss/aux_loss": 0.04807761088013649, + "loss/crossentropy": 2.8476333379745484, + "loss/logits": 0.8590798646211624, + "step": 32440 + }, + { + "epoch": 0.3245, + "grad_norm": 13.0625, + "grad_norm_var": 0.6376139322916666, + "learning_rate": 0.0003, + "loss": 11.3744, + "loss/aux_loss": 0.0480758473277092, + "loss/crossentropy": 2.7274765491485597, + "loss/logits": 0.8521047949790954, + "step": 32450 + }, + { + "epoch": 0.3246, + "grad_norm": 12.5, + "grad_norm_var": 0.79140625, + "learning_rate": 0.0003, + "loss": 11.4257, + "loss/aux_loss": 0.048076083324849604, + "loss/crossentropy": 2.7871260046958923, + "loss/logits": 0.8288904428482056, + "step": 32460 + }, + { + "epoch": 0.3247, + "grad_norm": 12.9375, + "grad_norm_var": 0.2708333333333333, + "learning_rate": 0.0003, + "loss": 11.2386, + "loss/aux_loss": 0.048092946968972684, + "loss/crossentropy": 2.6454286336898805, + "loss/logits": 0.8405205219984054, + "step": 32470 + }, + { + "epoch": 0.3248, + "grad_norm": 14.125, + "grad_norm_var": 0.6605305989583333, + "learning_rate": 0.0003, + "loss": 11.4729, + "loss/aux_loss": 0.048083856143057345, + "loss/crossentropy": 2.8707290291786194, + "loss/logits": 0.8646343678236008, + "step": 32480 + }, + { + "epoch": 0.3249, + "grad_norm": 13.125, + "grad_norm_var": 1.0945149739583333, + "learning_rate": 0.0003, + "loss": 11.4695, + "loss/aux_loss": 0.04808031674474478, + "loss/crossentropy": 2.734501177072525, + "loss/logits": 0.8618703633546829, + "step": 32490 + }, + { + "epoch": 0.325, + "grad_norm": 13.625, + "grad_norm_var": 0.342822265625, + "learning_rate": 0.0003, + "loss": 11.3474, + "loss/aux_loss": 0.0480826249346137, + "loss/crossentropy": 2.4993535339832307, + "loss/logits": 0.8169450134038925, + "step": 32500 + }, + { + "epoch": 0.3251, + "grad_norm": 13.0625, + "grad_norm_var": 0.24256184895833333, + "learning_rate": 0.0003, + "loss": 11.1817, + "loss/aux_loss": 0.04808486234396696, + "loss/crossentropy": 2.7927271008491514, + "loss/logits": 0.8319184005260467, + "step": 32510 + }, + { + "epoch": 0.3252, + "grad_norm": 13.4375, + "grad_norm_var": 0.43333333333333335, + "learning_rate": 0.0003, + "loss": 11.4204, + "loss/aux_loss": 0.048070120811462405, + "loss/crossentropy": 2.7226893484592436, + "loss/logits": 0.842848926782608, + "step": 32520 + }, + { + "epoch": 0.3253, + "grad_norm": 14.25, + "grad_norm_var": 2.196875, + "learning_rate": 0.0003, + "loss": 11.338, + "loss/aux_loss": 0.04808647688478231, + "loss/crossentropy": 2.754758191108704, + "loss/logits": 0.8788342326879501, + "step": 32530 + }, + { + "epoch": 0.3254, + "grad_norm": 13.3125, + "grad_norm_var": 2.3486979166666666, + "learning_rate": 0.0003, + "loss": 11.4313, + "loss/aux_loss": 0.04807833395898342, + "loss/crossentropy": 2.6338735044002535, + "loss/logits": 0.8634207069873809, + "step": 32540 + }, + { + "epoch": 0.3255, + "grad_norm": 12.875, + "grad_norm_var": 0.24036458333333333, + "learning_rate": 0.0003, + "loss": 11.3306, + "loss/aux_loss": 0.04807445779442787, + "loss/crossentropy": 2.74268923997879, + "loss/logits": 0.8131024420261384, + "step": 32550 + }, + { + "epoch": 0.3256, + "grad_norm": 13.5, + "grad_norm_var": 0.3675618489583333, + "learning_rate": 0.0003, + "loss": 11.5403, + "loss/aux_loss": 0.04809182155877352, + "loss/crossentropy": 2.729319167137146, + "loss/logits": 0.8604931205511093, + "step": 32560 + }, + { + "epoch": 0.3257, + "grad_norm": 13.8125, + "grad_norm_var": 0.37159830729166665, + "learning_rate": 0.0003, + "loss": 11.2922, + "loss/aux_loss": 0.04806775096803904, + "loss/crossentropy": 2.7165225446224213, + "loss/logits": 0.8697218716144561, + "step": 32570 + }, + { + "epoch": 0.3258, + "grad_norm": 13.0625, + "grad_norm_var": 0.6143229166666667, + "learning_rate": 0.0003, + "loss": 11.5015, + "loss/aux_loss": 0.04808111321181059, + "loss/crossentropy": 2.6520249009132386, + "loss/logits": 0.8677373945713043, + "step": 32580 + }, + { + "epoch": 0.3259, + "grad_norm": 13.5, + "grad_norm_var": 0.3104166666666667, + "learning_rate": 0.0003, + "loss": 11.2081, + "loss/aux_loss": 0.04808580782264471, + "loss/crossentropy": 2.626668655872345, + "loss/logits": 0.8411953181028367, + "step": 32590 + }, + { + "epoch": 0.326, + "grad_norm": 13.375, + "grad_norm_var": 0.19869791666666667, + "learning_rate": 0.0003, + "loss": 11.4345, + "loss/aux_loss": 0.04807950202375651, + "loss/crossentropy": 2.6047201275825502, + "loss/logits": 0.8363840937614441, + "step": 32600 + }, + { + "epoch": 0.3261, + "grad_norm": 13.3125, + "grad_norm_var": 0.225, + "learning_rate": 0.0003, + "loss": 11.4163, + "loss/aux_loss": 0.04808168914169073, + "loss/crossentropy": 2.745485466718674, + "loss/logits": 0.855038857460022, + "step": 32610 + }, + { + "epoch": 0.3262, + "grad_norm": 13.9375, + "grad_norm_var": 66.59993489583333, + "learning_rate": 0.0003, + "loss": 11.5196, + "loss/aux_loss": 0.048089150339365005, + "loss/crossentropy": 2.6984958589076995, + "loss/logits": 0.8720762193202972, + "step": 32620 + }, + { + "epoch": 0.3263, + "grad_norm": 15.25, + "grad_norm_var": 64.37161458333334, + "learning_rate": 0.0003, + "loss": 11.5008, + "loss/aux_loss": 0.04807731341570616, + "loss/crossentropy": 2.6679943084716795, + "loss/logits": 0.859524542093277, + "step": 32630 + }, + { + "epoch": 0.3264, + "grad_norm": 14.0, + "grad_norm_var": 0.6309733072916667, + "learning_rate": 0.0003, + "loss": 11.4637, + "loss/aux_loss": 0.04808408729732037, + "loss/crossentropy": 2.699479818344116, + "loss/logits": 0.84074946641922, + "step": 32640 + }, + { + "epoch": 0.3265, + "grad_norm": 15.9375, + "grad_norm_var": 0.9561848958333333, + "learning_rate": 0.0003, + "loss": 11.3639, + "loss/aux_loss": 0.04807748533785343, + "loss/crossentropy": 2.654457098245621, + "loss/logits": 0.8376824676990509, + "step": 32650 + }, + { + "epoch": 0.3266, + "grad_norm": 13.125, + "grad_norm_var": 2.091259765625, + "learning_rate": 0.0003, + "loss": 11.4042, + "loss/aux_loss": 0.04807919319719076, + "loss/crossentropy": 2.6513519108295442, + "loss/logits": 0.8539661675691604, + "step": 32660 + }, + { + "epoch": 0.3267, + "grad_norm": 13.25, + "grad_norm_var": 0.7520670572916667, + "learning_rate": 0.0003, + "loss": 11.331, + "loss/aux_loss": 0.04807972591370344, + "loss/crossentropy": 2.807789134979248, + "loss/logits": 0.8730264127254486, + "step": 32670 + }, + { + "epoch": 0.3268, + "grad_norm": 15.5, + "grad_norm_var": 0.9223307291666667, + "learning_rate": 0.0003, + "loss": 11.4216, + "loss/aux_loss": 0.048074715211987494, + "loss/crossentropy": 2.852227210998535, + "loss/logits": 0.8674792051315308, + "step": 32680 + }, + { + "epoch": 0.3269, + "grad_norm": 12.5625, + "grad_norm_var": 0.9957682291666666, + "learning_rate": 0.0003, + "loss": 11.4985, + "loss/aux_loss": 0.04807224553078413, + "loss/crossentropy": 2.6428285241127014, + "loss/logits": 0.8319959819316864, + "step": 32690 + }, + { + "epoch": 0.327, + "grad_norm": 12.8125, + "grad_norm_var": 1.8821451822916666, + "learning_rate": 0.0003, + "loss": 11.3336, + "loss/aux_loss": 0.048086940124630925, + "loss/crossentropy": 2.7363753497600554, + "loss/logits": 0.8253339737653732, + "step": 32700 + }, + { + "epoch": 0.3271, + "grad_norm": 15.5625, + "grad_norm_var": 1.7759765625, + "learning_rate": 0.0003, + "loss": 11.5912, + "loss/aux_loss": 0.04807717055082321, + "loss/crossentropy": 2.7939674854278564, + "loss/logits": 0.8734579056501388, + "step": 32710 + }, + { + "epoch": 0.3272, + "grad_norm": 13.8125, + "grad_norm_var": 1.1382649739583333, + "learning_rate": 0.0003, + "loss": 11.3646, + "loss/aux_loss": 0.04808651357889175, + "loss/crossentropy": 2.657242488861084, + "loss/logits": 0.8325454264879226, + "step": 32720 + }, + { + "epoch": 0.3273, + "grad_norm": 13.3125, + "grad_norm_var": 1.162353515625, + "learning_rate": 0.0003, + "loss": 11.3746, + "loss/aux_loss": 0.048074505664408206, + "loss/crossentropy": 2.7361601114273073, + "loss/logits": 0.8744795680046081, + "step": 32730 + }, + { + "epoch": 0.3274, + "grad_norm": 12.9375, + "grad_norm_var": 0.37237955729166666, + "learning_rate": 0.0003, + "loss": 11.2877, + "loss/aux_loss": 0.04808551985770464, + "loss/crossentropy": 2.480491054058075, + "loss/logits": 0.8157087236642837, + "step": 32740 + }, + { + "epoch": 0.3275, + "grad_norm": 13.1875, + "grad_norm_var": 0.95078125, + "learning_rate": 0.0003, + "loss": 11.2748, + "loss/aux_loss": 0.04807721842080355, + "loss/crossentropy": 2.7901974260807036, + "loss/logits": 0.8725056558847427, + "step": 32750 + }, + { + "epoch": 0.3276, + "grad_norm": 13.5625, + "grad_norm_var": 0.8020670572916667, + "learning_rate": 0.0003, + "loss": 11.4644, + "loss/aux_loss": 0.04809094499796629, + "loss/crossentropy": 2.6982684254646303, + "loss/logits": 0.8750581175088883, + "step": 32760 + }, + { + "epoch": 0.3277, + "grad_norm": 14.125, + "grad_norm_var": 0.9791015625, + "learning_rate": 0.0003, + "loss": 11.3499, + "loss/aux_loss": 0.04807215426117182, + "loss/crossentropy": 2.56101336479187, + "loss/logits": 0.859020522236824, + "step": 32770 + }, + { + "epoch": 0.3278, + "grad_norm": 13.4375, + "grad_norm_var": 1.1011555989583333, + "learning_rate": 0.0003, + "loss": 11.3663, + "loss/aux_loss": 0.048077127523720266, + "loss/crossentropy": 2.7586211442947386, + "loss/logits": 0.8794794708490372, + "step": 32780 + }, + { + "epoch": 0.3279, + "grad_norm": 14.625, + "grad_norm_var": 0.3056640625, + "learning_rate": 0.0003, + "loss": 11.575, + "loss/aux_loss": 0.04807180892676115, + "loss/crossentropy": 2.8552963852882387, + "loss/logits": 0.9014603316783905, + "step": 32790 + }, + { + "epoch": 0.328, + "grad_norm": 13.0625, + "grad_norm_var": 0.448681640625, + "learning_rate": 0.0003, + "loss": 11.659, + "loss/aux_loss": 0.048085894994437696, + "loss/crossentropy": 2.7301037549972533, + "loss/logits": 0.8675024837255478, + "step": 32800 + }, + { + "epoch": 0.3281, + "grad_norm": 13.6875, + "grad_norm_var": 0.2535807291666667, + "learning_rate": 0.0003, + "loss": 11.422, + "loss/aux_loss": 0.04807611275464296, + "loss/crossentropy": 2.61910994052887, + "loss/logits": 0.8257275193929672, + "step": 32810 + }, + { + "epoch": 0.3282, + "grad_norm": 13.0625, + "grad_norm_var": 0.219384765625, + "learning_rate": 0.0003, + "loss": 11.3728, + "loss/aux_loss": 0.04807970225811005, + "loss/crossentropy": 2.5538667261600496, + "loss/logits": 0.8033603578805923, + "step": 32820 + }, + { + "epoch": 0.3283, + "grad_norm": 13.3125, + "grad_norm_var": 0.544384765625, + "learning_rate": 0.0003, + "loss": 11.449, + "loss/aux_loss": 0.048089250549674036, + "loss/crossentropy": 2.8291757106781006, + "loss/logits": 0.86662557721138, + "step": 32830 + }, + { + "epoch": 0.3284, + "grad_norm": 13.3125, + "grad_norm_var": 0.4356608072916667, + "learning_rate": 0.0003, + "loss": 11.3986, + "loss/aux_loss": 0.048079658299684525, + "loss/crossentropy": 2.6797396779060363, + "loss/logits": 0.8524430304765701, + "step": 32840 + }, + { + "epoch": 0.3285, + "grad_norm": 13.1875, + "grad_norm_var": 0.3113932291666667, + "learning_rate": 0.0003, + "loss": 11.5521, + "loss/aux_loss": 0.048092910647392274, + "loss/crossentropy": 2.8970932602882384, + "loss/logits": 0.8721674889326095, + "step": 32850 + }, + { + "epoch": 0.3286, + "grad_norm": 13.3125, + "grad_norm_var": 0.2041015625, + "learning_rate": 0.0003, + "loss": 11.3052, + "loss/aux_loss": 0.04807873163372278, + "loss/crossentropy": 2.6216281414031983, + "loss/logits": 0.8397267490625382, + "step": 32860 + }, + { + "epoch": 0.3287, + "grad_norm": 12.5, + "grad_norm_var": 0.36404622395833336, + "learning_rate": 0.0003, + "loss": 11.4758, + "loss/aux_loss": 0.048077932186424734, + "loss/crossentropy": 2.8529594242572784, + "loss/logits": 0.9037439674139023, + "step": 32870 + }, + { + "epoch": 0.3288, + "grad_norm": 12.5, + "grad_norm_var": 0.39088541666666665, + "learning_rate": 0.0003, + "loss": 11.2377, + "loss/aux_loss": 0.04808652587234974, + "loss/crossentropy": 2.665499210357666, + "loss/logits": 0.8454530268907547, + "step": 32880 + }, + { + "epoch": 0.3289, + "grad_norm": 12.9375, + "grad_norm_var": 0.3072265625, + "learning_rate": 0.0003, + "loss": 11.3039, + "loss/aux_loss": 0.04807148966938257, + "loss/crossentropy": 2.602234035730362, + "loss/logits": 0.8243951052427292, + "step": 32890 + }, + { + "epoch": 0.329, + "grad_norm": 12.9375, + "grad_norm_var": 0.2581868489583333, + "learning_rate": 0.0003, + "loss": 11.2624, + "loss/aux_loss": 0.04808493070304394, + "loss/crossentropy": 2.7614921808242796, + "loss/logits": 0.8612099617719651, + "step": 32900 + }, + { + "epoch": 0.3291, + "grad_norm": 14.5, + "grad_norm_var": 0.5416666666666666, + "learning_rate": 0.0003, + "loss": 11.3729, + "loss/aux_loss": 0.04808545112609863, + "loss/crossentropy": 2.7853208422660827, + "loss/logits": 0.8972157269716263, + "step": 32910 + }, + { + "epoch": 0.3292, + "grad_norm": 14.0625, + "grad_norm_var": 0.808837890625, + "learning_rate": 0.0003, + "loss": 11.4716, + "loss/aux_loss": 0.04807188231498003, + "loss/crossentropy": 2.709330898523331, + "loss/logits": 0.8775971084833145, + "step": 32920 + }, + { + "epoch": 0.3293, + "grad_norm": 12.5, + "grad_norm_var": 0.28411458333333334, + "learning_rate": 0.0003, + "loss": 11.3775, + "loss/aux_loss": 0.04807582814246416, + "loss/crossentropy": 2.63877277970314, + "loss/logits": 0.8797126650810242, + "step": 32930 + }, + { + "epoch": 0.3294, + "grad_norm": 12.9375, + "grad_norm_var": 0.2796875, + "learning_rate": 0.0003, + "loss": 11.3529, + "loss/aux_loss": 0.04808872751891613, + "loss/crossentropy": 2.681774044036865, + "loss/logits": 0.862132015824318, + "step": 32940 + }, + { + "epoch": 0.3295, + "grad_norm": 13.625, + "grad_norm_var": 136.1453125, + "learning_rate": 0.0003, + "loss": 11.4708, + "loss/aux_loss": 0.04807477127760649, + "loss/crossentropy": 2.7079889833927155, + "loss/logits": 0.8643805146217346, + "step": 32950 + }, + { + "epoch": 0.3296, + "grad_norm": 14.5, + "grad_norm_var": 133.14055989583332, + "learning_rate": 0.0003, + "loss": 11.4029, + "loss/aux_loss": 0.04808342196047306, + "loss/crossentropy": 2.7009809732437136, + "loss/logits": 0.8897195219993591, + "step": 32960 + }, + { + "epoch": 0.3297, + "grad_norm": 12.9375, + "grad_norm_var": 0.55703125, + "learning_rate": 0.0003, + "loss": 11.4977, + "loss/aux_loss": 0.04807784240692854, + "loss/crossentropy": 2.888811504840851, + "loss/logits": 0.9022614181041717, + "step": 32970 + }, + { + "epoch": 0.3298, + "grad_norm": 13.25, + "grad_norm_var": 0.7114420572916667, + "learning_rate": 0.0003, + "loss": 11.4451, + "loss/aux_loss": 0.04807289652526379, + "loss/crossentropy": 2.7488197565078734, + "loss/logits": 0.8802861243486404, + "step": 32980 + }, + { + "epoch": 0.3299, + "grad_norm": 14.875, + "grad_norm_var": 1.1337890625, + "learning_rate": 0.0003, + "loss": 11.5285, + "loss/aux_loss": 0.048081624880433084, + "loss/crossentropy": 2.7710135102272035, + "loss/logits": 0.8831702828407287, + "step": 32990 + }, + { + "epoch": 0.33, + "grad_norm": 14.0, + "grad_norm_var": 0.39334309895833336, + "learning_rate": 0.0003, + "loss": 11.5182, + "loss/aux_loss": 0.048084603250026704, + "loss/crossentropy": 2.920530825853348, + "loss/logits": 0.8848973125219345, + "step": 33000 + }, + { + "epoch": 0.3301, + "grad_norm": 14.125, + "grad_norm_var": 0.30911458333333336, + "learning_rate": 0.0003, + "loss": 11.2578, + "loss/aux_loss": 0.04807808380573988, + "loss/crossentropy": 2.7349561214447022, + "loss/logits": 0.8306137710809708, + "step": 33010 + }, + { + "epoch": 0.3302, + "grad_norm": 12.8125, + "grad_norm_var": 0.21013997395833334, + "learning_rate": 0.0003, + "loss": 11.3987, + "loss/aux_loss": 0.04808585941791534, + "loss/crossentropy": 2.7925686955451967, + "loss/logits": 0.8563876241445542, + "step": 33020 + }, + { + "epoch": 0.3303, + "grad_norm": 12.8125, + "grad_norm_var": 0.4200520833333333, + "learning_rate": 0.0003, + "loss": 11.4525, + "loss/aux_loss": 0.048077084310352805, + "loss/crossentropy": 2.829834222793579, + "loss/logits": 0.8569678455591202, + "step": 33030 + }, + { + "epoch": 0.3304, + "grad_norm": 13.0625, + "grad_norm_var": 2.3742024739583334, + "learning_rate": 0.0003, + "loss": 11.4428, + "loss/aux_loss": 0.04808685947209597, + "loss/crossentropy": 2.6605750918388367, + "loss/logits": 0.8512579023838043, + "step": 33040 + }, + { + "epoch": 0.3305, + "grad_norm": 13.25, + "grad_norm_var": 2.8878743489583334, + "learning_rate": 0.0003, + "loss": 11.4905, + "loss/aux_loss": 0.04808191433548927, + "loss/crossentropy": 2.821686065196991, + "loss/logits": 0.8521415889263153, + "step": 33050 + }, + { + "epoch": 0.3306, + "grad_norm": 14.0625, + "grad_norm_var": 1.6442545572916667, + "learning_rate": 0.0003, + "loss": 11.448, + "loss/aux_loss": 0.048077190294861794, + "loss/crossentropy": 2.784515953063965, + "loss/logits": 0.8502372175455093, + "step": 33060 + }, + { + "epoch": 0.3307, + "grad_norm": 14.625, + "grad_norm_var": 1.2645182291666666, + "learning_rate": 0.0003, + "loss": 11.298, + "loss/aux_loss": 0.04808208737522364, + "loss/crossentropy": 2.7311050057411195, + "loss/logits": 0.8578761130571365, + "step": 33070 + }, + { + "epoch": 0.3308, + "grad_norm": 13.5625, + "grad_norm_var": 0.60703125, + "learning_rate": 0.0003, + "loss": 11.5828, + "loss/aux_loss": 0.0480745630338788, + "loss/crossentropy": 2.7229528069496154, + "loss/logits": 0.8834622859954834, + "step": 33080 + }, + { + "epoch": 0.3309, + "grad_norm": 12.3125, + "grad_norm_var": 0.2822265625, + "learning_rate": 0.0003, + "loss": 11.3736, + "loss/aux_loss": 0.048079187795519826, + "loss/crossentropy": 2.596200668811798, + "loss/logits": 0.8260492444038391, + "step": 33090 + }, + { + "epoch": 0.331, + "grad_norm": 13.25, + "grad_norm_var": 0.3492024739583333, + "learning_rate": 0.0003, + "loss": 11.4958, + "loss/aux_loss": 0.04808150306344032, + "loss/crossentropy": 2.6033548295497893, + "loss/logits": 0.8426315069198609, + "step": 33100 + }, + { + "epoch": 0.3311, + "grad_norm": 13.3125, + "grad_norm_var": 0.43019205729166665, + "learning_rate": 0.0003, + "loss": 11.1088, + "loss/aux_loss": 0.048071819357573986, + "loss/crossentropy": 2.667391049861908, + "loss/logits": 0.8707249313592911, + "step": 33110 + }, + { + "epoch": 0.3312, + "grad_norm": 13.625, + "grad_norm_var": 0.5379557291666667, + "learning_rate": 0.0003, + "loss": 11.4546, + "loss/aux_loss": 0.04808641467243433, + "loss/crossentropy": 2.759235656261444, + "loss/logits": 0.8834040969610214, + "step": 33120 + }, + { + "epoch": 0.3313, + "grad_norm": 13.8125, + "grad_norm_var": 0.4339680989583333, + "learning_rate": 0.0003, + "loss": 11.4916, + "loss/aux_loss": 0.04807813689112663, + "loss/crossentropy": 2.7543640404939653, + "loss/logits": 0.8365322396159172, + "step": 33130 + }, + { + "epoch": 0.3314, + "grad_norm": 14.125, + "grad_norm_var": 15.084488932291666, + "learning_rate": 0.0003, + "loss": 11.4041, + "loss/aux_loss": 0.048085601069033146, + "loss/crossentropy": 2.870689940452576, + "loss/logits": 0.850860208272934, + "step": 33140 + }, + { + "epoch": 0.3315, + "grad_norm": 13.5, + "grad_norm_var": 15.106705729166666, + "learning_rate": 0.0003, + "loss": 11.2983, + "loss/aux_loss": 0.04808761551976204, + "loss/crossentropy": 2.716045266389847, + "loss/logits": 0.8434042870998383, + "step": 33150 + }, + { + "epoch": 0.3316, + "grad_norm": 14.125, + "grad_norm_var": 0.5707682291666667, + "learning_rate": 0.0003, + "loss": 11.5516, + "loss/aux_loss": 0.04807737655937672, + "loss/crossentropy": 2.8646560847759246, + "loss/logits": 0.8435910433530808, + "step": 33160 + }, + { + "epoch": 0.3317, + "grad_norm": 13.4375, + "grad_norm_var": 1.0468098958333334, + "learning_rate": 0.0003, + "loss": 11.2852, + "loss/aux_loss": 0.04808404687792063, + "loss/crossentropy": 2.6018213868141173, + "loss/logits": 0.8262585073709487, + "step": 33170 + }, + { + "epoch": 0.3318, + "grad_norm": 12.75, + "grad_norm_var": 0.6083333333333333, + "learning_rate": 0.0003, + "loss": 11.3755, + "loss/aux_loss": 0.04808040447533131, + "loss/crossentropy": 2.752035117149353, + "loss/logits": 0.8619087725877762, + "step": 33180 + }, + { + "epoch": 0.3319, + "grad_norm": 13.1875, + "grad_norm_var": 0.5118326822916667, + "learning_rate": 0.0003, + "loss": 11.3865, + "loss/aux_loss": 0.048074712976813316, + "loss/crossentropy": 2.859244775772095, + "loss/logits": 0.8745595574378967, + "step": 33190 + }, + { + "epoch": 0.332, + "grad_norm": 14.125, + "grad_norm_var": 0.625244140625, + "learning_rate": 0.0003, + "loss": 11.3773, + "loss/aux_loss": 0.04807355534285307, + "loss/crossentropy": 2.8394315361976625, + "loss/logits": 0.8535150647163391, + "step": 33200 + }, + { + "epoch": 0.3321, + "grad_norm": 14.5, + "grad_norm_var": 0.6541015625, + "learning_rate": 0.0003, + "loss": 11.3814, + "loss/aux_loss": 0.04808756597340107, + "loss/crossentropy": 2.8331064164638518, + "loss/logits": 0.8748747378587722, + "step": 33210 + }, + { + "epoch": 0.3322, + "grad_norm": 14.8125, + "grad_norm_var": 1.4541015625, + "learning_rate": 0.0003, + "loss": 11.6009, + "loss/aux_loss": 0.048076229728758337, + "loss/crossentropy": 2.8229152381420137, + "loss/logits": 0.8776537507772446, + "step": 33220 + }, + { + "epoch": 0.3323, + "grad_norm": 13.1875, + "grad_norm_var": 0.5733723958333333, + "learning_rate": 0.0003, + "loss": 11.453, + "loss/aux_loss": 0.04807769935578108, + "loss/crossentropy": 2.744255816936493, + "loss/logits": 0.8375910878181457, + "step": 33230 + }, + { + "epoch": 0.3324, + "grad_norm": 15.25, + "grad_norm_var": 1.1174479166666667, + "learning_rate": 0.0003, + "loss": 11.5114, + "loss/aux_loss": 0.048086580075323584, + "loss/crossentropy": 2.7748912930488587, + "loss/logits": 0.8750550776720047, + "step": 33240 + }, + { + "epoch": 0.3325, + "grad_norm": 12.75, + "grad_norm_var": 1.4270182291666667, + "learning_rate": 0.0003, + "loss": 11.4482, + "loss/aux_loss": 0.04807029981166124, + "loss/crossentropy": 2.785758376121521, + "loss/logits": 0.8546015530824661, + "step": 33250 + }, + { + "epoch": 0.3326, + "grad_norm": 13.0625, + "grad_norm_var": 0.21302083333333333, + "learning_rate": 0.0003, + "loss": 11.4774, + "loss/aux_loss": 0.048079310730099675, + "loss/crossentropy": 2.7421350955963133, + "loss/logits": 0.858822014927864, + "step": 33260 + }, + { + "epoch": 0.3327, + "grad_norm": 13.625, + "grad_norm_var": 0.18229166666666666, + "learning_rate": 0.0003, + "loss": 11.3066, + "loss/aux_loss": 0.04808879122138023, + "loss/crossentropy": 2.7908874809741975, + "loss/logits": 0.877889646589756, + "step": 33270 + }, + { + "epoch": 0.3328, + "grad_norm": 13.625, + "grad_norm_var": 0.5369791666666667, + "learning_rate": 0.0003, + "loss": 11.4317, + "loss/aux_loss": 0.048072985000908376, + "loss/crossentropy": 2.6791608691215516, + "loss/logits": 0.8541211634874344, + "step": 33280 + }, + { + "epoch": 0.3329, + "grad_norm": 56.0, + "grad_norm_var": 114.448681640625, + "learning_rate": 0.0003, + "loss": 11.4201, + "loss/aux_loss": 0.0480899965390563, + "loss/crossentropy": 2.7793687105178835, + "loss/logits": 0.8533193141222, + "step": 33290 + }, + { + "epoch": 0.333, + "grad_norm": 13.375, + "grad_norm_var": 113.77381184895833, + "learning_rate": 0.0003, + "loss": 11.5771, + "loss/aux_loss": 0.04808140806853771, + "loss/crossentropy": 2.759211188554764, + "loss/logits": 0.847288829088211, + "step": 33300 + }, + { + "epoch": 0.3331, + "grad_norm": 13.4375, + "grad_norm_var": 0.36139322916666666, + "learning_rate": 0.0003, + "loss": 11.5233, + "loss/aux_loss": 0.04808169547468424, + "loss/crossentropy": 2.6220255315303804, + "loss/logits": 0.8704487830400467, + "step": 33310 + }, + { + "epoch": 0.3332, + "grad_norm": 13.4375, + "grad_norm_var": 0.392431640625, + "learning_rate": 0.0003, + "loss": 11.3581, + "loss/aux_loss": 0.0480834748595953, + "loss/crossentropy": 2.7258807718753815, + "loss/logits": 0.8605304449796677, + "step": 33320 + }, + { + "epoch": 0.3333, + "grad_norm": 12.5625, + "grad_norm_var": 0.46990559895833334, + "learning_rate": 0.0003, + "loss": 11.3325, + "loss/aux_loss": 0.048091739602386954, + "loss/crossentropy": 2.7206650257110594, + "loss/logits": 0.8408534616231919, + "step": 33330 + }, + { + "epoch": 0.3334, + "grad_norm": 13.5, + "grad_norm_var": 0.253759765625, + "learning_rate": 0.0003, + "loss": 11.4986, + "loss/aux_loss": 0.04806930366903543, + "loss/crossentropy": 2.7812957525253297, + "loss/logits": 0.855445483326912, + "step": 33340 + }, + { + "epoch": 0.3335, + "grad_norm": 13.625, + "grad_norm_var": 0.506494140625, + "learning_rate": 0.0003, + "loss": 11.4043, + "loss/aux_loss": 0.04807830974459648, + "loss/crossentropy": 2.7645578622817992, + "loss/logits": 0.8752608805894851, + "step": 33350 + }, + { + "epoch": 0.3336, + "grad_norm": 13.8125, + "grad_norm_var": 0.8958333333333334, + "learning_rate": 0.0003, + "loss": 11.386, + "loss/aux_loss": 0.04809688944369554, + "loss/crossentropy": 2.757777750492096, + "loss/logits": 0.8512715846300125, + "step": 33360 + }, + { + "epoch": 0.3337, + "grad_norm": 15.5, + "grad_norm_var": 0.689306640625, + "learning_rate": 0.0003, + "loss": 11.4614, + "loss/aux_loss": 0.048079153336584565, + "loss/crossentropy": 2.7252914190292357, + "loss/logits": 0.8573682248592377, + "step": 33370 + }, + { + "epoch": 0.3338, + "grad_norm": 13.625, + "grad_norm_var": 1085.8733723958333, + "learning_rate": 0.0003, + "loss": 11.4558, + "loss/aux_loss": 0.04808936920017004, + "loss/crossentropy": 2.648952716588974, + "loss/logits": 0.8360880434513092, + "step": 33380 + }, + { + "epoch": 0.3339, + "grad_norm": 14.0, + "grad_norm_var": 1096.3770182291667, + "learning_rate": 0.0003, + "loss": 11.2424, + "loss/aux_loss": 0.04808183200657368, + "loss/crossentropy": 2.7101231694221495, + "loss/logits": 0.8388356804847718, + "step": 33390 + }, + { + "epoch": 0.334, + "grad_norm": 14.0, + "grad_norm_var": 46.952457682291666, + "learning_rate": 0.0003, + "loss": 11.4011, + "loss/aux_loss": 0.048093979991972444, + "loss/crossentropy": 2.7197480618953707, + "loss/logits": 0.8777379095554352, + "step": 33400 + }, + { + "epoch": 0.3341, + "grad_norm": 12.75, + "grad_norm_var": 0.5402180989583333, + "learning_rate": 0.0003, + "loss": 11.3751, + "loss/aux_loss": 0.04807923678308725, + "loss/crossentropy": 2.874398422241211, + "loss/logits": 0.908473339676857, + "step": 33410 + }, + { + "epoch": 0.3342, + "grad_norm": 14.75, + "grad_norm_var": 0.653759765625, + "learning_rate": 0.0003, + "loss": 11.3637, + "loss/aux_loss": 0.04807525873184204, + "loss/crossentropy": 2.8315866231918334, + "loss/logits": 0.8731096774339676, + "step": 33420 + }, + { + "epoch": 0.3343, + "grad_norm": 13.0, + "grad_norm_var": 0.6478515625, + "learning_rate": 0.0003, + "loss": 11.4268, + "loss/aux_loss": 0.04808218479156494, + "loss/crossentropy": 2.957446539402008, + "loss/logits": 0.869893753528595, + "step": 33430 + }, + { + "epoch": 0.3344, + "grad_norm": 12.5, + "grad_norm_var": 0.5835774739583334, + "learning_rate": 0.0003, + "loss": 11.4248, + "loss/aux_loss": 0.04807921797037125, + "loss/crossentropy": 2.8220800697803496, + "loss/logits": 0.8761366009712219, + "step": 33440 + }, + { + "epoch": 0.3345, + "grad_norm": 15.75, + "grad_norm_var": 0.9004557291666667, + "learning_rate": 0.0003, + "loss": 11.4527, + "loss/aux_loss": 0.04809036403894425, + "loss/crossentropy": 2.7487100541591643, + "loss/logits": 0.863110476732254, + "step": 33450 + }, + { + "epoch": 0.3346, + "grad_norm": 14.9375, + "grad_norm_var": 0.5707682291666667, + "learning_rate": 0.0003, + "loss": 11.388, + "loss/aux_loss": 0.04808029551059008, + "loss/crossentropy": 2.7007455945014955, + "loss/logits": 0.8433271735906601, + "step": 33460 + }, + { + "epoch": 0.3347, + "grad_norm": 15.0625, + "grad_norm_var": 0.30911458333333336, + "learning_rate": 0.0003, + "loss": 11.2174, + "loss/aux_loss": 0.04807520732283592, + "loss/crossentropy": 2.7608300507068635, + "loss/logits": 0.8681216955184936, + "step": 33470 + }, + { + "epoch": 0.3348, + "grad_norm": 51.25, + "grad_norm_var": 86.63619791666666, + "learning_rate": 0.0003, + "loss": 11.4039, + "loss/aux_loss": 0.0480832202360034, + "loss/crossentropy": 2.7710861444473265, + "loss/logits": 0.8367562472820282, + "step": 33480 + }, + { + "epoch": 0.3349, + "grad_norm": 12.25, + "grad_norm_var": 88.547119140625, + "learning_rate": 0.0003, + "loss": 11.3695, + "loss/aux_loss": 0.04809240307658911, + "loss/crossentropy": 2.803847813606262, + "loss/logits": 0.862313050031662, + "step": 33490 + }, + { + "epoch": 0.335, + "grad_norm": 12.875, + "grad_norm_var": 0.628369140625, + "learning_rate": 0.0003, + "loss": 11.3663, + "loss/aux_loss": 0.04807665664702654, + "loss/crossentropy": 2.6767329633235932, + "loss/logits": 0.8725397795438766, + "step": 33500 + }, + { + "epoch": 0.3351, + "grad_norm": 13.5625, + "grad_norm_var": 0.30130208333333336, + "learning_rate": 0.0003, + "loss": 11.5062, + "loss/aux_loss": 0.04808883797377348, + "loss/crossentropy": 2.7930223047733307, + "loss/logits": 0.9104775667190552, + "step": 33510 + }, + { + "epoch": 0.3352, + "grad_norm": 13.5, + "grad_norm_var": 0.264697265625, + "learning_rate": 0.0003, + "loss": 11.3642, + "loss/aux_loss": 0.04807222187519074, + "loss/crossentropy": 2.809762644767761, + "loss/logits": 0.858039128780365, + "step": 33520 + }, + { + "epoch": 0.3353, + "grad_norm": 13.3125, + "grad_norm_var": 0.15755208333333334, + "learning_rate": 0.0003, + "loss": 11.3144, + "loss/aux_loss": 0.04808267746120691, + "loss/crossentropy": 2.903217875957489, + "loss/logits": 0.8900675117969513, + "step": 33530 + }, + { + "epoch": 0.3354, + "grad_norm": 13.375, + "grad_norm_var": 0.2572265625, + "learning_rate": 0.0003, + "loss": 11.4675, + "loss/aux_loss": 0.048075296357274054, + "loss/crossentropy": 2.8689566016197205, + "loss/logits": 0.8549737244844436, + "step": 33540 + }, + { + "epoch": 0.3355, + "grad_norm": 13.1875, + "grad_norm_var": 0.5383951822916667, + "learning_rate": 0.0003, + "loss": 11.259, + "loss/aux_loss": 0.04808174092322588, + "loss/crossentropy": 2.632620471715927, + "loss/logits": 0.8442192494869232, + "step": 33550 + }, + { + "epoch": 0.3356, + "grad_norm": 13.5, + "grad_norm_var": 0.679150390625, + "learning_rate": 0.0003, + "loss": 11.506, + "loss/aux_loss": 0.048078482039272785, + "loss/crossentropy": 2.8332688093185423, + "loss/logits": 0.8708831310272217, + "step": 33560 + }, + { + "epoch": 0.3357, + "grad_norm": 13.75, + "grad_norm_var": 0.7718098958333334, + "learning_rate": 0.0003, + "loss": 11.5518, + "loss/aux_loss": 0.048078897222876546, + "loss/crossentropy": 2.7509835004806518, + "loss/logits": 0.8593619883060455, + "step": 33570 + }, + { + "epoch": 0.3358, + "grad_norm": 14.0, + "grad_norm_var": 1.4572265625, + "learning_rate": 0.0003, + "loss": 11.3659, + "loss/aux_loss": 0.04807315096259117, + "loss/crossentropy": 2.7531749844551086, + "loss/logits": 0.8511229604482651, + "step": 33580 + }, + { + "epoch": 0.3359, + "grad_norm": 12.875, + "grad_norm_var": 0.23118489583333332, + "learning_rate": 0.0003, + "loss": 11.5134, + "loss/aux_loss": 0.04808947648853064, + "loss/crossentropy": 2.7343482613563537, + "loss/logits": 0.8925404042005539, + "step": 33590 + }, + { + "epoch": 0.336, + "grad_norm": 12.75, + "grad_norm_var": 0.47708333333333336, + "learning_rate": 0.0003, + "loss": 11.3391, + "loss/aux_loss": 0.04807898830622435, + "loss/crossentropy": 2.814681512117386, + "loss/logits": 0.8659243017435074, + "step": 33600 + }, + { + "epoch": 0.3361, + "grad_norm": 15.0, + "grad_norm_var": 0.6880208333333333, + "learning_rate": 0.0003, + "loss": 11.575, + "loss/aux_loss": 0.04807465691119432, + "loss/crossentropy": 2.740473783016205, + "loss/logits": 0.8824987977743148, + "step": 33610 + }, + { + "epoch": 0.3362, + "grad_norm": 14.5625, + "grad_norm_var": 0.4078125, + "learning_rate": 0.0003, + "loss": 11.3833, + "loss/aux_loss": 0.04808358158916235, + "loss/crossentropy": 2.7260211586952208, + "loss/logits": 0.8684123188257218, + "step": 33620 + }, + { + "epoch": 0.3363, + "grad_norm": 14.125, + "grad_norm_var": 0.2, + "learning_rate": 0.0003, + "loss": 11.3365, + "loss/aux_loss": 0.048076849430799484, + "loss/crossentropy": 2.7123505532741548, + "loss/logits": 0.8516561061143875, + "step": 33630 + }, + { + "epoch": 0.3364, + "grad_norm": 13.9375, + "grad_norm_var": 0.17185872395833332, + "learning_rate": 0.0003, + "loss": 11.2675, + "loss/aux_loss": 0.048073905520141125, + "loss/crossentropy": 2.894002687931061, + "loss/logits": 0.8935140758752823, + "step": 33640 + }, + { + "epoch": 0.3365, + "grad_norm": 14.0, + "grad_norm_var": 0.661962890625, + "learning_rate": 0.0003, + "loss": 11.4262, + "loss/aux_loss": 0.04808433558791876, + "loss/crossentropy": 2.725138372182846, + "loss/logits": 0.8834821820259094, + "step": 33650 + }, + { + "epoch": 0.3366, + "grad_norm": 13.75, + "grad_norm_var": 0.515625, + "learning_rate": 0.0003, + "loss": 11.3247, + "loss/aux_loss": 0.048078181222081184, + "loss/crossentropy": 2.6882384717464447, + "loss/logits": 0.8374345928430558, + "step": 33660 + }, + { + "epoch": 0.3367, + "grad_norm": 12.6875, + "grad_norm_var": 0.9149576822916666, + "learning_rate": 0.0003, + "loss": 11.3953, + "loss/aux_loss": 0.048090609908103946, + "loss/crossentropy": 2.7942125260829926, + "loss/logits": 0.8645864456892014, + "step": 33670 + }, + { + "epoch": 0.3368, + "grad_norm": 13.6875, + "grad_norm_var": 0.50625, + "learning_rate": 0.0003, + "loss": 11.61, + "loss/aux_loss": 0.04808273129165173, + "loss/crossentropy": 2.7299344420433043, + "loss/logits": 0.8819968163967132, + "step": 33680 + }, + { + "epoch": 0.3369, + "grad_norm": 14.0625, + "grad_norm_var": 4.205843098958334, + "learning_rate": 0.0003, + "loss": 11.5101, + "loss/aux_loss": 0.048078842274844644, + "loss/crossentropy": 2.6255062937736513, + "loss/logits": 0.8747300773859024, + "step": 33690 + }, + { + "epoch": 0.337, + "grad_norm": 13.1875, + "grad_norm_var": 0.42916666666666664, + "learning_rate": 0.0003, + "loss": 11.4736, + "loss/aux_loss": 0.04807350095361471, + "loss/crossentropy": 2.7771036982536317, + "loss/logits": 0.8716427236795425, + "step": 33700 + }, + { + "epoch": 0.3371, + "grad_norm": 13.8125, + "grad_norm_var": 0.4578125, + "learning_rate": 0.0003, + "loss": 11.5287, + "loss/aux_loss": 0.04808547291904688, + "loss/crossentropy": 2.8936782777309418, + "loss/logits": 0.8637538403272629, + "step": 33710 + }, + { + "epoch": 0.3372, + "grad_norm": 14.9375, + "grad_norm_var": 0.39817708333333335, + "learning_rate": 0.0003, + "loss": 11.4074, + "loss/aux_loss": 0.04807719625532627, + "loss/crossentropy": 2.781727874279022, + "loss/logits": 0.8768081456422806, + "step": 33720 + }, + { + "epoch": 0.3373, + "grad_norm": 14.0625, + "grad_norm_var": 6.6869140625, + "learning_rate": 0.0003, + "loss": 11.4137, + "loss/aux_loss": 0.04808393493294716, + "loss/crossentropy": 2.816925013065338, + "loss/logits": 0.857237920165062, + "step": 33730 + }, + { + "epoch": 0.3374, + "grad_norm": 12.625, + "grad_norm_var": 0.618603515625, + "learning_rate": 0.0003, + "loss": 11.3942, + "loss/aux_loss": 0.04807567745447159, + "loss/crossentropy": 2.733104008436203, + "loss/logits": 0.8575152397155762, + "step": 33740 + }, + { + "epoch": 0.3375, + "grad_norm": 13.4375, + "grad_norm_var": 0.35130208333333335, + "learning_rate": 0.0003, + "loss": 11.4799, + "loss/aux_loss": 0.04807877670973539, + "loss/crossentropy": 2.8148476839065553, + "loss/logits": 0.8529479697346687, + "step": 33750 + }, + { + "epoch": 0.3376, + "grad_norm": 14.5625, + "grad_norm_var": 0.7265625, + "learning_rate": 0.0003, + "loss": 11.327, + "loss/aux_loss": 0.0480769969522953, + "loss/crossentropy": 2.7635378301143647, + "loss/logits": 0.8825518250465393, + "step": 33760 + }, + { + "epoch": 0.3377, + "grad_norm": 13.125, + "grad_norm_var": 0.34479166666666666, + "learning_rate": 0.0003, + "loss": 11.2382, + "loss/aux_loss": 0.04808934032917023, + "loss/crossentropy": 2.5587519288063048, + "loss/logits": 0.803919005393982, + "step": 33770 + }, + { + "epoch": 0.3378, + "grad_norm": 14.9375, + "grad_norm_var": 3.247900390625, + "learning_rate": 0.0003, + "loss": 11.5012, + "loss/aux_loss": 0.04807521179318428, + "loss/crossentropy": 2.668776106834412, + "loss/logits": 0.8375656992197037, + "step": 33780 + }, + { + "epoch": 0.3379, + "grad_norm": 13.75, + "grad_norm_var": 3.1749837239583334, + "learning_rate": 0.0003, + "loss": 11.2613, + "loss/aux_loss": 0.04808988273143768, + "loss/crossentropy": 2.650000900030136, + "loss/logits": 0.8299892216920852, + "step": 33790 + }, + { + "epoch": 0.338, + "grad_norm": 14.75, + "grad_norm_var": 7.923030598958333, + "learning_rate": 0.0003, + "loss": 11.405, + "loss/aux_loss": 0.04808232057839632, + "loss/crossentropy": 2.8221355438232423, + "loss/logits": 0.8894154459238053, + "step": 33800 + }, + { + "epoch": 0.3381, + "grad_norm": 12.625, + "grad_norm_var": 0.47980143229166666, + "learning_rate": 0.0003, + "loss": 11.503, + "loss/aux_loss": 0.04806968811899424, + "loss/crossentropy": 2.8208815813064576, + "loss/logits": 0.896739274263382, + "step": 33810 + }, + { + "epoch": 0.3382, + "grad_norm": 13.375, + "grad_norm_var": 0.46087239583333334, + "learning_rate": 0.0003, + "loss": 11.3597, + "loss/aux_loss": 0.048082143254578116, + "loss/crossentropy": 2.549110287427902, + "loss/logits": 0.8261379420757293, + "step": 33820 + }, + { + "epoch": 0.3383, + "grad_norm": 14.3125, + "grad_norm_var": 0.6005208333333333, + "learning_rate": 0.0003, + "loss": 11.6718, + "loss/aux_loss": 0.048074525967240336, + "loss/crossentropy": 2.8008286237716673, + "loss/logits": 0.9250722289085388, + "step": 33830 + }, + { + "epoch": 0.3384, + "grad_norm": 13.6875, + "grad_norm_var": 0.25128580729166666, + "learning_rate": 0.0003, + "loss": 11.343, + "loss/aux_loss": 0.04808689635246992, + "loss/crossentropy": 2.663837468624115, + "loss/logits": 0.8424749076366425, + "step": 33840 + }, + { + "epoch": 0.3385, + "grad_norm": 13.75, + "grad_norm_var": 1.8880208333333333, + "learning_rate": 0.0003, + "loss": 11.5091, + "loss/aux_loss": 0.04809404145926237, + "loss/crossentropy": 2.8522875905036926, + "loss/logits": 0.8729503244161606, + "step": 33850 + }, + { + "epoch": 0.3386, + "grad_norm": 16.625, + "grad_norm_var": 0.809228515625, + "learning_rate": 0.0003, + "loss": 11.449, + "loss/aux_loss": 0.04807314351201057, + "loss/crossentropy": 2.9294650077819826, + "loss/logits": 0.8873317569494248, + "step": 33860 + }, + { + "epoch": 0.3387, + "grad_norm": 15.0, + "grad_norm_var": 1.1075358072916666, + "learning_rate": 0.0003, + "loss": 11.4419, + "loss/aux_loss": 0.048078053072094916, + "loss/crossentropy": 2.893061339855194, + "loss/logits": 0.9111079752445221, + "step": 33870 + }, + { + "epoch": 0.3388, + "grad_norm": 14.125, + "grad_norm_var": 2.179541015625, + "learning_rate": 0.0003, + "loss": 11.3273, + "loss/aux_loss": 0.04808342736214399, + "loss/crossentropy": 2.602770173549652, + "loss/logits": 0.81967893242836, + "step": 33880 + }, + { + "epoch": 0.3389, + "grad_norm": 13.3125, + "grad_norm_var": 0.9304524739583333, + "learning_rate": 0.0003, + "loss": 11.321, + "loss/aux_loss": 0.048087199591100215, + "loss/crossentropy": 2.5976479768753054, + "loss/logits": 0.8418799489736557, + "step": 33890 + }, + { + "epoch": 0.339, + "grad_norm": 13.5625, + "grad_norm_var": 0.8169108072916667, + "learning_rate": 0.0003, + "loss": 11.3486, + "loss/aux_loss": 0.04807840995490551, + "loss/crossentropy": 2.9478099584579467, + "loss/logits": 0.8819531232118607, + "step": 33900 + }, + { + "epoch": 0.3391, + "grad_norm": 13.1875, + "grad_norm_var": 0.41326497395833334, + "learning_rate": 0.0003, + "loss": 11.2091, + "loss/aux_loss": 0.04807400442659855, + "loss/crossentropy": 2.6751762211322783, + "loss/logits": 0.8382753849029541, + "step": 33910 + }, + { + "epoch": 0.3392, + "grad_norm": 12.5, + "grad_norm_var": 0.4864420572916667, + "learning_rate": 0.0003, + "loss": 11.3863, + "loss/aux_loss": 0.048082627542316916, + "loss/crossentropy": 2.67808051109314, + "loss/logits": 0.8579610645771026, + "step": 33920 + }, + { + "epoch": 0.3393, + "grad_norm": 13.25, + "grad_norm_var": 1.636572265625, + "learning_rate": 0.0003, + "loss": 11.5376, + "loss/aux_loss": 0.04807865135371685, + "loss/crossentropy": 2.8202176868915556, + "loss/logits": 0.8423859208822251, + "step": 33930 + }, + { + "epoch": 0.3394, + "grad_norm": 13.5625, + "grad_norm_var": 0.4158854166666667, + "learning_rate": 0.0003, + "loss": 11.3614, + "loss/aux_loss": 0.04808816146105528, + "loss/crossentropy": 2.5449154317378997, + "loss/logits": 0.8428879886865616, + "step": 33940 + }, + { + "epoch": 0.3395, + "grad_norm": 13.25, + "grad_norm_var": 0.5692057291666667, + "learning_rate": 0.0003, + "loss": 11.22, + "loss/aux_loss": 0.04808717239648104, + "loss/crossentropy": 2.7124799370765684, + "loss/logits": 0.8479482620954514, + "step": 33950 + }, + { + "epoch": 0.3396, + "grad_norm": 12.9375, + "grad_norm_var": 0.35833333333333334, + "learning_rate": 0.0003, + "loss": 11.4288, + "loss/aux_loss": 0.04808026142418385, + "loss/crossentropy": 2.769565761089325, + "loss/logits": 0.8855602651834488, + "step": 33960 + }, + { + "epoch": 0.3397, + "grad_norm": 12.875, + "grad_norm_var": 0.22849934895833332, + "learning_rate": 0.0003, + "loss": 11.3179, + "loss/aux_loss": 0.04807202909141779, + "loss/crossentropy": 2.7668771505355836, + "loss/logits": 0.8770667523145675, + "step": 33970 + }, + { + "epoch": 0.3398, + "grad_norm": 12.625, + "grad_norm_var": 0.37303059895833335, + "learning_rate": 0.0003, + "loss": 11.3418, + "loss/aux_loss": 0.04808063618838787, + "loss/crossentropy": 2.774995541572571, + "loss/logits": 0.8627175658941268, + "step": 33980 + }, + { + "epoch": 0.3399, + "grad_norm": 12.625, + "grad_norm_var": 0.19869791666666667, + "learning_rate": 0.0003, + "loss": 11.2925, + "loss/aux_loss": 0.04808147568255663, + "loss/crossentropy": 2.731263720989227, + "loss/logits": 0.8523558408021927, + "step": 33990 + }, + { + "epoch": 0.34, + "grad_norm": 12.6875, + "grad_norm_var": 0.6572916666666667, + "learning_rate": 0.0003, + "loss": 11.4506, + "loss/aux_loss": 0.048083949461579324, + "loss/crossentropy": 2.6820335149765016, + "loss/logits": 0.8677924752235413, + "step": 34000 + }, + { + "epoch": 0.3401, + "grad_norm": 14.0, + "grad_norm_var": 0.484619140625, + "learning_rate": 0.0003, + "loss": 11.49, + "loss/aux_loss": 0.04807724487036467, + "loss/crossentropy": 2.8576271653175356, + "loss/logits": 0.886279183626175, + "step": 34010 + }, + { + "epoch": 0.3402, + "grad_norm": 13.125, + "grad_norm_var": 0.33203125, + "learning_rate": 0.0003, + "loss": 11.2666, + "loss/aux_loss": 0.048078724555671214, + "loss/crossentropy": 2.695602595806122, + "loss/logits": 0.8482803136110306, + "step": 34020 + }, + { + "epoch": 0.3403, + "grad_norm": 13.1875, + "grad_norm_var": 0.272900390625, + "learning_rate": 0.0003, + "loss": 11.3761, + "loss/aux_loss": 0.04807147961109877, + "loss/crossentropy": 2.681200659275055, + "loss/logits": 0.8647918730974198, + "step": 34030 + }, + { + "epoch": 0.3404, + "grad_norm": 13.1875, + "grad_norm_var": 21.075764973958332, + "learning_rate": 0.0003, + "loss": 11.387, + "loss/aux_loss": 0.04808759596198797, + "loss/crossentropy": 2.7159022450447083, + "loss/logits": 0.8712036728858947, + "step": 34040 + }, + { + "epoch": 0.3405, + "grad_norm": 13.5625, + "grad_norm_var": 21.0384765625, + "learning_rate": 0.0003, + "loss": 11.5856, + "loss/aux_loss": 0.048080033622682095, + "loss/crossentropy": 2.847675824165344, + "loss/logits": 0.8919312745332718, + "step": 34050 + }, + { + "epoch": 0.3406, + "grad_norm": 13.75, + "grad_norm_var": 0.22180989583333333, + "learning_rate": 0.0003, + "loss": 11.4877, + "loss/aux_loss": 0.048076984100043775, + "loss/crossentropy": 2.7297983527183534, + "loss/logits": 0.8955170571804046, + "step": 34060 + }, + { + "epoch": 0.3407, + "grad_norm": 12.6875, + "grad_norm_var": 0.26712239583333336, + "learning_rate": 0.0003, + "loss": 11.4626, + "loss/aux_loss": 0.048071731068193915, + "loss/crossentropy": 2.777487003803253, + "loss/logits": 0.8421605467796326, + "step": 34070 + }, + { + "epoch": 0.3408, + "grad_norm": 16.75, + "grad_norm_var": 1.576025390625, + "learning_rate": 0.0003, + "loss": 11.5644, + "loss/aux_loss": 0.048079511150717735, + "loss/crossentropy": 2.791468983888626, + "loss/logits": 0.8447980105876922, + "step": 34080 + }, + { + "epoch": 0.3409, + "grad_norm": 14.5625, + "grad_norm_var": 113.454541015625, + "learning_rate": 0.0003, + "loss": 11.3856, + "loss/aux_loss": 0.04809570461511612, + "loss/crossentropy": 2.746097815036774, + "loss/logits": 0.8729157716035842, + "step": 34090 + }, + { + "epoch": 0.341, + "grad_norm": 13.125, + "grad_norm_var": 114.9181640625, + "learning_rate": 0.0003, + "loss": 11.2094, + "loss/aux_loss": 0.04807548206299543, + "loss/crossentropy": 2.744466412067413, + "loss/logits": 0.8620479941368103, + "step": 34100 + }, + { + "epoch": 0.3411, + "grad_norm": 13.375, + "grad_norm_var": 0.9307291666666667, + "learning_rate": 0.0003, + "loss": 11.2762, + "loss/aux_loss": 0.04807662982493639, + "loss/crossentropy": 2.7420936226844788, + "loss/logits": 0.8486543864011764, + "step": 34110 + }, + { + "epoch": 0.3412, + "grad_norm": 14.375, + "grad_norm_var": 0.5926920572916666, + "learning_rate": 0.0003, + "loss": 11.3183, + "loss/aux_loss": 0.04808496292680502, + "loss/crossentropy": 2.7532592713832855, + "loss/logits": 0.8521647185087204, + "step": 34120 + }, + { + "epoch": 0.3413, + "grad_norm": 13.8125, + "grad_norm_var": 0.19479166666666667, + "learning_rate": 0.0003, + "loss": 11.4643, + "loss/aux_loss": 0.04806942287832498, + "loss/crossentropy": 2.971810203790665, + "loss/logits": 0.9140418171882629, + "step": 34130 + }, + { + "epoch": 0.3414, + "grad_norm": 13.5, + "grad_norm_var": 0.6016764322916667, + "learning_rate": 0.0003, + "loss": 11.4657, + "loss/aux_loss": 0.048084068857133386, + "loss/crossentropy": 2.714846724271774, + "loss/logits": 0.8961813569068908, + "step": 34140 + }, + { + "epoch": 0.3415, + "grad_norm": 13.125, + "grad_norm_var": 0.86015625, + "learning_rate": 0.0003, + "loss": 11.4352, + "loss/aux_loss": 0.048082141764461996, + "loss/crossentropy": 2.7342415273189546, + "loss/logits": 0.8455929309129715, + "step": 34150 + }, + { + "epoch": 0.3416, + "grad_norm": 13.5625, + "grad_norm_var": 0.6512858072916666, + "learning_rate": 0.0003, + "loss": 11.2724, + "loss/aux_loss": 0.048071014508605, + "loss/crossentropy": 2.6547606706619264, + "loss/logits": 0.8492685943841934, + "step": 34160 + }, + { + "epoch": 0.3417, + "grad_norm": 14.75, + "grad_norm_var": 1.117431640625, + "learning_rate": 0.0003, + "loss": 11.502, + "loss/aux_loss": 0.04807817898690701, + "loss/crossentropy": 2.7856763303279877, + "loss/logits": 0.8842839747667313, + "step": 34170 + }, + { + "epoch": 0.3418, + "grad_norm": 13.375, + "grad_norm_var": 0.5221354166666666, + "learning_rate": 0.0003, + "loss": 11.3981, + "loss/aux_loss": 0.04807715006172657, + "loss/crossentropy": 2.708691877126694, + "loss/logits": 0.8723496258258819, + "step": 34180 + }, + { + "epoch": 0.3419, + "grad_norm": 13.375, + "grad_norm_var": 0.45362955729166665, + "learning_rate": 0.0003, + "loss": 11.3953, + "loss/aux_loss": 0.04807659070938826, + "loss/crossentropy": 2.7593763947486876, + "loss/logits": 0.8531175792217255, + "step": 34190 + }, + { + "epoch": 0.342, + "grad_norm": 13.875, + "grad_norm_var": 0.21066080729166667, + "learning_rate": 0.0003, + "loss": 11.2668, + "loss/aux_loss": 0.04807346910238266, + "loss/crossentropy": 2.825400298833847, + "loss/logits": 0.8504022687673569, + "step": 34200 + }, + { + "epoch": 0.3421, + "grad_norm": 13.6875, + "grad_norm_var": 0.6419108072916667, + "learning_rate": 0.0003, + "loss": 11.3772, + "loss/aux_loss": 0.048073016293346885, + "loss/crossentropy": 2.614541435241699, + "loss/logits": 0.8537357658147812, + "step": 34210 + }, + { + "epoch": 0.3422, + "grad_norm": 16.75, + "grad_norm_var": 1.0940104166666667, + "learning_rate": 0.0003, + "loss": 11.2334, + "loss/aux_loss": 0.04808624424040318, + "loss/crossentropy": 2.588237798213959, + "loss/logits": 0.8261738806962967, + "step": 34220 + }, + { + "epoch": 0.3423, + "grad_norm": 13.6875, + "grad_norm_var": 1.0035807291666667, + "learning_rate": 0.0003, + "loss": 11.3177, + "loss/aux_loss": 0.04808156695216894, + "loss/crossentropy": 2.766414910554886, + "loss/logits": 0.8738790214061737, + "step": 34230 + }, + { + "epoch": 0.3424, + "grad_norm": 13.6875, + "grad_norm_var": 0.32024739583333334, + "learning_rate": 0.0003, + "loss": 11.1505, + "loss/aux_loss": 0.04807690214365721, + "loss/crossentropy": 2.6533170878887176, + "loss/logits": 0.8381021320819855, + "step": 34240 + }, + { + "epoch": 0.3425, + "grad_norm": 13.125, + "grad_norm_var": 0.4025390625, + "learning_rate": 0.0003, + "loss": 11.3866, + "loss/aux_loss": 0.048076437786221504, + "loss/crossentropy": 2.852277064323425, + "loss/logits": 0.8767792642116546, + "step": 34250 + }, + { + "epoch": 0.3426, + "grad_norm": 12.875, + "grad_norm_var": 0.3776041666666667, + "learning_rate": 0.0003, + "loss": 11.3834, + "loss/aux_loss": 0.04808211978524923, + "loss/crossentropy": 2.8015721797943116, + "loss/logits": 0.8706277936697007, + "step": 34260 + }, + { + "epoch": 0.3427, + "grad_norm": 13.125, + "grad_norm_var": 2.2030598958333334, + "learning_rate": 0.0003, + "loss": 11.3862, + "loss/aux_loss": 0.048078938759863375, + "loss/crossentropy": 2.790551495552063, + "loss/logits": 0.8879824995994567, + "step": 34270 + }, + { + "epoch": 0.3428, + "grad_norm": 13.3125, + "grad_norm_var": 2.0182291666666665, + "learning_rate": 0.0003, + "loss": 11.4202, + "loss/aux_loss": 0.048073595948517324, + "loss/crossentropy": 2.634516406059265, + "loss/logits": 0.8375965476036071, + "step": 34280 + }, + { + "epoch": 0.3429, + "grad_norm": 14.5, + "grad_norm_var": 0.3700358072916667, + "learning_rate": 0.0003, + "loss": 11.1732, + "loss/aux_loss": 0.048077768087387084, + "loss/crossentropy": 2.8296406984329225, + "loss/logits": 0.8416439831256867, + "step": 34290 + }, + { + "epoch": 0.343, + "grad_norm": 14.8125, + "grad_norm_var": 8.235139973958333, + "learning_rate": 0.0003, + "loss": 11.4395, + "loss/aux_loss": 0.04807805363088846, + "loss/crossentropy": 2.6985132932662963, + "loss/logits": 0.8508864104747772, + "step": 34300 + }, + { + "epoch": 0.3431, + "grad_norm": 12.8125, + "grad_norm_var": 8.224934895833334, + "learning_rate": 0.0003, + "loss": 11.3689, + "loss/aux_loss": 0.04808331541717052, + "loss/crossentropy": 2.7489245235919952, + "loss/logits": 0.8492704391479492, + "step": 34310 + }, + { + "epoch": 0.3432, + "grad_norm": 15.5, + "grad_norm_var": 0.6957682291666667, + "learning_rate": 0.0003, + "loss": 11.3776, + "loss/aux_loss": 0.048074118047952655, + "loss/crossentropy": 2.6718755304813384, + "loss/logits": 0.8244736731052399, + "step": 34320 + }, + { + "epoch": 0.3433, + "grad_norm": 14.0, + "grad_norm_var": 0.699853515625, + "learning_rate": 0.0003, + "loss": 11.3257, + "loss/aux_loss": 0.04808854255825281, + "loss/crossentropy": 2.5519628286361695, + "loss/logits": 0.8258508026599884, + "step": 34330 + }, + { + "epoch": 0.3434, + "grad_norm": 13.5625, + "grad_norm_var": 0.9551920572916667, + "learning_rate": 0.0003, + "loss": 11.2696, + "loss/aux_loss": 0.048077429085969924, + "loss/crossentropy": 2.7307616233825684, + "loss/logits": 0.8781674951314926, + "step": 34340 + }, + { + "epoch": 0.3435, + "grad_norm": 12.875, + "grad_norm_var": 0.2400390625, + "learning_rate": 0.0003, + "loss": 11.5434, + "loss/aux_loss": 0.0480849402025342, + "loss/crossentropy": 2.7033145487308503, + "loss/logits": 0.8667132765054703, + "step": 34350 + }, + { + "epoch": 0.3436, + "grad_norm": 13.625, + "grad_norm_var": 0.24348958333333334, + "learning_rate": 0.0003, + "loss": 11.2203, + "loss/aux_loss": 0.048083757422864436, + "loss/crossentropy": 2.7016442120075226, + "loss/logits": 0.8575159192085267, + "step": 34360 + }, + { + "epoch": 0.3437, + "grad_norm": 13.25, + "grad_norm_var": 0.5801432291666667, + "learning_rate": 0.0003, + "loss": 11.4771, + "loss/aux_loss": 0.04808876011520624, + "loss/crossentropy": 2.7391174018383024, + "loss/logits": 0.864966481924057, + "step": 34370 + }, + { + "epoch": 0.3438, + "grad_norm": 13.0625, + "grad_norm_var": 0.8673014322916667, + "learning_rate": 0.0003, + "loss": 11.489, + "loss/aux_loss": 0.04807764030992985, + "loss/crossentropy": 2.7683672428131105, + "loss/logits": 0.8739204913377762, + "step": 34380 + }, + { + "epoch": 0.3439, + "grad_norm": 13.1875, + "grad_norm_var": 0.6416015625, + "learning_rate": 0.0003, + "loss": 11.2858, + "loss/aux_loss": 0.04807857647538185, + "loss/crossentropy": 2.7284740686416624, + "loss/logits": 0.8791959375143051, + "step": 34390 + }, + { + "epoch": 0.344, + "grad_norm": 13.9375, + "grad_norm_var": 0.7230305989583333, + "learning_rate": 0.0003, + "loss": 11.4192, + "loss/aux_loss": 0.04808525741100311, + "loss/crossentropy": 2.701452487707138, + "loss/logits": 0.8486621975898743, + "step": 34400 + }, + { + "epoch": 0.3441, + "grad_norm": 14.0, + "grad_norm_var": 0.49420572916666666, + "learning_rate": 0.0003, + "loss": 11.4331, + "loss/aux_loss": 0.04808118660002947, + "loss/crossentropy": 2.7283570528030396, + "loss/logits": 0.8572055399417877, + "step": 34410 + }, + { + "epoch": 0.3442, + "grad_norm": 15.4375, + "grad_norm_var": 67.07941080729167, + "learning_rate": 0.0003, + "loss": 11.4184, + "loss/aux_loss": 0.04808388836681843, + "loss/crossentropy": 2.7710089802742006, + "loss/logits": 0.9001825273036956, + "step": 34420 + }, + { + "epoch": 0.3443, + "grad_norm": 14.375, + "grad_norm_var": 66.71573893229167, + "learning_rate": 0.0003, + "loss": 11.4897, + "loss/aux_loss": 0.04807852674275637, + "loss/crossentropy": 2.9153838396072387, + "loss/logits": 0.9058898121118546, + "step": 34430 + }, + { + "epoch": 0.3444, + "grad_norm": 13.4375, + "grad_norm_var": 0.4254557291666667, + "learning_rate": 0.0003, + "loss": 11.3548, + "loss/aux_loss": 0.048092870600521566, + "loss/crossentropy": 2.798048400878906, + "loss/logits": 0.8726730048656464, + "step": 34440 + }, + { + "epoch": 0.3445, + "grad_norm": 13.125, + "grad_norm_var": 2.915478515625, + "learning_rate": 0.0003, + "loss": 11.3393, + "loss/aux_loss": 0.04807271007448435, + "loss/crossentropy": 2.8184443950653075, + "loss/logits": 0.8590573251247406, + "step": 34450 + }, + { + "epoch": 0.3446, + "grad_norm": 12.625, + "grad_norm_var": 2.8291015625, + "learning_rate": 0.0003, + "loss": 11.3975, + "loss/aux_loss": 0.048090783134102824, + "loss/crossentropy": 2.57768235206604, + "loss/logits": 0.8573962718248367, + "step": 34460 + }, + { + "epoch": 0.3447, + "grad_norm": 15.3125, + "grad_norm_var": 3.8590983072916667, + "learning_rate": 0.0003, + "loss": 11.4101, + "loss/aux_loss": 0.0480766186490655, + "loss/crossentropy": 2.7841560423374174, + "loss/logits": 0.8446702927350997, + "step": 34470 + }, + { + "epoch": 0.3448, + "grad_norm": 13.9375, + "grad_norm_var": 497.44138997395834, + "learning_rate": 0.0003, + "loss": 11.3165, + "loss/aux_loss": 0.04808448310941458, + "loss/crossentropy": 2.6943975150585175, + "loss/logits": 0.8504249632358551, + "step": 34480 + }, + { + "epoch": 0.3449, + "grad_norm": 13.8125, + "grad_norm_var": 500.03435872395835, + "learning_rate": 0.0003, + "loss": 11.4362, + "loss/aux_loss": 0.04807149842381477, + "loss/crossentropy": 2.736327660083771, + "loss/logits": 0.900990754365921, + "step": 34490 + }, + { + "epoch": 0.345, + "grad_norm": 14.0, + "grad_norm_var": 0.38357747395833336, + "learning_rate": 0.0003, + "loss": 11.5657, + "loss/aux_loss": 0.04809287562966347, + "loss/crossentropy": 2.7342132091522218, + "loss/logits": 0.864747279882431, + "step": 34500 + }, + { + "epoch": 0.3451, + "grad_norm": 13.8125, + "grad_norm_var": 0.5598958333333334, + "learning_rate": 0.0003, + "loss": 11.3414, + "loss/aux_loss": 0.04807418640702963, + "loss/crossentropy": 2.7356625139713286, + "loss/logits": 0.8603394240140915, + "step": 34510 + }, + { + "epoch": 0.3452, + "grad_norm": 14.75, + "grad_norm_var": 15.267643229166667, + "learning_rate": 0.0003, + "loss": 11.451, + "loss/aux_loss": 0.048086957447230814, + "loss/crossentropy": 2.6491969525814056, + "loss/logits": 0.8844427525997162, + "step": 34520 + }, + { + "epoch": 0.3453, + "grad_norm": 13.125, + "grad_norm_var": 16.376546223958332, + "learning_rate": 0.0003, + "loss": 11.39, + "loss/aux_loss": 0.048073998652398586, + "loss/crossentropy": 2.768836522102356, + "loss/logits": 0.8581522196531296, + "step": 34530 + }, + { + "epoch": 0.3454, + "grad_norm": 14.5625, + "grad_norm_var": 0.8609212239583334, + "learning_rate": 0.0003, + "loss": 11.3267, + "loss/aux_loss": 0.04808975532650948, + "loss/crossentropy": 2.5816560626029967, + "loss/logits": 0.8240507543087006, + "step": 34540 + }, + { + "epoch": 0.3455, + "grad_norm": 13.3125, + "grad_norm_var": 0.42967122395833335, + "learning_rate": 0.0003, + "loss": 11.3352, + "loss/aux_loss": 0.048081275261938575, + "loss/crossentropy": 2.775824022293091, + "loss/logits": 0.8478355586528779, + "step": 34550 + }, + { + "epoch": 0.3456, + "grad_norm": 52.75, + "grad_norm_var": 95.89368489583333, + "learning_rate": 0.0003, + "loss": 11.4611, + "loss/aux_loss": 0.04807286318391561, + "loss/crossentropy": 2.7201479375362396, + "loss/logits": 0.8709542602300644, + "step": 34560 + }, + { + "epoch": 0.3457, + "grad_norm": 13.5625, + "grad_norm_var": 94.72369791666667, + "learning_rate": 0.0003, + "loss": 11.3639, + "loss/aux_loss": 0.04808838125318289, + "loss/crossentropy": 2.6683280885219576, + "loss/logits": 0.8334614604711532, + "step": 34570 + }, + { + "epoch": 0.3458, + "grad_norm": 13.125, + "grad_norm_var": 0.28828125, + "learning_rate": 0.0003, + "loss": 11.2168, + "loss/aux_loss": 0.048079443350434306, + "loss/crossentropy": 2.8578147292137146, + "loss/logits": 0.851711419224739, + "step": 34580 + }, + { + "epoch": 0.3459, + "grad_norm": 14.0, + "grad_norm_var": 1.5541015625, + "learning_rate": 0.0003, + "loss": 11.5112, + "loss/aux_loss": 0.048081373795866966, + "loss/crossentropy": 2.717966139316559, + "loss/logits": 0.8493025034666062, + "step": 34590 + }, + { + "epoch": 0.346, + "grad_norm": 13.1875, + "grad_norm_var": 1.81953125, + "learning_rate": 0.0003, + "loss": 11.344, + "loss/aux_loss": 0.04807697702199221, + "loss/crossentropy": 2.6692949771881103, + "loss/logits": 0.8717973381280899, + "step": 34600 + }, + { + "epoch": 0.3461, + "grad_norm": 12.6875, + "grad_norm_var": 0.6989583333333333, + "learning_rate": 0.0003, + "loss": 11.3426, + "loss/aux_loss": 0.048083077929913995, + "loss/crossentropy": 2.7653361916542054, + "loss/logits": 0.8600286096334457, + "step": 34610 + }, + { + "epoch": 0.3462, + "grad_norm": 15.5, + "grad_norm_var": 0.632275390625, + "learning_rate": 0.0003, + "loss": 11.5035, + "loss/aux_loss": 0.04807848259806633, + "loss/crossentropy": 2.6988938450813293, + "loss/logits": 0.8438379615545273, + "step": 34620 + }, + { + "epoch": 0.3463, + "grad_norm": 13.1875, + "grad_norm_var": 0.7317057291666667, + "learning_rate": 0.0003, + "loss": 11.3984, + "loss/aux_loss": 0.048082325235009195, + "loss/crossentropy": 2.8675466597080232, + "loss/logits": 0.8446835935115814, + "step": 34630 + }, + { + "epoch": 0.3464, + "grad_norm": 13.625, + "grad_norm_var": 0.482666015625, + "learning_rate": 0.0003, + "loss": 11.5275, + "loss/aux_loss": 0.04808323420584202, + "loss/crossentropy": 2.6933222889900206, + "loss/logits": 0.8721669852733612, + "step": 34640 + }, + { + "epoch": 0.3465, + "grad_norm": 13.6875, + "grad_norm_var": 4.624983723958334, + "learning_rate": 0.0003, + "loss": 11.2929, + "loss/aux_loss": 0.04808050952851772, + "loss/crossentropy": 2.932732379436493, + "loss/logits": 0.8866453051567078, + "step": 34650 + }, + { + "epoch": 0.3466, + "grad_norm": 14.0, + "grad_norm_var": 4.804801432291667, + "learning_rate": 0.0003, + "loss": 11.5469, + "loss/aux_loss": 0.04808491580188275, + "loss/crossentropy": 2.6847081184387207, + "loss/logits": 0.8588018774986267, + "step": 34660 + }, + { + "epoch": 0.3467, + "grad_norm": 12.375, + "grad_norm_var": 0.226025390625, + "learning_rate": 0.0003, + "loss": 11.3049, + "loss/aux_loss": 0.04807180892676115, + "loss/crossentropy": 2.8481385111808777, + "loss/logits": 0.8246441930532455, + "step": 34670 + }, + { + "epoch": 0.3468, + "grad_norm": 13.5625, + "grad_norm_var": 0.1884765625, + "learning_rate": 0.0003, + "loss": 11.4136, + "loss/aux_loss": 0.04807733632624149, + "loss/crossentropy": 2.7871821761131286, + "loss/logits": 0.8697138547897338, + "step": 34680 + }, + { + "epoch": 0.3469, + "grad_norm": 13.3125, + "grad_norm_var": 0.16243489583333334, + "learning_rate": 0.0003, + "loss": 11.2759, + "loss/aux_loss": 0.04808486551046372, + "loss/crossentropy": 2.7706130504608155, + "loss/logits": 0.8613585025072098, + "step": 34690 + }, + { + "epoch": 0.347, + "grad_norm": 13.875, + "grad_norm_var": 0.6391764322916667, + "learning_rate": 0.0003, + "loss": 11.2369, + "loss/aux_loss": 0.048074539937078956, + "loss/crossentropy": 2.725221812725067, + "loss/logits": 0.8312882751226425, + "step": 34700 + }, + { + "epoch": 0.3471, + "grad_norm": 14.75, + "grad_norm_var": 1.1984375, + "learning_rate": 0.0003, + "loss": 11.3719, + "loss/aux_loss": 0.048076304234564306, + "loss/crossentropy": 2.8168802559375763, + "loss/logits": 0.8541012018918991, + "step": 34710 + }, + { + "epoch": 0.3472, + "grad_norm": 14.9375, + "grad_norm_var": 0.8786295572916667, + "learning_rate": 0.0003, + "loss": 11.2599, + "loss/aux_loss": 0.048082977347075936, + "loss/crossentropy": 2.701399064064026, + "loss/logits": 0.8401564180850982, + "step": 34720 + }, + { + "epoch": 0.3473, + "grad_norm": 13.5625, + "grad_norm_var": 0.941259765625, + "learning_rate": 0.0003, + "loss": 11.4601, + "loss/aux_loss": 0.048081225156784056, + "loss/crossentropy": 2.8854560017585755, + "loss/logits": 0.8480129152536392, + "step": 34730 + }, + { + "epoch": 0.3474, + "grad_norm": 14.375, + "grad_norm_var": 0.24881184895833333, + "learning_rate": 0.0003, + "loss": 11.4022, + "loss/aux_loss": 0.048076600581407544, + "loss/crossentropy": 2.7241687536239625, + "loss/logits": 0.8581605464220047, + "step": 34740 + }, + { + "epoch": 0.3475, + "grad_norm": 13.3125, + "grad_norm_var": 0.37472330729166664, + "learning_rate": 0.0003, + "loss": 11.3392, + "loss/aux_loss": 0.04808560237288475, + "loss/crossentropy": 2.6501355826854707, + "loss/logits": 0.843683734536171, + "step": 34750 + }, + { + "epoch": 0.3476, + "grad_norm": 14.0625, + "grad_norm_var": 0.20792643229166666, + "learning_rate": 0.0003, + "loss": 11.4091, + "loss/aux_loss": 0.04806531127542257, + "loss/crossentropy": 2.753983849287033, + "loss/logits": 0.8834265947341919, + "step": 34760 + }, + { + "epoch": 0.3477, + "grad_norm": 14.0, + "grad_norm_var": 0.3792805989583333, + "learning_rate": 0.0003, + "loss": 11.5056, + "loss/aux_loss": 0.04808344487100839, + "loss/crossentropy": 2.8421459555625916, + "loss/logits": 0.9221995055675507, + "step": 34770 + }, + { + "epoch": 0.3478, + "grad_norm": 14.0625, + "grad_norm_var": 0.41119791666666666, + "learning_rate": 0.0003, + "loss": 11.3332, + "loss/aux_loss": 0.04807017482817173, + "loss/crossentropy": 2.630517715215683, + "loss/logits": 0.8022065937519074, + "step": 34780 + }, + { + "epoch": 0.3479, + "grad_norm": 13.25, + "grad_norm_var": 0.37161458333333336, + "learning_rate": 0.0003, + "loss": 11.4773, + "loss/aux_loss": 0.04808255229145288, + "loss/crossentropy": 2.7792518377304076, + "loss/logits": 0.8739649176597595, + "step": 34790 + }, + { + "epoch": 0.348, + "grad_norm": 13.25, + "grad_norm_var": 0.24998372395833332, + "learning_rate": 0.0003, + "loss": 11.2592, + "loss/aux_loss": 0.048081311769783495, + "loss/crossentropy": 2.8258360862731933, + "loss/logits": 0.8601921498775482, + "step": 34800 + }, + { + "epoch": 0.3481, + "grad_norm": 12.1875, + "grad_norm_var": 0.3882649739583333, + "learning_rate": 0.0003, + "loss": 11.2427, + "loss/aux_loss": 0.048077326826751234, + "loss/crossentropy": 2.804766833782196, + "loss/logits": 0.8551715075969696, + "step": 34810 + }, + { + "epoch": 0.3482, + "grad_norm": 13.0625, + "grad_norm_var": 0.44212239583333335, + "learning_rate": 0.0003, + "loss": 11.3048, + "loss/aux_loss": 0.048078553192317484, + "loss/crossentropy": 2.6608037412166596, + "loss/logits": 0.8357015043497086, + "step": 34820 + }, + { + "epoch": 0.3483, + "grad_norm": 13.9375, + "grad_norm_var": 0.230712890625, + "learning_rate": 0.0003, + "loss": 11.3251, + "loss/aux_loss": 0.04808176066726446, + "loss/crossentropy": 2.824368530511856, + "loss/logits": 0.8423573106527329, + "step": 34830 + }, + { + "epoch": 0.3484, + "grad_norm": 13.1875, + "grad_norm_var": 0.2775390625, + "learning_rate": 0.0003, + "loss": 11.5143, + "loss/aux_loss": 0.04807648658752441, + "loss/crossentropy": 2.8121955931186675, + "loss/logits": 0.8731589168310165, + "step": 34840 + }, + { + "epoch": 0.3485, + "grad_norm": 15.0625, + "grad_norm_var": 1.0973958333333333, + "learning_rate": 0.0003, + "loss": 11.3939, + "loss/aux_loss": 0.04807734619826078, + "loss/crossentropy": 2.754815798997879, + "loss/logits": 0.8677790522575378, + "step": 34850 + }, + { + "epoch": 0.3486, + "grad_norm": 13.75, + "grad_norm_var": 1.135791015625, + "learning_rate": 0.0003, + "loss": 11.3472, + "loss/aux_loss": 0.04807490929961204, + "loss/crossentropy": 2.728018116950989, + "loss/logits": 0.8459836810827255, + "step": 34860 + }, + { + "epoch": 0.3487, + "grad_norm": 13.75, + "grad_norm_var": 0.6589680989583333, + "learning_rate": 0.0003, + "loss": 11.2925, + "loss/aux_loss": 0.0480719706043601, + "loss/crossentropy": 2.681806039810181, + "loss/logits": 0.8287836849689484, + "step": 34870 + }, + { + "epoch": 0.3488, + "grad_norm": 12.625, + "grad_norm_var": 0.313916015625, + "learning_rate": 0.0003, + "loss": 11.2936, + "loss/aux_loss": 0.04808789901435375, + "loss/crossentropy": 2.5996453762054443, + "loss/logits": 0.8564824372529983, + "step": 34880 + }, + { + "epoch": 0.3489, + "grad_norm": 13.125, + "grad_norm_var": 0.4479166666666667, + "learning_rate": 0.0003, + "loss": 11.417, + "loss/aux_loss": 0.04807357657700777, + "loss/crossentropy": 2.754497063159943, + "loss/logits": 0.8746155887842179, + "step": 34890 + }, + { + "epoch": 0.349, + "grad_norm": 13.9375, + "grad_norm_var": 1.4731770833333333, + "learning_rate": 0.0003, + "loss": 11.3581, + "loss/aux_loss": 0.04808343891054392, + "loss/crossentropy": 2.722722589969635, + "loss/logits": 0.8574351370334625, + "step": 34900 + }, + { + "epoch": 0.3491, + "grad_norm": 13.4375, + "grad_norm_var": 0.468994140625, + "learning_rate": 0.0003, + "loss": 11.2342, + "loss/aux_loss": 0.04807738587260246, + "loss/crossentropy": 2.85051429271698, + "loss/logits": 0.8694952636957168, + "step": 34910 + }, + { + "epoch": 0.3492, + "grad_norm": 13.8125, + "grad_norm_var": 0.30935872395833336, + "learning_rate": 0.0003, + "loss": 11.2306, + "loss/aux_loss": 0.04808169659227133, + "loss/crossentropy": 2.581733113527298, + "loss/logits": 0.8042867451906204, + "step": 34920 + }, + { + "epoch": 0.3493, + "grad_norm": 13.8125, + "grad_norm_var": 0.36380208333333336, + "learning_rate": 0.0003, + "loss": 11.3718, + "loss/aux_loss": 0.04807847626507282, + "loss/crossentropy": 2.7856172263622283, + "loss/logits": 0.8864524632692337, + "step": 34930 + }, + { + "epoch": 0.3494, + "grad_norm": 12.5625, + "grad_norm_var": 0.5591145833333333, + "learning_rate": 0.0003, + "loss": 11.4262, + "loss/aux_loss": 0.04807476550340652, + "loss/crossentropy": 2.842118561267853, + "loss/logits": 0.8961553603410721, + "step": 34940 + }, + { + "epoch": 0.3495, + "grad_norm": 13.125, + "grad_norm_var": 0.6594889322916667, + "learning_rate": 0.0003, + "loss": 11.2777, + "loss/aux_loss": 0.04807927329093218, + "loss/crossentropy": 2.732057309150696, + "loss/logits": 0.8318034172058105, + "step": 34950 + }, + { + "epoch": 0.3496, + "grad_norm": 13.5625, + "grad_norm_var": 0.6219889322916666, + "learning_rate": 0.0003, + "loss": 11.2521, + "loss/aux_loss": 0.04807781353592873, + "loss/crossentropy": 2.712051713466644, + "loss/logits": 0.8508867889642715, + "step": 34960 + }, + { + "epoch": 0.3497, + "grad_norm": 14.5625, + "grad_norm_var": 0.6476399739583333, + "learning_rate": 0.0003, + "loss": 11.2176, + "loss/aux_loss": 0.04807377476245165, + "loss/crossentropy": 2.7046292066574096, + "loss/logits": 0.8390609532594681, + "step": 34970 + }, + { + "epoch": 0.3498, + "grad_norm": 13.1875, + "grad_norm_var": 0.25201822916666666, + "learning_rate": 0.0003, + "loss": 11.3613, + "loss/aux_loss": 0.04808987118303776, + "loss/crossentropy": 2.7671579003334044, + "loss/logits": 0.8521817743778228, + "step": 34980 + }, + { + "epoch": 0.3499, + "grad_norm": 13.125, + "grad_norm_var": 0.21834309895833334, + "learning_rate": 0.0003, + "loss": 11.2822, + "loss/aux_loss": 0.048079888336360455, + "loss/crossentropy": 2.6444417238235474, + "loss/logits": 0.842853182554245, + "step": 34990 + }, + { + "epoch": 0.35, + "grad_norm": 13.125, + "grad_norm_var": 0.212353515625, + "learning_rate": 0.0003, + "loss": 11.2254, + "loss/aux_loss": 0.04808303378522396, + "loss/crossentropy": 2.692433053255081, + "loss/logits": 0.8417773574590683, + "step": 35000 + }, + { + "epoch": 0.3501, + "grad_norm": 14.0, + "grad_norm_var": 0.2947265625, + "learning_rate": 0.0003, + "loss": 11.4454, + "loss/aux_loss": 0.04807858187705279, + "loss/crossentropy": 2.7733002305030823, + "loss/logits": 0.8693195581436157, + "step": 35010 + }, + { + "epoch": 0.3502, + "grad_norm": 14.5625, + "grad_norm_var": 0.8526041666666667, + "learning_rate": 0.0003, + "loss": 11.3941, + "loss/aux_loss": 0.04808044396340847, + "loss/crossentropy": 2.615969657897949, + "loss/logits": 0.8622830808162689, + "step": 35020 + }, + { + "epoch": 0.3503, + "grad_norm": 12.625, + "grad_norm_var": 0.931494140625, + "learning_rate": 0.0003, + "loss": 11.2957, + "loss/aux_loss": 0.04807266090065241, + "loss/crossentropy": 2.801264774799347, + "loss/logits": 0.8435007154941558, + "step": 35030 + }, + { + "epoch": 0.3504, + "grad_norm": 13.3125, + "grad_norm_var": 0.484619140625, + "learning_rate": 0.0003, + "loss": 11.3, + "loss/aux_loss": 0.04808126632124186, + "loss/crossentropy": 2.7451845824718477, + "loss/logits": 0.867804229259491, + "step": 35040 + }, + { + "epoch": 0.3505, + "grad_norm": 14.1875, + "grad_norm_var": 0.151806640625, + "learning_rate": 0.0003, + "loss": 11.346, + "loss/aux_loss": 0.04807517919689417, + "loss/crossentropy": 2.8747010231018066, + "loss/logits": 0.8618029087781907, + "step": 35050 + }, + { + "epoch": 0.3506, + "grad_norm": 14.0, + "grad_norm_var": 0.3374837239583333, + "learning_rate": 0.0003, + "loss": 11.4672, + "loss/aux_loss": 0.048079409264028074, + "loss/crossentropy": 2.8339676380157472, + "loss/logits": 0.856550145149231, + "step": 35060 + }, + { + "epoch": 0.3507, + "grad_norm": 13.5625, + "grad_norm_var": 1.504931640625, + "learning_rate": 0.0003, + "loss": 11.4003, + "loss/aux_loss": 0.04808185379952192, + "loss/crossentropy": 2.8066389322280885, + "loss/logits": 0.897555747628212, + "step": 35070 + }, + { + "epoch": 0.3508, + "grad_norm": 14.875, + "grad_norm_var": 0.5884765625, + "learning_rate": 0.0003, + "loss": 11.546, + "loss/aux_loss": 0.04807468615472317, + "loss/crossentropy": 2.8240845441818236, + "loss/logits": 0.8684464514255523, + "step": 35080 + }, + { + "epoch": 0.3509, + "grad_norm": 15.9375, + "grad_norm_var": 5.7009765625, + "learning_rate": 0.0003, + "loss": 11.3058, + "loss/aux_loss": 0.04807514958083629, + "loss/crossentropy": 2.7188303232192994, + "loss/logits": 0.8454073309898377, + "step": 35090 + }, + { + "epoch": 0.351, + "grad_norm": 13.625, + "grad_norm_var": 5.8384765625, + "learning_rate": 0.0003, + "loss": 11.3441, + "loss/aux_loss": 0.04809205364435911, + "loss/crossentropy": 2.6878881573677065, + "loss/logits": 0.8072956264019012, + "step": 35100 + }, + { + "epoch": 0.3511, + "grad_norm": 12.1875, + "grad_norm_var": 1.4634765625, + "learning_rate": 0.0003, + "loss": 11.3553, + "loss/aux_loss": 0.048081132024526595, + "loss/crossentropy": 2.7471259951591493, + "loss/logits": 0.871997344493866, + "step": 35110 + }, + { + "epoch": 0.3512, + "grad_norm": 14.625, + "grad_norm_var": 0.869775390625, + "learning_rate": 0.0003, + "loss": 11.4906, + "loss/aux_loss": 0.04808983094990253, + "loss/crossentropy": 2.767477738857269, + "loss/logits": 0.8434695929288865, + "step": 35120 + }, + { + "epoch": 0.3513, + "grad_norm": 13.125, + "grad_norm_var": 0.463916015625, + "learning_rate": 0.0003, + "loss": 11.3584, + "loss/aux_loss": 0.048065770603716376, + "loss/crossentropy": 2.678995144367218, + "loss/logits": 0.8505305916070938, + "step": 35130 + }, + { + "epoch": 0.3514, + "grad_norm": 14.625, + "grad_norm_var": 0.4921875, + "learning_rate": 0.0003, + "loss": 11.253, + "loss/aux_loss": 0.04808024074882269, + "loss/crossentropy": 2.7573612451553347, + "loss/logits": 0.8538782745599747, + "step": 35140 + }, + { + "epoch": 0.3515, + "grad_norm": 13.0625, + "grad_norm_var": 0.5835774739583334, + "learning_rate": 0.0003, + "loss": 11.1857, + "loss/aux_loss": 0.04808320011943579, + "loss/crossentropy": 2.6276727855205535, + "loss/logits": 0.8079499483108521, + "step": 35150 + }, + { + "epoch": 0.3516, + "grad_norm": 14.3125, + "grad_norm_var": 1.0212890625, + "learning_rate": 0.0003, + "loss": 11.4222, + "loss/aux_loss": 0.0480812968686223, + "loss/crossentropy": 2.6966384649276733, + "loss/logits": 0.8357253611087799, + "step": 35160 + }, + { + "epoch": 0.3517, + "grad_norm": 13.0, + "grad_norm_var": 0.9255045572916667, + "learning_rate": 0.0003, + "loss": 11.3839, + "loss/aux_loss": 0.04808384496718645, + "loss/crossentropy": 2.7424313902854918, + "loss/logits": 0.8915427207946778, + "step": 35170 + }, + { + "epoch": 0.3518, + "grad_norm": 14.0625, + "grad_norm_var": 10.929020182291667, + "learning_rate": 0.0003, + "loss": 11.3673, + "loss/aux_loss": 0.048090758919715884, + "loss/crossentropy": 2.70193572640419, + "loss/logits": 0.8317285031080246, + "step": 35180 + }, + { + "epoch": 0.3519, + "grad_norm": 13.6875, + "grad_norm_var": 0.20338541666666668, + "learning_rate": 0.0003, + "loss": 11.3432, + "loss/aux_loss": 0.048075008764863016, + "loss/crossentropy": 2.724704682826996, + "loss/logits": 0.8616194367408753, + "step": 35190 + }, + { + "epoch": 0.352, + "grad_norm": 13.25, + "grad_norm_var": 0.42706705729166666, + "learning_rate": 0.0003, + "loss": 11.3289, + "loss/aux_loss": 0.04808625653386116, + "loss/crossentropy": 2.6437718331813813, + "loss/logits": 0.829070645570755, + "step": 35200 + }, + { + "epoch": 0.3521, + "grad_norm": 14.625, + "grad_norm_var": 0.65625, + "learning_rate": 0.0003, + "loss": 11.3784, + "loss/aux_loss": 0.04808688312768936, + "loss/crossentropy": 2.8204615235328676, + "loss/logits": 0.9107112646102905, + "step": 35210 + }, + { + "epoch": 0.3522, + "grad_norm": 15.0, + "grad_norm_var": 0.6231770833333333, + "learning_rate": 0.0003, + "loss": 11.3746, + "loss/aux_loss": 0.048073959164321424, + "loss/crossentropy": 2.9584303498268127, + "loss/logits": 0.8751641631126403, + "step": 35220 + }, + { + "epoch": 0.3523, + "grad_norm": 12.6875, + "grad_norm_var": 0.447900390625, + "learning_rate": 0.0003, + "loss": 11.3529, + "loss/aux_loss": 0.048080052994191644, + "loss/crossentropy": 2.8173577427864074, + "loss/logits": 0.8873382836580277, + "step": 35230 + }, + { + "epoch": 0.3524, + "grad_norm": 13.6875, + "grad_norm_var": 0.5597493489583333, + "learning_rate": 0.0003, + "loss": 11.543, + "loss/aux_loss": 0.048076377063989637, + "loss/crossentropy": 2.825025236606598, + "loss/logits": 0.8857592105865478, + "step": 35240 + }, + { + "epoch": 0.3525, + "grad_norm": 13.3125, + "grad_norm_var": 0.6442057291666666, + "learning_rate": 0.0003, + "loss": 11.3836, + "loss/aux_loss": 0.048085327818989755, + "loss/crossentropy": 2.6969442307949065, + "loss/logits": 0.8094238936901093, + "step": 35250 + }, + { + "epoch": 0.3526, + "grad_norm": 13.875, + "grad_norm_var": 0.645556640625, + "learning_rate": 0.0003, + "loss": 11.3612, + "loss/aux_loss": 0.04808722659945488, + "loss/crossentropy": 2.6123400807380674, + "loss/logits": 0.8106647431850433, + "step": 35260 + }, + { + "epoch": 0.3527, + "grad_norm": 14.125, + "grad_norm_var": 55.956624348958336, + "learning_rate": 0.0003, + "loss": 11.3009, + "loss/aux_loss": 0.048074896819889544, + "loss/crossentropy": 2.810643696784973, + "loss/logits": 0.8637238830327988, + "step": 35270 + }, + { + "epoch": 0.3528, + "grad_norm": 22.625, + "grad_norm_var": 68.66764322916667, + "learning_rate": 0.0003, + "loss": 11.3654, + "loss/aux_loss": 0.04808924626559019, + "loss/crossentropy": 2.6293312191963194, + "loss/logits": 0.8716479748487472, + "step": 35280 + }, + { + "epoch": 0.3529, + "grad_norm": 13.875, + "grad_norm_var": 7.793082682291667, + "learning_rate": 0.0003, + "loss": 11.2959, + "loss/aux_loss": 0.04808361511677504, + "loss/crossentropy": 2.6466811537742614, + "loss/logits": 0.8509993731975556, + "step": 35290 + }, + { + "epoch": 0.353, + "grad_norm": 14.5625, + "grad_norm_var": 0.49114583333333334, + "learning_rate": 0.0003, + "loss": 11.2404, + "loss/aux_loss": 0.048073071613907815, + "loss/crossentropy": 2.7615082263946533, + "loss/logits": 0.8562895059585571, + "step": 35300 + }, + { + "epoch": 0.3531, + "grad_norm": 13.25, + "grad_norm_var": 0.2009765625, + "learning_rate": 0.0003, + "loss": 11.2599, + "loss/aux_loss": 0.04808885268867016, + "loss/crossentropy": 2.7079634070396423, + "loss/logits": 0.8336867898702621, + "step": 35310 + }, + { + "epoch": 0.3532, + "grad_norm": 13.3125, + "grad_norm_var": 0.14503580729166668, + "learning_rate": 0.0003, + "loss": 11.3066, + "loss/aux_loss": 0.04807536732405424, + "loss/crossentropy": 2.9801666378974914, + "loss/logits": 0.8767634421586991, + "step": 35320 + }, + { + "epoch": 0.3533, + "grad_norm": 12.625, + "grad_norm_var": 0.30245768229166664, + "learning_rate": 0.0003, + "loss": 11.538, + "loss/aux_loss": 0.048068761453032495, + "loss/crossentropy": 2.752707290649414, + "loss/logits": 0.878770825266838, + "step": 35330 + }, + { + "epoch": 0.3534, + "grad_norm": 13.8125, + "grad_norm_var": 0.4618326822916667, + "learning_rate": 0.0003, + "loss": 11.3234, + "loss/aux_loss": 0.04809120837599039, + "loss/crossentropy": 2.751829779148102, + "loss/logits": 0.8367206364870071, + "step": 35340 + }, + { + "epoch": 0.3535, + "grad_norm": 13.1875, + "grad_norm_var": 0.6983723958333333, + "learning_rate": 0.0003, + "loss": 11.3473, + "loss/aux_loss": 0.04807098787277937, + "loss/crossentropy": 2.903922712802887, + "loss/logits": 0.885872820019722, + "step": 35350 + }, + { + "epoch": 0.3536, + "grad_norm": 13.4375, + "grad_norm_var": 0.747900390625, + "learning_rate": 0.0003, + "loss": 11.2302, + "loss/aux_loss": 0.04807676579803229, + "loss/crossentropy": 2.7807978630065917, + "loss/logits": 0.8654327929019928, + "step": 35360 + }, + { + "epoch": 0.3537, + "grad_norm": 13.75, + "grad_norm_var": 0.4561848958333333, + "learning_rate": 0.0003, + "loss": 11.4295, + "loss/aux_loss": 0.04808081611990929, + "loss/crossentropy": 2.8749902486801147, + "loss/logits": 0.8648800730705262, + "step": 35370 + }, + { + "epoch": 0.3538, + "grad_norm": 14.3125, + "grad_norm_var": 10.038997395833333, + "learning_rate": 0.0003, + "loss": 11.2158, + "loss/aux_loss": 0.04807778876274824, + "loss/crossentropy": 2.7491287708282472, + "loss/logits": 0.8350054025650024, + "step": 35380 + }, + { + "epoch": 0.3539, + "grad_norm": 13.125, + "grad_norm_var": 0.4544270833333333, + "learning_rate": 0.0003, + "loss": 11.3964, + "loss/aux_loss": 0.048072314076125625, + "loss/crossentropy": 2.757090598344803, + "loss/logits": 0.8464928805828095, + "step": 35390 + }, + { + "epoch": 0.354, + "grad_norm": 12.9375, + "grad_norm_var": 0.3385416666666667, + "learning_rate": 0.0003, + "loss": 11.2412, + "loss/aux_loss": 0.04808321315795183, + "loss/crossentropy": 2.7094544529914857, + "loss/logits": 0.8820542007684707, + "step": 35400 + }, + { + "epoch": 0.3541, + "grad_norm": 15.0625, + "grad_norm_var": 11.601416015625, + "learning_rate": 0.0003, + "loss": 11.3299, + "loss/aux_loss": 0.04808483738452196, + "loss/crossentropy": 2.7222808599472046, + "loss/logits": 0.835541981458664, + "step": 35410 + }, + { + "epoch": 0.3542, + "grad_norm": 13.0625, + "grad_norm_var": 0.7833333333333333, + "learning_rate": 0.0003, + "loss": 11.3284, + "loss/aux_loss": 0.04806767236441374, + "loss/crossentropy": 2.756817102432251, + "loss/logits": 0.8438375443220139, + "step": 35420 + }, + { + "epoch": 0.3543, + "grad_norm": 13.6875, + "grad_norm_var": 0.32076822916666664, + "learning_rate": 0.0003, + "loss": 11.1812, + "loss/aux_loss": 0.04807917140424252, + "loss/crossentropy": 2.6759197235107424, + "loss/logits": 0.809447106719017, + "step": 35430 + }, + { + "epoch": 0.3544, + "grad_norm": 13.625, + "grad_norm_var": 0.32526041666666666, + "learning_rate": 0.0003, + "loss": 11.3935, + "loss/aux_loss": 0.048069739155471324, + "loss/crossentropy": 2.920281636714935, + "loss/logits": 0.8890509903430939, + "step": 35440 + }, + { + "epoch": 0.3545, + "grad_norm": 12.75, + "grad_norm_var": 0.43748372395833335, + "learning_rate": 0.0003, + "loss": 11.2958, + "loss/aux_loss": 0.048088513500988486, + "loss/crossentropy": 2.5987202882766725, + "loss/logits": 0.8323431223630905, + "step": 35450 + }, + { + "epoch": 0.3546, + "grad_norm": 20.0, + "grad_norm_var": 367.611181640625, + "learning_rate": 0.0003, + "loss": 11.345, + "loss/aux_loss": 0.04808755200356245, + "loss/crossentropy": 2.6423967361450194, + "loss/logits": 0.849226924777031, + "step": 35460 + }, + { + "epoch": 0.3547, + "grad_norm": 13.125, + "grad_norm_var": 2.9330729166666667, + "learning_rate": 0.0003, + "loss": 11.3632, + "loss/aux_loss": 0.04807810019701719, + "loss/crossentropy": 2.7577707767486572, + "loss/logits": 0.8662696242332458, + "step": 35470 + }, + { + "epoch": 0.3548, + "grad_norm": 13.875, + "grad_norm_var": 0.524853515625, + "learning_rate": 0.0003, + "loss": 11.4028, + "loss/aux_loss": 0.048086337931454184, + "loss/crossentropy": 2.615101230144501, + "loss/logits": 0.836988553404808, + "step": 35480 + }, + { + "epoch": 0.3549, + "grad_norm": 15.125, + "grad_norm_var": 1.8457682291666666, + "learning_rate": 0.0003, + "loss": 11.2295, + "loss/aux_loss": 0.04806696325540542, + "loss/crossentropy": 2.6185821652412415, + "loss/logits": 0.8607719987630844, + "step": 35490 + }, + { + "epoch": 0.355, + "grad_norm": 14.875, + "grad_norm_var": 1.1585774739583334, + "learning_rate": 0.0003, + "loss": 11.3416, + "loss/aux_loss": 0.04807763248682022, + "loss/crossentropy": 2.7731160163879394, + "loss/logits": 0.8407220751047134, + "step": 35500 + }, + { + "epoch": 0.3551, + "grad_norm": 13.1875, + "grad_norm_var": 0.5738932291666666, + "learning_rate": 0.0003, + "loss": 11.3636, + "loss/aux_loss": 0.04808692019432783, + "loss/crossentropy": 2.7722087264060975, + "loss/logits": 0.8629306703805923, + "step": 35510 + }, + { + "epoch": 0.3552, + "grad_norm": 13.875, + "grad_norm_var": 0.29322916666666665, + "learning_rate": 0.0003, + "loss": 11.4107, + "loss/aux_loss": 0.04807148296386003, + "loss/crossentropy": 2.8003060460090636, + "loss/logits": 0.8875281304121018, + "step": 35520 + }, + { + "epoch": 0.3553, + "grad_norm": 12.4375, + "grad_norm_var": 0.35323893229166664, + "learning_rate": 0.0003, + "loss": 11.4067, + "loss/aux_loss": 0.04808224029839039, + "loss/crossentropy": 2.701129513978958, + "loss/logits": 0.8584021329879761, + "step": 35530 + }, + { + "epoch": 0.3554, + "grad_norm": 13.875, + "grad_norm_var": 0.42888997395833334, + "learning_rate": 0.0003, + "loss": 11.4743, + "loss/aux_loss": 0.048079540766775605, + "loss/crossentropy": 2.85890337228775, + "loss/logits": 0.8821686983108521, + "step": 35540 + }, + { + "epoch": 0.3555, + "grad_norm": 13.5625, + "grad_norm_var": 0.37473958333333335, + "learning_rate": 0.0003, + "loss": 11.3944, + "loss/aux_loss": 0.04807381983846426, + "loss/crossentropy": 2.7811784029006956, + "loss/logits": 0.8604174524545669, + "step": 35550 + }, + { + "epoch": 0.3556, + "grad_norm": 14.125, + "grad_norm_var": 0.4749348958333333, + "learning_rate": 0.0003, + "loss": 11.2323, + "loss/aux_loss": 0.04808265678584576, + "loss/crossentropy": 2.76348534822464, + "loss/logits": 0.8644289672374725, + "step": 35560 + }, + { + "epoch": 0.3557, + "grad_norm": 13.0625, + "grad_norm_var": 0.3317057291666667, + "learning_rate": 0.0003, + "loss": 11.3451, + "loss/aux_loss": 0.048073519952595234, + "loss/crossentropy": 2.821522521972656, + "loss/logits": 0.8591786533594131, + "step": 35570 + }, + { + "epoch": 0.3558, + "grad_norm": 16.875, + "grad_norm_var": 1.5110514322916666, + "learning_rate": 0.0003, + "loss": 11.1857, + "loss/aux_loss": 0.048086483217775824, + "loss/crossentropy": 2.685242211818695, + "loss/logits": 0.807622566819191, + "step": 35580 + }, + { + "epoch": 0.3559, + "grad_norm": 11.8125, + "grad_norm_var": 2.0515625, + "learning_rate": 0.0003, + "loss": 11.3026, + "loss/aux_loss": 0.04807799514383078, + "loss/crossentropy": 2.5974143624305723, + "loss/logits": 0.8406628459692002, + "step": 35590 + }, + { + "epoch": 0.356, + "grad_norm": 14.5, + "grad_norm_var": 0.5382649739583333, + "learning_rate": 0.0003, + "loss": 11.3024, + "loss/aux_loss": 0.04808833636343479, + "loss/crossentropy": 2.5895915746688845, + "loss/logits": 0.8398721873760223, + "step": 35600 + }, + { + "epoch": 0.3561, + "grad_norm": 13.9375, + "grad_norm_var": 0.4663899739583333, + "learning_rate": 0.0003, + "loss": 11.2555, + "loss/aux_loss": 0.04808176904916763, + "loss/crossentropy": 2.6462597012519837, + "loss/logits": 0.8560863435268402, + "step": 35610 + }, + { + "epoch": 0.3562, + "grad_norm": 13.4375, + "grad_norm_var": 0.3993326822916667, + "learning_rate": 0.0003, + "loss": 11.3956, + "loss/aux_loss": 0.04806818459182978, + "loss/crossentropy": 2.619718074798584, + "loss/logits": 0.8831523567438125, + "step": 35620 + }, + { + "epoch": 0.3563, + "grad_norm": 14.5625, + "grad_norm_var": 0.44733072916666666, + "learning_rate": 0.0003, + "loss": 11.4719, + "loss/aux_loss": 0.04808085449039936, + "loss/crossentropy": 2.7037489295005797, + "loss/logits": 0.8706602722406387, + "step": 35630 + }, + { + "epoch": 0.3564, + "grad_norm": 14.25, + "grad_norm_var": 0.613134765625, + "learning_rate": 0.0003, + "loss": 11.3119, + "loss/aux_loss": 0.04807729236781597, + "loss/crossentropy": 2.698741543292999, + "loss/logits": 0.877493503689766, + "step": 35640 + }, + { + "epoch": 0.3565, + "grad_norm": 13.5, + "grad_norm_var": 0.6702962239583333, + "learning_rate": 0.0003, + "loss": 11.2573, + "loss/aux_loss": 0.04807835165411234, + "loss/crossentropy": 2.6732171416282653, + "loss/logits": 0.8277383238077164, + "step": 35650 + }, + { + "epoch": 0.3566, + "grad_norm": 13.125, + "grad_norm_var": 0.20701497395833332, + "learning_rate": 0.0003, + "loss": 11.3619, + "loss/aux_loss": 0.04807893112301827, + "loss/crossentropy": 2.7110289692878724, + "loss/logits": 0.8260849803686142, + "step": 35660 + }, + { + "epoch": 0.3567, + "grad_norm": 13.0625, + "grad_norm_var": 0.23671875, + "learning_rate": 0.0003, + "loss": 11.1931, + "loss/aux_loss": 0.0480891864746809, + "loss/crossentropy": 2.6819321513175964, + "loss/logits": 0.8022189557552337, + "step": 35670 + }, + { + "epoch": 0.3568, + "grad_norm": 12.8125, + "grad_norm_var": 0.42706705729166666, + "learning_rate": 0.0003, + "loss": 11.4875, + "loss/aux_loss": 0.04807867780327797, + "loss/crossentropy": 2.8046223700046538, + "loss/logits": 0.8751489996910096, + "step": 35680 + }, + { + "epoch": 0.3569, + "grad_norm": 14.3125, + "grad_norm_var": 0.2921223958333333, + "learning_rate": 0.0003, + "loss": 11.6221, + "loss/aux_loss": 0.04808136597275734, + "loss/crossentropy": 2.7549788117408753, + "loss/logits": 0.8519262999296189, + "step": 35690 + }, + { + "epoch": 0.357, + "grad_norm": 13.6875, + "grad_norm_var": 0.12630208333333334, + "learning_rate": 0.0003, + "loss": 11.265, + "loss/aux_loss": 0.04808007068932056, + "loss/crossentropy": 2.647283446788788, + "loss/logits": 0.8637819319963456, + "step": 35700 + }, + { + "epoch": 0.3571, + "grad_norm": 12.5625, + "grad_norm_var": 0.23984375, + "learning_rate": 0.0003, + "loss": 11.3778, + "loss/aux_loss": 0.048076775297522543, + "loss/crossentropy": 2.6670687079429625, + "loss/logits": 0.8395314335823059, + "step": 35710 + }, + { + "epoch": 0.3572, + "grad_norm": 12.8125, + "grad_norm_var": 0.4837890625, + "learning_rate": 0.0003, + "loss": 11.3448, + "loss/aux_loss": 0.0480807974934578, + "loss/crossentropy": 2.8054326295852663, + "loss/logits": 0.8705591022968292, + "step": 35720 + }, + { + "epoch": 0.3573, + "grad_norm": 13.125, + "grad_norm_var": 0.4630208333333333, + "learning_rate": 0.0003, + "loss": 11.4336, + "loss/aux_loss": 0.048079059645533564, + "loss/crossentropy": 2.64428573846817, + "loss/logits": 0.8815708011388779, + "step": 35730 + }, + { + "epoch": 0.3574, + "grad_norm": 14.125, + "grad_norm_var": 0.5306640625, + "learning_rate": 0.0003, + "loss": 11.3834, + "loss/aux_loss": 0.048077475652098654, + "loss/crossentropy": 2.6877181112766264, + "loss/logits": 0.8534661501646041, + "step": 35740 + }, + { + "epoch": 0.3575, + "grad_norm": 12.4375, + "grad_norm_var": 0.5773274739583333, + "learning_rate": 0.0003, + "loss": 11.1881, + "loss/aux_loss": 0.04806953519582748, + "loss/crossentropy": 2.6530270755290983, + "loss/logits": 0.8448708355426788, + "step": 35750 + }, + { + "epoch": 0.3576, + "grad_norm": 14.5, + "grad_norm_var": 0.7054524739583333, + "learning_rate": 0.0003, + "loss": 11.1465, + "loss/aux_loss": 0.048080852068960664, + "loss/crossentropy": 2.6025227308273315, + "loss/logits": 0.8209551721811295, + "step": 35760 + }, + { + "epoch": 0.3577, + "grad_norm": 13.6875, + "grad_norm_var": 0.5111979166666667, + "learning_rate": 0.0003, + "loss": 11.354, + "loss/aux_loss": 0.04807685352861881, + "loss/crossentropy": 2.7108654737472535, + "loss/logits": 0.8405659079551697, + "step": 35770 + }, + { + "epoch": 0.3578, + "grad_norm": 14.0, + "grad_norm_var": 0.7598795572916667, + "learning_rate": 0.0003, + "loss": 11.3892, + "loss/aux_loss": 0.048074676841497424, + "loss/crossentropy": 2.8448933243751524, + "loss/logits": 0.8365987449884414, + "step": 35780 + }, + { + "epoch": 0.3579, + "grad_norm": 14.4375, + "grad_norm_var": 0.4903645833333333, + "learning_rate": 0.0003, + "loss": 11.2817, + "loss/aux_loss": 0.04807745218276978, + "loss/crossentropy": 2.8002660870552063, + "loss/logits": 0.8614138662815094, + "step": 35790 + }, + { + "epoch": 0.358, + "grad_norm": 12.75, + "grad_norm_var": 0.8332682291666667, + "learning_rate": 0.0003, + "loss": 11.3633, + "loss/aux_loss": 0.048078560084104535, + "loss/crossentropy": 2.7766624689102173, + "loss/logits": 0.8491496801376343, + "step": 35800 + }, + { + "epoch": 0.3581, + "grad_norm": 13.375, + "grad_norm_var": 3.879541015625, + "learning_rate": 0.0003, + "loss": 11.3036, + "loss/aux_loss": 0.04808343965560198, + "loss/crossentropy": 2.7654669165611265, + "loss/logits": 0.8556907385587692, + "step": 35810 + }, + { + "epoch": 0.3582, + "grad_norm": 13.375, + "grad_norm_var": 0.29099934895833335, + "learning_rate": 0.0003, + "loss": 11.3808, + "loss/aux_loss": 0.04808389656245708, + "loss/crossentropy": 2.712459546327591, + "loss/logits": 0.854012405872345, + "step": 35820 + }, + { + "epoch": 0.3583, + "grad_norm": 13.0, + "grad_norm_var": 0.18097330729166666, + "learning_rate": 0.0003, + "loss": 11.3502, + "loss/aux_loss": 0.04807974956929684, + "loss/crossentropy": 2.6321381747722628, + "loss/logits": 0.850381875038147, + "step": 35830 + }, + { + "epoch": 0.3584, + "grad_norm": 12.75, + "grad_norm_var": 0.41451822916666664, + "learning_rate": 0.0003, + "loss": 11.1751, + "loss/aux_loss": 0.04808203168213367, + "loss/crossentropy": 2.6016912758350372, + "loss/logits": 0.8554532587528229, + "step": 35840 + }, + { + "epoch": 0.3585, + "grad_norm": 12.5, + "grad_norm_var": 0.7824055989583333, + "learning_rate": 0.0003, + "loss": 11.4128, + "loss/aux_loss": 0.04807893205434084, + "loss/crossentropy": 2.8680613577365874, + "loss/logits": 0.8510574102401733, + "step": 35850 + }, + { + "epoch": 0.3586, + "grad_norm": 12.8125, + "grad_norm_var": 0.679541015625, + "learning_rate": 0.0003, + "loss": 11.2393, + "loss/aux_loss": 0.048077529110014436, + "loss/crossentropy": 2.755670565366745, + "loss/logits": 0.8600349962711334, + "step": 35860 + }, + { + "epoch": 0.3587, + "grad_norm": 12.4375, + "grad_norm_var": 0.9173014322916667, + "learning_rate": 0.0003, + "loss": 11.3892, + "loss/aux_loss": 0.048092295043170454, + "loss/crossentropy": 2.7275496542453768, + "loss/logits": 0.8351588726043702, + "step": 35870 + }, + { + "epoch": 0.3588, + "grad_norm": 13.0, + "grad_norm_var": 0.9158854166666667, + "learning_rate": 0.0003, + "loss": 11.2862, + "loss/aux_loss": 0.0480747377499938, + "loss/crossentropy": 2.7512252271175384, + "loss/logits": 0.8427632987499237, + "step": 35880 + }, + { + "epoch": 0.3589, + "grad_norm": 13.5625, + "grad_norm_var": 0.29322916666666665, + "learning_rate": 0.0003, + "loss": 11.3548, + "loss/aux_loss": 0.04807990454137325, + "loss/crossentropy": 2.873078280687332, + "loss/logits": 0.8730248123407364, + "step": 35890 + }, + { + "epoch": 0.359, + "grad_norm": 13.5, + "grad_norm_var": 0.3624348958333333, + "learning_rate": 0.0003, + "loss": 11.2026, + "loss/aux_loss": 0.048082062415778636, + "loss/crossentropy": 2.740494179725647, + "loss/logits": 0.8442809909582139, + "step": 35900 + }, + { + "epoch": 0.3591, + "grad_norm": 14.375, + "grad_norm_var": 0.512744140625, + "learning_rate": 0.0003, + "loss": 11.23, + "loss/aux_loss": 0.048080322705209254, + "loss/crossentropy": 2.7258496403694155, + "loss/logits": 0.8964686661958694, + "step": 35910 + }, + { + "epoch": 0.3592, + "grad_norm": 14.0625, + "grad_norm_var": 0.6156087239583333, + "learning_rate": 0.0003, + "loss": 11.332, + "loss/aux_loss": 0.048071503080427645, + "loss/crossentropy": 2.664508581161499, + "loss/logits": 0.8316588670015335, + "step": 35920 + }, + { + "epoch": 0.3593, + "grad_norm": 13.6875, + "grad_norm_var": 0.14178059895833334, + "learning_rate": 0.0003, + "loss": 11.3462, + "loss/aux_loss": 0.0480843897908926, + "loss/crossentropy": 2.8054317951202394, + "loss/logits": 0.8416423499584198, + "step": 35930 + }, + { + "epoch": 0.3594, + "grad_norm": 13.5625, + "grad_norm_var": 0.1541015625, + "learning_rate": 0.0003, + "loss": 11.4479, + "loss/aux_loss": 0.04807868916541338, + "loss/crossentropy": 2.6694052278995515, + "loss/logits": 0.8517626136541366, + "step": 35940 + }, + { + "epoch": 0.3595, + "grad_norm": 17.375, + "grad_norm_var": 312.1473795572917, + "learning_rate": 0.0003, + "loss": 11.422, + "loss/aux_loss": 0.04807343017309904, + "loss/crossentropy": 2.7886245131492613, + "loss/logits": 0.8560446441173554, + "step": 35950 + }, + { + "epoch": 0.3596, + "grad_norm": 15.3125, + "grad_norm_var": 306.50983072916665, + "learning_rate": 0.0003, + "loss": 11.528, + "loss/aux_loss": 0.0480889055877924, + "loss/crossentropy": 2.7681180238723755, + "loss/logits": 0.8745385199785233, + "step": 35960 + }, + { + "epoch": 0.3597, + "grad_norm": 15.0625, + "grad_norm_var": 0.3260416666666667, + "learning_rate": 0.0003, + "loss": 11.4073, + "loss/aux_loss": 0.048074235394597056, + "loss/crossentropy": 2.7186142563819886, + "loss/logits": 0.8708992570638656, + "step": 35970 + }, + { + "epoch": 0.3598, + "grad_norm": 13.1875, + "grad_norm_var": 0.390087890625, + "learning_rate": 0.0003, + "loss": 11.256, + "loss/aux_loss": 0.048084134608507155, + "loss/crossentropy": 2.6606498062610626, + "loss/logits": 0.8529165148735046, + "step": 35980 + }, + { + "epoch": 0.3599, + "grad_norm": 13.8125, + "grad_norm_var": 0.6249837239583333, + "learning_rate": 0.0003, + "loss": 11.4559, + "loss/aux_loss": 0.04807859268039465, + "loss/crossentropy": 2.862497079372406, + "loss/logits": 0.8499436527490616, + "step": 35990 + }, + { + "epoch": 0.36, + "grad_norm": 14.125, + "grad_norm_var": 0.48899739583333335, + "learning_rate": 0.0003, + "loss": 11.4658, + "loss/aux_loss": 0.04807100892066955, + "loss/crossentropy": 2.797253680229187, + "loss/logits": 0.8588318228721619, + "step": 36000 + }, + { + "epoch": 0.3601, + "grad_norm": 13.6875, + "grad_norm_var": 0.3251139322916667, + "learning_rate": 0.0003, + "loss": 11.2969, + "loss/aux_loss": 0.04807574506849051, + "loss/crossentropy": 2.5939278662204743, + "loss/logits": 0.8101026326417923, + "step": 36010 + }, + { + "epoch": 0.3602, + "grad_norm": 14.3125, + "grad_norm_var": 0.73828125, + "learning_rate": 0.0003, + "loss": 11.3092, + "loss/aux_loss": 0.048069524206221105, + "loss/crossentropy": 2.8990219116210936, + "loss/logits": 0.8483186364173889, + "step": 36020 + }, + { + "epoch": 0.3603, + "grad_norm": 13.25, + "grad_norm_var": 0.36599934895833336, + "learning_rate": 0.0003, + "loss": 11.3561, + "loss/aux_loss": 0.04808562994003296, + "loss/crossentropy": 2.6316673278808596, + "loss/logits": 0.8744110763072968, + "step": 36030 + }, + { + "epoch": 0.3604, + "grad_norm": 13.3125, + "grad_norm_var": 0.48385416666666664, + "learning_rate": 0.0003, + "loss": 11.2467, + "loss/aux_loss": 0.04806984197348356, + "loss/crossentropy": 2.8506676077842714, + "loss/logits": 0.8674321442842483, + "step": 36040 + }, + { + "epoch": 0.3605, + "grad_norm": 13.625, + "grad_norm_var": 0.3028645833333333, + "learning_rate": 0.0003, + "loss": 11.374, + "loss/aux_loss": 0.048074014112353326, + "loss/crossentropy": 2.6722546577453614, + "loss/logits": 0.8741536557674408, + "step": 36050 + }, + { + "epoch": 0.3606, + "grad_norm": 14.75, + "grad_norm_var": 0.3251139322916667, + "learning_rate": 0.0003, + "loss": 11.1968, + "loss/aux_loss": 0.04807903002947569, + "loss/crossentropy": 2.777270722389221, + "loss/logits": 0.8541721493005753, + "step": 36060 + }, + { + "epoch": 0.3607, + "grad_norm": 14.0625, + "grad_norm_var": 0.480322265625, + "learning_rate": 0.0003, + "loss": 11.3552, + "loss/aux_loss": 0.04808016233146191, + "loss/crossentropy": 2.692441987991333, + "loss/logits": 0.8523607522249221, + "step": 36070 + }, + { + "epoch": 0.3608, + "grad_norm": 12.8125, + "grad_norm_var": 0.40896809895833336, + "learning_rate": 0.0003, + "loss": 11.2746, + "loss/aux_loss": 0.048079789616167545, + "loss/crossentropy": 2.746452260017395, + "loss/logits": 0.863274747133255, + "step": 36080 + }, + { + "epoch": 0.3609, + "grad_norm": 12.9375, + "grad_norm_var": 0.583837890625, + "learning_rate": 0.0003, + "loss": 11.4223, + "loss/aux_loss": 0.048079486936330795, + "loss/crossentropy": 2.662570732831955, + "loss/logits": 0.8952972948551178, + "step": 36090 + }, + { + "epoch": 0.361, + "grad_norm": 14.875, + "grad_norm_var": 1.0411295572916666, + "learning_rate": 0.0003, + "loss": 11.3986, + "loss/aux_loss": 0.048073232360184195, + "loss/crossentropy": 2.687173879146576, + "loss/logits": 0.8554467290639878, + "step": 36100 + }, + { + "epoch": 0.3611, + "grad_norm": 14.0, + "grad_norm_var": 0.997509765625, + "learning_rate": 0.0003, + "loss": 11.2722, + "loss/aux_loss": 0.048092739656567574, + "loss/crossentropy": 2.521233695745468, + "loss/logits": 0.7955525845289231, + "step": 36110 + }, + { + "epoch": 0.3612, + "grad_norm": 15.125, + "grad_norm_var": 0.9139973958333333, + "learning_rate": 0.0003, + "loss": 11.4461, + "loss/aux_loss": 0.04807361625134945, + "loss/crossentropy": 2.970944273471832, + "loss/logits": 0.8794440478086472, + "step": 36120 + }, + { + "epoch": 0.3613, + "grad_norm": 12.4375, + "grad_norm_var": 1.252587890625, + "learning_rate": 0.0003, + "loss": 11.2798, + "loss/aux_loss": 0.04807492271065712, + "loss/crossentropy": 2.717639720439911, + "loss/logits": 0.8793477922677994, + "step": 36130 + }, + { + "epoch": 0.3614, + "grad_norm": 13.625, + "grad_norm_var": 0.8056640625, + "learning_rate": 0.0003, + "loss": 11.2061, + "loss/aux_loss": 0.04807124081999063, + "loss/crossentropy": 2.7465671420097353, + "loss/logits": 0.8570981532335281, + "step": 36140 + }, + { + "epoch": 0.3615, + "grad_norm": 14.0, + "grad_norm_var": 0.7410807291666667, + "learning_rate": 0.0003, + "loss": 11.4171, + "loss/aux_loss": 0.04809042625129223, + "loss/crossentropy": 2.874330496788025, + "loss/logits": 0.8536765873432159, + "step": 36150 + }, + { + "epoch": 0.3616, + "grad_norm": 15.1875, + "grad_norm_var": 0.9301920572916667, + "learning_rate": 0.0003, + "loss": 11.3739, + "loss/aux_loss": 0.048077494464814664, + "loss/crossentropy": 2.586655741930008, + "loss/logits": 0.8711060285568237, + "step": 36160 + }, + { + "epoch": 0.3617, + "grad_norm": 14.4375, + "grad_norm_var": 1.490869140625, + "learning_rate": 0.0003, + "loss": 11.3623, + "loss/aux_loss": 0.04806613698601723, + "loss/crossentropy": 2.5908707082271576, + "loss/logits": 0.8134723126888275, + "step": 36170 + }, + { + "epoch": 0.3618, + "grad_norm": 13.9375, + "grad_norm_var": 1.3150390625, + "learning_rate": 0.0003, + "loss": 11.4247, + "loss/aux_loss": 0.048084022291004655, + "loss/crossentropy": 2.795676851272583, + "loss/logits": 0.8587139397859573, + "step": 36180 + }, + { + "epoch": 0.3619, + "grad_norm": 13.6875, + "grad_norm_var": 0.7551920572916667, + "learning_rate": 0.0003, + "loss": 11.2165, + "loss/aux_loss": 0.048082707822322844, + "loss/crossentropy": 2.8844053208827973, + "loss/logits": 0.8785182237625122, + "step": 36190 + }, + { + "epoch": 0.362, + "grad_norm": 13.0625, + "grad_norm_var": 0.3738932291666667, + "learning_rate": 0.0003, + "loss": 11.2659, + "loss/aux_loss": 0.04806621167808771, + "loss/crossentropy": 2.760159510374069, + "loss/logits": 0.8577248483896256, + "step": 36200 + }, + { + "epoch": 0.3621, + "grad_norm": 13.375, + "grad_norm_var": 0.31808268229166664, + "learning_rate": 0.0003, + "loss": 11.2366, + "loss/aux_loss": 0.04807536099106073, + "loss/crossentropy": 2.608304864168167, + "loss/logits": 0.829086622595787, + "step": 36210 + }, + { + "epoch": 0.3622, + "grad_norm": 13.875, + "grad_norm_var": 0.2072265625, + "learning_rate": 0.0003, + "loss": 11.4158, + "loss/aux_loss": 0.04807885363698006, + "loss/crossentropy": 2.6055088222026823, + "loss/logits": 0.825168663263321, + "step": 36220 + }, + { + "epoch": 0.3623, + "grad_norm": 13.4375, + "grad_norm_var": 0.39334309895833336, + "learning_rate": 0.0003, + "loss": 11.553, + "loss/aux_loss": 0.0480809198692441, + "loss/crossentropy": 2.9127083659172057, + "loss/logits": 0.8658655256032943, + "step": 36230 + }, + { + "epoch": 0.3624, + "grad_norm": 14.25, + "grad_norm_var": 0.37213541666666666, + "learning_rate": 0.0003, + "loss": 11.2686, + "loss/aux_loss": 0.048089105263352395, + "loss/crossentropy": 2.8726187229156492, + "loss/logits": 0.8607639342546463, + "step": 36240 + }, + { + "epoch": 0.3625, + "grad_norm": 12.9375, + "grad_norm_var": 1.0445149739583333, + "learning_rate": 0.0003, + "loss": 11.2932, + "loss/aux_loss": 0.04807120338082314, + "loss/crossentropy": 2.898715019226074, + "loss/logits": 0.8681466579437256, + "step": 36250 + }, + { + "epoch": 0.3626, + "grad_norm": 12.9375, + "grad_norm_var": 0.8946451822916667, + "learning_rate": 0.0003, + "loss": 11.373, + "loss/aux_loss": 0.0480785708874464, + "loss/crossentropy": 2.6757899284362794, + "loss/logits": 0.8399546831846237, + "step": 36260 + }, + { + "epoch": 0.3627, + "grad_norm": 12.625, + "grad_norm_var": 0.471728515625, + "learning_rate": 0.0003, + "loss": 11.2656, + "loss/aux_loss": 0.048072914406657216, + "loss/crossentropy": 2.7273074328899383, + "loss/logits": 0.834012359380722, + "step": 36270 + }, + { + "epoch": 0.3628, + "grad_norm": 15.625, + "grad_norm_var": 0.5853515625, + "learning_rate": 0.0003, + "loss": 11.3074, + "loss/aux_loss": 0.04808451887220144, + "loss/crossentropy": 2.661761927604675, + "loss/logits": 0.8267664194107056, + "step": 36280 + }, + { + "epoch": 0.3629, + "grad_norm": 13.875, + "grad_norm_var": 7.678645833333333, + "learning_rate": 0.0003, + "loss": 11.1024, + "loss/aux_loss": 0.04808289129287004, + "loss/crossentropy": 2.8848253428936004, + "loss/logits": 0.8689317673444747, + "step": 36290 + }, + { + "epoch": 0.363, + "grad_norm": 13.875, + "grad_norm_var": 7.61328125, + "learning_rate": 0.0003, + "loss": 11.3154, + "loss/aux_loss": 0.048081953264772895, + "loss/crossentropy": 2.703933322429657, + "loss/logits": 0.860775688290596, + "step": 36300 + }, + { + "epoch": 0.3631, + "grad_norm": 12.875, + "grad_norm_var": 0.6884765625, + "learning_rate": 0.0003, + "loss": 11.2708, + "loss/aux_loss": 0.04807724803686142, + "loss/crossentropy": 2.715017533302307, + "loss/logits": 0.8319126725196838, + "step": 36310 + }, + { + "epoch": 0.3632, + "grad_norm": 22.625, + "grad_norm_var": 5.701936848958334, + "learning_rate": 0.0003, + "loss": 11.2363, + "loss/aux_loss": 0.04807513263076544, + "loss/crossentropy": 2.829657733440399, + "loss/logits": 0.8587293684482574, + "step": 36320 + }, + { + "epoch": 0.3633, + "grad_norm": 14.25, + "grad_norm_var": 4.881184895833333, + "learning_rate": 0.0003, + "loss": 11.408, + "loss/aux_loss": 0.04807958342134953, + "loss/crossentropy": 2.7696733355522154, + "loss/logits": 0.8509972572326661, + "step": 36330 + }, + { + "epoch": 0.3634, + "grad_norm": 14.625, + "grad_norm_var": 0.25388997395833335, + "learning_rate": 0.0003, + "loss": 11.3764, + "loss/aux_loss": 0.04807343930006027, + "loss/crossentropy": 2.824759781360626, + "loss/logits": 0.8681064277887345, + "step": 36340 + }, + { + "epoch": 0.3635, + "grad_norm": 13.375, + "grad_norm_var": 0.49386393229166664, + "learning_rate": 0.0003, + "loss": 11.3524, + "loss/aux_loss": 0.04807184562087059, + "loss/crossentropy": 2.8086614489555357, + "loss/logits": 0.8541026085615158, + "step": 36350 + }, + { + "epoch": 0.3636, + "grad_norm": 14.4375, + "grad_norm_var": 0.5608723958333334, + "learning_rate": 0.0003, + "loss": 11.0975, + "loss/aux_loss": 0.04808126986026764, + "loss/crossentropy": 2.7107265830039977, + "loss/logits": 0.8230546474456787, + "step": 36360 + }, + { + "epoch": 0.3637, + "grad_norm": 15.5, + "grad_norm_var": 0.6462076822916667, + "learning_rate": 0.0003, + "loss": 11.2573, + "loss/aux_loss": 0.048082560300827026, + "loss/crossentropy": 2.660491919517517, + "loss/logits": 0.8427970826625824, + "step": 36370 + }, + { + "epoch": 0.3638, + "grad_norm": 12.6875, + "grad_norm_var": 0.799072265625, + "learning_rate": 0.0003, + "loss": 11.3318, + "loss/aux_loss": 0.048079566471278666, + "loss/crossentropy": 2.788001722097397, + "loss/logits": 0.8650757223367691, + "step": 36380 + }, + { + "epoch": 0.3639, + "grad_norm": 12.9375, + "grad_norm_var": 0.584375, + "learning_rate": 0.0003, + "loss": 11.3163, + "loss/aux_loss": 0.048077472113072874, + "loss/crossentropy": 2.7954628705978393, + "loss/logits": 0.8578163594007492, + "step": 36390 + }, + { + "epoch": 0.364, + "grad_norm": 14.0, + "grad_norm_var": 0.38743489583333335, + "learning_rate": 0.0003, + "loss": 11.3133, + "loss/aux_loss": 0.04807708989828825, + "loss/crossentropy": 2.665116882324219, + "loss/logits": 0.8234895557165146, + "step": 36400 + }, + { + "epoch": 0.3641, + "grad_norm": 14.875, + "grad_norm_var": 0.7328125, + "learning_rate": 0.0003, + "loss": 11.5322, + "loss/aux_loss": 0.04808671064674854, + "loss/crossentropy": 2.858789348602295, + "loss/logits": 0.8716346949338913, + "step": 36410 + }, + { + "epoch": 0.3642, + "grad_norm": 14.0625, + "grad_norm_var": 0.6462890625, + "learning_rate": 0.0003, + "loss": 11.1745, + "loss/aux_loss": 0.048071319982409474, + "loss/crossentropy": 2.877179265022278, + "loss/logits": 0.8559047758579255, + "step": 36420 + }, + { + "epoch": 0.3643, + "grad_norm": 13.3125, + "grad_norm_var": 0.7098958333333333, + "learning_rate": 0.0003, + "loss": 11.3668, + "loss/aux_loss": 0.04808102864772081, + "loss/crossentropy": 2.699104994535446, + "loss/logits": 0.8286954373121261, + "step": 36430 + }, + { + "epoch": 0.3644, + "grad_norm": 13.75, + "grad_norm_var": 0.3348795572916667, + "learning_rate": 0.0003, + "loss": 11.3764, + "loss/aux_loss": 0.04807946030050516, + "loss/crossentropy": 2.7624664068222047, + "loss/logits": 0.8778378039598465, + "step": 36440 + }, + { + "epoch": 0.3645, + "grad_norm": 13.5625, + "grad_norm_var": 0.21614583333333334, + "learning_rate": 0.0003, + "loss": 11.4262, + "loss/aux_loss": 0.04808296486735344, + "loss/crossentropy": 2.7352758646011353, + "loss/logits": 0.8656487733125686, + "step": 36450 + }, + { + "epoch": 0.3646, + "grad_norm": 14.3125, + "grad_norm_var": 0.4103515625, + "learning_rate": 0.0003, + "loss": 11.4924, + "loss/aux_loss": 0.04807944241911173, + "loss/crossentropy": 2.774025857448578, + "loss/logits": 0.8716156959533692, + "step": 36460 + }, + { + "epoch": 0.3647, + "grad_norm": 14.125, + "grad_norm_var": 0.39791666666666664, + "learning_rate": 0.0003, + "loss": 11.2264, + "loss/aux_loss": 0.04807451739907265, + "loss/crossentropy": 2.8603923201560972, + "loss/logits": 0.8951089948415756, + "step": 36470 + }, + { + "epoch": 0.3648, + "grad_norm": 13.5625, + "grad_norm_var": 0.35201822916666664, + "learning_rate": 0.0003, + "loss": 11.4953, + "loss/aux_loss": 0.04807769488543272, + "loss/crossentropy": 2.6446187674999235, + "loss/logits": 0.8907380670309066, + "step": 36480 + }, + { + "epoch": 0.3649, + "grad_norm": 12.8125, + "grad_norm_var": 6.081363932291667, + "learning_rate": 0.0003, + "loss": 11.185, + "loss/aux_loss": 0.048073522932827475, + "loss/crossentropy": 2.7823033690452577, + "loss/logits": 0.8572315156459809, + "step": 36490 + }, + { + "epoch": 0.365, + "grad_norm": 14.125, + "grad_norm_var": 0.465087890625, + "learning_rate": 0.0003, + "loss": 11.3351, + "loss/aux_loss": 0.048076865077018735, + "loss/crossentropy": 2.7182795643806457, + "loss/logits": 0.8255507349967957, + "step": 36500 + }, + { + "epoch": 0.3651, + "grad_norm": 13.4375, + "grad_norm_var": 0.5874348958333333, + "learning_rate": 0.0003, + "loss": 11.275, + "loss/aux_loss": 0.048083111830055716, + "loss/crossentropy": 2.7402828454971315, + "loss/logits": 0.8623090296983719, + "step": 36510 + }, + { + "epoch": 0.3652, + "grad_norm": 14.0, + "grad_norm_var": 0.3395182291666667, + "learning_rate": 0.0003, + "loss": 11.3483, + "loss/aux_loss": 0.04808528944849968, + "loss/crossentropy": 2.7780889511108398, + "loss/logits": 0.8970998287200928, + "step": 36520 + }, + { + "epoch": 0.3653, + "grad_norm": 13.75, + "grad_norm_var": 912.176416015625, + "learning_rate": 0.0003, + "loss": 11.3774, + "loss/aux_loss": 0.04809322264045477, + "loss/crossentropy": 2.5786903738975524, + "loss/logits": 0.8551579564809799, + "step": 36530 + }, + { + "epoch": 0.3654, + "grad_norm": 13.5625, + "grad_norm_var": 0.26666666666666666, + "learning_rate": 0.0003, + "loss": 11.3992, + "loss/aux_loss": 0.04808025564998388, + "loss/crossentropy": 2.6024239301681518, + "loss/logits": 0.8405012160539627, + "step": 36540 + }, + { + "epoch": 0.3655, + "grad_norm": 13.3125, + "grad_norm_var": 0.9311848958333333, + "learning_rate": 0.0003, + "loss": 11.1519, + "loss/aux_loss": 0.04808431137353182, + "loss/crossentropy": 2.628155159950256, + "loss/logits": 0.8049672454595566, + "step": 36550 + }, + { + "epoch": 0.3656, + "grad_norm": 15.0, + "grad_norm_var": 0.49420572916666666, + "learning_rate": 0.0003, + "loss": 11.3941, + "loss/aux_loss": 0.048075790517032146, + "loss/crossentropy": 2.7668872237205506, + "loss/logits": 0.838652953505516, + "step": 36560 + }, + { + "epoch": 0.3657, + "grad_norm": 12.625, + "grad_norm_var": 0.6994140625, + "learning_rate": 0.0003, + "loss": 11.3062, + "loss/aux_loss": 0.04808170460164547, + "loss/crossentropy": 2.7823184549808504, + "loss/logits": 0.9078426092863083, + "step": 36570 + }, + { + "epoch": 0.3658, + "grad_norm": 13.0, + "grad_norm_var": 0.8878743489583333, + "learning_rate": 0.0003, + "loss": 11.0968, + "loss/aux_loss": 0.048066372610628606, + "loss/crossentropy": 2.6771502017974855, + "loss/logits": 0.8519851267337799, + "step": 36580 + }, + { + "epoch": 0.3659, + "grad_norm": 13.625, + "grad_norm_var": 0.6288899739583333, + "learning_rate": 0.0003, + "loss": 11.4529, + "loss/aux_loss": 0.048079500906169415, + "loss/crossentropy": 2.7219568133354186, + "loss/logits": 0.8512856423854828, + "step": 36590 + }, + { + "epoch": 0.366, + "grad_norm": 13.375, + "grad_norm_var": 0.48587239583333336, + "learning_rate": 0.0003, + "loss": 11.3979, + "loss/aux_loss": 0.048080523125827314, + "loss/crossentropy": 2.677553081512451, + "loss/logits": 0.8542275846004486, + "step": 36600 + }, + { + "epoch": 0.3661, + "grad_norm": 13.0625, + "grad_norm_var": 0.35442708333333334, + "learning_rate": 0.0003, + "loss": 11.2516, + "loss/aux_loss": 0.04807436354458332, + "loss/crossentropy": 2.7150216817855837, + "loss/logits": 0.8340162307024002, + "step": 36610 + }, + { + "epoch": 0.3662, + "grad_norm": 13.5, + "grad_norm_var": 13.445686848958333, + "learning_rate": 0.0003, + "loss": 11.2813, + "loss/aux_loss": 0.04808293953537941, + "loss/crossentropy": 2.7328949213027953, + "loss/logits": 0.8716106861829758, + "step": 36620 + }, + { + "epoch": 0.3663, + "grad_norm": 13.6875, + "grad_norm_var": 0.9638020833333333, + "learning_rate": 0.0003, + "loss": 11.2869, + "loss/aux_loss": 0.04808443430811167, + "loss/crossentropy": 2.728735291957855, + "loss/logits": 0.8548354119062423, + "step": 36630 + }, + { + "epoch": 0.3664, + "grad_norm": 13.5625, + "grad_norm_var": 4.690087890625, + "learning_rate": 0.0003, + "loss": 11.3094, + "loss/aux_loss": 0.048085299693048, + "loss/crossentropy": 2.6517282664775848, + "loss/logits": 0.8119051426649093, + "step": 36640 + }, + { + "epoch": 0.3665, + "grad_norm": 14.1875, + "grad_norm_var": 4.356705729166666, + "learning_rate": 0.0003, + "loss": 11.3516, + "loss/aux_loss": 0.04806904401630163, + "loss/crossentropy": 2.6928380608558653, + "loss/logits": 0.8894807904958725, + "step": 36650 + }, + { + "epoch": 0.3666, + "grad_norm": 13.25, + "grad_norm_var": 0.4025390625, + "learning_rate": 0.0003, + "loss": 11.1876, + "loss/aux_loss": 0.04807957727462053, + "loss/crossentropy": 2.6671720802783967, + "loss/logits": 0.8575594484806061, + "step": 36660 + }, + { + "epoch": 0.3667, + "grad_norm": 13.625, + "grad_norm_var": 0.6534993489583333, + "learning_rate": 0.0003, + "loss": 11.3217, + "loss/aux_loss": 0.04807212818413973, + "loss/crossentropy": 2.920051896572113, + "loss/logits": 0.9000935316085815, + "step": 36670 + }, + { + "epoch": 0.3668, + "grad_norm": 13.0625, + "grad_norm_var": 0.578759765625, + "learning_rate": 0.0003, + "loss": 11.2754, + "loss/aux_loss": 0.04808471836149693, + "loss/crossentropy": 2.7112753033638, + "loss/logits": 0.8980684787034988, + "step": 36680 + }, + { + "epoch": 0.3669, + "grad_norm": 14.5625, + "grad_norm_var": 0.46339518229166665, + "learning_rate": 0.0003, + "loss": 11.175, + "loss/aux_loss": 0.048070876859128475, + "loss/crossentropy": 2.8545451045036314, + "loss/logits": 0.8624769806861877, + "step": 36690 + }, + { + "epoch": 0.367, + "grad_norm": 14.25, + "grad_norm_var": 0.3614583333333333, + "learning_rate": 0.0003, + "loss": 11.419, + "loss/aux_loss": 0.048080637119710445, + "loss/crossentropy": 2.584076887369156, + "loss/logits": 0.8492877304553985, + "step": 36700 + }, + { + "epoch": 0.3671, + "grad_norm": 14.5, + "grad_norm_var": 0.46901041666666665, + "learning_rate": 0.0003, + "loss": 11.2005, + "loss/aux_loss": 0.04807239808142185, + "loss/crossentropy": 2.8296147108078005, + "loss/logits": 0.8446451902389527, + "step": 36710 + }, + { + "epoch": 0.3672, + "grad_norm": 13.625, + "grad_norm_var": 0.3153483072916667, + "learning_rate": 0.0003, + "loss": 11.2948, + "loss/aux_loss": 0.04807722382247448, + "loss/crossentropy": 2.8139419972896578, + "loss/logits": 0.8666492760181427, + "step": 36720 + }, + { + "epoch": 0.3673, + "grad_norm": 13.0, + "grad_norm_var": 0.2140625, + "learning_rate": 0.0003, + "loss": 11.2456, + "loss/aux_loss": 0.04808667413890362, + "loss/crossentropy": 2.7662573993206023, + "loss/logits": 0.8440918147563934, + "step": 36730 + }, + { + "epoch": 0.3674, + "grad_norm": 13.3125, + "grad_norm_var": 0.48162434895833334, + "learning_rate": 0.0003, + "loss": 11.1766, + "loss/aux_loss": 0.048073191195726395, + "loss/crossentropy": 2.733067828416824, + "loss/logits": 0.8450499773025513, + "step": 36740 + }, + { + "epoch": 0.3675, + "grad_norm": 13.25, + "grad_norm_var": 0.408837890625, + "learning_rate": 0.0003, + "loss": 11.1612, + "loss/aux_loss": 0.04808583315461874, + "loss/crossentropy": 2.661664068698883, + "loss/logits": 0.8497480273246765, + "step": 36750 + }, + { + "epoch": 0.3676, + "grad_norm": 15.3125, + "grad_norm_var": 1.2481608072916666, + "learning_rate": 0.0003, + "loss": 11.3239, + "loss/aux_loss": 0.048081538453698155, + "loss/crossentropy": 2.836627209186554, + "loss/logits": 0.8409482598304748, + "step": 36760 + }, + { + "epoch": 0.3677, + "grad_norm": 14.125, + "grad_norm_var": 0.6759765625, + "learning_rate": 0.0003, + "loss": 11.2629, + "loss/aux_loss": 0.048070738464593886, + "loss/crossentropy": 2.7837388277053834, + "loss/logits": 0.8499901384115219, + "step": 36770 + }, + { + "epoch": 0.3678, + "grad_norm": 13.75, + "grad_norm_var": 0.29713541666666665, + "learning_rate": 0.0003, + "loss": 11.4489, + "loss/aux_loss": 0.04807408787310123, + "loss/crossentropy": 2.8329702377319337, + "loss/logits": 0.8994654446840287, + "step": 36780 + }, + { + "epoch": 0.3679, + "grad_norm": 13.1875, + "grad_norm_var": 0.3837076822916667, + "learning_rate": 0.0003, + "loss": 11.1841, + "loss/aux_loss": 0.04807285293936729, + "loss/crossentropy": 2.7574662566184998, + "loss/logits": 0.8559128046035767, + "step": 36790 + }, + { + "epoch": 0.368, + "grad_norm": 14.6875, + "grad_norm_var": 0.5015462239583334, + "learning_rate": 0.0003, + "loss": 11.2499, + "loss/aux_loss": 0.04807376656681299, + "loss/crossentropy": 2.684080684185028, + "loss/logits": 0.8515265494585037, + "step": 36800 + }, + { + "epoch": 0.3681, + "grad_norm": 13.9375, + "grad_norm_var": 0.609228515625, + "learning_rate": 0.0003, + "loss": 11.3147, + "loss/aux_loss": 0.04808369651436806, + "loss/crossentropy": 2.618355232477188, + "loss/logits": 0.8441348135471344, + "step": 36810 + }, + { + "epoch": 0.3682, + "grad_norm": 13.4375, + "grad_norm_var": 5.31953125, + "learning_rate": 0.0003, + "loss": 11.2577, + "loss/aux_loss": 0.04807754773646593, + "loss/crossentropy": 2.5288033723831176, + "loss/logits": 0.8119451552629471, + "step": 36820 + }, + { + "epoch": 0.3683, + "grad_norm": 13.4375, + "grad_norm_var": 8.267708333333333, + "learning_rate": 0.0003, + "loss": 11.2319, + "loss/aux_loss": 0.048084663413465026, + "loss/crossentropy": 2.753344786167145, + "loss/logits": 0.8751234143972397, + "step": 36830 + }, + { + "epoch": 0.3684, + "grad_norm": 14.8125, + "grad_norm_var": 0.308447265625, + "learning_rate": 0.0003, + "loss": 11.4226, + "loss/aux_loss": 0.04807217866182327, + "loss/crossentropy": 2.67869313955307, + "loss/logits": 0.8474486947059632, + "step": 36840 + }, + { + "epoch": 0.3685, + "grad_norm": 13.9375, + "grad_norm_var": 3.936051432291667, + "learning_rate": 0.0003, + "loss": 11.1881, + "loss/aux_loss": 0.048085474967956544, + "loss/crossentropy": 2.6841680705547333, + "loss/logits": 0.8220482736825943, + "step": 36850 + }, + { + "epoch": 0.3686, + "grad_norm": 13.75, + "grad_norm_var": 4.113997395833334, + "learning_rate": 0.0003, + "loss": 11.2414, + "loss/aux_loss": 0.04807091634720564, + "loss/crossentropy": 2.6126440107822417, + "loss/logits": 0.8671426773071289, + "step": 36860 + }, + { + "epoch": 0.3687, + "grad_norm": 13.3125, + "grad_norm_var": 0.36432291666666666, + "learning_rate": 0.0003, + "loss": 11.4133, + "loss/aux_loss": 0.048081329092383385, + "loss/crossentropy": 2.675434243679047, + "loss/logits": 0.8265480697154999, + "step": 36870 + }, + { + "epoch": 0.3688, + "grad_norm": 13.1875, + "grad_norm_var": 0.42902018229166666, + "learning_rate": 0.0003, + "loss": 11.2284, + "loss/aux_loss": 0.048071084544062614, + "loss/crossentropy": 2.649240803718567, + "loss/logits": 0.858475786447525, + "step": 36880 + }, + { + "epoch": 0.3689, + "grad_norm": 14.0, + "grad_norm_var": 0.5214680989583333, + "learning_rate": 0.0003, + "loss": 11.453, + "loss/aux_loss": 0.04807976335287094, + "loss/crossentropy": 2.740285503864288, + "loss/logits": 0.8892779976129532, + "step": 36890 + }, + { + "epoch": 0.369, + "grad_norm": 14.1875, + "grad_norm_var": 0.5254557291666667, + "learning_rate": 0.0003, + "loss": 11.4388, + "loss/aux_loss": 0.048085974715650084, + "loss/crossentropy": 2.597354656457901, + "loss/logits": 0.852543905377388, + "step": 36900 + }, + { + "epoch": 0.3691, + "grad_norm": 13.6875, + "grad_norm_var": 1.0181640625, + "learning_rate": 0.0003, + "loss": 11.34, + "loss/aux_loss": 0.04807802941650152, + "loss/crossentropy": 2.7012298822402956, + "loss/logits": 0.8767822653055191, + "step": 36910 + }, + { + "epoch": 0.3692, + "grad_norm": 13.6875, + "grad_norm_var": 0.36171875, + "learning_rate": 0.0003, + "loss": 11.251, + "loss/aux_loss": 0.048073366098105905, + "loss/crossentropy": 2.560819482803345, + "loss/logits": 0.8467923909425735, + "step": 36920 + }, + { + "epoch": 0.3693, + "grad_norm": 15.0, + "grad_norm_var": 0.342041015625, + "learning_rate": 0.0003, + "loss": 11.148, + "loss/aux_loss": 0.04808372184634209, + "loss/crossentropy": 2.713025426864624, + "loss/logits": 0.8554750919342041, + "step": 36930 + }, + { + "epoch": 0.3694, + "grad_norm": 14.3125, + "grad_norm_var": 0.5695149739583333, + "learning_rate": 0.0003, + "loss": 11.1105, + "loss/aux_loss": 0.048074273765087126, + "loss/crossentropy": 2.560827577114105, + "loss/logits": 0.837596133351326, + "step": 36940 + }, + { + "epoch": 0.3695, + "grad_norm": 13.8125, + "grad_norm_var": 0.4853515625, + "learning_rate": 0.0003, + "loss": 11.237, + "loss/aux_loss": 0.04807106014341116, + "loss/crossentropy": 2.7133314967155457, + "loss/logits": 0.8286193758249283, + "step": 36950 + }, + { + "epoch": 0.3696, + "grad_norm": 13.9375, + "grad_norm_var": 0.28683268229166664, + "learning_rate": 0.0003, + "loss": 11.3376, + "loss/aux_loss": 0.04807455725967884, + "loss/crossentropy": 2.6688225150108336, + "loss/logits": 0.8450548857450485, + "step": 36960 + }, + { + "epoch": 0.3697, + "grad_norm": 14.375, + "grad_norm_var": 0.458056640625, + "learning_rate": 0.0003, + "loss": 11.2668, + "loss/aux_loss": 0.048079893365502356, + "loss/crossentropy": 2.7559533953666686, + "loss/logits": 0.8815567016601562, + "step": 36970 + }, + { + "epoch": 0.3698, + "grad_norm": 14.5625, + "grad_norm_var": 0.4317708333333333, + "learning_rate": 0.0003, + "loss": 11.4765, + "loss/aux_loss": 0.04807979427278042, + "loss/crossentropy": 2.837592136859894, + "loss/logits": 0.855880606174469, + "step": 36980 + }, + { + "epoch": 0.3699, + "grad_norm": 13.4375, + "grad_norm_var": 0.477197265625, + "learning_rate": 0.0003, + "loss": 11.3164, + "loss/aux_loss": 0.048071004822850226, + "loss/crossentropy": 2.743358498811722, + "loss/logits": 0.8194127559661866, + "step": 36990 + }, + { + "epoch": 0.37, + "grad_norm": 14.625, + "grad_norm_var": 0.527587890625, + "learning_rate": 0.0003, + "loss": 11.5576, + "loss/aux_loss": 0.04808164816349745, + "loss/crossentropy": 2.8244762778282166, + "loss/logits": 0.8719862341880799, + "step": 37000 + }, + { + "epoch": 0.3701, + "grad_norm": 13.8125, + "grad_norm_var": 0.5921712239583333, + "learning_rate": 0.0003, + "loss": 11.2832, + "loss/aux_loss": 0.048074906878173354, + "loss/crossentropy": 2.8368687868118285, + "loss/logits": 0.8303968459367752, + "step": 37010 + }, + { + "epoch": 0.3702, + "grad_norm": 12.625, + "grad_norm_var": 59.25792643229167, + "learning_rate": 0.0003, + "loss": 11.1296, + "loss/aux_loss": 0.048082617297768596, + "loss/crossentropy": 2.7732748210430147, + "loss/logits": 0.8518129020929337, + "step": 37020 + }, + { + "epoch": 0.3703, + "grad_norm": 13.5625, + "grad_norm_var": 157.23645833333333, + "learning_rate": 0.0003, + "loss": 11.4101, + "loss/aux_loss": 0.04808836504817009, + "loss/crossentropy": 2.71566726565361, + "loss/logits": 0.8372407227754592, + "step": 37030 + }, + { + "epoch": 0.3704, + "grad_norm": 14.5, + "grad_norm_var": 115.4890625, + "learning_rate": 0.0003, + "loss": 11.4471, + "loss/aux_loss": 0.04808253161609173, + "loss/crossentropy": 2.8112912774086, + "loss/logits": 0.8763896584510803, + "step": 37040 + }, + { + "epoch": 0.3705, + "grad_norm": 13.5625, + "grad_norm_var": 0.4981770833333333, + "learning_rate": 0.0003, + "loss": 11.2863, + "loss/aux_loss": 0.048083126358687875, + "loss/crossentropy": 2.740783101320267, + "loss/logits": 0.8048513799905777, + "step": 37050 + }, + { + "epoch": 0.3706, + "grad_norm": 13.9375, + "grad_norm_var": 0.5832682291666667, + "learning_rate": 0.0003, + "loss": 11.2289, + "loss/aux_loss": 0.04807225782424211, + "loss/crossentropy": 2.809919023513794, + "loss/logits": 0.8596052765846253, + "step": 37060 + }, + { + "epoch": 0.3707, + "grad_norm": 13.625, + "grad_norm_var": 0.32667643229166665, + "learning_rate": 0.0003, + "loss": 11.2248, + "loss/aux_loss": 0.04808387588709593, + "loss/crossentropy": 2.8138983845710754, + "loss/logits": 0.8710784047842026, + "step": 37070 + }, + { + "epoch": 0.3708, + "grad_norm": 13.1875, + "grad_norm_var": 0.35989583333333336, + "learning_rate": 0.0003, + "loss": 11.4458, + "loss/aux_loss": 0.04807316083461046, + "loss/crossentropy": 2.647478461265564, + "loss/logits": 0.8652923613786697, + "step": 37080 + }, + { + "epoch": 0.3709, + "grad_norm": 14.125, + "grad_norm_var": 90.54837239583334, + "learning_rate": 0.0003, + "loss": 11.3368, + "loss/aux_loss": 0.04807783383876085, + "loss/crossentropy": 2.7728021681308745, + "loss/logits": 0.8556511580944062, + "step": 37090 + }, + { + "epoch": 0.371, + "grad_norm": 13.6875, + "grad_norm_var": 0.5078125, + "learning_rate": 0.0003, + "loss": 11.2261, + "loss/aux_loss": 0.04807612337172031, + "loss/crossentropy": 2.7843292593955993, + "loss/logits": 0.8542019307613373, + "step": 37100 + }, + { + "epoch": 0.3711, + "grad_norm": 13.5625, + "grad_norm_var": 0.3551432291666667, + "learning_rate": 0.0003, + "loss": 11.4133, + "loss/aux_loss": 0.048083677515387536, + "loss/crossentropy": 2.768745648860931, + "loss/logits": 0.8667060941457748, + "step": 37110 + }, + { + "epoch": 0.3712, + "grad_norm": 12.9375, + "grad_norm_var": 0.38748372395833336, + "learning_rate": 0.0003, + "loss": 11.2222, + "loss/aux_loss": 0.04808000139892101, + "loss/crossentropy": 2.6551915645599364, + "loss/logits": 0.8254688054323196, + "step": 37120 + }, + { + "epoch": 0.3713, + "grad_norm": 13.3125, + "grad_norm_var": 0.41534830729166666, + "learning_rate": 0.0003, + "loss": 11.1694, + "loss/aux_loss": 0.048072263970971106, + "loss/crossentropy": 2.6525086402893066, + "loss/logits": 0.8595420539379119, + "step": 37130 + }, + { + "epoch": 0.3714, + "grad_norm": 12.8125, + "grad_norm_var": 0.739306640625, + "learning_rate": 0.0003, + "loss": 11.2952, + "loss/aux_loss": 0.04807467870414257, + "loss/crossentropy": 2.77539883852005, + "loss/logits": 0.8893805712461471, + "step": 37140 + }, + { + "epoch": 0.3715, + "grad_norm": 13.8125, + "grad_norm_var": 0.8153645833333333, + "learning_rate": 0.0003, + "loss": 11.3194, + "loss/aux_loss": 0.04808569923043251, + "loss/crossentropy": 2.7440546989440917, + "loss/logits": 0.855570039153099, + "step": 37150 + }, + { + "epoch": 0.3716, + "grad_norm": 13.5625, + "grad_norm_var": 0.5109212239583333, + "learning_rate": 0.0003, + "loss": 11.1757, + "loss/aux_loss": 0.04810119271278381, + "loss/crossentropy": 2.6668431758880615, + "loss/logits": 0.8302851766347885, + "step": 37160 + }, + { + "epoch": 0.3717, + "grad_norm": 14.0, + "grad_norm_var": 1.3648274739583333, + "learning_rate": 0.0003, + "loss": 11.3603, + "loss/aux_loss": 0.048070420511066914, + "loss/crossentropy": 2.657299679517746, + "loss/logits": 0.8717033207416535, + "step": 37170 + }, + { + "epoch": 0.3718, + "grad_norm": 13.0625, + "grad_norm_var": 1.5200358072916667, + "learning_rate": 0.0003, + "loss": 11.2216, + "loss/aux_loss": 0.048081215284764764, + "loss/crossentropy": 2.9752244472503664, + "loss/logits": 0.8586607486009598, + "step": 37180 + }, + { + "epoch": 0.3719, + "grad_norm": 12.75, + "grad_norm_var": 0.29108072916666666, + "learning_rate": 0.0003, + "loss": 11.3228, + "loss/aux_loss": 0.04807878099381924, + "loss/crossentropy": 2.8252045154571532, + "loss/logits": 0.8735492646694183, + "step": 37190 + }, + { + "epoch": 0.372, + "grad_norm": 16.375, + "grad_norm_var": 0.9677083333333333, + "learning_rate": 0.0003, + "loss": 11.4145, + "loss/aux_loss": 0.04807718005031347, + "loss/crossentropy": 2.825979804992676, + "loss/logits": 0.8712354183197022, + "step": 37200 + }, + { + "epoch": 0.3721, + "grad_norm": 12.8125, + "grad_norm_var": 1.1072916666666666, + "learning_rate": 0.0003, + "loss": 11.1937, + "loss/aux_loss": 0.04808374121785164, + "loss/crossentropy": 2.5838040828704836, + "loss/logits": 0.8319817185401917, + "step": 37210 + }, + { + "epoch": 0.3722, + "grad_norm": 14.0, + "grad_norm_var": 0.42120768229166666, + "learning_rate": 0.0003, + "loss": 11.3134, + "loss/aux_loss": 0.04806477259844542, + "loss/crossentropy": 2.7815585494041444, + "loss/logits": 0.8544807106256485, + "step": 37220 + }, + { + "epoch": 0.3723, + "grad_norm": 13.125, + "grad_norm_var": 0.433056640625, + "learning_rate": 0.0003, + "loss": 11.2287, + "loss/aux_loss": 0.04808630477637053, + "loss/crossentropy": 2.6713213086128236, + "loss/logits": 0.8477931290864944, + "step": 37230 + }, + { + "epoch": 0.3724, + "grad_norm": 13.0625, + "grad_norm_var": 0.4471354166666667, + "learning_rate": 0.0003, + "loss": 11.3342, + "loss/aux_loss": 0.04807670786976814, + "loss/crossentropy": 2.6989696443080904, + "loss/logits": 0.8390361964702606, + "step": 37240 + }, + { + "epoch": 0.3725, + "grad_norm": 14.25, + "grad_norm_var": 0.36354166666666665, + "learning_rate": 0.0003, + "loss": 11.1887, + "loss/aux_loss": 0.048072361201047895, + "loss/crossentropy": 2.6670637369155883, + "loss/logits": 0.8481419175863266, + "step": 37250 + }, + { + "epoch": 0.3726, + "grad_norm": 14.3125, + "grad_norm_var": 0.30670572916666666, + "learning_rate": 0.0003, + "loss": 11.2535, + "loss/aux_loss": 0.04808654896914959, + "loss/crossentropy": 2.7662817001342774, + "loss/logits": 0.8623171299695969, + "step": 37260 + }, + { + "epoch": 0.3727, + "grad_norm": 14.625, + "grad_norm_var": 0.3473307291666667, + "learning_rate": 0.0003, + "loss": 11.313, + "loss/aux_loss": 0.04807217847555876, + "loss/crossentropy": 2.655113381147385, + "loss/logits": 0.8379829883575439, + "step": 37270 + }, + { + "epoch": 0.3728, + "grad_norm": 13.625, + "grad_norm_var": 0.364306640625, + "learning_rate": 0.0003, + "loss": 11.3619, + "loss/aux_loss": 0.04807236138731241, + "loss/crossentropy": 2.7911306262016295, + "loss/logits": 0.8533078819513321, + "step": 37280 + }, + { + "epoch": 0.3729, + "grad_norm": 13.6875, + "grad_norm_var": 0.4390462239583333, + "learning_rate": 0.0003, + "loss": 11.2455, + "loss/aux_loss": 0.048087391443550585, + "loss/crossentropy": 2.675192391872406, + "loss/logits": 0.8323038935661315, + "step": 37290 + }, + { + "epoch": 0.373, + "grad_norm": 13.125, + "grad_norm_var": 0.30909830729166665, + "learning_rate": 0.0003, + "loss": 11.2423, + "loss/aux_loss": 0.048072326742112634, + "loss/crossentropy": 2.807281959056854, + "loss/logits": 0.8477059155702591, + "step": 37300 + }, + { + "epoch": 0.3731, + "grad_norm": 13.625, + "grad_norm_var": 0.7629557291666667, + "learning_rate": 0.0003, + "loss": 11.139, + "loss/aux_loss": 0.04808030799031258, + "loss/crossentropy": 2.8326464533805846, + "loss/logits": 0.8708844691514969, + "step": 37310 + }, + { + "epoch": 0.3732, + "grad_norm": 14.3125, + "grad_norm_var": 0.43430989583333335, + "learning_rate": 0.0003, + "loss": 11.3773, + "loss/aux_loss": 0.04808178097009659, + "loss/crossentropy": 2.7206430673599242, + "loss/logits": 0.9048705369234085, + "step": 37320 + }, + { + "epoch": 0.3733, + "grad_norm": 14.0, + "grad_norm_var": 0.294775390625, + "learning_rate": 0.0003, + "loss": 11.1414, + "loss/aux_loss": 0.04806458819657564, + "loss/crossentropy": 2.6413770437240602, + "loss/logits": 0.8203712821006774, + "step": 37330 + }, + { + "epoch": 0.3734, + "grad_norm": 14.125, + "grad_norm_var": 0.38515625, + "learning_rate": 0.0003, + "loss": 11.193, + "loss/aux_loss": 0.04807915184646845, + "loss/crossentropy": 2.800459563732147, + "loss/logits": 0.8518052160739898, + "step": 37340 + }, + { + "epoch": 0.3735, + "grad_norm": 14.3125, + "grad_norm_var": 0.5583170572916667, + "learning_rate": 0.0003, + "loss": 11.2843, + "loss/aux_loss": 0.04809236507862806, + "loss/crossentropy": 2.648477429151535, + "loss/logits": 0.8216162532567978, + "step": 37350 + }, + { + "epoch": 0.3736, + "grad_norm": 17.125, + "grad_norm_var": 1.143212890625, + "learning_rate": 0.0003, + "loss": 11.4077, + "loss/aux_loss": 0.048064970411360264, + "loss/crossentropy": 2.7521123051643372, + "loss/logits": 0.8703978210687637, + "step": 37360 + }, + { + "epoch": 0.3737, + "grad_norm": 14.25, + "grad_norm_var": 1.049462890625, + "learning_rate": 0.0003, + "loss": 11.4275, + "loss/aux_loss": 0.04807972647249699, + "loss/crossentropy": 2.737633216381073, + "loss/logits": 0.8671592533588409, + "step": 37370 + }, + { + "epoch": 0.3738, + "grad_norm": 13.0625, + "grad_norm_var": 0.2950520833333333, + "learning_rate": 0.0003, + "loss": 11.1968, + "loss/aux_loss": 0.04807094354182482, + "loss/crossentropy": 2.7768781900405886, + "loss/logits": 0.8548236817121506, + "step": 37380 + }, + { + "epoch": 0.3739, + "grad_norm": 14.0625, + "grad_norm_var": 0.4281087239583333, + "learning_rate": 0.0003, + "loss": 11.4145, + "loss/aux_loss": 0.048079249635338786, + "loss/crossentropy": 2.779543364048004, + "loss/logits": 0.82299225628376, + "step": 37390 + }, + { + "epoch": 0.374, + "grad_norm": 14.5625, + "grad_norm_var": 0.34524739583333336, + "learning_rate": 0.0003, + "loss": 11.385, + "loss/aux_loss": 0.04807200077921152, + "loss/crossentropy": 2.843193084001541, + "loss/logits": 0.8840054035186767, + "step": 37400 + }, + { + "epoch": 0.3741, + "grad_norm": 15.125, + "grad_norm_var": 0.2822265625, + "learning_rate": 0.0003, + "loss": 11.3311, + "loss/aux_loss": 0.04806772284209728, + "loss/crossentropy": 2.692880618572235, + "loss/logits": 0.827517831325531, + "step": 37410 + }, + { + "epoch": 0.3742, + "grad_norm": 13.0, + "grad_norm_var": 0.3374837239583333, + "learning_rate": 0.0003, + "loss": 11.3374, + "loss/aux_loss": 0.04807442184537649, + "loss/crossentropy": 2.8666168451309204, + "loss/logits": 0.8572170734405518, + "step": 37420 + }, + { + "epoch": 0.3743, + "grad_norm": 13.875, + "grad_norm_var": 0.5565104166666667, + "learning_rate": 0.0003, + "loss": 11.4877, + "loss/aux_loss": 0.04808829519897699, + "loss/crossentropy": 2.800356590747833, + "loss/logits": 0.8869089663028717, + "step": 37430 + }, + { + "epoch": 0.3744, + "grad_norm": 14.125, + "grad_norm_var": 0.56875, + "learning_rate": 0.0003, + "loss": 11.1438, + "loss/aux_loss": 0.04806892778724432, + "loss/crossentropy": 2.55942959189415, + "loss/logits": 0.8205576926469803, + "step": 37440 + }, + { + "epoch": 0.3745, + "grad_norm": 14.375, + "grad_norm_var": 3.4124348958333335, + "learning_rate": 0.0003, + "loss": 11.219, + "loss/aux_loss": 0.04807998221367597, + "loss/crossentropy": 2.665431547164917, + "loss/logits": 0.8188263595104217, + "step": 37450 + }, + { + "epoch": 0.3746, + "grad_norm": 13.3125, + "grad_norm_var": 0.7863932291666667, + "learning_rate": 0.0003, + "loss": 11.2692, + "loss/aux_loss": 0.04808599669486284, + "loss/crossentropy": 2.5414306223392487, + "loss/logits": 0.8146803647279739, + "step": 37460 + }, + { + "epoch": 0.3747, + "grad_norm": 13.5625, + "grad_norm_var": 0.5702473958333333, + "learning_rate": 0.0003, + "loss": 11.1549, + "loss/aux_loss": 0.04807412289083004, + "loss/crossentropy": 2.700755310058594, + "loss/logits": 0.8417092651128769, + "step": 37470 + }, + { + "epoch": 0.3748, + "grad_norm": 13.4375, + "grad_norm_var": 0.292041015625, + "learning_rate": 0.0003, + "loss": 11.2612, + "loss/aux_loss": 0.04807847496122122, + "loss/crossentropy": 2.6125539779663085, + "loss/logits": 0.8809157848358155, + "step": 37480 + }, + { + "epoch": 0.3749, + "grad_norm": 13.8125, + "grad_norm_var": 0.49264322916666664, + "learning_rate": 0.0003, + "loss": 11.1776, + "loss/aux_loss": 0.0480781901627779, + "loss/crossentropy": 2.731597048044205, + "loss/logits": 0.8342852920293808, + "step": 37490 + }, + { + "epoch": 0.375, + "grad_norm": 13.375, + "grad_norm_var": 0.9214680989583334, + "learning_rate": 0.0003, + "loss": 11.1111, + "loss/aux_loss": 0.04808135274797678, + "loss/crossentropy": 2.7630858182907105, + "loss/logits": 0.8406100690364837, + "step": 37500 + }, + { + "epoch": 0.3751, + "grad_norm": 13.5625, + "grad_norm_var": 0.7954264322916667, + "learning_rate": 0.0003, + "loss": 11.2801, + "loss/aux_loss": 0.04807340987026691, + "loss/crossentropy": 2.808059513568878, + "loss/logits": 0.8589271575212478, + "step": 37510 + }, + { + "epoch": 0.3752, + "grad_norm": 14.0625, + "grad_norm_var": 0.18409830729166668, + "learning_rate": 0.0003, + "loss": 11.3453, + "loss/aux_loss": 0.04808524418622255, + "loss/crossentropy": 2.832816928625107, + "loss/logits": 0.87020343542099, + "step": 37520 + }, + { + "epoch": 0.3753, + "grad_norm": 12.6875, + "grad_norm_var": 0.19895833333333332, + "learning_rate": 0.0003, + "loss": 11.2984, + "loss/aux_loss": 0.0480803806334734, + "loss/crossentropy": 2.7448639810085296, + "loss/logits": 0.8650804668664932, + "step": 37530 + }, + { + "epoch": 0.3754, + "grad_norm": 13.0625, + "grad_norm_var": 0.29713541666666665, + "learning_rate": 0.0003, + "loss": 11.2711, + "loss/aux_loss": 0.04808024540543556, + "loss/crossentropy": 2.723136955499649, + "loss/logits": 0.8429204732179642, + "step": 37540 + }, + { + "epoch": 0.3755, + "grad_norm": 13.125, + "grad_norm_var": 0.4122233072916667, + "learning_rate": 0.0003, + "loss": 11.3523, + "loss/aux_loss": 0.04806947018951178, + "loss/crossentropy": 2.7606225490570067, + "loss/logits": 0.8683151304721832, + "step": 37550 + }, + { + "epoch": 0.3756, + "grad_norm": 12.8125, + "grad_norm_var": 2.58828125, + "learning_rate": 0.0003, + "loss": 11.3345, + "loss/aux_loss": 0.048083030991256236, + "loss/crossentropy": 2.5333042323589323, + "loss/logits": 0.8388000845909118, + "step": 37560 + }, + { + "epoch": 0.3757, + "grad_norm": 18.375, + "grad_norm_var": 295.15208333333334, + "learning_rate": 0.0003, + "loss": 11.4007, + "loss/aux_loss": 0.04808737169951201, + "loss/crossentropy": 2.724211460351944, + "loss/logits": 0.8297827035188675, + "step": 37570 + }, + { + "epoch": 0.3758, + "grad_norm": 13.9375, + "grad_norm_var": 292.0377604166667, + "learning_rate": 0.0003, + "loss": 11.2395, + "loss/aux_loss": 0.048087266832590105, + "loss/crossentropy": 2.724383169412613, + "loss/logits": 0.8737955868244172, + "step": 37580 + }, + { + "epoch": 0.3759, + "grad_norm": 14.0, + "grad_norm_var": 0.43865559895833334, + "learning_rate": 0.0003, + "loss": 11.2987, + "loss/aux_loss": 0.04807658027857542, + "loss/crossentropy": 2.810201585292816, + "loss/logits": 0.8624020755290985, + "step": 37590 + }, + { + "epoch": 0.376, + "grad_norm": 13.0625, + "grad_norm_var": 0.5176920572916667, + "learning_rate": 0.0003, + "loss": 11.3521, + "loss/aux_loss": 0.048078496009111404, + "loss/crossentropy": 2.75020290017128, + "loss/logits": 0.8415679961442948, + "step": 37600 + }, + { + "epoch": 0.3761, + "grad_norm": 14.875, + "grad_norm_var": 0.5782389322916667, + "learning_rate": 0.0003, + "loss": 11.1843, + "loss/aux_loss": 0.04807473961263895, + "loss/crossentropy": 2.7164148449897767, + "loss/logits": 0.8833770871162414, + "step": 37610 + }, + { + "epoch": 0.3762, + "grad_norm": 15.375, + "grad_norm_var": 0.7687337239583333, + "learning_rate": 0.0003, + "loss": 11.2037, + "loss/aux_loss": 0.04808453526347876, + "loss/crossentropy": 2.697547745704651, + "loss/logits": 0.8366124957799912, + "step": 37620 + }, + { + "epoch": 0.3763, + "grad_norm": 14.0625, + "grad_norm_var": 0.7102701822916667, + "learning_rate": 0.0003, + "loss": 11.4474, + "loss/aux_loss": 0.04807381797581911, + "loss/crossentropy": 2.698218834400177, + "loss/logits": 0.8631418794393539, + "step": 37630 + }, + { + "epoch": 0.3764, + "grad_norm": 13.625, + "grad_norm_var": 0.265087890625, + "learning_rate": 0.0003, + "loss": 11.2557, + "loss/aux_loss": 0.04807877913117409, + "loss/crossentropy": 2.5806887984275817, + "loss/logits": 0.8300289899110794, + "step": 37640 + }, + { + "epoch": 0.3765, + "grad_norm": 13.625, + "grad_norm_var": 0.4354166666666667, + "learning_rate": 0.0003, + "loss": 11.2533, + "loss/aux_loss": 0.048092398792505264, + "loss/crossentropy": 2.6587139785289766, + "loss/logits": 0.8522565513849258, + "step": 37650 + }, + { + "epoch": 0.3766, + "grad_norm": 14.6875, + "grad_norm_var": 23.364957682291667, + "learning_rate": 0.0003, + "loss": 11.3313, + "loss/aux_loss": 0.048093941807746884, + "loss/crossentropy": 2.806702709197998, + "loss/logits": 0.8423727869987487, + "step": 37660 + }, + { + "epoch": 0.3767, + "grad_norm": 17.75, + "grad_norm_var": 442.132275390625, + "learning_rate": 0.0003, + "loss": 11.307, + "loss/aux_loss": 0.04808268621563912, + "loss/crossentropy": 2.6064475953578947, + "loss/logits": 0.8261544018983841, + "step": 37670 + }, + { + "epoch": 0.3768, + "grad_norm": 15.875, + "grad_norm_var": 14.4119140625, + "learning_rate": 0.0003, + "loss": 11.3099, + "loss/aux_loss": 0.04807278923690319, + "loss/crossentropy": 2.6616617262363436, + "loss/logits": 0.8287598133087158, + "step": 37680 + }, + { + "epoch": 0.3769, + "grad_norm": 15.1875, + "grad_norm_var": 2.6393229166666665, + "learning_rate": 0.0003, + "loss": 11.2747, + "loss/aux_loss": 0.04807351883500814, + "loss/crossentropy": 2.8991053104400635, + "loss/logits": 0.8774980515241623, + "step": 37690 + }, + { + "epoch": 0.377, + "grad_norm": 13.5, + "grad_norm_var": 1.1721354166666667, + "learning_rate": 0.0003, + "loss": 11.3018, + "loss/aux_loss": 0.04807792901992798, + "loss/crossentropy": 2.793347454071045, + "loss/logits": 0.8624837636947632, + "step": 37700 + }, + { + "epoch": 0.3771, + "grad_norm": 13.5625, + "grad_norm_var": 0.9067057291666667, + "learning_rate": 0.0003, + "loss": 11.1449, + "loss/aux_loss": 0.04807579685002565, + "loss/crossentropy": 2.564465194940567, + "loss/logits": 0.8291731148958206, + "step": 37710 + }, + { + "epoch": 0.3772, + "grad_norm": 12.1875, + "grad_norm_var": 0.49733072916666665, + "learning_rate": 0.0003, + "loss": 11.1825, + "loss/aux_loss": 0.0480663126334548, + "loss/crossentropy": 2.8131748914718626, + "loss/logits": 0.8564931780099869, + "step": 37720 + }, + { + "epoch": 0.3773, + "grad_norm": 14.3125, + "grad_norm_var": 1.3179524739583333, + "learning_rate": 0.0003, + "loss": 11.1927, + "loss/aux_loss": 0.04809189885854721, + "loss/crossentropy": 2.8035511016845702, + "loss/logits": 0.8402773588895798, + "step": 37730 + }, + { + "epoch": 0.3774, + "grad_norm": 13.375, + "grad_norm_var": 1.3494140625, + "learning_rate": 0.0003, + "loss": 11.2049, + "loss/aux_loss": 0.048071368038654326, + "loss/crossentropy": 2.788879954814911, + "loss/logits": 0.8656217336654664, + "step": 37740 + }, + { + "epoch": 0.3775, + "grad_norm": 13.1875, + "grad_norm_var": 0.861962890625, + "learning_rate": 0.0003, + "loss": 11.0996, + "loss/aux_loss": 0.04808103609830141, + "loss/crossentropy": 2.627894651889801, + "loss/logits": 0.8183565735816956, + "step": 37750 + }, + { + "epoch": 0.3776, + "grad_norm": 14.4375, + "grad_norm_var": 1.5488932291666666, + "learning_rate": 0.0003, + "loss": 11.2273, + "loss/aux_loss": 0.048079018481075764, + "loss/crossentropy": 2.7429580628871917, + "loss/logits": 0.8102349221706391, + "step": 37760 + }, + { + "epoch": 0.3777, + "grad_norm": 14.0, + "grad_norm_var": 1.4503743489583334, + "learning_rate": 0.0003, + "loss": 11.3375, + "loss/aux_loss": 0.0480774000287056, + "loss/crossentropy": 2.7857055068016052, + "loss/logits": 0.8557232707738877, + "step": 37770 + }, + { + "epoch": 0.3778, + "grad_norm": 14.4375, + "grad_norm_var": 0.51875, + "learning_rate": 0.0003, + "loss": 11.3197, + "loss/aux_loss": 0.04807243067771196, + "loss/crossentropy": 2.889014649391174, + "loss/logits": 0.8674527406692505, + "step": 37780 + }, + { + "epoch": 0.3779, + "grad_norm": 14.0, + "grad_norm_var": 0.2916015625, + "learning_rate": 0.0003, + "loss": 11.1796, + "loss/aux_loss": 0.04807484410703182, + "loss/crossentropy": 2.7089039623737334, + "loss/logits": 0.8426949769258499, + "step": 37790 + }, + { + "epoch": 0.378, + "grad_norm": 13.5625, + "grad_norm_var": 0.39576822916666665, + "learning_rate": 0.0003, + "loss": 11.2755, + "loss/aux_loss": 0.048075446113944056, + "loss/crossentropy": 2.6602770924568175, + "loss/logits": 0.8283806025981904, + "step": 37800 + }, + { + "epoch": 0.3781, + "grad_norm": 12.75, + "grad_norm_var": 0.42962239583333334, + "learning_rate": 0.0003, + "loss": 11.205, + "loss/aux_loss": 0.04808063935488462, + "loss/crossentropy": 2.594762307405472, + "loss/logits": 0.8218467265367508, + "step": 37810 + }, + { + "epoch": 0.3782, + "grad_norm": 13.6875, + "grad_norm_var": 0.49933268229166666, + "learning_rate": 0.0003, + "loss": 11.1313, + "loss/aux_loss": 0.048066642694175245, + "loss/crossentropy": 2.5958735227584837, + "loss/logits": 0.8756007015705108, + "step": 37820 + }, + { + "epoch": 0.3783, + "grad_norm": 17.625, + "grad_norm_var": 1.3032389322916667, + "learning_rate": 0.0003, + "loss": 11.283, + "loss/aux_loss": 0.048081421107053754, + "loss/crossentropy": 2.6480916321277617, + "loss/logits": 0.8332589745521546, + "step": 37830 + }, + { + "epoch": 0.3784, + "grad_norm": 15.125, + "grad_norm_var": 1.1004557291666666, + "learning_rate": 0.0003, + "loss": 11.3192, + "loss/aux_loss": 0.048070864751935005, + "loss/crossentropy": 2.741181659698486, + "loss/logits": 0.8588764518499374, + "step": 37840 + }, + { + "epoch": 0.3785, + "grad_norm": 16.25, + "grad_norm_var": 0.9374348958333333, + "learning_rate": 0.0003, + "loss": 11.4032, + "loss/aux_loss": 0.048078233189880845, + "loss/crossentropy": 2.9572018921375274, + "loss/logits": 0.8699509769678115, + "step": 37850 + }, + { + "epoch": 0.3786, + "grad_norm": 13.5625, + "grad_norm_var": 0.9024576822916667, + "learning_rate": 0.0003, + "loss": 11.2661, + "loss/aux_loss": 0.048085720464587214, + "loss/crossentropy": 2.580213463306427, + "loss/logits": 0.7983238309621811, + "step": 37860 + }, + { + "epoch": 0.3787, + "grad_norm": 14.125, + "grad_norm_var": 0.342041015625, + "learning_rate": 0.0003, + "loss": 11.4754, + "loss/aux_loss": 0.04807271733880043, + "loss/crossentropy": 2.709409844875336, + "loss/logits": 0.8600565820932389, + "step": 37870 + }, + { + "epoch": 0.3788, + "grad_norm": 13.9375, + "grad_norm_var": 0.4671223958333333, + "learning_rate": 0.0003, + "loss": 11.281, + "loss/aux_loss": 0.048080405406653884, + "loss/crossentropy": 2.82181898355484, + "loss/logits": 0.8688194662332535, + "step": 37880 + }, + { + "epoch": 0.3789, + "grad_norm": 14.25, + "grad_norm_var": 1.0020182291666666, + "learning_rate": 0.0003, + "loss": 11.2233, + "loss/aux_loss": 0.04807996470481157, + "loss/crossentropy": 2.773501121997833, + "loss/logits": 0.8502937823534011, + "step": 37890 + }, + { + "epoch": 0.379, + "grad_norm": 13.875, + "grad_norm_var": 0.25467122395833336, + "learning_rate": 0.0003, + "loss": 11.2839, + "loss/aux_loss": 0.048080760054290295, + "loss/crossentropy": 2.6579030215740205, + "loss/logits": 0.8489834278821945, + "step": 37900 + }, + { + "epoch": 0.3791, + "grad_norm": 13.9375, + "grad_norm_var": 1.21328125, + "learning_rate": 0.0003, + "loss": 11.2377, + "loss/aux_loss": 0.04807602297514677, + "loss/crossentropy": 2.6808858156204223, + "loss/logits": 0.85300872027874, + "step": 37910 + }, + { + "epoch": 0.3792, + "grad_norm": 13.125, + "grad_norm_var": 1.8402180989583334, + "learning_rate": 0.0003, + "loss": 11.2778, + "loss/aux_loss": 0.04807765781879425, + "loss/crossentropy": 2.6116097033023835, + "loss/logits": 0.8364583939313889, + "step": 37920 + }, + { + "epoch": 0.3793, + "grad_norm": 13.0, + "grad_norm_var": 1.376025390625, + "learning_rate": 0.0003, + "loss": 11.3479, + "loss/aux_loss": 0.04807660095393658, + "loss/crossentropy": 2.7459771037101746, + "loss/logits": 0.8774673551321029, + "step": 37930 + }, + { + "epoch": 0.3794, + "grad_norm": 14.3125, + "grad_norm_var": 1.4880208333333333, + "learning_rate": 0.0003, + "loss": 11.1755, + "loss/aux_loss": 0.0480880094692111, + "loss/crossentropy": 2.7383559942245483, + "loss/logits": 0.8331858664751053, + "step": 37940 + }, + { + "epoch": 0.3795, + "grad_norm": 14.6875, + "grad_norm_var": 1.3161458333333333, + "learning_rate": 0.0003, + "loss": 11.2232, + "loss/aux_loss": 0.04807455353438854, + "loss/crossentropy": 2.7862467050552366, + "loss/logits": 0.8778936117887497, + "step": 37950 + }, + { + "epoch": 0.3796, + "grad_norm": 14.0625, + "grad_norm_var": 0.651025390625, + "learning_rate": 0.0003, + "loss": 11.2377, + "loss/aux_loss": 0.04807548765093088, + "loss/crossentropy": 2.6477761268615723, + "loss/logits": 0.8515429794788361, + "step": 37960 + }, + { + "epoch": 0.3797, + "grad_norm": 14.875, + "grad_norm_var": 0.5752604166666667, + "learning_rate": 0.0003, + "loss": 11.5044, + "loss/aux_loss": 0.04807682652026415, + "loss/crossentropy": 2.8267282128334044, + "loss/logits": 0.8833474934101104, + "step": 37970 + }, + { + "epoch": 0.3798, + "grad_norm": 13.625, + "grad_norm_var": 0.4791666666666667, + "learning_rate": 0.0003, + "loss": 11.1736, + "loss/aux_loss": 0.048078049533069135, + "loss/crossentropy": 2.788653367757797, + "loss/logits": 0.8569782227277756, + "step": 37980 + }, + { + "epoch": 0.3799, + "grad_norm": 12.8125, + "grad_norm_var": 0.4383951822916667, + "learning_rate": 0.0003, + "loss": 11.267, + "loss/aux_loss": 0.04807330220937729, + "loss/crossentropy": 2.745168626308441, + "loss/logits": 0.8513006120920181, + "step": 37990 + }, + { + "epoch": 0.38, + "grad_norm": 14.25, + "grad_norm_var": 0.7040201822916666, + "learning_rate": 0.0003, + "loss": 11.312, + "loss/aux_loss": 0.048071262612938884, + "loss/crossentropy": 2.828562021255493, + "loss/logits": 0.8815089613199234, + "step": 38000 + }, + { + "epoch": 0.3801, + "grad_norm": 13.5625, + "grad_norm_var": 0.5681640625, + "learning_rate": 0.0003, + "loss": 11.2651, + "loss/aux_loss": 0.04807847216725349, + "loss/crossentropy": 2.6953054130077363, + "loss/logits": 0.841254535317421, + "step": 38010 + }, + { + "epoch": 0.3802, + "grad_norm": 13.75, + "grad_norm_var": 0.350634765625, + "learning_rate": 0.0003, + "loss": 11.2093, + "loss/aux_loss": 0.04808027595281601, + "loss/crossentropy": 2.7304549276828767, + "loss/logits": 0.8305320262908935, + "step": 38020 + }, + { + "epoch": 0.3803, + "grad_norm": 12.4375, + "grad_norm_var": 0.2353515625, + "learning_rate": 0.0003, + "loss": 11.2175, + "loss/aux_loss": 0.04807380642741919, + "loss/crossentropy": 2.672012412548065, + "loss/logits": 0.8408935517072678, + "step": 38030 + }, + { + "epoch": 0.3804, + "grad_norm": 14.125, + "grad_norm_var": 0.23748372395833334, + "learning_rate": 0.0003, + "loss": 11.3549, + "loss/aux_loss": 0.04807410296052694, + "loss/crossentropy": 2.8609830141067505, + "loss/logits": 0.8738586813211441, + "step": 38040 + }, + { + "epoch": 0.3805, + "grad_norm": 14.5, + "grad_norm_var": 1.24921875, + "learning_rate": 0.0003, + "loss": 11.2491, + "loss/aux_loss": 0.048076963238418105, + "loss/crossentropy": 2.8962836384773256, + "loss/logits": 0.8688966006040573, + "step": 38050 + }, + { + "epoch": 0.3806, + "grad_norm": 14.0625, + "grad_norm_var": 0.5122395833333333, + "learning_rate": 0.0003, + "loss": 11.361, + "loss/aux_loss": 0.04807176198810339, + "loss/crossentropy": 2.8302778005599976, + "loss/logits": 0.8801500231027604, + "step": 38060 + }, + { + "epoch": 0.3807, + "grad_norm": 14.125, + "grad_norm_var": 0.35480143229166666, + "learning_rate": 0.0003, + "loss": 11.2694, + "loss/aux_loss": 0.048076769523322585, + "loss/crossentropy": 2.6821465611457826, + "loss/logits": 0.8561849266290664, + "step": 38070 + }, + { + "epoch": 0.3808, + "grad_norm": 14.9375, + "grad_norm_var": 0.4009765625, + "learning_rate": 0.0003, + "loss": 11.2842, + "loss/aux_loss": 0.04807623084634542, + "loss/crossentropy": 2.868058133125305, + "loss/logits": 0.8749160617589951, + "step": 38080 + }, + { + "epoch": 0.3809, + "grad_norm": 14.625, + "grad_norm_var": 0.6916015625, + "learning_rate": 0.0003, + "loss": 11.2862, + "loss/aux_loss": 0.0480802733451128, + "loss/crossentropy": 2.762987458705902, + "loss/logits": 0.8203055411577225, + "step": 38090 + }, + { + "epoch": 0.381, + "grad_norm": 14.75, + "grad_norm_var": 0.3472493489583333, + "learning_rate": 0.0003, + "loss": 11.267, + "loss/aux_loss": 0.0480813367292285, + "loss/crossentropy": 2.5167156994342803, + "loss/logits": 0.8391987591981888, + "step": 38100 + }, + { + "epoch": 0.3811, + "grad_norm": 14.5, + "grad_norm_var": 0.3890462239583333, + "learning_rate": 0.0003, + "loss": 11.3582, + "loss/aux_loss": 0.04806756749749184, + "loss/crossentropy": 2.9812386274337768, + "loss/logits": 0.8828330308198928, + "step": 38110 + }, + { + "epoch": 0.3812, + "grad_norm": 15.375, + "grad_norm_var": 0.619775390625, + "learning_rate": 0.0003, + "loss": 11.2353, + "loss/aux_loss": 0.048090960085392, + "loss/crossentropy": 2.782704734802246, + "loss/logits": 0.8270899027585983, + "step": 38120 + }, + { + "epoch": 0.3813, + "grad_norm": 12.9375, + "grad_norm_var": 0.73515625, + "learning_rate": 0.0003, + "loss": 11.2608, + "loss/aux_loss": 0.048075029626488686, + "loss/crossentropy": 2.669895362854004, + "loss/logits": 0.8388209640979767, + "step": 38130 + }, + { + "epoch": 0.3814, + "grad_norm": 13.3125, + "grad_norm_var": 0.5979166666666667, + "learning_rate": 0.0003, + "loss": 11.3802, + "loss/aux_loss": 0.048083779774606226, + "loss/crossentropy": 2.775761139392853, + "loss/logits": 0.8643197298049927, + "step": 38140 + }, + { + "epoch": 0.3815, + "grad_norm": 12.9375, + "grad_norm_var": 0.6613932291666667, + "learning_rate": 0.0003, + "loss": 11.0835, + "loss/aux_loss": 0.04807844534516335, + "loss/crossentropy": 2.769335401058197, + "loss/logits": 0.822916254401207, + "step": 38150 + }, + { + "epoch": 0.3816, + "grad_norm": 12.8125, + "grad_norm_var": 0.9258951822916667, + "learning_rate": 0.0003, + "loss": 11.2522, + "loss/aux_loss": 0.0480781301856041, + "loss/crossentropy": 2.5931221723556517, + "loss/logits": 0.8241954296827316, + "step": 38160 + }, + { + "epoch": 0.3817, + "grad_norm": 13.5, + "grad_norm_var": 0.6152180989583333, + "learning_rate": 0.0003, + "loss": 11.2332, + "loss/aux_loss": 0.048073183931410315, + "loss/crossentropy": 2.7164094507694245, + "loss/logits": 0.8017847687005997, + "step": 38170 + }, + { + "epoch": 0.3818, + "grad_norm": 13.9375, + "grad_norm_var": 0.450244140625, + "learning_rate": 0.0003, + "loss": 11.2502, + "loss/aux_loss": 0.04807348102331162, + "loss/crossentropy": 2.7940221190452577, + "loss/logits": 0.8343671351671219, + "step": 38180 + }, + { + "epoch": 0.3819, + "grad_norm": 13.0625, + "grad_norm_var": 0.46608072916666665, + "learning_rate": 0.0003, + "loss": 11.1612, + "loss/aux_loss": 0.04808283261954784, + "loss/crossentropy": 2.659856015443802, + "loss/logits": 0.80497907102108, + "step": 38190 + }, + { + "epoch": 0.382, + "grad_norm": 12.875, + "grad_norm_var": 0.581884765625, + "learning_rate": 0.0003, + "loss": 11.3037, + "loss/aux_loss": 0.04806670006364584, + "loss/crossentropy": 2.79791459441185, + "loss/logits": 0.8519628554582596, + "step": 38200 + }, + { + "epoch": 0.3821, + "grad_norm": 13.25, + "grad_norm_var": 1.0557291666666666, + "learning_rate": 0.0003, + "loss": 11.0594, + "loss/aux_loss": 0.04808285720646381, + "loss/crossentropy": 2.640339195728302, + "loss/logits": 0.7928971499204636, + "step": 38210 + }, + { + "epoch": 0.3822, + "grad_norm": 12.9375, + "grad_norm_var": 0.9680826822916667, + "learning_rate": 0.0003, + "loss": 11.2579, + "loss/aux_loss": 0.048081761412322524, + "loss/crossentropy": 2.681269496679306, + "loss/logits": 0.8393559873104095, + "step": 38220 + }, + { + "epoch": 0.3823, + "grad_norm": 14.0625, + "grad_norm_var": 0.4390625, + "learning_rate": 0.0003, + "loss": 11.2653, + "loss/aux_loss": 0.04807662703096867, + "loss/crossentropy": 2.575548267364502, + "loss/logits": 0.8309787482023239, + "step": 38230 + }, + { + "epoch": 0.3824, + "grad_norm": 14.5625, + "grad_norm_var": 0.42967122395833335, + "learning_rate": 0.0003, + "loss": 11.2071, + "loss/aux_loss": 0.04807767011225224, + "loss/crossentropy": 2.598121851682663, + "loss/logits": 0.8631124287843704, + "step": 38240 + }, + { + "epoch": 0.3825, + "grad_norm": 14.0, + "grad_norm_var": 0.17550455729166667, + "learning_rate": 0.0003, + "loss": 11.1865, + "loss/aux_loss": 0.04808439090847969, + "loss/crossentropy": 2.6111572325229644, + "loss/logits": 0.7978465467691421, + "step": 38250 + }, + { + "epoch": 0.3826, + "grad_norm": 13.6875, + "grad_norm_var": 0.5291015625, + "learning_rate": 0.0003, + "loss": 11.3335, + "loss/aux_loss": 0.048076186701655386, + "loss/crossentropy": 2.824894219636917, + "loss/logits": 0.8665509730577469, + "step": 38260 + }, + { + "epoch": 0.3827, + "grad_norm": 14.0625, + "grad_norm_var": 0.43826497395833336, + "learning_rate": 0.0003, + "loss": 11.3737, + "loss/aux_loss": 0.04807574283331632, + "loss/crossentropy": 2.7032552480697634, + "loss/logits": 0.8640910536050797, + "step": 38270 + }, + { + "epoch": 0.3828, + "grad_norm": 14.0625, + "grad_norm_var": 1.1030598958333333, + "learning_rate": 0.0003, + "loss": 11.3598, + "loss/aux_loss": 0.04807320572435856, + "loss/crossentropy": 2.7154432415962217, + "loss/logits": 0.868793374300003, + "step": 38280 + }, + { + "epoch": 0.3829, + "grad_norm": 14.3125, + "grad_norm_var": 0.3275390625, + "learning_rate": 0.0003, + "loss": 11.1598, + "loss/aux_loss": 0.04809410870075226, + "loss/crossentropy": 2.817612624168396, + "loss/logits": 0.863050663471222, + "step": 38290 + }, + { + "epoch": 0.383, + "grad_norm": 14.9375, + "grad_norm_var": 0.40675455729166665, + "learning_rate": 0.0003, + "loss": 11.2505, + "loss/aux_loss": 0.04806933347135782, + "loss/crossentropy": 2.6598378300666807, + "loss/logits": 0.8606748700141906, + "step": 38300 + }, + { + "epoch": 0.3831, + "grad_norm": 13.25, + "grad_norm_var": 13.406705729166667, + "learning_rate": 0.0003, + "loss": 11.2905, + "loss/aux_loss": 0.04808672070503235, + "loss/crossentropy": 2.607471966743469, + "loss/logits": 0.8459422647953033, + "step": 38310 + }, + { + "epoch": 0.3832, + "grad_norm": 13.375, + "grad_norm_var": 13.0916015625, + "learning_rate": 0.0003, + "loss": 11.3664, + "loss/aux_loss": 0.04807449951767921, + "loss/crossentropy": 2.73186194896698, + "loss/logits": 0.8628045409917832, + "step": 38320 + }, + { + "epoch": 0.3833, + "grad_norm": 15.3125, + "grad_norm_var": 3.707145182291667, + "learning_rate": 0.0003, + "loss": 11.5162, + "loss/aux_loss": 0.04808128159493208, + "loss/crossentropy": 2.7736764550209045, + "loss/logits": 0.8548513650894165, + "step": 38330 + }, + { + "epoch": 0.3834, + "grad_norm": 14.625, + "grad_norm_var": 0.7597493489583333, + "learning_rate": 0.0003, + "loss": 11.3973, + "loss/aux_loss": 0.0480838356539607, + "loss/crossentropy": 2.8341826438903808, + "loss/logits": 0.8437968879938126, + "step": 38340 + }, + { + "epoch": 0.3835, + "grad_norm": 16.625, + "grad_norm_var": 0.7883951822916667, + "learning_rate": 0.0003, + "loss": 11.2896, + "loss/aux_loss": 0.04808208886533975, + "loss/crossentropy": 2.6580508768558504, + "loss/logits": 0.8282568514347076, + "step": 38350 + }, + { + "epoch": 0.3836, + "grad_norm": 13.125, + "grad_norm_var": 0.77578125, + "learning_rate": 0.0003, + "loss": 11.4887, + "loss/aux_loss": 0.04807929620146752, + "loss/crossentropy": 2.7363623082637787, + "loss/logits": 0.8577351301908493, + "step": 38360 + }, + { + "epoch": 0.3837, + "grad_norm": 13.6875, + "grad_norm_var": 0.44244791666666666, + "learning_rate": 0.0003, + "loss": 11.4202, + "loss/aux_loss": 0.04807289559394121, + "loss/crossentropy": 2.8419145464897158, + "loss/logits": 0.8924077719449997, + "step": 38370 + }, + { + "epoch": 0.3838, + "grad_norm": 13.9375, + "grad_norm_var": 1.4704264322916667, + "learning_rate": 0.0003, + "loss": 11.2463, + "loss/aux_loss": 0.0480803145095706, + "loss/crossentropy": 2.823737806081772, + "loss/logits": 0.8809779584407806, + "step": 38380 + }, + { + "epoch": 0.3839, + "grad_norm": 13.9375, + "grad_norm_var": 1.5296875, + "learning_rate": 0.0003, + "loss": 11.3235, + "loss/aux_loss": 0.04806930013000965, + "loss/crossentropy": 2.7565455436706543, + "loss/logits": 0.8279327541589737, + "step": 38390 + }, + { + "epoch": 0.384, + "grad_norm": 13.6875, + "grad_norm_var": 0.4903483072916667, + "learning_rate": 0.0003, + "loss": 11.0918, + "loss/aux_loss": 0.04807766154408455, + "loss/crossentropy": 2.6135290563106537, + "loss/logits": 0.8393938690423965, + "step": 38400 + }, + { + "epoch": 0.3841, + "grad_norm": 33.0, + "grad_norm_var": 24.82421875, + "learning_rate": 0.0003, + "loss": 11.1982, + "loss/aux_loss": 0.04807633478194475, + "loss/crossentropy": 2.6710329234600065, + "loss/logits": 0.8311236262321472, + "step": 38410 + }, + { + "epoch": 0.3842, + "grad_norm": 14.9375, + "grad_norm_var": 22.696875, + "learning_rate": 0.0003, + "loss": 11.3847, + "loss/aux_loss": 0.048079009726643564, + "loss/crossentropy": 2.6408372461795806, + "loss/logits": 0.8508718222379684, + "step": 38420 + }, + { + "epoch": 0.3843, + "grad_norm": 14.5, + "grad_norm_var": 0.7051432291666667, + "learning_rate": 0.0003, + "loss": 11.3346, + "loss/aux_loss": 0.04808001890778542, + "loss/crossentropy": 2.7175046026706697, + "loss/logits": 0.8537416934967041, + "step": 38430 + }, + { + "epoch": 0.3844, + "grad_norm": 13.4375, + "grad_norm_var": 0.5770833333333333, + "learning_rate": 0.0003, + "loss": 11.2761, + "loss/aux_loss": 0.048079208470880985, + "loss/crossentropy": 2.7895686745643617, + "loss/logits": 0.8444579422473908, + "step": 38440 + }, + { + "epoch": 0.3845, + "grad_norm": 14.4375, + "grad_norm_var": 0.8260416666666667, + "learning_rate": 0.0003, + "loss": 11.2037, + "loss/aux_loss": 0.04807562418282032, + "loss/crossentropy": 2.6479432761669157, + "loss/logits": 0.839043453335762, + "step": 38450 + }, + { + "epoch": 0.3846, + "grad_norm": 14.125, + "grad_norm_var": 0.5632649739583333, + "learning_rate": 0.0003, + "loss": 11.2972, + "loss/aux_loss": 0.04808216225355864, + "loss/crossentropy": 2.7740365862846375, + "loss/logits": 0.8312047332525253, + "step": 38460 + }, + { + "epoch": 0.3847, + "grad_norm": 14.0, + "grad_norm_var": 0.5236979166666667, + "learning_rate": 0.0003, + "loss": 11.2348, + "loss/aux_loss": 0.04807409662753344, + "loss/crossentropy": 2.7867905139923095, + "loss/logits": 0.8606914162635804, + "step": 38470 + }, + { + "epoch": 0.3848, + "grad_norm": 13.625, + "grad_norm_var": 0.8204264322916667, + "learning_rate": 0.0003, + "loss": 11.1963, + "loss/aux_loss": 0.04807875119149685, + "loss/crossentropy": 2.7122581124305727, + "loss/logits": 0.8468606352806092, + "step": 38480 + }, + { + "epoch": 0.3849, + "grad_norm": 14.25, + "grad_norm_var": 0.9686848958333333, + "learning_rate": 0.0003, + "loss": 11.2863, + "loss/aux_loss": 0.04807748645544052, + "loss/crossentropy": 2.779924100637436, + "loss/logits": 0.8382695466279984, + "step": 38490 + }, + { + "epoch": 0.385, + "grad_norm": 12.8125, + "grad_norm_var": 0.420556640625, + "learning_rate": 0.0003, + "loss": 11.3427, + "loss/aux_loss": 0.0480777844786644, + "loss/crossentropy": 2.725245749950409, + "loss/logits": 0.8357455193996429, + "step": 38500 + }, + { + "epoch": 0.3851, + "grad_norm": 14.9375, + "grad_norm_var": 0.518994140625, + "learning_rate": 0.0003, + "loss": 11.3015, + "loss/aux_loss": 0.04808611571788788, + "loss/crossentropy": 2.803503179550171, + "loss/logits": 0.8664654195308685, + "step": 38510 + }, + { + "epoch": 0.3852, + "grad_norm": 12.9375, + "grad_norm_var": 0.5083170572916667, + "learning_rate": 0.0003, + "loss": 11.0804, + "loss/aux_loss": 0.048075488209724425, + "loss/crossentropy": 2.8052771151065827, + "loss/logits": 0.8602871984243393, + "step": 38520 + }, + { + "epoch": 0.3853, + "grad_norm": 15.0625, + "grad_norm_var": 0.361181640625, + "learning_rate": 0.0003, + "loss": 11.2198, + "loss/aux_loss": 0.04808064680546522, + "loss/crossentropy": 2.808874398469925, + "loss/logits": 0.8875895857810974, + "step": 38530 + }, + { + "epoch": 0.3854, + "grad_norm": 13.25, + "grad_norm_var": 0.572900390625, + "learning_rate": 0.0003, + "loss": 11.2937, + "loss/aux_loss": 0.048069310747087, + "loss/crossentropy": 2.7475598096847533, + "loss/logits": 0.8482136219739914, + "step": 38540 + }, + { + "epoch": 0.3855, + "grad_norm": 14.1875, + "grad_norm_var": 0.7632649739583334, + "learning_rate": 0.0003, + "loss": 11.2942, + "loss/aux_loss": 0.04807958360761404, + "loss/crossentropy": 2.6134680569171906, + "loss/logits": 0.8251173198223114, + "step": 38550 + }, + { + "epoch": 0.3856, + "grad_norm": 13.6875, + "grad_norm_var": 0.48587239583333336, + "learning_rate": 0.0003, + "loss": 11.2606, + "loss/aux_loss": 0.048083121702075, + "loss/crossentropy": 2.7685590624809264, + "loss/logits": 0.852242037653923, + "step": 38560 + }, + { + "epoch": 0.3857, + "grad_norm": 13.8125, + "grad_norm_var": 0.13802083333333334, + "learning_rate": 0.0003, + "loss": 11.3004, + "loss/aux_loss": 0.04806161895394325, + "loss/crossentropy": 2.70892972946167, + "loss/logits": 0.8436130315065384, + "step": 38570 + }, + { + "epoch": 0.3858, + "grad_norm": 15.0, + "grad_norm_var": 0.8285807291666667, + "learning_rate": 0.0003, + "loss": 11.36, + "loss/aux_loss": 0.04808020200580358, + "loss/crossentropy": 2.798497807979584, + "loss/logits": 0.8555373579263688, + "step": 38580 + }, + { + "epoch": 0.3859, + "grad_norm": 14.625, + "grad_norm_var": 0.5480305989583333, + "learning_rate": 0.0003, + "loss": 11.1952, + "loss/aux_loss": 0.04807691927999258, + "loss/crossentropy": 2.627361184358597, + "loss/logits": 0.854537034034729, + "step": 38590 + }, + { + "epoch": 0.386, + "grad_norm": 13.75, + "grad_norm_var": 0.245556640625, + "learning_rate": 0.0003, + "loss": 11.1977, + "loss/aux_loss": 0.04807463120669127, + "loss/crossentropy": 2.816139954328537, + "loss/logits": 0.8892535716295242, + "step": 38600 + }, + { + "epoch": 0.3861, + "grad_norm": 13.9375, + "grad_norm_var": 0.33670247395833336, + "learning_rate": 0.0003, + "loss": 11.2819, + "loss/aux_loss": 0.04806935954838991, + "loss/crossentropy": 2.7068843841552734, + "loss/logits": 0.8351662307977676, + "step": 38610 + }, + { + "epoch": 0.3862, + "grad_norm": 15.3125, + "grad_norm_var": 0.5954264322916667, + "learning_rate": 0.0003, + "loss": 11.3311, + "loss/aux_loss": 0.048080881126224995, + "loss/crossentropy": 2.768476206064224, + "loss/logits": 0.8442646831274032, + "step": 38620 + }, + { + "epoch": 0.3863, + "grad_norm": 13.625, + "grad_norm_var": 0.31756184895833334, + "learning_rate": 0.0003, + "loss": 11.1192, + "loss/aux_loss": 0.04808139074593783, + "loss/crossentropy": 2.7400481700897217, + "loss/logits": 0.8542584419250489, + "step": 38630 + }, + { + "epoch": 0.3864, + "grad_norm": 12.9375, + "grad_norm_var": 13.647395833333333, + "learning_rate": 0.0003, + "loss": 11.0828, + "loss/aux_loss": 0.04807969853281975, + "loss/crossentropy": 2.630267012119293, + "loss/logits": 0.8385035455226898, + "step": 38640 + }, + { + "epoch": 0.3865, + "grad_norm": 14.25, + "grad_norm_var": 0.44921875, + "learning_rate": 0.0003, + "loss": 11.1954, + "loss/aux_loss": 0.048078606836497784, + "loss/crossentropy": 2.6999772429466247, + "loss/logits": 0.8357056826353073, + "step": 38650 + }, + { + "epoch": 0.3866, + "grad_norm": 15.375, + "grad_norm_var": 0.5769368489583333, + "learning_rate": 0.0003, + "loss": 11.3088, + "loss/aux_loss": 0.0480708010494709, + "loss/crossentropy": 2.7469111561775206, + "loss/logits": 0.8259833127260208, + "step": 38660 + }, + { + "epoch": 0.3867, + "grad_norm": 13.0625, + "grad_norm_var": 0.7369140625, + "learning_rate": 0.0003, + "loss": 11.2538, + "loss/aux_loss": 0.04808108452707529, + "loss/crossentropy": 2.8682199835777284, + "loss/logits": 0.8337242752313614, + "step": 38670 + }, + { + "epoch": 0.3868, + "grad_norm": 13.8125, + "grad_norm_var": 0.902197265625, + "learning_rate": 0.0003, + "loss": 11.2038, + "loss/aux_loss": 0.0480692382901907, + "loss/crossentropy": 2.748265969753265, + "loss/logits": 0.8487885296344757, + "step": 38680 + }, + { + "epoch": 0.3869, + "grad_norm": 13.25, + "grad_norm_var": 0.3020833333333333, + "learning_rate": 0.0003, + "loss": 11.2714, + "loss/aux_loss": 0.04808627497404814, + "loss/crossentropy": 2.6985159516334534, + "loss/logits": 0.8388892740011216, + "step": 38690 + }, + { + "epoch": 0.387, + "grad_norm": 18.25, + "grad_norm_var": 1.5707682291666667, + "learning_rate": 0.0003, + "loss": 11.1927, + "loss/aux_loss": 0.048079338297247885, + "loss/crossentropy": 2.7121275901794433, + "loss/logits": 0.8318122088909149, + "step": 38700 + }, + { + "epoch": 0.3871, + "grad_norm": 14.1875, + "grad_norm_var": 1.6813639322916667, + "learning_rate": 0.0003, + "loss": 11.3357, + "loss/aux_loss": 0.04807351864874363, + "loss/crossentropy": 2.8397586047649384, + "loss/logits": 0.8729697972536087, + "step": 38710 + }, + { + "epoch": 0.3872, + "grad_norm": 14.125, + "grad_norm_var": 0.5071451822916667, + "learning_rate": 0.0003, + "loss": 11.2126, + "loss/aux_loss": 0.04808269124478102, + "loss/crossentropy": 2.6068269073963166, + "loss/logits": 0.8011586487293243, + "step": 38720 + }, + { + "epoch": 0.3873, + "grad_norm": 13.875, + "grad_norm_var": 0.4791015625, + "learning_rate": 0.0003, + "loss": 11.1212, + "loss/aux_loss": 0.04808108098804951, + "loss/crossentropy": 2.8975651144981383, + "loss/logits": 0.847677406668663, + "step": 38730 + }, + { + "epoch": 0.3874, + "grad_norm": 13.5, + "grad_norm_var": 0.2749837239583333, + "learning_rate": 0.0003, + "loss": 11.2524, + "loss/aux_loss": 0.04808564744889736, + "loss/crossentropy": 2.6788780450820924, + "loss/logits": 0.8401986241340638, + "step": 38740 + }, + { + "epoch": 0.3875, + "grad_norm": 14.1875, + "grad_norm_var": 0.49073893229166665, + "learning_rate": 0.0003, + "loss": 11.335, + "loss/aux_loss": 0.048062538541853425, + "loss/crossentropy": 2.7721718668937685, + "loss/logits": 0.8494657784700393, + "step": 38750 + }, + { + "epoch": 0.3876, + "grad_norm": 14.3125, + "grad_norm_var": 0.7044270833333334, + "learning_rate": 0.0003, + "loss": 11.243, + "loss/aux_loss": 0.04808050319552422, + "loss/crossentropy": 2.6818510770797728, + "loss/logits": 0.8431717932224274, + "step": 38760 + }, + { + "epoch": 0.3877, + "grad_norm": 13.875, + "grad_norm_var": 0.43828125, + "learning_rate": 0.0003, + "loss": 11.1333, + "loss/aux_loss": 0.04807931166142225, + "loss/crossentropy": 2.8285969376564024, + "loss/logits": 0.8329048067331314, + "step": 38770 + }, + { + "epoch": 0.3878, + "grad_norm": 13.5, + "grad_norm_var": 0.30390625, + "learning_rate": 0.0003, + "loss": 11.207, + "loss/aux_loss": 0.04807636775076389, + "loss/crossentropy": 2.7491527557373048, + "loss/logits": 0.8234784305095673, + "step": 38780 + }, + { + "epoch": 0.3879, + "grad_norm": 13.25, + "grad_norm_var": 0.6166666666666667, + "learning_rate": 0.0003, + "loss": 11.101, + "loss/aux_loss": 0.048082555457949636, + "loss/crossentropy": 2.4799464106559754, + "loss/logits": 0.8477719098329544, + "step": 38790 + }, + { + "epoch": 0.388, + "grad_norm": 13.3125, + "grad_norm_var": 0.5782389322916667, + "learning_rate": 0.0003, + "loss": 11.3399, + "loss/aux_loss": 0.0480673098936677, + "loss/crossentropy": 2.77501580119133, + "loss/logits": 0.8746285647153854, + "step": 38800 + }, + { + "epoch": 0.3881, + "grad_norm": 13.5625, + "grad_norm_var": 0.7794270833333333, + "learning_rate": 0.0003, + "loss": 11.1978, + "loss/aux_loss": 0.04808091875165701, + "loss/crossentropy": 2.731118106842041, + "loss/logits": 0.860284361243248, + "step": 38810 + }, + { + "epoch": 0.3882, + "grad_norm": 13.375, + "grad_norm_var": 0.436962890625, + "learning_rate": 0.0003, + "loss": 11.2145, + "loss/aux_loss": 0.04807815104722977, + "loss/crossentropy": 2.6639424443244932, + "loss/logits": 0.8353655904531478, + "step": 38820 + }, + { + "epoch": 0.3883, + "grad_norm": 13.6875, + "grad_norm_var": 0.42511393229166666, + "learning_rate": 0.0003, + "loss": 11.3399, + "loss/aux_loss": 0.04808487202972174, + "loss/crossentropy": 2.8214931964874266, + "loss/logits": 0.8728219717741013, + "step": 38830 + }, + { + "epoch": 0.3884, + "grad_norm": 14.1875, + "grad_norm_var": 0.315869140625, + "learning_rate": 0.0003, + "loss": 11.3803, + "loss/aux_loss": 0.048068196326494214, + "loss/crossentropy": 2.951774549484253, + "loss/logits": 0.8737129330635071, + "step": 38840 + }, + { + "epoch": 0.3885, + "grad_norm": 13.625, + "grad_norm_var": 0.19036458333333334, + "learning_rate": 0.0003, + "loss": 11.3304, + "loss/aux_loss": 0.048076307959854604, + "loss/crossentropy": 2.614348477125168, + "loss/logits": 0.8102818191051483, + "step": 38850 + }, + { + "epoch": 0.3886, + "grad_norm": 14.5, + "grad_norm_var": 0.199853515625, + "learning_rate": 0.0003, + "loss": 11.1863, + "loss/aux_loss": 0.04808367285877466, + "loss/crossentropy": 2.670736050605774, + "loss/logits": 0.8290715306997299, + "step": 38860 + }, + { + "epoch": 0.3887, + "grad_norm": 13.0625, + "grad_norm_var": 0.3790201822916667, + "learning_rate": 0.0003, + "loss": 11.4597, + "loss/aux_loss": 0.04807464815676212, + "loss/crossentropy": 2.7976260662078856, + "loss/logits": 0.8721549570560455, + "step": 38870 + }, + { + "epoch": 0.3888, + "grad_norm": 14.0625, + "grad_norm_var": 4.417822265625, + "learning_rate": 0.0003, + "loss": 11.4346, + "loss/aux_loss": 0.048079947382211684, + "loss/crossentropy": 2.7102751970291137, + "loss/logits": 0.8924416452646255, + "step": 38880 + }, + { + "epoch": 0.3889, + "grad_norm": 14.6875, + "grad_norm_var": 3.782275390625, + "learning_rate": 0.0003, + "loss": 11.3206, + "loss/aux_loss": 0.048079301975667475, + "loss/crossentropy": 2.900500977039337, + "loss/logits": 0.8390416592359543, + "step": 38890 + }, + { + "epoch": 0.389, + "grad_norm": 14.125, + "grad_norm_var": 0.547119140625, + "learning_rate": 0.0003, + "loss": 11.1748, + "loss/aux_loss": 0.04807066544890404, + "loss/crossentropy": 2.6859599113464356, + "loss/logits": 0.8484610259532929, + "step": 38900 + }, + { + "epoch": 0.3891, + "grad_norm": 12.9375, + "grad_norm_var": 0.5132649739583334, + "learning_rate": 0.0003, + "loss": 11.1113, + "loss/aux_loss": 0.04807322956621647, + "loss/crossentropy": 2.678993618488312, + "loss/logits": 0.8367465615272522, + "step": 38910 + }, + { + "epoch": 0.3892, + "grad_norm": 13.9375, + "grad_norm_var": 0.694775390625, + "learning_rate": 0.0003, + "loss": 11.2919, + "loss/aux_loss": 0.04807643033564091, + "loss/crossentropy": 2.71058109998703, + "loss/logits": 0.8457289397716522, + "step": 38920 + }, + { + "epoch": 0.3893, + "grad_norm": 13.9375, + "grad_norm_var": 0.654541015625, + "learning_rate": 0.0003, + "loss": 11.2503, + "loss/aux_loss": 0.048072323016822335, + "loss/crossentropy": 2.544221270084381, + "loss/logits": 0.8313421994447708, + "step": 38930 + }, + { + "epoch": 0.3894, + "grad_norm": 14.1875, + "grad_norm_var": 0.22862955729166667, + "learning_rate": 0.0003, + "loss": 11.3605, + "loss/aux_loss": 0.04807420931756497, + "loss/crossentropy": 2.8579561948776244, + "loss/logits": 0.8661619156599045, + "step": 38940 + }, + { + "epoch": 0.3895, + "grad_norm": 14.625, + "grad_norm_var": 0.7454264322916667, + "learning_rate": 0.0003, + "loss": 11.3917, + "loss/aux_loss": 0.048077373020350936, + "loss/crossentropy": 2.728842890262604, + "loss/logits": 0.8730567246675491, + "step": 38950 + }, + { + "epoch": 0.3896, + "grad_norm": 13.5625, + "grad_norm_var": 0.541259765625, + "learning_rate": 0.0003, + "loss": 11.3586, + "loss/aux_loss": 0.04807371459901333, + "loss/crossentropy": 2.8443053007125854, + "loss/logits": 0.8811523258686066, + "step": 38960 + }, + { + "epoch": 0.3897, + "grad_norm": 14.25, + "grad_norm_var": 0.5179524739583333, + "learning_rate": 0.0003, + "loss": 11.3437, + "loss/aux_loss": 0.04806583281606436, + "loss/crossentropy": 2.693267875909805, + "loss/logits": 0.8548869907855987, + "step": 38970 + }, + { + "epoch": 0.3898, + "grad_norm": 15.375, + "grad_norm_var": 0.6398274739583333, + "learning_rate": 0.0003, + "loss": 11.2203, + "loss/aux_loss": 0.04806962329894304, + "loss/crossentropy": 2.8249629139900208, + "loss/logits": 0.8568162739276886, + "step": 38980 + }, + { + "epoch": 0.3899, + "grad_norm": 13.75, + "grad_norm_var": 0.39724934895833336, + "learning_rate": 0.0003, + "loss": 11.2541, + "loss/aux_loss": 0.04808248896151781, + "loss/crossentropy": 2.7104422807693482, + "loss/logits": 0.8193158626556396, + "step": 38990 + }, + { + "epoch": 0.39, + "grad_norm": 13.9375, + "grad_norm_var": 0.6528645833333333, + "learning_rate": 0.0003, + "loss": 11.336, + "loss/aux_loss": 0.04807611163705587, + "loss/crossentropy": 2.734510087966919, + "loss/logits": 0.880244129896164, + "step": 39000 + }, + { + "epoch": 0.3901, + "grad_norm": 13.8125, + "grad_norm_var": 0.8196451822916667, + "learning_rate": 0.0003, + "loss": 11.2298, + "loss/aux_loss": 0.048070698976516724, + "loss/crossentropy": 2.7943927884101867, + "loss/logits": 0.8071790516376496, + "step": 39010 + }, + { + "epoch": 0.3902, + "grad_norm": 14.0, + "grad_norm_var": 0.4534993489583333, + "learning_rate": 0.0003, + "loss": 11.3262, + "loss/aux_loss": 0.048071658983826634, + "loss/crossentropy": 2.7959813237190247, + "loss/logits": 0.8696642935276031, + "step": 39020 + }, + { + "epoch": 0.3903, + "grad_norm": 13.1875, + "grad_norm_var": 0.39264322916666666, + "learning_rate": 0.0003, + "loss": 11.2013, + "loss/aux_loss": 0.04807510618120432, + "loss/crossentropy": 2.625903457403183, + "loss/logits": 0.8251208335161209, + "step": 39030 + }, + { + "epoch": 0.3904, + "grad_norm": 13.6875, + "grad_norm_var": 0.2674479166666667, + "learning_rate": 0.0003, + "loss": 11.2806, + "loss/aux_loss": 0.04807391464710235, + "loss/crossentropy": 2.740411990880966, + "loss/logits": 0.8526921212673187, + "step": 39040 + }, + { + "epoch": 0.3905, + "grad_norm": 14.0, + "grad_norm_var": 2.7739583333333333, + "learning_rate": 0.0003, + "loss": 11.1987, + "loss/aux_loss": 0.048082204163074495, + "loss/crossentropy": 2.744875466823578, + "loss/logits": 0.8375120222568512, + "step": 39050 + }, + { + "epoch": 0.3906, + "grad_norm": 15.5625, + "grad_norm_var": 3.3329264322916665, + "learning_rate": 0.0003, + "loss": 11.092, + "loss/aux_loss": 0.04808939266949892, + "loss/crossentropy": 2.5393874824047087, + "loss/logits": 0.8006115674972534, + "step": 39060 + }, + { + "epoch": 0.3907, + "grad_norm": 13.9375, + "grad_norm_var": 0.78125, + "learning_rate": 0.0003, + "loss": 11.3496, + "loss/aux_loss": 0.048069536313414575, + "loss/crossentropy": 2.8732733964920043, + "loss/logits": 0.8897728711366654, + "step": 39070 + }, + { + "epoch": 0.3908, + "grad_norm": 14.9375, + "grad_norm_var": 0.6104166666666667, + "learning_rate": 0.0003, + "loss": 11.2304, + "loss/aux_loss": 0.048077582754194735, + "loss/crossentropy": 2.779140567779541, + "loss/logits": 0.8601418375968933, + "step": 39080 + }, + { + "epoch": 0.3909, + "grad_norm": 13.3125, + "grad_norm_var": 0.6387858072916667, + "learning_rate": 0.0003, + "loss": 11.2849, + "loss/aux_loss": 0.04808152187615633, + "loss/crossentropy": 2.901885849237442, + "loss/logits": 0.8603871166706085, + "step": 39090 + }, + { + "epoch": 0.391, + "grad_norm": 14.3125, + "grad_norm_var": 1.2986979166666666, + "learning_rate": 0.0003, + "loss": 11.1638, + "loss/aux_loss": 0.048083712719380854, + "loss/crossentropy": 2.6146180272102355, + "loss/logits": 0.8246536731719971, + "step": 39100 + }, + { + "epoch": 0.3911, + "grad_norm": 13.3125, + "grad_norm_var": 0.5108723958333333, + "learning_rate": 0.0003, + "loss": 11.2838, + "loss/aux_loss": 0.04807220734655857, + "loss/crossentropy": 2.606696993112564, + "loss/logits": 0.8235841602087021, + "step": 39110 + }, + { + "epoch": 0.3912, + "grad_norm": 14.1875, + "grad_norm_var": 0.5723307291666667, + "learning_rate": 0.0003, + "loss": 11.1847, + "loss/aux_loss": 0.04808733835816383, + "loss/crossentropy": 2.591922175884247, + "loss/logits": 0.7960670560598373, + "step": 39120 + }, + { + "epoch": 0.3913, + "grad_norm": 13.5625, + "grad_norm_var": 1.135791015625, + "learning_rate": 0.0003, + "loss": 11.1785, + "loss/aux_loss": 0.048079108074307444, + "loss/crossentropy": 2.8134935319423677, + "loss/logits": 0.8420876532793045, + "step": 39130 + }, + { + "epoch": 0.3914, + "grad_norm": 14.0625, + "grad_norm_var": 0.5067708333333333, + "learning_rate": 0.0003, + "loss": 11.2383, + "loss/aux_loss": 0.048073142766952515, + "loss/crossentropy": 2.853075420856476, + "loss/logits": 0.8272636830806732, + "step": 39140 + }, + { + "epoch": 0.3915, + "grad_norm": 14.5, + "grad_norm_var": 0.858837890625, + "learning_rate": 0.0003, + "loss": 11.2376, + "loss/aux_loss": 0.04809119720011949, + "loss/crossentropy": 2.522566032409668, + "loss/logits": 0.8258247703313828, + "step": 39150 + }, + { + "epoch": 0.3916, + "grad_norm": 13.875, + "grad_norm_var": 0.737744140625, + "learning_rate": 0.0003, + "loss": 11.2268, + "loss/aux_loss": 0.048074861988425255, + "loss/crossentropy": 2.750867176055908, + "loss/logits": 0.846402308344841, + "step": 39160 + }, + { + "epoch": 0.3917, + "grad_norm": 14.625, + "grad_norm_var": 1.01171875, + "learning_rate": 0.0003, + "loss": 11.2082, + "loss/aux_loss": 0.048072229884564874, + "loss/crossentropy": 2.7354251742362976, + "loss/logits": 0.8645006984472274, + "step": 39170 + }, + { + "epoch": 0.3918, + "grad_norm": 13.625, + "grad_norm_var": 1.2765462239583334, + "learning_rate": 0.0003, + "loss": 11.2339, + "loss/aux_loss": 0.04807515200227499, + "loss/crossentropy": 2.7877457082271575, + "loss/logits": 0.8666865587234497, + "step": 39180 + }, + { + "epoch": 0.3919, + "grad_norm": 13.875, + "grad_norm_var": 0.6962076822916666, + "learning_rate": 0.0003, + "loss": 11.5099, + "loss/aux_loss": 0.04808229543268681, + "loss/crossentropy": 2.6917248964309692, + "loss/logits": 0.8847535520792007, + "step": 39190 + }, + { + "epoch": 0.392, + "grad_norm": 15.375, + "grad_norm_var": 0.5378743489583333, + "learning_rate": 0.0003, + "loss": 11.3657, + "loss/aux_loss": 0.048071438632905486, + "loss/crossentropy": 2.789354109764099, + "loss/logits": 0.8606502175331116, + "step": 39200 + }, + { + "epoch": 0.3921, + "grad_norm": 13.25, + "grad_norm_var": 1.1359212239583334, + "learning_rate": 0.0003, + "loss": 11.3121, + "loss/aux_loss": 0.048081927560269834, + "loss/crossentropy": 2.780927097797394, + "loss/logits": 0.886367890238762, + "step": 39210 + }, + { + "epoch": 0.3922, + "grad_norm": 13.8125, + "grad_norm_var": 0.908447265625, + "learning_rate": 0.0003, + "loss": 11.2079, + "loss/aux_loss": 0.048087149113416675, + "loss/crossentropy": 2.6847366988658905, + "loss/logits": 0.8129809975624085, + "step": 39220 + }, + { + "epoch": 0.3923, + "grad_norm": 13.6875, + "grad_norm_var": 0.5606608072916667, + "learning_rate": 0.0003, + "loss": 11.3817, + "loss/aux_loss": 0.0480810409411788, + "loss/crossentropy": 2.937886118888855, + "loss/logits": 0.906218609213829, + "step": 39230 + }, + { + "epoch": 0.3924, + "grad_norm": 13.9375, + "grad_norm_var": 0.51640625, + "learning_rate": 0.0003, + "loss": 11.266, + "loss/aux_loss": 0.04808030817657709, + "loss/crossentropy": 2.6783434629440306, + "loss/logits": 0.8466577887535095, + "step": 39240 + }, + { + "epoch": 0.3925, + "grad_norm": 13.625, + "grad_norm_var": 0.551025390625, + "learning_rate": 0.0003, + "loss": 11.2733, + "loss/aux_loss": 0.048078637942671774, + "loss/crossentropy": 2.7514628052711485, + "loss/logits": 0.851484876871109, + "step": 39250 + }, + { + "epoch": 0.3926, + "grad_norm": 14.4375, + "grad_norm_var": 0.8516764322916667, + "learning_rate": 0.0003, + "loss": 11.1715, + "loss/aux_loss": 0.04808551203459501, + "loss/crossentropy": 2.4610378623008726, + "loss/logits": 0.8220183670520782, + "step": 39260 + }, + { + "epoch": 0.3927, + "grad_norm": 14.3125, + "grad_norm_var": 0.5624348958333333, + "learning_rate": 0.0003, + "loss": 11.2718, + "loss/aux_loss": 0.04808190818876028, + "loss/crossentropy": 2.7207518577575684, + "loss/logits": 0.8513909667730332, + "step": 39270 + }, + { + "epoch": 0.3928, + "grad_norm": 13.9375, + "grad_norm_var": 0.5618326822916667, + "learning_rate": 0.0003, + "loss": 11.2583, + "loss/aux_loss": 0.04807283375412226, + "loss/crossentropy": 2.514565271139145, + "loss/logits": 0.8464554220438003, + "step": 39280 + }, + { + "epoch": 0.3929, + "grad_norm": 14.0, + "grad_norm_var": 0.5028645833333333, + "learning_rate": 0.0003, + "loss": 11.1733, + "loss/aux_loss": 0.04806869979947805, + "loss/crossentropy": 2.7700137376785277, + "loss/logits": 0.849734765291214, + "step": 39290 + }, + { + "epoch": 0.393, + "grad_norm": 13.25, + "grad_norm_var": 0.7659993489583333, + "learning_rate": 0.0003, + "loss": 11.274, + "loss/aux_loss": 0.048083323240280154, + "loss/crossentropy": 2.843100357055664, + "loss/logits": 0.8784733712673187, + "step": 39300 + }, + { + "epoch": 0.3931, + "grad_norm": 12.625, + "grad_norm_var": 0.6634765625, + "learning_rate": 0.0003, + "loss": 11.4241, + "loss/aux_loss": 0.04807077012956142, + "loss/crossentropy": 2.799287849664688, + "loss/logits": 0.8680036425590515, + "step": 39310 + }, + { + "epoch": 0.3932, + "grad_norm": 13.5, + "grad_norm_var": 0.7202473958333333, + "learning_rate": 0.0003, + "loss": 11.2232, + "loss/aux_loss": 0.04807591922581196, + "loss/crossentropy": 2.7692100405693054, + "loss/logits": 0.8555681079626083, + "step": 39320 + }, + { + "epoch": 0.3933, + "grad_norm": 14.6875, + "grad_norm_var": 0.97578125, + "learning_rate": 0.0003, + "loss": 11.3526, + "loss/aux_loss": 0.04808627963066101, + "loss/crossentropy": 2.7542243778705595, + "loss/logits": 0.842135438323021, + "step": 39330 + }, + { + "epoch": 0.3934, + "grad_norm": 14.4375, + "grad_norm_var": 0.45358072916666664, + "learning_rate": 0.0003, + "loss": 11.3093, + "loss/aux_loss": 0.048074960522353646, + "loss/crossentropy": 2.8112044095993043, + "loss/logits": 0.8553465873003006, + "step": 39340 + }, + { + "epoch": 0.3935, + "grad_norm": 13.6875, + "grad_norm_var": 0.32545572916666665, + "learning_rate": 0.0003, + "loss": 11.2175, + "loss/aux_loss": 0.048079118691384794, + "loss/crossentropy": 2.6633784532547, + "loss/logits": 0.8331804633140564, + "step": 39350 + }, + { + "epoch": 0.3936, + "grad_norm": 14.1875, + "grad_norm_var": 0.35494791666666664, + "learning_rate": 0.0003, + "loss": 11.2346, + "loss/aux_loss": 0.04808082692325115, + "loss/crossentropy": 2.67893762588501, + "loss/logits": 0.8055594295263291, + "step": 39360 + }, + { + "epoch": 0.3937, + "grad_norm": 13.875, + "grad_norm_var": 0.5494140625, + "learning_rate": 0.0003, + "loss": 11.3957, + "loss/aux_loss": 0.04807113204151392, + "loss/crossentropy": 2.6513688981533052, + "loss/logits": 0.8550222337245941, + "step": 39370 + }, + { + "epoch": 0.3938, + "grad_norm": 13.5, + "grad_norm_var": 0.5113932291666666, + "learning_rate": 0.0003, + "loss": 11.1742, + "loss/aux_loss": 0.04808781389147043, + "loss/crossentropy": 2.5786080420017243, + "loss/logits": 0.8316751003265381, + "step": 39380 + }, + { + "epoch": 0.3939, + "grad_norm": 14.125, + "grad_norm_var": 0.38274739583333334, + "learning_rate": 0.0003, + "loss": 11.3349, + "loss/aux_loss": 0.048061727173626424, + "loss/crossentropy": 2.815303325653076, + "loss/logits": 0.8787429064512253, + "step": 39390 + }, + { + "epoch": 0.394, + "grad_norm": 13.375, + "grad_norm_var": 0.4574055989583333, + "learning_rate": 0.0003, + "loss": 11.1497, + "loss/aux_loss": 0.04808071050792932, + "loss/crossentropy": 2.8457810401916506, + "loss/logits": 0.8681068003177643, + "step": 39400 + }, + { + "epoch": 0.3941, + "grad_norm": 13.875, + "grad_norm_var": 0.637744140625, + "learning_rate": 0.0003, + "loss": 11.1906, + "loss/aux_loss": 0.048074718564748764, + "loss/crossentropy": 2.536262887716293, + "loss/logits": 0.8039017617702484, + "step": 39410 + }, + { + "epoch": 0.3942, + "grad_norm": 13.125, + "grad_norm_var": 0.6919270833333333, + "learning_rate": 0.0003, + "loss": 11.2357, + "loss/aux_loss": 0.04807904493063688, + "loss/crossentropy": 2.7414814889431, + "loss/logits": 0.8549802154302597, + "step": 39420 + }, + { + "epoch": 0.3943, + "grad_norm": 16.375, + "grad_norm_var": 0.8254557291666667, + "learning_rate": 0.0003, + "loss": 11.1561, + "loss/aux_loss": 0.048071026988327506, + "loss/crossentropy": 2.9144181966781617, + "loss/logits": 0.8272378146648407, + "step": 39430 + }, + { + "epoch": 0.3944, + "grad_norm": 13.875, + "grad_norm_var": 0.8355305989583334, + "learning_rate": 0.0003, + "loss": 11.3736, + "loss/aux_loss": 0.048078907653689384, + "loss/crossentropy": 2.664820075035095, + "loss/logits": 0.8304022997617722, + "step": 39440 + }, + { + "epoch": 0.3945, + "grad_norm": 14.875, + "grad_norm_var": 0.34088541666666666, + "learning_rate": 0.0003, + "loss": 11.2582, + "loss/aux_loss": 0.04807480573654175, + "loss/crossentropy": 2.5339000284671784, + "loss/logits": 0.7781210362911224, + "step": 39450 + }, + { + "epoch": 0.3946, + "grad_norm": 15.0625, + "grad_norm_var": 0.42337239583333336, + "learning_rate": 0.0003, + "loss": 11.3047, + "loss/aux_loss": 0.0480910299345851, + "loss/crossentropy": 2.610448843240738, + "loss/logits": 0.8010566890239715, + "step": 39460 + }, + { + "epoch": 0.3947, + "grad_norm": 13.625, + "grad_norm_var": 0.4361979166666667, + "learning_rate": 0.0003, + "loss": 11.2501, + "loss/aux_loss": 0.04807810541242361, + "loss/crossentropy": 2.775594508647919, + "loss/logits": 0.8565292507410049, + "step": 39470 + }, + { + "epoch": 0.3948, + "grad_norm": 13.25, + "grad_norm_var": 0.44073893229166666, + "learning_rate": 0.0003, + "loss": 11.1435, + "loss/aux_loss": 0.048072817362844945, + "loss/crossentropy": 2.6863482356071473, + "loss/logits": 0.8157978534698487, + "step": 39480 + }, + { + "epoch": 0.3949, + "grad_norm": 13.4375, + "grad_norm_var": 0.40305989583333335, + "learning_rate": 0.0003, + "loss": 11.3558, + "loss/aux_loss": 0.04808029588311911, + "loss/crossentropy": 2.8376736283302306, + "loss/logits": 0.8666150987148284, + "step": 39490 + }, + { + "epoch": 0.395, + "grad_norm": 14.5, + "grad_norm_var": 0.4551920572916667, + "learning_rate": 0.0003, + "loss": 11.2658, + "loss/aux_loss": 0.048076699860394, + "loss/crossentropy": 2.607940810918808, + "loss/logits": 0.8273808121681213, + "step": 39500 + }, + { + "epoch": 0.3951, + "grad_norm": 12.8125, + "grad_norm_var": 0.7525390625, + "learning_rate": 0.0003, + "loss": 11.1388, + "loss/aux_loss": 0.04806460794061422, + "loss/crossentropy": 2.6791930377483366, + "loss/logits": 0.8224393516778946, + "step": 39510 + }, + { + "epoch": 0.3952, + "grad_norm": 12.625, + "grad_norm_var": 0.6700358072916667, + "learning_rate": 0.0003, + "loss": 10.9948, + "loss/aux_loss": 0.04807190522551537, + "loss/crossentropy": 2.6685730695724486, + "loss/logits": 0.819244459271431, + "step": 39520 + }, + { + "epoch": 0.3953, + "grad_norm": 13.875, + "grad_norm_var": 0.38014322916666665, + "learning_rate": 0.0003, + "loss": 11.1964, + "loss/aux_loss": 0.048081899993121624, + "loss/crossentropy": 2.6292571663856505, + "loss/logits": 0.8497846484184265, + "step": 39530 + }, + { + "epoch": 0.3954, + "grad_norm": 13.75, + "grad_norm_var": 0.25462239583333335, + "learning_rate": 0.0003, + "loss": 11.359, + "loss/aux_loss": 0.048076131381094456, + "loss/crossentropy": 2.8662326276302337, + "loss/logits": 0.8483193576335907, + "step": 39540 + }, + { + "epoch": 0.3955, + "grad_norm": 13.9375, + "grad_norm_var": 0.47805989583333336, + "learning_rate": 0.0003, + "loss": 11.1232, + "loss/aux_loss": 0.048073952086269855, + "loss/crossentropy": 2.7887901782989504, + "loss/logits": 0.8364946961402893, + "step": 39550 + }, + { + "epoch": 0.3956, + "grad_norm": 13.3125, + "grad_norm_var": 0.3306640625, + "learning_rate": 0.0003, + "loss": 11.4657, + "loss/aux_loss": 0.04808053988963366, + "loss/crossentropy": 2.7247639894485474, + "loss/logits": 0.8865299373865128, + "step": 39560 + }, + { + "epoch": 0.3957, + "grad_norm": 14.3125, + "grad_norm_var": 0.25, + "learning_rate": 0.0003, + "loss": 11.1221, + "loss/aux_loss": 0.048066049627959725, + "loss/crossentropy": 2.7538771450519564, + "loss/logits": 0.8528720825910568, + "step": 39570 + }, + { + "epoch": 0.3958, + "grad_norm": 13.875, + "grad_norm_var": 0.3035807291666667, + "learning_rate": 0.0003, + "loss": 11.1495, + "loss/aux_loss": 0.04807343017309904, + "loss/crossentropy": 2.7472833156585694, + "loss/logits": 0.8405825644731522, + "step": 39580 + }, + { + "epoch": 0.3959, + "grad_norm": 14.1875, + "grad_norm_var": 0.24713541666666666, + "learning_rate": 0.0003, + "loss": 11.1893, + "loss/aux_loss": 0.0480898505076766, + "loss/crossentropy": 2.8660534262657165, + "loss/logits": 0.8727923810482026, + "step": 39590 + }, + { + "epoch": 0.396, + "grad_norm": 13.75, + "grad_norm_var": 0.3712076822916667, + "learning_rate": 0.0003, + "loss": 11.3245, + "loss/aux_loss": 0.04807707834988832, + "loss/crossentropy": 2.6662731945514677, + "loss/logits": 0.8399159997701645, + "step": 39600 + }, + { + "epoch": 0.3961, + "grad_norm": 13.1875, + "grad_norm_var": 0.5163899739583333, + "learning_rate": 0.0003, + "loss": 11.2979, + "loss/aux_loss": 0.04807784650474787, + "loss/crossentropy": 2.767361307144165, + "loss/logits": 0.8635453820228577, + "step": 39610 + }, + { + "epoch": 0.3962, + "grad_norm": 13.8125, + "grad_norm_var": 0.44217122395833336, + "learning_rate": 0.0003, + "loss": 11.3056, + "loss/aux_loss": 0.048077255859971045, + "loss/crossentropy": 2.885894167423248, + "loss/logits": 0.85841805934906, + "step": 39620 + }, + { + "epoch": 0.3963, + "grad_norm": 14.5, + "grad_norm_var": 13.565478515625, + "learning_rate": 0.0003, + "loss": 11.2025, + "loss/aux_loss": 0.04807215016335249, + "loss/crossentropy": 2.854511320590973, + "loss/logits": 0.8788524448871613, + "step": 39630 + }, + { + "epoch": 0.3964, + "grad_norm": 13.4375, + "grad_norm_var": 13.962093098958333, + "learning_rate": 0.0003, + "loss": 11.3555, + "loss/aux_loss": 0.04809546619653702, + "loss/crossentropy": 2.764549750089645, + "loss/logits": 0.8647037327289582, + "step": 39640 + }, + { + "epoch": 0.3965, + "grad_norm": 13.0625, + "grad_norm_var": 0.37578125, + "learning_rate": 0.0003, + "loss": 11.1444, + "loss/aux_loss": 0.04806444570422173, + "loss/crossentropy": 2.7343260645866394, + "loss/logits": 0.8285282194614411, + "step": 39650 + }, + { + "epoch": 0.3966, + "grad_norm": 14.0, + "grad_norm_var": 0.4197265625, + "learning_rate": 0.0003, + "loss": 11.271, + "loss/aux_loss": 0.04809067714959383, + "loss/crossentropy": 2.6628151297569276, + "loss/logits": 0.8381938517093659, + "step": 39660 + }, + { + "epoch": 0.3967, + "grad_norm": 13.1875, + "grad_norm_var": 0.468994140625, + "learning_rate": 0.0003, + "loss": 11.26, + "loss/aux_loss": 0.04807262271642685, + "loss/crossentropy": 2.827323651313782, + "loss/logits": 0.8302334070205688, + "step": 39670 + }, + { + "epoch": 0.3968, + "grad_norm": 13.625, + "grad_norm_var": 0.2494140625, + "learning_rate": 0.0003, + "loss": 11.2455, + "loss/aux_loss": 0.04807976856827736, + "loss/crossentropy": 2.863471567630768, + "loss/logits": 0.8567991226911544, + "step": 39680 + }, + { + "epoch": 0.3969, + "grad_norm": 13.875, + "grad_norm_var": 0.7364583333333333, + "learning_rate": 0.0003, + "loss": 11.3192, + "loss/aux_loss": 0.04808229897171259, + "loss/crossentropy": 2.7523205041885377, + "loss/logits": 0.8334077000617981, + "step": 39690 + }, + { + "epoch": 0.397, + "grad_norm": 14.9375, + "grad_norm_var": 0.7137858072916666, + "learning_rate": 0.0003, + "loss": 11.1529, + "loss/aux_loss": 0.04808085970580578, + "loss/crossentropy": 2.8388813376426696, + "loss/logits": 0.8468579053878784, + "step": 39700 + }, + { + "epoch": 0.3971, + "grad_norm": 13.625, + "grad_norm_var": 0.283056640625, + "learning_rate": 0.0003, + "loss": 11.1723, + "loss/aux_loss": 0.04806781094521284, + "loss/crossentropy": 2.8035045742988585, + "loss/logits": 0.8642447054386139, + "step": 39710 + }, + { + "epoch": 0.3972, + "grad_norm": 12.3125, + "grad_norm_var": 0.39842122395833335, + "learning_rate": 0.0003, + "loss": 11.1913, + "loss/aux_loss": 0.04807733949273825, + "loss/crossentropy": 2.6676317691802978, + "loss/logits": 0.8248802542686462, + "step": 39720 + }, + { + "epoch": 0.3973, + "grad_norm": 14.1875, + "grad_norm_var": 3.1890462239583335, + "learning_rate": 0.0003, + "loss": 11.419, + "loss/aux_loss": 0.048076121136546135, + "loss/crossentropy": 2.783367484807968, + "loss/logits": 0.8631105840206146, + "step": 39730 + }, + { + "epoch": 0.3974, + "grad_norm": 12.6875, + "grad_norm_var": 0.56953125, + "learning_rate": 0.0003, + "loss": 11.2753, + "loss/aux_loss": 0.04807474035769701, + "loss/crossentropy": 2.648731881380081, + "loss/logits": 0.8296503305435181, + "step": 39740 + }, + { + "epoch": 0.3975, + "grad_norm": 13.8125, + "grad_norm_var": 0.4315104166666667, + "learning_rate": 0.0003, + "loss": 11.3218, + "loss/aux_loss": 0.048075980879366396, + "loss/crossentropy": 2.709260368347168, + "loss/logits": 0.8509759098291397, + "step": 39750 + }, + { + "epoch": 0.3976, + "grad_norm": 16.875, + "grad_norm_var": 0.8150390625, + "learning_rate": 0.0003, + "loss": 11.3212, + "loss/aux_loss": 0.048085262067615986, + "loss/crossentropy": 2.561914938688278, + "loss/logits": 0.8712035864591599, + "step": 39760 + }, + { + "epoch": 0.3977, + "grad_norm": 14.25, + "grad_norm_var": 1.005322265625, + "learning_rate": 0.0003, + "loss": 11.353, + "loss/aux_loss": 0.048075600527226925, + "loss/crossentropy": 2.821903848648071, + "loss/logits": 0.8585720628499984, + "step": 39770 + }, + { + "epoch": 0.3978, + "grad_norm": 13.4375, + "grad_norm_var": 0.6304524739583334, + "learning_rate": 0.0003, + "loss": 11.1229, + "loss/aux_loss": 0.04808585401624441, + "loss/crossentropy": 2.5138413667678834, + "loss/logits": 0.7884344816207886, + "step": 39780 + }, + { + "epoch": 0.3979, + "grad_norm": 13.25, + "grad_norm_var": 0.49347330729166666, + "learning_rate": 0.0003, + "loss": 11.1641, + "loss/aux_loss": 0.04808144606649876, + "loss/crossentropy": 2.563122200965881, + "loss/logits": 0.8173367559909821, + "step": 39790 + }, + { + "epoch": 0.398, + "grad_norm": 13.75, + "grad_norm_var": 0.30388997395833334, + "learning_rate": 0.0003, + "loss": 10.9662, + "loss/aux_loss": 0.048083293810486795, + "loss/crossentropy": 2.514444661140442, + "loss/logits": 0.8048440098762513, + "step": 39800 + }, + { + "epoch": 0.3981, + "grad_norm": 14.0625, + "grad_norm_var": 21.170572916666668, + "learning_rate": 0.0003, + "loss": 11.246, + "loss/aux_loss": 0.0480785084888339, + "loss/crossentropy": 2.842600917816162, + "loss/logits": 0.8754805415868759, + "step": 39810 + }, + { + "epoch": 0.3982, + "grad_norm": 13.8125, + "grad_norm_var": 20.811442057291668, + "learning_rate": 0.0003, + "loss": 11.2918, + "loss/aux_loss": 0.048076963610947133, + "loss/crossentropy": 2.7346277594566346, + "loss/logits": 0.8496310234069824, + "step": 39820 + }, + { + "epoch": 0.3983, + "grad_norm": 13.625, + "grad_norm_var": 1.0786458333333333, + "learning_rate": 0.0003, + "loss": 11.1372, + "loss/aux_loss": 0.048078888468444346, + "loss/crossentropy": 2.68115548491478, + "loss/logits": 0.8213476330041886, + "step": 39830 + }, + { + "epoch": 0.3984, + "grad_norm": 14.6875, + "grad_norm_var": 0.44998372395833336, + "learning_rate": 0.0003, + "loss": 11.1956, + "loss/aux_loss": 0.048077587597072126, + "loss/crossentropy": 2.689275288581848, + "loss/logits": 0.8431656301021576, + "step": 39840 + }, + { + "epoch": 0.3985, + "grad_norm": 13.75, + "grad_norm_var": 0.36521809895833335, + "learning_rate": 0.0003, + "loss": 11.3385, + "loss/aux_loss": 0.04807688985019922, + "loss/crossentropy": 2.775956404209137, + "loss/logits": 0.8679609030485154, + "step": 39850 + }, + { + "epoch": 0.3986, + "grad_norm": 13.5625, + "grad_norm_var": 0.36692708333333335, + "learning_rate": 0.0003, + "loss": 11.3861, + "loss/aux_loss": 0.04808139931410551, + "loss/crossentropy": 2.6501555681228637, + "loss/logits": 0.8278191804885864, + "step": 39860 + }, + { + "epoch": 0.3987, + "grad_norm": 13.8125, + "grad_norm_var": 0.327587890625, + "learning_rate": 0.0003, + "loss": 11.3386, + "loss/aux_loss": 0.04808625839650631, + "loss/crossentropy": 2.752862584590912, + "loss/logits": 0.8313428431749343, + "step": 39870 + }, + { + "epoch": 0.3988, + "grad_norm": 15.0, + "grad_norm_var": 0.49973958333333335, + "learning_rate": 0.0003, + "loss": 11.1974, + "loss/aux_loss": 0.04805862847715616, + "loss/crossentropy": 2.766802215576172, + "loss/logits": 0.8339938923716546, + "step": 39880 + }, + { + "epoch": 0.3989, + "grad_norm": 14.9375, + "grad_norm_var": 0.4676432291666667, + "learning_rate": 0.0003, + "loss": 11.4879, + "loss/aux_loss": 0.0480917839333415, + "loss/crossentropy": 2.7454636096954346, + "loss/logits": 0.8500055640935897, + "step": 39890 + }, + { + "epoch": 0.399, + "grad_norm": 14.125, + "grad_norm_var": 0.5358723958333333, + "learning_rate": 0.0003, + "loss": 11.3118, + "loss/aux_loss": 0.04806092549115419, + "loss/crossentropy": 2.7647584557533262, + "loss/logits": 0.8545819491147995, + "step": 39900 + }, + { + "epoch": 0.3991, + "grad_norm": 15.375, + "grad_norm_var": 11.383707682291666, + "learning_rate": 0.0003, + "loss": 11.2725, + "loss/aux_loss": 0.04808534067124128, + "loss/crossentropy": 2.7508405685424804, + "loss/logits": 0.8540914624929428, + "step": 39910 + }, + { + "epoch": 0.3992, + "grad_norm": 15.125, + "grad_norm_var": 9.876676432291667, + "learning_rate": 0.0003, + "loss": 11.3259, + "loss/aux_loss": 0.04807797037065029, + "loss/crossentropy": 2.67775102853775, + "loss/logits": 0.8853228390216827, + "step": 39920 + }, + { + "epoch": 0.3993, + "grad_norm": 13.875, + "grad_norm_var": 1.1197265625, + "learning_rate": 0.0003, + "loss": 11.1964, + "loss/aux_loss": 0.04806930739432573, + "loss/crossentropy": 2.670240956544876, + "loss/logits": 0.8302730619907379, + "step": 39930 + }, + { + "epoch": 0.3994, + "grad_norm": 14.4375, + "grad_norm_var": 1.28984375, + "learning_rate": 0.0003, + "loss": 11.3559, + "loss/aux_loss": 0.04808368775993586, + "loss/crossentropy": 2.7974973797798155, + "loss/logits": 0.8431978434324264, + "step": 39940 + }, + { + "epoch": 0.3995, + "grad_norm": 14.1875, + "grad_norm_var": 0.9925618489583333, + "learning_rate": 0.0003, + "loss": 11.3789, + "loss/aux_loss": 0.04807193577289581, + "loss/crossentropy": 2.8047056078910826, + "loss/logits": 0.8324245274066925, + "step": 39950 + }, + { + "epoch": 0.3996, + "grad_norm": 15.0, + "grad_norm_var": 0.4400390625, + "learning_rate": 0.0003, + "loss": 11.295, + "loss/aux_loss": 0.048078315891325475, + "loss/crossentropy": 2.907421922683716, + "loss/logits": 0.8765753865242004, + "step": 39960 + }, + { + "epoch": 0.3997, + "grad_norm": 14.8125, + "grad_norm_var": 0.3042805989583333, + "learning_rate": 0.0003, + "loss": 11.2693, + "loss/aux_loss": 0.04807358868420124, + "loss/crossentropy": 2.6865237832069395, + "loss/logits": 0.8515175133943558, + "step": 39970 + }, + { + "epoch": 0.3998, + "grad_norm": 13.5625, + "grad_norm_var": 0.33839518229166665, + "learning_rate": 0.0003, + "loss": 11.2747, + "loss/aux_loss": 0.04806916173547506, + "loss/crossentropy": 2.7106878042221068, + "loss/logits": 0.8739930838346481, + "step": 39980 + }, + { + "epoch": 0.3999, + "grad_norm": 13.5625, + "grad_norm_var": 0.31640625, + "learning_rate": 0.0003, + "loss": 11.2064, + "loss/aux_loss": 0.04807684104889631, + "loss/crossentropy": 2.7278328776359557, + "loss/logits": 0.8172307670116424, + "step": 39990 + }, + { + "epoch": 0.4, + "grad_norm": 13.125, + "grad_norm_var": 0.15701497395833333, + "learning_rate": 0.0003, + "loss": 11.2938, + "loss/aux_loss": 0.04807619974017143, + "loss/crossentropy": 2.7106220006942747, + "loss/logits": 0.8441434442996979, + "step": 40000 + }, + { + "epoch": 0.4001, + "grad_norm": 14.4375, + "grad_norm_var": 0.45358072916666664, + "learning_rate": 0.0003, + "loss": 11.172, + "loss/aux_loss": 0.048076309636235236, + "loss/crossentropy": 2.640831911563873, + "loss/logits": 0.8590665191411972, + "step": 40010 + }, + { + "epoch": 0.4002, + "grad_norm": 13.875, + "grad_norm_var": 0.545556640625, + "learning_rate": 0.0003, + "loss": 11.414, + "loss/aux_loss": 0.048071987740695474, + "loss/crossentropy": 2.869738209247589, + "loss/logits": 0.866798147559166, + "step": 40020 + }, + { + "epoch": 0.4003, + "grad_norm": 13.3125, + "grad_norm_var": 0.4275390625, + "learning_rate": 0.0003, + "loss": 11.328, + "loss/aux_loss": 0.048072556219995025, + "loss/crossentropy": 2.6954082608222962, + "loss/logits": 0.831238204240799, + "step": 40030 + }, + { + "epoch": 0.4004, + "grad_norm": 14.5625, + "grad_norm_var": 0.32962239583333336, + "learning_rate": 0.0003, + "loss": 11.3336, + "loss/aux_loss": 0.04808360133320093, + "loss/crossentropy": 2.5370292246341704, + "loss/logits": 0.8065064072608947, + "step": 40040 + }, + { + "epoch": 0.4005, + "grad_norm": 14.25, + "grad_norm_var": 0.6997395833333333, + "learning_rate": 0.0003, + "loss": 11.2486, + "loss/aux_loss": 0.04807292725890875, + "loss/crossentropy": 2.687315058708191, + "loss/logits": 0.8650965690612793, + "step": 40050 + }, + { + "epoch": 0.4006, + "grad_norm": 14.0, + "grad_norm_var": 0.3633951822916667, + "learning_rate": 0.0003, + "loss": 11.0689, + "loss/aux_loss": 0.048067699931561944, + "loss/crossentropy": 2.8071807265281676, + "loss/logits": 0.8397237300872803, + "step": 40060 + }, + { + "epoch": 0.4007, + "grad_norm": 14.25, + "grad_norm_var": 0.4202473958333333, + "learning_rate": 0.0003, + "loss": 11.4838, + "loss/aux_loss": 0.04807199016213417, + "loss/crossentropy": 2.72553288936615, + "loss/logits": 0.8591938436031341, + "step": 40070 + }, + { + "epoch": 0.4008, + "grad_norm": 14.5625, + "grad_norm_var": 0.3973795572916667, + "learning_rate": 0.0003, + "loss": 11.1278, + "loss/aux_loss": 0.048072010092437265, + "loss/crossentropy": 2.725685381889343, + "loss/logits": 0.8271927177906037, + "step": 40080 + }, + { + "epoch": 0.4009, + "grad_norm": 14.6875, + "grad_norm_var": 0.3041015625, + "learning_rate": 0.0003, + "loss": 11.2294, + "loss/aux_loss": 0.048078466951847074, + "loss/crossentropy": 2.692145121097565, + "loss/logits": 0.8675953030586243, + "step": 40090 + }, + { + "epoch": 0.401, + "grad_norm": 15.25, + "grad_norm_var": 0.2764973958333333, + "learning_rate": 0.0003, + "loss": 11.3525, + "loss/aux_loss": 0.048074528202414514, + "loss/crossentropy": 2.8597113609313967, + "loss/logits": 0.8692526042461395, + "step": 40100 + }, + { + "epoch": 0.4011, + "grad_norm": 13.5, + "grad_norm_var": 0.262744140625, + "learning_rate": 0.0003, + "loss": 11.1604, + "loss/aux_loss": 0.048076745681464673, + "loss/crossentropy": 2.693953478336334, + "loss/logits": 0.8202391982078552, + "step": 40110 + }, + { + "epoch": 0.4012, + "grad_norm": 13.125, + "grad_norm_var": 0.21443684895833334, + "learning_rate": 0.0003, + "loss": 11.1184, + "loss/aux_loss": 0.04807984083890915, + "loss/crossentropy": 2.7698384284973145, + "loss/logits": 0.8715376138687134, + "step": 40120 + }, + { + "epoch": 0.4013, + "grad_norm": 14.25, + "grad_norm_var": 0.7333333333333333, + "learning_rate": 0.0003, + "loss": 11.2354, + "loss/aux_loss": 0.04807922802865505, + "loss/crossentropy": 2.58315287232399, + "loss/logits": 0.8681640088558197, + "step": 40130 + }, + { + "epoch": 0.4014, + "grad_norm": 12.9375, + "grad_norm_var": 0.46067708333333335, + "learning_rate": 0.0003, + "loss": 11.2891, + "loss/aux_loss": 0.048069264926016333, + "loss/crossentropy": 2.6770537555217744, + "loss/logits": 0.8575960993766785, + "step": 40140 + }, + { + "epoch": 0.4015, + "grad_norm": 14.8125, + "grad_norm_var": 0.368994140625, + "learning_rate": 0.0003, + "loss": 11.1327, + "loss/aux_loss": 0.048076451011002067, + "loss/crossentropy": 2.8826801657676695, + "loss/logits": 0.8579352647066116, + "step": 40150 + }, + { + "epoch": 0.4016, + "grad_norm": 13.8125, + "grad_norm_var": 0.5153483072916667, + "learning_rate": 0.0003, + "loss": 11.2343, + "loss/aux_loss": 0.04809125438332558, + "loss/crossentropy": 2.618731087446213, + "loss/logits": 0.7966024458408356, + "step": 40160 + }, + { + "epoch": 0.4017, + "grad_norm": 15.0625, + "grad_norm_var": 0.2830729166666667, + "learning_rate": 0.0003, + "loss": 11.1517, + "loss/aux_loss": 0.04808546844869852, + "loss/crossentropy": 2.646185064315796, + "loss/logits": 0.8252136647701264, + "step": 40170 + }, + { + "epoch": 0.4018, + "grad_norm": 14.3125, + "grad_norm_var": 0.3223795572916667, + "learning_rate": 0.0003, + "loss": 11.1737, + "loss/aux_loss": 0.048075793869793416, + "loss/crossentropy": 2.519075998663902, + "loss/logits": 0.8385014414787293, + "step": 40180 + }, + { + "epoch": 0.4019, + "grad_norm": 14.25, + "grad_norm_var": 0.4786295572916667, + "learning_rate": 0.0003, + "loss": 11.3532, + "loss/aux_loss": 0.0480783874168992, + "loss/crossentropy": 2.806821274757385, + "loss/logits": 0.841489189863205, + "step": 40190 + }, + { + "epoch": 0.402, + "grad_norm": 12.6875, + "grad_norm_var": 8.168082682291667, + "learning_rate": 0.0003, + "loss": 11.3093, + "loss/aux_loss": 0.04808771722018719, + "loss/crossentropy": 2.7427878618240356, + "loss/logits": 0.8800740391016006, + "step": 40200 + }, + { + "epoch": 0.4021, + "grad_norm": 13.3125, + "grad_norm_var": 0.27024739583333335, + "learning_rate": 0.0003, + "loss": 11.1774, + "loss/aux_loss": 0.048070460185408594, + "loss/crossentropy": 2.5837554335594177, + "loss/logits": 0.855616545677185, + "step": 40210 + }, + { + "epoch": 0.4022, + "grad_norm": 13.8125, + "grad_norm_var": 0.6893229166666667, + "learning_rate": 0.0003, + "loss": 11.2717, + "loss/aux_loss": 0.04808332584798336, + "loss/crossentropy": 2.7743981003761293, + "loss/logits": 0.8714166820049286, + "step": 40220 + }, + { + "epoch": 0.4023, + "grad_norm": 13.375, + "grad_norm_var": 0.743994140625, + "learning_rate": 0.0003, + "loss": 11.2791, + "loss/aux_loss": 0.04807800035923719, + "loss/crossentropy": 2.73896102309227, + "loss/logits": 0.8382152438163757, + "step": 40230 + }, + { + "epoch": 0.4024, + "grad_norm": 15.5625, + "grad_norm_var": 2.974739583333333, + "learning_rate": 0.0003, + "loss": 11.133, + "loss/aux_loss": 0.0480761282145977, + "loss/crossentropy": 2.6960204541683197, + "loss/logits": 0.8285995244979858, + "step": 40240 + }, + { + "epoch": 0.4025, + "grad_norm": 14.6875, + "grad_norm_var": 3.1540201822916667, + "learning_rate": 0.0003, + "loss": 11.3377, + "loss/aux_loss": 0.048080798238515854, + "loss/crossentropy": 2.774839425086975, + "loss/logits": 0.8541697800159455, + "step": 40250 + }, + { + "epoch": 0.4026, + "grad_norm": 15.9375, + "grad_norm_var": 0.5161458333333333, + "learning_rate": 0.0003, + "loss": 11.2857, + "loss/aux_loss": 0.04807197824120522, + "loss/crossentropy": 2.7606529712677004, + "loss/logits": 0.8577615320682526, + "step": 40260 + }, + { + "epoch": 0.4027, + "grad_norm": 13.8125, + "grad_norm_var": 0.4900390625, + "learning_rate": 0.0003, + "loss": 11.2021, + "loss/aux_loss": 0.04807481914758682, + "loss/crossentropy": 2.809233945608139, + "loss/logits": 0.8515638172626495, + "step": 40270 + }, + { + "epoch": 0.4028, + "grad_norm": 15.4375, + "grad_norm_var": 0.39920247395833336, + "learning_rate": 0.0003, + "loss": 11.1816, + "loss/aux_loss": 0.04807350169867277, + "loss/crossentropy": 2.748124420642853, + "loss/logits": 0.8494113475084305, + "step": 40280 + }, + { + "epoch": 0.4029, + "grad_norm": 15.8125, + "grad_norm_var": 0.5233723958333333, + "learning_rate": 0.0003, + "loss": 11.4207, + "loss/aux_loss": 0.04807033948600292, + "loss/crossentropy": 2.719420325756073, + "loss/logits": 0.8667916238307953, + "step": 40290 + }, + { + "epoch": 0.403, + "grad_norm": 14.6875, + "grad_norm_var": 0.27858072916666665, + "learning_rate": 0.0003, + "loss": 11.3065, + "loss/aux_loss": 0.048076901398599145, + "loss/crossentropy": 2.6295212328433992, + "loss/logits": 0.8683276027441025, + "step": 40300 + }, + { + "epoch": 0.4031, + "grad_norm": 16.625, + "grad_norm_var": 16.3119140625, + "learning_rate": 0.0003, + "loss": 11.1657, + "loss/aux_loss": 0.048084932193160054, + "loss/crossentropy": 2.8191932320594786, + "loss/logits": 0.8459627896547317, + "step": 40310 + }, + { + "epoch": 0.4032, + "grad_norm": 14.0625, + "grad_norm_var": 16.589957682291665, + "learning_rate": 0.0003, + "loss": 11.2229, + "loss/aux_loss": 0.04807272832840681, + "loss/crossentropy": 2.8436803817749023, + "loss/logits": 0.8772078216075897, + "step": 40320 + }, + { + "epoch": 0.4033, + "grad_norm": 13.125, + "grad_norm_var": 0.17962239583333334, + "learning_rate": 0.0003, + "loss": 11.2633, + "loss/aux_loss": 0.04807746745646, + "loss/crossentropy": 2.7854455411434174, + "loss/logits": 0.8320712119340896, + "step": 40330 + }, + { + "epoch": 0.4034, + "grad_norm": 13.8125, + "grad_norm_var": 0.3485514322916667, + "learning_rate": 0.0003, + "loss": 11.2072, + "loss/aux_loss": 0.04806860648095608, + "loss/crossentropy": 2.8128843665122987, + "loss/logits": 0.8971195042133331, + "step": 40340 + }, + { + "epoch": 0.4035, + "grad_norm": 12.9375, + "grad_norm_var": 0.5788899739583333, + "learning_rate": 0.0003, + "loss": 11.2615, + "loss/aux_loss": 0.04807734172791243, + "loss/crossentropy": 2.719151735305786, + "loss/logits": 0.8351715385913849, + "step": 40350 + }, + { + "epoch": 0.4036, + "grad_norm": 13.8125, + "grad_norm_var": 0.6541666666666667, + "learning_rate": 0.0003, + "loss": 11.217, + "loss/aux_loss": 0.04807971119880676, + "loss/crossentropy": 2.65439595580101, + "loss/logits": 0.8389561653137207, + "step": 40360 + }, + { + "epoch": 0.4037, + "grad_norm": 12.875, + "grad_norm_var": 0.7363932291666667, + "learning_rate": 0.0003, + "loss": 11.3419, + "loss/aux_loss": 0.048067951761186126, + "loss/crossentropy": 2.8152174830436705, + "loss/logits": 0.8660207390785217, + "step": 40370 + }, + { + "epoch": 0.4038, + "grad_norm": 14.75, + "grad_norm_var": 0.5778645833333333, + "learning_rate": 0.0003, + "loss": 11.2324, + "loss/aux_loss": 0.04808384161442518, + "loss/crossentropy": 2.682347524166107, + "loss/logits": 0.8513666987419128, + "step": 40380 + }, + { + "epoch": 0.4039, + "grad_norm": 14.0, + "grad_norm_var": 0.4456868489583333, + "learning_rate": 0.0003, + "loss": 11.1544, + "loss/aux_loss": 0.04806650690734386, + "loss/crossentropy": 2.735366094112396, + "loss/logits": 0.861262845993042, + "step": 40390 + }, + { + "epoch": 0.404, + "grad_norm": 13.3125, + "grad_norm_var": 0.6650390625, + "learning_rate": 0.0003, + "loss": 11.442, + "loss/aux_loss": 0.0480755427852273, + "loss/crossentropy": 2.8482566595077516, + "loss/logits": 0.8759390920400619, + "step": 40400 + }, + { + "epoch": 0.4041, + "grad_norm": 13.875, + "grad_norm_var": 0.29620768229166666, + "learning_rate": 0.0003, + "loss": 11.2389, + "loss/aux_loss": 0.04807413425296545, + "loss/crossentropy": 2.877766025066376, + "loss/logits": 0.8527081072330475, + "step": 40410 + }, + { + "epoch": 0.4042, + "grad_norm": 15.6875, + "grad_norm_var": 1855.3328125, + "learning_rate": 0.0003, + "loss": 11.212, + "loss/aux_loss": 0.04808369241654873, + "loss/crossentropy": 2.6940404534339906, + "loss/logits": 0.8131880909204483, + "step": 40420 + }, + { + "epoch": 0.4043, + "grad_norm": 15.8125, + "grad_norm_var": 1844.8794270833334, + "learning_rate": 0.0003, + "loss": 11.2414, + "loss/aux_loss": 0.048076518811285496, + "loss/crossentropy": 2.773360276222229, + "loss/logits": 0.8594042271375656, + "step": 40430 + }, + { + "epoch": 0.4044, + "grad_norm": 13.75, + "grad_norm_var": 3.066650390625, + "learning_rate": 0.0003, + "loss": 11.2978, + "loss/aux_loss": 0.048078188113868235, + "loss/crossentropy": 2.703492206335068, + "loss/logits": 0.8389413356781006, + "step": 40440 + }, + { + "epoch": 0.4045, + "grad_norm": 13.8125, + "grad_norm_var": 0.389306640625, + "learning_rate": 0.0003, + "loss": 11.2174, + "loss/aux_loss": 0.04808528777211905, + "loss/crossentropy": 2.682606953382492, + "loss/logits": 0.8352272599935532, + "step": 40450 + }, + { + "epoch": 0.4046, + "grad_norm": 13.1875, + "grad_norm_var": 56.533056640625, + "learning_rate": 0.0003, + "loss": 11.2459, + "loss/aux_loss": 0.04807485770434141, + "loss/crossentropy": 2.8166627526283263, + "loss/logits": 0.907360565662384, + "step": 40460 + }, + { + "epoch": 0.4047, + "grad_norm": 13.5625, + "grad_norm_var": 1.5634765625, + "learning_rate": 0.0003, + "loss": 11.1853, + "loss/aux_loss": 0.048092107847332956, + "loss/crossentropy": 2.744275617599487, + "loss/logits": 0.8467898726463318, + "step": 40470 + }, + { + "epoch": 0.4048, + "grad_norm": 13.3125, + "grad_norm_var": 0.5330729166666667, + "learning_rate": 0.0003, + "loss": 11.3042, + "loss/aux_loss": 0.04806965496391058, + "loss/crossentropy": 2.959182548522949, + "loss/logits": 0.8552993059158325, + "step": 40480 + }, + { + "epoch": 0.4049, + "grad_norm": 12.75, + "grad_norm_var": 0.3731608072916667, + "learning_rate": 0.0003, + "loss": 11.3252, + "loss/aux_loss": 0.04807450994849205, + "loss/crossentropy": 2.826492565870285, + "loss/logits": 0.8594222873449325, + "step": 40490 + }, + { + "epoch": 0.405, + "grad_norm": 16.25, + "grad_norm_var": 1.2333333333333334, + "learning_rate": 0.0003, + "loss": 11.1622, + "loss/aux_loss": 0.04808164164423943, + "loss/crossentropy": 2.7364363431930543, + "loss/logits": 0.8271835565567016, + "step": 40500 + }, + { + "epoch": 0.4051, + "grad_norm": 12.875, + "grad_norm_var": 0.9535807291666667, + "learning_rate": 0.0003, + "loss": 11.2023, + "loss/aux_loss": 0.048075484670698644, + "loss/crossentropy": 2.6764685451984405, + "loss/logits": 0.8407616734504699, + "step": 40510 + }, + { + "epoch": 0.4052, + "grad_norm": 14.375, + "grad_norm_var": 53.86302083333333, + "learning_rate": 0.0003, + "loss": 11.1894, + "loss/aux_loss": 0.048087388090789315, + "loss/crossentropy": 2.6650672793388366, + "loss/logits": 0.8593515366315841, + "step": 40520 + }, + { + "epoch": 0.4053, + "grad_norm": 19.625, + "grad_norm_var": 38.290478515625, + "learning_rate": 0.0003, + "loss": 11.304, + "loss/aux_loss": 0.04808492045849562, + "loss/crossentropy": 2.5504296123981476, + "loss/logits": 0.8223045408725739, + "step": 40530 + }, + { + "epoch": 0.4054, + "grad_norm": 13.25, + "grad_norm_var": 5.257535807291666, + "learning_rate": 0.0003, + "loss": 11.2125, + "loss/aux_loss": 0.04807108696550131, + "loss/crossentropy": 2.6947197139263155, + "loss/logits": 0.8361944794654846, + "step": 40540 + }, + { + "epoch": 0.4055, + "grad_norm": 14.1875, + "grad_norm_var": 0.5137858072916667, + "learning_rate": 0.0003, + "loss": 11.1638, + "loss/aux_loss": 0.048065942153334616, + "loss/crossentropy": 2.8109546184539793, + "loss/logits": 0.829085710644722, + "step": 40550 + }, + { + "epoch": 0.4056, + "grad_norm": 14.0, + "grad_norm_var": 0.6329264322916667, + "learning_rate": 0.0003, + "loss": 11.1777, + "loss/aux_loss": 0.048072488605976106, + "loss/crossentropy": 2.786225712299347, + "loss/logits": 0.8105708062648773, + "step": 40560 + }, + { + "epoch": 0.4057, + "grad_norm": 13.5, + "grad_norm_var": 6.563395182291667, + "learning_rate": 0.0003, + "loss": 11.3507, + "loss/aux_loss": 0.04808564819395542, + "loss/crossentropy": 2.751372504234314, + "loss/logits": 0.8564148962497711, + "step": 40570 + }, + { + "epoch": 0.4058, + "grad_norm": 13.0625, + "grad_norm_var": 0.8296223958333333, + "learning_rate": 0.0003, + "loss": 11.215, + "loss/aux_loss": 0.04807268865406513, + "loss/crossentropy": 2.8208558201789855, + "loss/logits": 0.8638029783964157, + "step": 40580 + }, + { + "epoch": 0.4059, + "grad_norm": 15.5625, + "grad_norm_var": 0.5239420572916667, + "learning_rate": 0.0003, + "loss": 11.28, + "loss/aux_loss": 0.04807143602520227, + "loss/crossentropy": 2.665737110376358, + "loss/logits": 0.8453109055757523, + "step": 40590 + }, + { + "epoch": 0.406, + "grad_norm": 15.375, + "grad_norm_var": 0.7660807291666667, + "learning_rate": 0.0003, + "loss": 11.1757, + "loss/aux_loss": 0.04807765483856201, + "loss/crossentropy": 2.630817985534668, + "loss/logits": 0.8509970605373383, + "step": 40600 + }, + { + "epoch": 0.4061, + "grad_norm": 15.4375, + "grad_norm_var": 0.9417805989583333, + "learning_rate": 0.0003, + "loss": 11.1785, + "loss/aux_loss": 0.04807794988155365, + "loss/crossentropy": 2.5632822811603546, + "loss/logits": 0.8242575019598007, + "step": 40610 + }, + { + "epoch": 0.4062, + "grad_norm": 16.5, + "grad_norm_var": 0.7395182291666667, + "learning_rate": 0.0003, + "loss": 11.1997, + "loss/aux_loss": 0.04807608798146248, + "loss/crossentropy": 2.808782720565796, + "loss/logits": 0.8660883277654647, + "step": 40620 + }, + { + "epoch": 0.4063, + "grad_norm": 15.6875, + "grad_norm_var": 1.1874348958333334, + "learning_rate": 0.0003, + "loss": 11.2832, + "loss/aux_loss": 0.04806890748441219, + "loss/crossentropy": 2.926540124416351, + "loss/logits": 0.86942158639431, + "step": 40630 + }, + { + "epoch": 0.4064, + "grad_norm": 14.75, + "grad_norm_var": 131.5337890625, + "learning_rate": 0.0003, + "loss": 11.3468, + "loss/aux_loss": 0.04808539636433125, + "loss/crossentropy": 2.8163744449615478, + "loss/logits": 0.8719450891017914, + "step": 40640 + }, + { + "epoch": 0.4065, + "grad_norm": 13.8125, + "grad_norm_var": 1.8114583333333334, + "learning_rate": 0.0003, + "loss": 11.3543, + "loss/aux_loss": 0.04807546809315681, + "loss/crossentropy": 2.686785101890564, + "loss/logits": 0.877001416683197, + "step": 40650 + }, + { + "epoch": 0.4066, + "grad_norm": 13.125, + "grad_norm_var": 0.5839680989583333, + "learning_rate": 0.0003, + "loss": 11.1328, + "loss/aux_loss": 0.04807546567171812, + "loss/crossentropy": 2.637565851211548, + "loss/logits": 0.8497596830129623, + "step": 40660 + }, + { + "epoch": 0.4067, + "grad_norm": 12.6875, + "grad_norm_var": 0.59375, + "learning_rate": 0.0003, + "loss": 11.3324, + "loss/aux_loss": 0.04807331208139658, + "loss/crossentropy": 2.5887813806533813, + "loss/logits": 0.8211749017238616, + "step": 40670 + }, + { + "epoch": 0.4068, + "grad_norm": 14.0, + "grad_norm_var": 0.9645182291666666, + "learning_rate": 0.0003, + "loss": 11.2639, + "loss/aux_loss": 0.0480765325948596, + "loss/crossentropy": 2.6423826932907106, + "loss/logits": 0.852640700340271, + "step": 40680 + }, + { + "epoch": 0.4069, + "grad_norm": 14.375, + "grad_norm_var": 0.35792643229166665, + "learning_rate": 0.0003, + "loss": 11.0373, + "loss/aux_loss": 0.0480792922899127, + "loss/crossentropy": 2.738467514514923, + "loss/logits": 0.8378350138664246, + "step": 40690 + }, + { + "epoch": 0.407, + "grad_norm": 14.0, + "grad_norm_var": 3.504280598958333, + "learning_rate": 0.0003, + "loss": 11.5276, + "loss/aux_loss": 0.0480826161801815, + "loss/crossentropy": 2.7524060189723967, + "loss/logits": 0.8455175578594207, + "step": 40700 + }, + { + "epoch": 0.4071, + "grad_norm": 13.625, + "grad_norm_var": 0.8868326822916667, + "learning_rate": 0.0003, + "loss": 11.1276, + "loss/aux_loss": 0.04806787483394146, + "loss/crossentropy": 2.55394446849823, + "loss/logits": 0.8222862929105759, + "step": 40710 + }, + { + "epoch": 0.4072, + "grad_norm": 14.625, + "grad_norm_var": 0.6627604166666666, + "learning_rate": 0.0003, + "loss": 11.2755, + "loss/aux_loss": 0.04808096699416638, + "loss/crossentropy": 2.7554591298103333, + "loss/logits": 0.8580325931310654, + "step": 40720 + }, + { + "epoch": 0.4073, + "grad_norm": 15.0, + "grad_norm_var": 1.2700358072916667, + "learning_rate": 0.0003, + "loss": 11.2255, + "loss/aux_loss": 0.04807104617357254, + "loss/crossentropy": 2.79397691488266, + "loss/logits": 0.8811484813690186, + "step": 40730 + }, + { + "epoch": 0.4074, + "grad_norm": 15.25, + "grad_norm_var": 1.4535807291666667, + "learning_rate": 0.0003, + "loss": 11.267, + "loss/aux_loss": 0.04806910958141088, + "loss/crossentropy": 2.837631583213806, + "loss/logits": 0.8389610022306442, + "step": 40740 + }, + { + "epoch": 0.4075, + "grad_norm": 14.25, + "grad_norm_var": 1.6921712239583333, + "learning_rate": 0.0003, + "loss": 11.1731, + "loss/aux_loss": 0.04807141162455082, + "loss/crossentropy": 2.658374536037445, + "loss/logits": 0.834993302822113, + "step": 40750 + }, + { + "epoch": 0.4076, + "grad_norm": 14.6875, + "grad_norm_var": 1.2516764322916667, + "learning_rate": 0.0003, + "loss": 11.1559, + "loss/aux_loss": 0.04808028191328049, + "loss/crossentropy": 2.8203831791877745, + "loss/logits": 0.849945318698883, + "step": 40760 + }, + { + "epoch": 0.4077, + "grad_norm": 14.8125, + "grad_norm_var": 0.5962076822916667, + "learning_rate": 0.0003, + "loss": 11.2235, + "loss/aux_loss": 0.04808716755360365, + "loss/crossentropy": 2.6665258586406706, + "loss/logits": 0.8106503874063492, + "step": 40770 + }, + { + "epoch": 0.4078, + "grad_norm": 14.375, + "grad_norm_var": 0.47994791666666664, + "learning_rate": 0.0003, + "loss": 11.1902, + "loss/aux_loss": 0.048067199811339376, + "loss/crossentropy": 2.8551108717918394, + "loss/logits": 0.8341933101415634, + "step": 40780 + }, + { + "epoch": 0.4079, + "grad_norm": 13.6875, + "grad_norm_var": 0.46404622395833334, + "learning_rate": 0.0003, + "loss": 11.3637, + "loss/aux_loss": 0.048095259629189965, + "loss/crossentropy": 2.789784300327301, + "loss/logits": 0.8456574827432632, + "step": 40790 + }, + { + "epoch": 0.408, + "grad_norm": 14.625, + "grad_norm_var": 0.36692708333333335, + "learning_rate": 0.0003, + "loss": 11.353, + "loss/aux_loss": 0.04808163102716208, + "loss/crossentropy": 2.769635444879532, + "loss/logits": 0.8221473515033721, + "step": 40800 + }, + { + "epoch": 0.4081, + "grad_norm": 14.4375, + "grad_norm_var": 0.235791015625, + "learning_rate": 0.0003, + "loss": 11.0647, + "loss/aux_loss": 0.04807380121201277, + "loss/crossentropy": 2.565002143383026, + "loss/logits": 0.80843525826931, + "step": 40810 + }, + { + "epoch": 0.4082, + "grad_norm": 13.4375, + "grad_norm_var": 0.17630208333333333, + "learning_rate": 0.0003, + "loss": 11.2184, + "loss/aux_loss": 0.04807892981916666, + "loss/crossentropy": 2.784299910068512, + "loss/logits": 0.8424362123012543, + "step": 40820 + }, + { + "epoch": 0.4083, + "grad_norm": 15.8125, + "grad_norm_var": 0.6523274739583333, + "learning_rate": 0.0003, + "loss": 11.1706, + "loss/aux_loss": 0.048079443722963335, + "loss/crossentropy": 2.6999199271202086, + "loss/logits": 0.8214786738157273, + "step": 40830 + }, + { + "epoch": 0.4084, + "grad_norm": 14.25, + "grad_norm_var": 0.57265625, + "learning_rate": 0.0003, + "loss": 11.2877, + "loss/aux_loss": 0.04807170610874891, + "loss/crossentropy": 2.830400151014328, + "loss/logits": 0.8450867384672165, + "step": 40840 + }, + { + "epoch": 0.4085, + "grad_norm": 14.125, + "grad_norm_var": 0.26555989583333334, + "learning_rate": 0.0003, + "loss": 11.3547, + "loss/aux_loss": 0.048065755516290665, + "loss/crossentropy": 2.723574197292328, + "loss/logits": 0.8342130482196808, + "step": 40850 + }, + { + "epoch": 0.4086, + "grad_norm": 13.875, + "grad_norm_var": 0.4051432291666667, + "learning_rate": 0.0003, + "loss": 11.1298, + "loss/aux_loss": 0.0480728205293417, + "loss/crossentropy": 2.7340495467185972, + "loss/logits": 0.8313911110162735, + "step": 40860 + }, + { + "epoch": 0.4087, + "grad_norm": 15.4375, + "grad_norm_var": 0.48880208333333336, + "learning_rate": 0.0003, + "loss": 11.1538, + "loss/aux_loss": 0.04808170907199383, + "loss/crossentropy": 2.6556981980800627, + "loss/logits": 0.8474443554878235, + "step": 40870 + }, + { + "epoch": 0.4088, + "grad_norm": 14.25, + "grad_norm_var": 0.42962239583333334, + "learning_rate": 0.0003, + "loss": 11.0847, + "loss/aux_loss": 0.04807398784905672, + "loss/crossentropy": 2.612995356321335, + "loss/logits": 0.8423753798007965, + "step": 40880 + }, + { + "epoch": 0.4089, + "grad_norm": 16.375, + "grad_norm_var": 1.2555826822916667, + "learning_rate": 0.0003, + "loss": 11.2143, + "loss/aux_loss": 0.04807846713811159, + "loss/crossentropy": 2.7332702219486236, + "loss/logits": 0.864795908331871, + "step": 40890 + }, + { + "epoch": 0.409, + "grad_norm": 13.6875, + "grad_norm_var": 10.117692057291666, + "learning_rate": 0.0003, + "loss": 11.2962, + "loss/aux_loss": 0.0480788690969348, + "loss/crossentropy": 2.662673217058182, + "loss/logits": 0.8612865924835205, + "step": 40900 + }, + { + "epoch": 0.4091, + "grad_norm": 14.5, + "grad_norm_var": 0.31131184895833336, + "learning_rate": 0.0003, + "loss": 11.1592, + "loss/aux_loss": 0.04808458536863327, + "loss/crossentropy": 2.793060463666916, + "loss/logits": 0.8244423866271973, + "step": 40910 + }, + { + "epoch": 0.4092, + "grad_norm": 18.0, + "grad_norm_var": 1.2817545572916667, + "learning_rate": 0.0003, + "loss": 11.3325, + "loss/aux_loss": 0.048066679015755655, + "loss/crossentropy": 2.822656285762787, + "loss/logits": 0.8820368677377701, + "step": 40920 + }, + { + "epoch": 0.4093, + "grad_norm": 14.1875, + "grad_norm_var": 1.397900390625, + "learning_rate": 0.0003, + "loss": 11.1258, + "loss/aux_loss": 0.04807949531823397, + "loss/crossentropy": 2.837810254096985, + "loss/logits": 0.8587910264730454, + "step": 40930 + }, + { + "epoch": 0.4094, + "grad_norm": 15.5625, + "grad_norm_var": 0.8061848958333333, + "learning_rate": 0.0003, + "loss": 11.2212, + "loss/aux_loss": 0.04807252325117588, + "loss/crossentropy": 2.607957309484482, + "loss/logits": 0.8224194586277008, + "step": 40940 + }, + { + "epoch": 0.4095, + "grad_norm": 13.6875, + "grad_norm_var": 0.6403483072916667, + "learning_rate": 0.0003, + "loss": 11.1002, + "loss/aux_loss": 0.048077072761952874, + "loss/crossentropy": 2.6987807989120483, + "loss/logits": 0.8149879366159439, + "step": 40950 + }, + { + "epoch": 0.4096, + "grad_norm": 14.125, + "grad_norm_var": 1.686962890625, + "learning_rate": 0.0003, + "loss": 11.3598, + "loss/aux_loss": 0.048078333213925364, + "loss/crossentropy": 2.849722057580948, + "loss/logits": 0.8561419308185577, + "step": 40960 + }, + { + "epoch": 0.4097, + "grad_norm": 13.5, + "grad_norm_var": 1.6792805989583333, + "learning_rate": 0.0003, + "loss": 11.2066, + "loss/aux_loss": 0.04807558581233025, + "loss/crossentropy": 2.783593249320984, + "loss/logits": 0.8805976897478104, + "step": 40970 + }, + { + "epoch": 0.4098, + "grad_norm": 16.75, + "grad_norm_var": 0.8942057291666666, + "learning_rate": 0.0003, + "loss": 11.2792, + "loss/aux_loss": 0.04807433895766735, + "loss/crossentropy": 2.619139677286148, + "loss/logits": 0.8347267210483551, + "step": 40980 + }, + { + "epoch": 0.4099, + "grad_norm": 16.125, + "grad_norm_var": 1.1270833333333334, + "learning_rate": 0.0003, + "loss": 11.2101, + "loss/aux_loss": 0.04807988088577986, + "loss/crossentropy": 2.6135978281497954, + "loss/logits": 0.8248639732599259, + "step": 40990 + }, + { + "epoch": 0.41, + "grad_norm": 15.125, + "grad_norm_var": 1.1157389322916667, + "learning_rate": 0.0003, + "loss": 11.1235, + "loss/aux_loss": 0.04808098264038563, + "loss/crossentropy": 2.711561453342438, + "loss/logits": 0.8340432167053222, + "step": 41000 + }, + { + "epoch": 0.4101, + "grad_norm": 15.0, + "grad_norm_var": 0.7129557291666667, + "learning_rate": 0.0003, + "loss": 11.141, + "loss/aux_loss": 0.04807994924485683, + "loss/crossentropy": 2.672397243976593, + "loss/logits": 0.8090786308050155, + "step": 41010 + }, + { + "epoch": 0.4102, + "grad_norm": 13.3125, + "grad_norm_var": 0.5226399739583333, + "learning_rate": 0.0003, + "loss": 11.2172, + "loss/aux_loss": 0.04806915018707514, + "loss/crossentropy": 2.7911486864089965, + "loss/logits": 0.8176318496465683, + "step": 41020 + }, + { + "epoch": 0.4103, + "grad_norm": 13.8125, + "grad_norm_var": 0.27029622395833336, + "learning_rate": 0.0003, + "loss": 11.0946, + "loss/aux_loss": 0.04807211291044951, + "loss/crossentropy": 2.7278398156166075, + "loss/logits": 0.837305772304535, + "step": 41030 + }, + { + "epoch": 0.4104, + "grad_norm": 13.75, + "grad_norm_var": 0.372900390625, + "learning_rate": 0.0003, + "loss": 11.1227, + "loss/aux_loss": 0.04808787349611521, + "loss/crossentropy": 2.6341135680675505, + "loss/logits": 0.8195017322897911, + "step": 41040 + }, + { + "epoch": 0.4105, + "grad_norm": 13.1875, + "grad_norm_var": 2.482796223958333, + "learning_rate": 0.0003, + "loss": 11.2554, + "loss/aux_loss": 0.04807057995349169, + "loss/crossentropy": 2.843722766637802, + "loss/logits": 0.8595122218132019, + "step": 41050 + }, + { + "epoch": 0.4106, + "grad_norm": 14.125, + "grad_norm_var": 2.5208333333333335, + "learning_rate": 0.0003, + "loss": 11.1853, + "loss/aux_loss": 0.04808246102184057, + "loss/crossentropy": 2.6582208454608915, + "loss/logits": 0.8346160590648651, + "step": 41060 + }, + { + "epoch": 0.4107, + "grad_norm": 15.3125, + "grad_norm_var": 0.71171875, + "learning_rate": 0.0003, + "loss": 11.3008, + "loss/aux_loss": 0.04806876853108406, + "loss/crossentropy": 2.6965928435325623, + "loss/logits": 0.8599708110094071, + "step": 41070 + }, + { + "epoch": 0.4108, + "grad_norm": 14.125, + "grad_norm_var": 0.8223307291666667, + "learning_rate": 0.0003, + "loss": 11.0954, + "loss/aux_loss": 0.048069384321570395, + "loss/crossentropy": 2.714770442247391, + "loss/logits": 0.8336487352848053, + "step": 41080 + }, + { + "epoch": 0.4109, + "grad_norm": 13.8125, + "grad_norm_var": 0.6469889322916667, + "learning_rate": 0.0003, + "loss": 11.2912, + "loss/aux_loss": 0.04807863663882017, + "loss/crossentropy": 2.7766244173049928, + "loss/logits": 0.8574995458126068, + "step": 41090 + }, + { + "epoch": 0.411, + "grad_norm": 14.75, + "grad_norm_var": 1.143603515625, + "learning_rate": 0.0003, + "loss": 10.9719, + "loss/aux_loss": 0.04807050917297602, + "loss/crossentropy": 2.6750208139419556, + "loss/logits": 0.8000975757837295, + "step": 41100 + }, + { + "epoch": 0.4111, + "grad_norm": 14.0625, + "grad_norm_var": 1.7415201822916666, + "learning_rate": 0.0003, + "loss": 11.0615, + "loss/aux_loss": 0.048084990307688716, + "loss/crossentropy": 2.8587915897369385, + "loss/logits": 0.859082692861557, + "step": 41110 + }, + { + "epoch": 0.4112, + "grad_norm": 12.9375, + "grad_norm_var": 0.441650390625, + "learning_rate": 0.0003, + "loss": 11.2455, + "loss/aux_loss": 0.04807133413851261, + "loss/crossentropy": 2.6838557541370394, + "loss/logits": 0.8341993808746337, + "step": 41120 + }, + { + "epoch": 0.4113, + "grad_norm": 14.5, + "grad_norm_var": 0.8056640625, + "learning_rate": 0.0003, + "loss": 11.3317, + "loss/aux_loss": 0.04806279819458723, + "loss/crossentropy": 2.934316062927246, + "loss/logits": 0.852023234963417, + "step": 41130 + }, + { + "epoch": 0.4114, + "grad_norm": 13.1875, + "grad_norm_var": 0.5887858072916666, + "learning_rate": 0.0003, + "loss": 11.1059, + "loss/aux_loss": 0.0480830904096365, + "loss/crossentropy": 2.8082756876945494, + "loss/logits": 0.814395149052143, + "step": 41140 + }, + { + "epoch": 0.4115, + "grad_norm": 14.1875, + "grad_norm_var": 3.896354166666667, + "learning_rate": 0.0003, + "loss": 11.2447, + "loss/aux_loss": 0.048078673891723156, + "loss/crossentropy": 2.7707399845123293, + "loss/logits": 0.8573799431324005, + "step": 41150 + }, + { + "epoch": 0.4116, + "grad_norm": 13.75, + "grad_norm_var": 3.89609375, + "learning_rate": 0.0003, + "loss": 11.2491, + "loss/aux_loss": 0.0480762155726552, + "loss/crossentropy": 2.9101900935173033, + "loss/logits": 0.8609474629163743, + "step": 41160 + }, + { + "epoch": 0.4117, + "grad_norm": 14.1875, + "grad_norm_var": 1.0238932291666667, + "learning_rate": 0.0003, + "loss": 11.1961, + "loss/aux_loss": 0.048080237582325935, + "loss/crossentropy": 2.606840658187866, + "loss/logits": 0.8273939996957779, + "step": 41170 + }, + { + "epoch": 0.4118, + "grad_norm": 14.5, + "grad_norm_var": 0.1619140625, + "learning_rate": 0.0003, + "loss": 11.2267, + "loss/aux_loss": 0.04807858131825924, + "loss/crossentropy": 2.684722530841827, + "loss/logits": 0.840096390247345, + "step": 41180 + }, + { + "epoch": 0.4119, + "grad_norm": 14.375, + "grad_norm_var": 0.6071451822916667, + "learning_rate": 0.0003, + "loss": 11.2679, + "loss/aux_loss": 0.04808023814111948, + "loss/crossentropy": 2.697424811124802, + "loss/logits": 0.8633444011211395, + "step": 41190 + }, + { + "epoch": 0.412, + "grad_norm": 14.0625, + "grad_norm_var": 0.28274739583333336, + "learning_rate": 0.0003, + "loss": 11.0866, + "loss/aux_loss": 0.04807175993919373, + "loss/crossentropy": 2.634129375219345, + "loss/logits": 0.8138844251632691, + "step": 41200 + }, + { + "epoch": 0.4121, + "grad_norm": 14.1875, + "grad_norm_var": 0.5567057291666667, + "learning_rate": 0.0003, + "loss": 11.0525, + "loss/aux_loss": 0.048080427944660185, + "loss/crossentropy": 2.6594059228897096, + "loss/logits": 0.8541360199451447, + "step": 41210 + }, + { + "epoch": 0.4122, + "grad_norm": 14.4375, + "grad_norm_var": 1.0149576822916666, + "learning_rate": 0.0003, + "loss": 11.2432, + "loss/aux_loss": 0.04808235038071871, + "loss/crossentropy": 2.797593057155609, + "loss/logits": 0.886846199631691, + "step": 41220 + }, + { + "epoch": 0.4123, + "grad_norm": 13.4375, + "grad_norm_var": 0.8604166666666667, + "learning_rate": 0.0003, + "loss": 11.0625, + "loss/aux_loss": 0.04806336238980293, + "loss/crossentropy": 2.474899399280548, + "loss/logits": 0.7937245279550552, + "step": 41230 + }, + { + "epoch": 0.4124, + "grad_norm": 13.625, + "grad_norm_var": 0.5230305989583334, + "learning_rate": 0.0003, + "loss": 11.2082, + "loss/aux_loss": 0.048086734302341935, + "loss/crossentropy": 2.6535877227783202, + "loss/logits": 0.8522655874490738, + "step": 41240 + }, + { + "epoch": 0.4125, + "grad_norm": 13.25, + "grad_norm_var": 13.269124348958334, + "learning_rate": 0.0003, + "loss": 11.2381, + "loss/aux_loss": 0.048068624176085, + "loss/crossentropy": 2.7681680560112, + "loss/logits": 0.8193393349647522, + "step": 41250 + }, + { + "epoch": 0.4126, + "grad_norm": 13.0, + "grad_norm_var": 14.011962890625, + "learning_rate": 0.0003, + "loss": 11.1778, + "loss/aux_loss": 0.04807403292506933, + "loss/crossentropy": 2.693704390525818, + "loss/logits": 0.861787760257721, + "step": 41260 + }, + { + "epoch": 0.4127, + "grad_norm": 14.6875, + "grad_norm_var": 0.5087076822916666, + "learning_rate": 0.0003, + "loss": 11.2949, + "loss/aux_loss": 0.04806809015572071, + "loss/crossentropy": 2.527338033914566, + "loss/logits": 0.8236821800470352, + "step": 41270 + }, + { + "epoch": 0.4128, + "grad_norm": 13.9375, + "grad_norm_var": 0.5129557291666667, + "learning_rate": 0.0003, + "loss": 11.22, + "loss/aux_loss": 0.04808800853788853, + "loss/crossentropy": 2.7711110353469848, + "loss/logits": 0.8424245923757553, + "step": 41280 + }, + { + "epoch": 0.4129, + "grad_norm": 14.375, + "grad_norm_var": 0.23878580729166668, + "learning_rate": 0.0003, + "loss": 11.1104, + "loss/aux_loss": 0.04806681144982576, + "loss/crossentropy": 2.7462151408195496, + "loss/logits": 0.878471040725708, + "step": 41290 + }, + { + "epoch": 0.413, + "grad_norm": 13.6875, + "grad_norm_var": 0.24386393229166667, + "learning_rate": 0.0003, + "loss": 11.2019, + "loss/aux_loss": 0.048082958348095416, + "loss/crossentropy": 2.857834202051163, + "loss/logits": 0.8067145884037018, + "step": 41300 + }, + { + "epoch": 0.4131, + "grad_norm": 13.6875, + "grad_norm_var": 0.2322265625, + "learning_rate": 0.0003, + "loss": 11.0486, + "loss/aux_loss": 0.048069640435278414, + "loss/crossentropy": 2.736476743221283, + "loss/logits": 0.8467221200466156, + "step": 41310 + }, + { + "epoch": 0.4132, + "grad_norm": 13.4375, + "grad_norm_var": 1.1536295572916666, + "learning_rate": 0.0003, + "loss": 11.3639, + "loss/aux_loss": 0.048079471290111545, + "loss/crossentropy": 2.822791963815689, + "loss/logits": 0.891073489189148, + "step": 41320 + }, + { + "epoch": 0.4133, + "grad_norm": 16.25, + "grad_norm_var": 1.016650390625, + "learning_rate": 0.0003, + "loss": 11.2361, + "loss/aux_loss": 0.04807520154863596, + "loss/crossentropy": 2.7339930176734923, + "loss/logits": 0.8536212533712387, + "step": 41330 + }, + { + "epoch": 0.4134, + "grad_norm": 14.5, + "grad_norm_var": 0.8587076822916667, + "learning_rate": 0.0003, + "loss": 11.0921, + "loss/aux_loss": 0.048079288192093374, + "loss/crossentropy": 2.6249010980129244, + "loss/logits": 0.8314791291952133, + "step": 41340 + }, + { + "epoch": 0.4135, + "grad_norm": 15.3125, + "grad_norm_var": 0.3551432291666667, + "learning_rate": 0.0003, + "loss": 11.1947, + "loss/aux_loss": 0.04807145558297634, + "loss/crossentropy": 2.6940083622932436, + "loss/logits": 0.8695379942655563, + "step": 41350 + }, + { + "epoch": 0.4136, + "grad_norm": 13.625, + "grad_norm_var": 0.5160807291666667, + "learning_rate": 0.0003, + "loss": 11.1861, + "loss/aux_loss": 0.04807656276971102, + "loss/crossentropy": 2.5916694521903993, + "loss/logits": 0.844970840215683, + "step": 41360 + }, + { + "epoch": 0.4137, + "grad_norm": 13.8125, + "grad_norm_var": 0.5186848958333333, + "learning_rate": 0.0003, + "loss": 11.362, + "loss/aux_loss": 0.04807461556047201, + "loss/crossentropy": 2.6328794419765473, + "loss/logits": 0.8364063590765, + "step": 41370 + }, + { + "epoch": 0.4138, + "grad_norm": 14.6875, + "grad_norm_var": 0.237744140625, + "learning_rate": 0.0003, + "loss": 11.201, + "loss/aux_loss": 0.04807669036090374, + "loss/crossentropy": 2.700971281528473, + "loss/logits": 0.8363195568323135, + "step": 41380 + }, + { + "epoch": 0.4139, + "grad_norm": 14.0625, + "grad_norm_var": 0.34524739583333336, + "learning_rate": 0.0003, + "loss": 11.3147, + "loss/aux_loss": 0.04808150418102741, + "loss/crossentropy": 2.7341397404670715, + "loss/logits": 0.8220134526491165, + "step": 41390 + }, + { + "epoch": 0.414, + "grad_norm": 13.75, + "grad_norm_var": 0.3525390625, + "learning_rate": 0.0003, + "loss": 11.303, + "loss/aux_loss": 0.04807320982217789, + "loss/crossentropy": 2.7358368039131165, + "loss/logits": 0.8331804543733596, + "step": 41400 + }, + { + "epoch": 0.4141, + "grad_norm": 15.0625, + "grad_norm_var": 1.0388020833333333, + "learning_rate": 0.0003, + "loss": 11.2284, + "loss/aux_loss": 0.04808115866035223, + "loss/crossentropy": 2.6575556874275206, + "loss/logits": 0.8795315742492675, + "step": 41410 + }, + { + "epoch": 0.4142, + "grad_norm": 14.5625, + "grad_norm_var": 0.8374348958333333, + "learning_rate": 0.0003, + "loss": 11.2057, + "loss/aux_loss": 0.04808524567633867, + "loss/crossentropy": 2.639425593614578, + "loss/logits": 0.8381777286529541, + "step": 41420 + }, + { + "epoch": 0.4143, + "grad_norm": 13.375, + "grad_norm_var": 0.7231608072916667, + "learning_rate": 0.0003, + "loss": 11.2503, + "loss/aux_loss": 0.04807372502982617, + "loss/crossentropy": 2.7047315418720244, + "loss/logits": 0.8312118053436279, + "step": 41430 + }, + { + "epoch": 0.4144, + "grad_norm": 14.3125, + "grad_norm_var": 0.2953125, + "learning_rate": 0.0003, + "loss": 11.1798, + "loss/aux_loss": 0.048076724819839003, + "loss/crossentropy": 2.651015895605087, + "loss/logits": 0.840973848104477, + "step": 41440 + }, + { + "epoch": 0.4145, + "grad_norm": 13.625, + "grad_norm_var": 0.5132649739583334, + "learning_rate": 0.0003, + "loss": 11.2366, + "loss/aux_loss": 0.04806795343756676, + "loss/crossentropy": 2.8247196197509767, + "loss/logits": 0.8217742323875428, + "step": 41450 + }, + { + "epoch": 0.4146, + "grad_norm": 14.1875, + "grad_norm_var": 0.46027018229166666, + "learning_rate": 0.0003, + "loss": 11.0727, + "loss/aux_loss": 0.04807949978858232, + "loss/crossentropy": 2.5847329258918763, + "loss/logits": 0.8199368387460708, + "step": 41460 + }, + { + "epoch": 0.4147, + "grad_norm": 13.8125, + "grad_norm_var": 0.314697265625, + "learning_rate": 0.0003, + "loss": 11.2538, + "loss/aux_loss": 0.04806741494685411, + "loss/crossentropy": 2.833824622631073, + "loss/logits": 0.8637136548757554, + "step": 41470 + }, + { + "epoch": 0.4148, + "grad_norm": 13.25, + "grad_norm_var": 0.16380208333333332, + "learning_rate": 0.0003, + "loss": 11.1968, + "loss/aux_loss": 0.04808125514537096, + "loss/crossentropy": 2.6305019736289976, + "loss/logits": 0.8417465597391128, + "step": 41480 + }, + { + "epoch": 0.4149, + "grad_norm": 25.375, + "grad_norm_var": 8.584749348958333, + "learning_rate": 0.0003, + "loss": 11.1024, + "loss/aux_loss": 0.0480692395940423, + "loss/crossentropy": 2.78939009308815, + "loss/logits": 0.8338617235422134, + "step": 41490 + }, + { + "epoch": 0.415, + "grad_norm": 15.625, + "grad_norm_var": 8.855452473958334, + "learning_rate": 0.0003, + "loss": 11.1944, + "loss/aux_loss": 0.04809089172631502, + "loss/crossentropy": 2.633754700422287, + "loss/logits": 0.8260821491479874, + "step": 41500 + }, + { + "epoch": 0.4151, + "grad_norm": 15.125, + "grad_norm_var": 1.2046223958333333, + "learning_rate": 0.0003, + "loss": 11.1703, + "loss/aux_loss": 0.048080214858055116, + "loss/crossentropy": 2.872915321588516, + "loss/logits": 0.847340676188469, + "step": 41510 + }, + { + "epoch": 0.4152, + "grad_norm": 15.4375, + "grad_norm_var": 1.2669108072916666, + "learning_rate": 0.0003, + "loss": 11.0204, + "loss/aux_loss": 0.04807033985853195, + "loss/crossentropy": 2.750588357448578, + "loss/logits": 0.8613759696483612, + "step": 41520 + }, + { + "epoch": 0.4153, + "grad_norm": 13.3125, + "grad_norm_var": 0.5091145833333334, + "learning_rate": 0.0003, + "loss": 11.2335, + "loss/aux_loss": 0.04808351546525955, + "loss/crossentropy": 2.702675199508667, + "loss/logits": 0.8650101304054261, + "step": 41530 + }, + { + "epoch": 0.4154, + "grad_norm": 15.75, + "grad_norm_var": 0.498291015625, + "learning_rate": 0.0003, + "loss": 11.2771, + "loss/aux_loss": 0.048078674264252184, + "loss/crossentropy": 2.69321893453598, + "loss/logits": 0.8795695304870605, + "step": 41540 + }, + { + "epoch": 0.4155, + "grad_norm": 13.9375, + "grad_norm_var": 1.3921223958333333, + "learning_rate": 0.0003, + "loss": 11.2882, + "loss/aux_loss": 0.0480771878734231, + "loss/crossentropy": 2.849241554737091, + "loss/logits": 0.8312081456184387, + "step": 41550 + }, + { + "epoch": 0.4156, + "grad_norm": 13.875, + "grad_norm_var": 1.506884765625, + "learning_rate": 0.0003, + "loss": 11.2257, + "loss/aux_loss": 0.04807361774146557, + "loss/crossentropy": 2.63561954498291, + "loss/logits": 0.8598904728889465, + "step": 41560 + }, + { + "epoch": 0.4157, + "grad_norm": 13.5, + "grad_norm_var": 0.570947265625, + "learning_rate": 0.0003, + "loss": 11.2278, + "loss/aux_loss": 0.04808267373591661, + "loss/crossentropy": 2.668130397796631, + "loss/logits": 0.8185748666524887, + "step": 41570 + }, + { + "epoch": 0.4158, + "grad_norm": 14.4375, + "grad_norm_var": 0.470166015625, + "learning_rate": 0.0003, + "loss": 11.1306, + "loss/aux_loss": 0.04807985983788967, + "loss/crossentropy": 2.642909526824951, + "loss/logits": 0.8164368301630021, + "step": 41580 + }, + { + "epoch": 0.4159, + "grad_norm": 14.6875, + "grad_norm_var": 0.4727701822916667, + "learning_rate": 0.0003, + "loss": 11.1133, + "loss/aux_loss": 0.04805450364947319, + "loss/crossentropy": 2.6953525304794312, + "loss/logits": 0.8492616504430771, + "step": 41590 + }, + { + "epoch": 0.416, + "grad_norm": 15.375, + "grad_norm_var": 50.5171875, + "learning_rate": 0.0003, + "loss": 11.4191, + "loss/aux_loss": 0.048086438328027725, + "loss/crossentropy": 2.696820414066315, + "loss/logits": 0.8366290658712388, + "step": 41600 + }, + { + "epoch": 0.4161, + "grad_norm": 14.875, + "grad_norm_var": 0.4183430989583333, + "learning_rate": 0.0003, + "loss": 11.178, + "loss/aux_loss": 0.04807772561907768, + "loss/crossentropy": 2.730132043361664, + "loss/logits": 0.8532672584056854, + "step": 41610 + }, + { + "epoch": 0.4162, + "grad_norm": 13.6875, + "grad_norm_var": 0.45625, + "learning_rate": 0.0003, + "loss": 11.2598, + "loss/aux_loss": 0.04807155448943377, + "loss/crossentropy": 2.8405850529670715, + "loss/logits": 0.8775646090507507, + "step": 41620 + }, + { + "epoch": 0.4163, + "grad_norm": 14.25, + "grad_norm_var": 3.5476399739583333, + "learning_rate": 0.0003, + "loss": 11.392, + "loss/aux_loss": 0.048077188059687616, + "loss/crossentropy": 2.6101203083992006, + "loss/logits": 0.8570107728242874, + "step": 41630 + }, + { + "epoch": 0.4164, + "grad_norm": 15.0625, + "grad_norm_var": 2.9973795572916666, + "learning_rate": 0.0003, + "loss": 11.3679, + "loss/aux_loss": 0.0480748301371932, + "loss/crossentropy": 2.6226659595966337, + "loss/logits": 0.8122975617647171, + "step": 41640 + }, + { + "epoch": 0.4165, + "grad_norm": 15.0, + "grad_norm_var": 0.42233072916666664, + "learning_rate": 0.0003, + "loss": 11.2247, + "loss/aux_loss": 0.04807394295930863, + "loss/crossentropy": 2.675206708908081, + "loss/logits": 0.8600716292858124, + "step": 41650 + }, + { + "epoch": 0.4166, + "grad_norm": 14.1875, + "grad_norm_var": 0.31951497395833334, + "learning_rate": 0.0003, + "loss": 11.3153, + "loss/aux_loss": 0.04808421637862921, + "loss/crossentropy": 2.7114802062511445, + "loss/logits": 0.8375776976346969, + "step": 41660 + }, + { + "epoch": 0.4167, + "grad_norm": 13.6875, + "grad_norm_var": 3.4596354166666665, + "learning_rate": 0.0003, + "loss": 11.1687, + "loss/aux_loss": 0.048071696795523165, + "loss/crossentropy": 2.6611205101013184, + "loss/logits": 0.8244008392095565, + "step": 41670 + }, + { + "epoch": 0.4168, + "grad_norm": 15.4375, + "grad_norm_var": 3.692122395833333, + "learning_rate": 0.0003, + "loss": 11.1502, + "loss/aux_loss": 0.048073360323905946, + "loss/crossentropy": 2.6855955958366393, + "loss/logits": 0.8512616366147995, + "step": 41680 + }, + { + "epoch": 0.4169, + "grad_norm": 12.75, + "grad_norm_var": 0.6722493489583333, + "learning_rate": 0.0003, + "loss": 11.2267, + "loss/aux_loss": 0.04807688854634762, + "loss/crossentropy": 2.8731314897537232, + "loss/logits": 0.8567210525274277, + "step": 41690 + }, + { + "epoch": 0.417, + "grad_norm": 14.3125, + "grad_norm_var": 0.6924479166666667, + "learning_rate": 0.0003, + "loss": 11.4032, + "loss/aux_loss": 0.04807659108191729, + "loss/crossentropy": 2.819017004966736, + "loss/logits": 0.8509372651576996, + "step": 41700 + }, + { + "epoch": 0.4171, + "grad_norm": 14.4375, + "grad_norm_var": 0.627978515625, + "learning_rate": 0.0003, + "loss": 11.1128, + "loss/aux_loss": 0.04807431064546108, + "loss/crossentropy": 2.587670737504959, + "loss/logits": 0.8356228917837143, + "step": 41710 + }, + { + "epoch": 0.4172, + "grad_norm": 13.9375, + "grad_norm_var": 0.8402180989583333, + "learning_rate": 0.0003, + "loss": 11.3018, + "loss/aux_loss": 0.048075992986559866, + "loss/crossentropy": 2.6217161655426025, + "loss/logits": 0.8416286587715149, + "step": 41720 + }, + { + "epoch": 0.4173, + "grad_norm": 12.5, + "grad_norm_var": 1.4369140625, + "learning_rate": 0.0003, + "loss": 11.2056, + "loss/aux_loss": 0.048069687001407145, + "loss/crossentropy": 2.705968415737152, + "loss/logits": 0.8546758621931076, + "step": 41730 + }, + { + "epoch": 0.4174, + "grad_norm": 14.6875, + "grad_norm_var": 1.9641764322916666, + "learning_rate": 0.0003, + "loss": 11.0999, + "loss/aux_loss": 0.04808080643415451, + "loss/crossentropy": 2.729911983013153, + "loss/logits": 0.8501161009073257, + "step": 41740 + }, + { + "epoch": 0.4175, + "grad_norm": 14.6875, + "grad_norm_var": 4.536051432291667, + "learning_rate": 0.0003, + "loss": 11.2002, + "loss/aux_loss": 0.0480776721611619, + "loss/crossentropy": 2.471704250574112, + "loss/logits": 0.8195729270577431, + "step": 41750 + }, + { + "epoch": 0.4176, + "grad_norm": 15.75, + "grad_norm_var": 0.6348307291666667, + "learning_rate": 0.0003, + "loss": 11.1175, + "loss/aux_loss": 0.04808681774884462, + "loss/crossentropy": 2.607026255130768, + "loss/logits": 0.8329499930143356, + "step": 41760 + }, + { + "epoch": 0.4177, + "grad_norm": 13.6875, + "grad_norm_var": 0.5440104166666667, + "learning_rate": 0.0003, + "loss": 11.2311, + "loss/aux_loss": 0.048072993755340576, + "loss/crossentropy": 2.548939037322998, + "loss/logits": 0.8520324468612671, + "step": 41770 + }, + { + "epoch": 0.4178, + "grad_norm": 14.0, + "grad_norm_var": 0.30130208333333336, + "learning_rate": 0.0003, + "loss": 11.1902, + "loss/aux_loss": 0.04807652682065964, + "loss/crossentropy": 2.70545357465744, + "loss/logits": 0.8085658639669419, + "step": 41780 + }, + { + "epoch": 0.4179, + "grad_norm": 14.1875, + "grad_norm_var": 0.785791015625, + "learning_rate": 0.0003, + "loss": 11.2129, + "loss/aux_loss": 0.04808551725000143, + "loss/crossentropy": 2.7015809535980226, + "loss/logits": 0.8569774001836776, + "step": 41790 + }, + { + "epoch": 0.418, + "grad_norm": 13.5625, + "grad_norm_var": 0.7012858072916667, + "learning_rate": 0.0003, + "loss": 11.1873, + "loss/aux_loss": 0.04806754495948553, + "loss/crossentropy": 2.7126809656620026, + "loss/logits": 0.8494727402925492, + "step": 41800 + }, + { + "epoch": 0.4181, + "grad_norm": 15.4375, + "grad_norm_var": 0.334375, + "learning_rate": 0.0003, + "loss": 11.2021, + "loss/aux_loss": 0.04807236008346081, + "loss/crossentropy": 2.6086998522281646, + "loss/logits": 0.854032838344574, + "step": 41810 + }, + { + "epoch": 0.4182, + "grad_norm": 14.4375, + "grad_norm_var": 0.279541015625, + "learning_rate": 0.0003, + "loss": 11.2497, + "loss/aux_loss": 0.048076076060533525, + "loss/crossentropy": 2.7394097089767455, + "loss/logits": 0.8530152827501297, + "step": 41820 + }, + { + "epoch": 0.4183, + "grad_norm": 13.5625, + "grad_norm_var": 0.3042805989583333, + "learning_rate": 0.0003, + "loss": 11.2131, + "loss/aux_loss": 0.048075619898736474, + "loss/crossentropy": 2.693272775411606, + "loss/logits": 0.8179311394691468, + "step": 41830 + }, + { + "epoch": 0.4184, + "grad_norm": 14.875, + "grad_norm_var": 0.6973958333333333, + "learning_rate": 0.0003, + "loss": 11.2869, + "loss/aux_loss": 0.048083697259426114, + "loss/crossentropy": 2.731438684463501, + "loss/logits": 0.8441088706254959, + "step": 41840 + }, + { + "epoch": 0.4185, + "grad_norm": 14.8125, + "grad_norm_var": 0.5348795572916667, + "learning_rate": 0.0003, + "loss": 11.2476, + "loss/aux_loss": 0.04807062391191721, + "loss/crossentropy": 2.7216593980789185, + "loss/logits": 0.8569325089454651, + "step": 41850 + }, + { + "epoch": 0.4186, + "grad_norm": 14.375, + "grad_norm_var": 0.318994140625, + "learning_rate": 0.0003, + "loss": 11.1622, + "loss/aux_loss": 0.04807673562318086, + "loss/crossentropy": 2.7688432216644285, + "loss/logits": 0.868677607178688, + "step": 41860 + }, + { + "epoch": 0.4187, + "grad_norm": 14.375, + "grad_norm_var": 0.549853515625, + "learning_rate": 0.0003, + "loss": 11.1659, + "loss/aux_loss": 0.04807185679674149, + "loss/crossentropy": 2.743114960193634, + "loss/logits": 0.8016538411378861, + "step": 41870 + }, + { + "epoch": 0.4188, + "grad_norm": 14.4375, + "grad_norm_var": 0.633056640625, + "learning_rate": 0.0003, + "loss": 11.4034, + "loss/aux_loss": 0.04806502480059862, + "loss/crossentropy": 2.7593605399131773, + "loss/logits": 0.8573271870613098, + "step": 41880 + }, + { + "epoch": 0.4189, + "grad_norm": 14.8125, + "grad_norm_var": 0.4806640625, + "learning_rate": 0.0003, + "loss": 11.2272, + "loss/aux_loss": 0.048074369132518766, + "loss/crossentropy": 2.729891151189804, + "loss/logits": 0.877751037478447, + "step": 41890 + }, + { + "epoch": 0.419, + "grad_norm": 15.5, + "grad_norm_var": 0.5794108072916667, + "learning_rate": 0.0003, + "loss": 11.2764, + "loss/aux_loss": 0.04806425198912621, + "loss/crossentropy": 2.7819466471672056, + "loss/logits": 0.8514897584915161, + "step": 41900 + }, + { + "epoch": 0.4191, + "grad_norm": 15.375, + "grad_norm_var": 0.35149739583333334, + "learning_rate": 0.0003, + "loss": 11.1274, + "loss/aux_loss": 0.048081301525235175, + "loss/crossentropy": 2.7535706400871276, + "loss/logits": 0.862214544415474, + "step": 41910 + }, + { + "epoch": 0.4192, + "grad_norm": 13.5, + "grad_norm_var": 0.5299479166666666, + "learning_rate": 0.0003, + "loss": 11.2672, + "loss/aux_loss": 0.0480682335793972, + "loss/crossentropy": 2.62730153799057, + "loss/logits": 0.8124941200017929, + "step": 41920 + }, + { + "epoch": 0.4193, + "grad_norm": 14.875, + "grad_norm_var": 0.52109375, + "learning_rate": 0.0003, + "loss": 11.2227, + "loss/aux_loss": 0.04808680079877377, + "loss/crossentropy": 2.6323707461357118, + "loss/logits": 0.8262597292661666, + "step": 41930 + }, + { + "epoch": 0.4194, + "grad_norm": 14.5, + "grad_norm_var": 0.290625, + "learning_rate": 0.0003, + "loss": 11.0658, + "loss/aux_loss": 0.048071213997900486, + "loss/crossentropy": 2.8325400054454803, + "loss/logits": 0.8364947497844696, + "step": 41940 + }, + { + "epoch": 0.4195, + "grad_norm": 13.75, + "grad_norm_var": 1.2986979166666666, + "learning_rate": 0.0003, + "loss": 11.3139, + "loss/aux_loss": 0.048080979473888875, + "loss/crossentropy": 2.669241964817047, + "loss/logits": 0.853290992975235, + "step": 41950 + }, + { + "epoch": 0.4196, + "grad_norm": 13.5625, + "grad_norm_var": 1.460791015625, + "learning_rate": 0.0003, + "loss": 11.1538, + "loss/aux_loss": 0.048079793155193326, + "loss/crossentropy": 2.7789398312568663, + "loss/logits": 0.8316181004047394, + "step": 41960 + }, + { + "epoch": 0.4197, + "grad_norm": 15.0625, + "grad_norm_var": 0.3932291666666667, + "learning_rate": 0.0003, + "loss": 11.411, + "loss/aux_loss": 0.04805813655257225, + "loss/crossentropy": 2.850307047367096, + "loss/logits": 0.8470256596803665, + "step": 41970 + }, + { + "epoch": 0.4198, + "grad_norm": 14.125, + "grad_norm_var": 0.590087890625, + "learning_rate": 0.0003, + "loss": 11.2341, + "loss/aux_loss": 0.04809781014919281, + "loss/crossentropy": 2.73042853474617, + "loss/logits": 0.8153778612613678, + "step": 41980 + }, + { + "epoch": 0.4199, + "grad_norm": 14.875, + "grad_norm_var": 1.03515625, + "learning_rate": 0.0003, + "loss": 11.2463, + "loss/aux_loss": 0.048078482411801814, + "loss/crossentropy": 2.771644139289856, + "loss/logits": 0.8651615500450134, + "step": 41990 + }, + { + "epoch": 0.42, + "grad_norm": 13.9375, + "grad_norm_var": 1.0833170572916666, + "learning_rate": 0.0003, + "loss": 11.2312, + "loss/aux_loss": 0.04807273019105196, + "loss/crossentropy": 2.685261583328247, + "loss/logits": 0.8609261780977249, + "step": 42000 + }, + { + "epoch": 0.4201, + "grad_norm": 15.1875, + "grad_norm_var": 0.44581705729166665, + "learning_rate": 0.0003, + "loss": 11.23, + "loss/aux_loss": 0.048074769973754886, + "loss/crossentropy": 2.605684131383896, + "loss/logits": 0.8102221429347992, + "step": 42010 + }, + { + "epoch": 0.4202, + "grad_norm": 13.5625, + "grad_norm_var": 0.5038899739583333, + "learning_rate": 0.0003, + "loss": 11.2462, + "loss/aux_loss": 0.04807645082473755, + "loss/crossentropy": 2.828294575214386, + "loss/logits": 0.8302065849304199, + "step": 42020 + }, + { + "epoch": 0.4203, + "grad_norm": 14.1875, + "grad_norm_var": 0.4630208333333333, + "learning_rate": 0.0003, + "loss": 11.2458, + "loss/aux_loss": 0.04807501658797264, + "loss/crossentropy": 2.6530769050121306, + "loss/logits": 0.8315863937139512, + "step": 42030 + }, + { + "epoch": 0.4204, + "grad_norm": 14.3125, + "grad_norm_var": 0.361962890625, + "learning_rate": 0.0003, + "loss": 11.1785, + "loss/aux_loss": 0.0480683233588934, + "loss/crossentropy": 2.774942231178284, + "loss/logits": 0.8557955861091614, + "step": 42040 + }, + { + "epoch": 0.4205, + "grad_norm": 15.8125, + "grad_norm_var": 1.2765625, + "learning_rate": 0.0003, + "loss": 11.1787, + "loss/aux_loss": 0.04808812700212002, + "loss/crossentropy": 2.7394619226455688, + "loss/logits": 0.8685553550720215, + "step": 42050 + }, + { + "epoch": 0.4206, + "grad_norm": 15.3125, + "grad_norm_var": 1.025244140625, + "learning_rate": 0.0003, + "loss": 11.141, + "loss/aux_loss": 0.04807517230510712, + "loss/crossentropy": 2.646501141786575, + "loss/logits": 0.8364500343799591, + "step": 42060 + }, + { + "epoch": 0.4207, + "grad_norm": 15.4375, + "grad_norm_var": 6.862223307291667, + "learning_rate": 0.0003, + "loss": 11.1773, + "loss/aux_loss": 0.048075311444699766, + "loss/crossentropy": 2.95753812789917, + "loss/logits": 0.8557416766881942, + "step": 42070 + }, + { + "epoch": 0.4208, + "grad_norm": 13.25, + "grad_norm_var": 2.1541015625, + "learning_rate": 0.0003, + "loss": 11.1619, + "loss/aux_loss": 0.04807597082108259, + "loss/crossentropy": 2.747110295295715, + "loss/logits": 0.8265171319246292, + "step": 42080 + }, + { + "epoch": 0.4209, + "grad_norm": 13.3125, + "grad_norm_var": 0.35651041666666666, + "learning_rate": 0.0003, + "loss": 11.2256, + "loss/aux_loss": 0.048070704378187654, + "loss/crossentropy": 2.6773354530334474, + "loss/logits": 0.8335412830114365, + "step": 42090 + }, + { + "epoch": 0.421, + "grad_norm": 14.0, + "grad_norm_var": 0.7171223958333334, + "learning_rate": 0.0003, + "loss": 11.3195, + "loss/aux_loss": 0.04807304628193378, + "loss/crossentropy": 2.7497352182865145, + "loss/logits": 0.8508964985609054, + "step": 42100 + }, + { + "epoch": 0.4211, + "grad_norm": 15.5, + "grad_norm_var": 3.123177083333333, + "learning_rate": 0.0003, + "loss": 11.2999, + "loss/aux_loss": 0.0480742210522294, + "loss/crossentropy": 2.733784317970276, + "loss/logits": 0.8499576389789582, + "step": 42110 + }, + { + "epoch": 0.4212, + "grad_norm": 14.625, + "grad_norm_var": 3.007926432291667, + "learning_rate": 0.0003, + "loss": 11.1922, + "loss/aux_loss": 0.04807363022118807, + "loss/crossentropy": 2.723239630460739, + "loss/logits": 0.8264488846063613, + "step": 42120 + }, + { + "epoch": 0.4213, + "grad_norm": 14.3125, + "grad_norm_var": 0.3167805989583333, + "learning_rate": 0.0003, + "loss": 11.2428, + "loss/aux_loss": 0.04807157460600138, + "loss/crossentropy": 2.740461474657059, + "loss/logits": 0.8402502328157425, + "step": 42130 + }, + { + "epoch": 0.4214, + "grad_norm": 14.5, + "grad_norm_var": 0.317822265625, + "learning_rate": 0.0003, + "loss": 11.3028, + "loss/aux_loss": 0.04807231556624174, + "loss/crossentropy": 2.737111634016037, + "loss/logits": 0.8425071030855179, + "step": 42140 + }, + { + "epoch": 0.4215, + "grad_norm": 13.875, + "grad_norm_var": 0.472509765625, + "learning_rate": 0.0003, + "loss": 11.1031, + "loss/aux_loss": 0.048074459098279475, + "loss/crossentropy": 2.6582253992557527, + "loss/logits": 0.8121216595172882, + "step": 42150 + }, + { + "epoch": 0.4216, + "grad_norm": 15.1875, + "grad_norm_var": 0.7161295572916667, + "learning_rate": 0.0003, + "loss": 11.2392, + "loss/aux_loss": 0.04806816950440407, + "loss/crossentropy": 2.573642885684967, + "loss/logits": 0.8212338477373123, + "step": 42160 + }, + { + "epoch": 0.4217, + "grad_norm": 14.25, + "grad_norm_var": 0.7730305989583334, + "learning_rate": 0.0003, + "loss": 11.1449, + "loss/aux_loss": 0.048084497638046744, + "loss/crossentropy": 2.7038708448410036, + "loss/logits": 0.8355427473783493, + "step": 42170 + }, + { + "epoch": 0.4218, + "grad_norm": 13.8125, + "grad_norm_var": 0.9291015625, + "learning_rate": 0.0003, + "loss": 11.1934, + "loss/aux_loss": 0.048070290684700014, + "loss/crossentropy": 2.932389295101166, + "loss/logits": 0.8793515950441361, + "step": 42180 + }, + { + "epoch": 0.4219, + "grad_norm": 13.75, + "grad_norm_var": 0.6808430989583333, + "learning_rate": 0.0003, + "loss": 11.2596, + "loss/aux_loss": 0.04806995559483766, + "loss/crossentropy": 2.848974609375, + "loss/logits": 0.887108889222145, + "step": 42190 + }, + { + "epoch": 0.422, + "grad_norm": 14.3125, + "grad_norm_var": 0.36139322916666666, + "learning_rate": 0.0003, + "loss": 11.0734, + "loss/aux_loss": 0.04808047562837601, + "loss/crossentropy": 2.7708349883556367, + "loss/logits": 0.8702079772949218, + "step": 42200 + }, + { + "epoch": 0.4221, + "grad_norm": 13.6875, + "grad_norm_var": 0.3931640625, + "learning_rate": 0.0003, + "loss": 11.0987, + "loss/aux_loss": 0.04806628059595823, + "loss/crossentropy": 2.75937157869339, + "loss/logits": 0.8796556890010834, + "step": 42210 + }, + { + "epoch": 0.4222, + "grad_norm": 14.6875, + "grad_norm_var": 0.392822265625, + "learning_rate": 0.0003, + "loss": 11.296, + "loss/aux_loss": 0.0480836084112525, + "loss/crossentropy": 2.686858814954758, + "loss/logits": 0.8247960180044174, + "step": 42220 + }, + { + "epoch": 0.4223, + "grad_norm": 14.5625, + "grad_norm_var": 0.758837890625, + "learning_rate": 0.0003, + "loss": 11.1395, + "loss/aux_loss": 0.048082459904253486, + "loss/crossentropy": 2.888724946975708, + "loss/logits": 0.840698453783989, + "step": 42230 + }, + { + "epoch": 0.4224, + "grad_norm": 15.4375, + "grad_norm_var": 2.5072265625, + "learning_rate": 0.0003, + "loss": 11.3138, + "loss/aux_loss": 0.04806939046829939, + "loss/crossentropy": 2.7978740334510803, + "loss/logits": 0.8392590701580047, + "step": 42240 + }, + { + "epoch": 0.4225, + "grad_norm": 14.6875, + "grad_norm_var": 1.006494140625, + "learning_rate": 0.0003, + "loss": 11.2709, + "loss/aux_loss": 0.048084151558578016, + "loss/crossentropy": 2.694107210636139, + "loss/logits": 0.8511635422706604, + "step": 42250 + }, + { + "epoch": 0.4226, + "grad_norm": 13.8125, + "grad_norm_var": 0.407275390625, + "learning_rate": 0.0003, + "loss": 11.1712, + "loss/aux_loss": 0.04807136338204145, + "loss/crossentropy": 2.8651691317558288, + "loss/logits": 0.8362912058830261, + "step": 42260 + }, + { + "epoch": 0.4227, + "grad_norm": 13.8125, + "grad_norm_var": 0.4098307291666667, + "learning_rate": 0.0003, + "loss": 11.301, + "loss/aux_loss": 0.04806836117058992, + "loss/crossentropy": 2.8899617552757264, + "loss/logits": 0.8420800924301147, + "step": 42270 + }, + { + "epoch": 0.4228, + "grad_norm": 14.0, + "grad_norm_var": 0.4931640625, + "learning_rate": 0.0003, + "loss": 11.2737, + "loss/aux_loss": 0.048084226250648496, + "loss/crossentropy": 2.731697905063629, + "loss/logits": 0.8795881062746048, + "step": 42280 + }, + { + "epoch": 0.4229, + "grad_norm": 14.5625, + "grad_norm_var": 0.454150390625, + "learning_rate": 0.0003, + "loss": 11.0708, + "loss/aux_loss": 0.048074116744101045, + "loss/crossentropy": 2.6348713278770446, + "loss/logits": 0.8292662829160691, + "step": 42290 + }, + { + "epoch": 0.423, + "grad_norm": 14.625, + "grad_norm_var": 0.9536295572916667, + "learning_rate": 0.0003, + "loss": 11.159, + "loss/aux_loss": 0.04808458648622036, + "loss/crossentropy": 2.6694875180721285, + "loss/logits": 0.8599371790885926, + "step": 42300 + }, + { + "epoch": 0.4231, + "grad_norm": 15.3125, + "grad_norm_var": 0.34308268229166666, + "learning_rate": 0.0003, + "loss": 11.0977, + "loss/aux_loss": 0.04808226190507412, + "loss/crossentropy": 2.7208563089370728, + "loss/logits": 0.8512184768915176, + "step": 42310 + }, + { + "epoch": 0.4232, + "grad_norm": 14.1875, + "grad_norm_var": 0.48631184895833335, + "learning_rate": 0.0003, + "loss": 11.0967, + "loss/aux_loss": 0.04806763473898172, + "loss/crossentropy": 2.7380436182022097, + "loss/logits": 0.8502931475639344, + "step": 42320 + }, + { + "epoch": 0.4233, + "grad_norm": 13.5625, + "grad_norm_var": 0.8145182291666667, + "learning_rate": 0.0003, + "loss": 11.0071, + "loss/aux_loss": 0.048071845807135104, + "loss/crossentropy": 2.7763596057891844, + "loss/logits": 0.8628914952278137, + "step": 42330 + }, + { + "epoch": 0.4234, + "grad_norm": 14.375, + "grad_norm_var": 2.943603515625, + "learning_rate": 0.0003, + "loss": 10.9369, + "loss/aux_loss": 0.04808341804891825, + "loss/crossentropy": 2.7524753272533418, + "loss/logits": 0.829263374209404, + "step": 42340 + }, + { + "epoch": 0.4235, + "grad_norm": 12.8125, + "grad_norm_var": 2.596337890625, + "learning_rate": 0.0003, + "loss": 11.1457, + "loss/aux_loss": 0.04806433003395796, + "loss/crossentropy": 2.7292973041534423, + "loss/logits": 0.8203612565994263, + "step": 42350 + }, + { + "epoch": 0.4236, + "grad_norm": 15.1875, + "grad_norm_var": 0.830322265625, + "learning_rate": 0.0003, + "loss": 11.224, + "loss/aux_loss": 0.048083837144076824, + "loss/crossentropy": 2.7010737299919128, + "loss/logits": 0.8339439123868942, + "step": 42360 + }, + { + "epoch": 0.4237, + "grad_norm": 17.75, + "grad_norm_var": 132.42135416666667, + "learning_rate": 0.0003, + "loss": 11.3355, + "loss/aux_loss": 0.04808040820062161, + "loss/crossentropy": 2.748473286628723, + "loss/logits": 0.8279530495405197, + "step": 42370 + }, + { + "epoch": 0.4238, + "grad_norm": 14.125, + "grad_norm_var": 134.461962890625, + "learning_rate": 0.0003, + "loss": 11.1351, + "loss/aux_loss": 0.04808204211294651, + "loss/crossentropy": 2.726352107524872, + "loss/logits": 0.8515573889017105, + "step": 42380 + }, + { + "epoch": 0.4239, + "grad_norm": 14.3125, + "grad_norm_var": 0.803369140625, + "learning_rate": 0.0003, + "loss": 11.1572, + "loss/aux_loss": 0.04808192439377308, + "loss/crossentropy": 2.593681287765503, + "loss/logits": 0.8259778410196305, + "step": 42390 + }, + { + "epoch": 0.424, + "grad_norm": 14.6875, + "grad_norm_var": 0.7634765625, + "learning_rate": 0.0003, + "loss": 11.1284, + "loss/aux_loss": 0.04806700516492128, + "loss/crossentropy": 2.8635907411575316, + "loss/logits": 0.8664378643035888, + "step": 42400 + }, + { + "epoch": 0.4241, + "grad_norm": 14.75, + "grad_norm_var": 1.23203125, + "learning_rate": 0.0003, + "loss": 11.085, + "loss/aux_loss": 0.048076603934168814, + "loss/crossentropy": 2.645522326231003, + "loss/logits": 0.8040447324514389, + "step": 42410 + }, + { + "epoch": 0.4242, + "grad_norm": 14.25, + "grad_norm_var": 0.42120768229166666, + "learning_rate": 0.0003, + "loss": 11.172, + "loss/aux_loss": 0.04807595741003752, + "loss/crossentropy": 2.6108031809329986, + "loss/logits": 0.8477170407772064, + "step": 42420 + }, + { + "epoch": 0.4243, + "grad_norm": 14.0, + "grad_norm_var": 0.3681640625, + "learning_rate": 0.0003, + "loss": 11.1488, + "loss/aux_loss": 0.04806941151618958, + "loss/crossentropy": 2.6864835619926453, + "loss/logits": 0.8509948909282684, + "step": 42430 + }, + { + "epoch": 0.4244, + "grad_norm": 13.375, + "grad_norm_var": 0.464697265625, + "learning_rate": 0.0003, + "loss": 11.3073, + "loss/aux_loss": 0.04807401150465011, + "loss/crossentropy": 2.864110291004181, + "loss/logits": 0.8419386476278305, + "step": 42440 + }, + { + "epoch": 0.4245, + "grad_norm": 14.6875, + "grad_norm_var": 1.1081868489583333, + "learning_rate": 0.0003, + "loss": 11.109, + "loss/aux_loss": 0.04807962235063314, + "loss/crossentropy": 2.665892016887665, + "loss/logits": 0.8466892153024673, + "step": 42450 + }, + { + "epoch": 0.4246, + "grad_norm": 14.8125, + "grad_norm_var": 0.9630045572916667, + "learning_rate": 0.0003, + "loss": 11.1531, + "loss/aux_loss": 0.048076581209897995, + "loss/crossentropy": 2.7316180169582367, + "loss/logits": 0.8452065467834473, + "step": 42460 + }, + { + "epoch": 0.4247, + "grad_norm": 13.4375, + "grad_norm_var": 0.8898274739583333, + "learning_rate": 0.0003, + "loss": 11.1563, + "loss/aux_loss": 0.04807441867887974, + "loss/crossentropy": 2.7544716358184815, + "loss/logits": 0.8261888146400451, + "step": 42470 + }, + { + "epoch": 0.4248, + "grad_norm": 15.3125, + "grad_norm_var": 0.7120930989583333, + "learning_rate": 0.0003, + "loss": 11.3566, + "loss/aux_loss": 0.04807336274534464, + "loss/crossentropy": 2.8429744720458983, + "loss/logits": 0.8528753489255905, + "step": 42480 + }, + { + "epoch": 0.4249, + "grad_norm": 13.9375, + "grad_norm_var": 0.7978515625, + "learning_rate": 0.0003, + "loss": 11.2925, + "loss/aux_loss": 0.04807384237647057, + "loss/crossentropy": 2.8680081605911254, + "loss/logits": 0.8669378757476807, + "step": 42490 + }, + { + "epoch": 0.425, + "grad_norm": 15.0, + "grad_norm_var": 1.0207682291666667, + "learning_rate": 0.0003, + "loss": 11.3442, + "loss/aux_loss": 0.04807020053267479, + "loss/crossentropy": 2.7715225398540495, + "loss/logits": 0.8685531944036484, + "step": 42500 + }, + { + "epoch": 0.4251, + "grad_norm": 14.3125, + "grad_norm_var": 0.5525390625, + "learning_rate": 0.0003, + "loss": 11.0004, + "loss/aux_loss": 0.048080707900226116, + "loss/crossentropy": 2.6281380653381348, + "loss/logits": 0.8210119009017944, + "step": 42510 + }, + { + "epoch": 0.4252, + "grad_norm": 15.125, + "grad_norm_var": 0.4669108072916667, + "learning_rate": 0.0003, + "loss": 11.2617, + "loss/aux_loss": 0.048080474697053434, + "loss/crossentropy": 2.61398241519928, + "loss/logits": 0.8228483706712723, + "step": 42520 + }, + { + "epoch": 0.4253, + "grad_norm": 12.6875, + "grad_norm_var": 1.0641764322916667, + "learning_rate": 0.0003, + "loss": 11.0279, + "loss/aux_loss": 0.04806278124451637, + "loss/crossentropy": 2.4613637685775758, + "loss/logits": 0.8284233272075653, + "step": 42530 + }, + { + "epoch": 0.4254, + "grad_norm": 15.375, + "grad_norm_var": 67.23683268229166, + "learning_rate": 0.0003, + "loss": 10.9845, + "loss/aux_loss": 0.048084072582423684, + "loss/crossentropy": 2.721170890331268, + "loss/logits": 0.8446434617042542, + "step": 42540 + }, + { + "epoch": 0.4255, + "grad_norm": 13.75, + "grad_norm_var": 67.12888997395834, + "learning_rate": 0.0003, + "loss": 11.0296, + "loss/aux_loss": 0.048082982562482356, + "loss/crossentropy": 2.51130490899086, + "loss/logits": 0.7815639197826385, + "step": 42550 + }, + { + "epoch": 0.4256, + "grad_norm": 14.875, + "grad_norm_var": 15.626302083333334, + "learning_rate": 0.0003, + "loss": 11.2562, + "loss/aux_loss": 0.04806295093148947, + "loss/crossentropy": 2.7693334579467774, + "loss/logits": 0.8466608166694641, + "step": 42560 + }, + { + "epoch": 0.4257, + "grad_norm": 14.5, + "grad_norm_var": 15.836572265625, + "learning_rate": 0.0003, + "loss": 11.2921, + "loss/aux_loss": 0.04808557108044624, + "loss/crossentropy": 2.6812986373901366, + "loss/logits": 0.841954892873764, + "step": 42570 + }, + { + "epoch": 0.4258, + "grad_norm": 14.3125, + "grad_norm_var": 0.978369140625, + "learning_rate": 0.0003, + "loss": 11.252, + "loss/aux_loss": 0.048081206530332564, + "loss/crossentropy": 2.687948948144913, + "loss/logits": 0.8682124525308609, + "step": 42580 + }, + { + "epoch": 0.4259, + "grad_norm": 15.4375, + "grad_norm_var": 0.7228515625, + "learning_rate": 0.0003, + "loss": 11.1966, + "loss/aux_loss": 0.0480744980275631, + "loss/crossentropy": 2.668538528680801, + "loss/logits": 0.8255208849906921, + "step": 42590 + }, + { + "epoch": 0.426, + "grad_norm": 14.75, + "grad_norm_var": 0.9821451822916667, + "learning_rate": 0.0003, + "loss": 11.1911, + "loss/aux_loss": 0.04807751923799515, + "loss/crossentropy": 2.661841082572937, + "loss/logits": 0.8377692878246308, + "step": 42600 + }, + { + "epoch": 0.4261, + "grad_norm": 15.6875, + "grad_norm_var": 0.9484375, + "learning_rate": 0.0003, + "loss": 11.3501, + "loss/aux_loss": 0.04808114971965551, + "loss/crossentropy": 2.902574121952057, + "loss/logits": 0.8539043575525284, + "step": 42610 + }, + { + "epoch": 0.4262, + "grad_norm": 13.5625, + "grad_norm_var": 0.9540201822916666, + "learning_rate": 0.0003, + "loss": 11.1911, + "loss/aux_loss": 0.048068844713270664, + "loss/crossentropy": 2.7202011168003084, + "loss/logits": 0.8331751823425293, + "step": 42620 + }, + { + "epoch": 0.4263, + "grad_norm": 13.9375, + "grad_norm_var": 0.24140625, + "learning_rate": 0.0003, + "loss": 11.3027, + "loss/aux_loss": 0.048070861399173735, + "loss/crossentropy": 2.889012670516968, + "loss/logits": 0.8704992473125458, + "step": 42630 + }, + { + "epoch": 0.4264, + "grad_norm": 15.125, + "grad_norm_var": 0.6125, + "learning_rate": 0.0003, + "loss": 11.1455, + "loss/aux_loss": 0.04808156322687864, + "loss/crossentropy": 2.7083074033260344, + "loss/logits": 0.8564732939004898, + "step": 42640 + }, + { + "epoch": 0.4265, + "grad_norm": 13.625, + "grad_norm_var": 0.61484375, + "learning_rate": 0.0003, + "loss": 11.2121, + "loss/aux_loss": 0.048069928959012034, + "loss/crossentropy": 2.73454931974411, + "loss/logits": 0.8486188590526581, + "step": 42650 + }, + { + "epoch": 0.4266, + "grad_norm": 12.875, + "grad_norm_var": 0.33984375, + "learning_rate": 0.0003, + "loss": 11.1947, + "loss/aux_loss": 0.04808113239705562, + "loss/crossentropy": 2.792097818851471, + "loss/logits": 0.8488514006137848, + "step": 42660 + }, + { + "epoch": 0.4267, + "grad_norm": 13.5, + "grad_norm_var": 1.0273274739583333, + "learning_rate": 0.0003, + "loss": 11.0177, + "loss/aux_loss": 0.04808440897613764, + "loss/crossentropy": 2.567507326602936, + "loss/logits": 0.8366076290607453, + "step": 42670 + }, + { + "epoch": 0.4268, + "grad_norm": 13.3125, + "grad_norm_var": 0.37941080729166665, + "learning_rate": 0.0003, + "loss": 11.0272, + "loss/aux_loss": 0.048066616617143156, + "loss/crossentropy": 2.7124005913734437, + "loss/logits": 0.8338280886411666, + "step": 42680 + }, + { + "epoch": 0.4269, + "grad_norm": 14.0625, + "grad_norm_var": 0.2353515625, + "learning_rate": 0.0003, + "loss": 11.178, + "loss/aux_loss": 0.048080355115234855, + "loss/crossentropy": 2.7802015602588654, + "loss/logits": 0.8339311271905899, + "step": 42690 + }, + { + "epoch": 0.427, + "grad_norm": 15.0625, + "grad_norm_var": 0.48956705729166666, + "learning_rate": 0.0003, + "loss": 11.0708, + "loss/aux_loss": 0.048078592866659164, + "loss/crossentropy": 2.6688977122306823, + "loss/logits": 0.7996150583028794, + "step": 42700 + }, + { + "epoch": 0.4271, + "grad_norm": 14.625, + "grad_norm_var": 6.747379557291667, + "learning_rate": 0.0003, + "loss": 11.1714, + "loss/aux_loss": 0.048077211156487464, + "loss/crossentropy": 2.715333503484726, + "loss/logits": 0.8404178529977798, + "step": 42710 + }, + { + "epoch": 0.4272, + "grad_norm": 13.5625, + "grad_norm_var": 0.27734375, + "learning_rate": 0.0003, + "loss": 11.0897, + "loss/aux_loss": 0.04808191582560539, + "loss/crossentropy": 2.709022808074951, + "loss/logits": 0.8491158545017242, + "step": 42720 + }, + { + "epoch": 0.4273, + "grad_norm": 12.9375, + "grad_norm_var": 0.72578125, + "learning_rate": 0.0003, + "loss": 11.0368, + "loss/aux_loss": 0.048076875135302545, + "loss/crossentropy": 2.7228225231170655, + "loss/logits": 0.8443385303020478, + "step": 42730 + }, + { + "epoch": 0.4274, + "grad_norm": 16.25, + "grad_norm_var": 1.623291015625, + "learning_rate": 0.0003, + "loss": 11.2172, + "loss/aux_loss": 0.04807112403213978, + "loss/crossentropy": 2.7902682304382322, + "loss/logits": 0.863338616490364, + "step": 42740 + }, + { + "epoch": 0.4275, + "grad_norm": 14.25, + "grad_norm_var": 1.578125, + "learning_rate": 0.0003, + "loss": 11.2052, + "loss/aux_loss": 0.048067062720656396, + "loss/crossentropy": 2.757642900943756, + "loss/logits": 0.8275944203138351, + "step": 42750 + }, + { + "epoch": 0.4276, + "grad_norm": 15.8125, + "grad_norm_var": 0.6692708333333334, + "learning_rate": 0.0003, + "loss": 11.0956, + "loss/aux_loss": 0.04808202516287565, + "loss/crossentropy": 2.728620332479477, + "loss/logits": 0.8508161783218384, + "step": 42760 + }, + { + "epoch": 0.4277, + "grad_norm": 13.4375, + "grad_norm_var": 0.5416015625, + "learning_rate": 0.0003, + "loss": 11.0911, + "loss/aux_loss": 0.04807655792683363, + "loss/crossentropy": 2.6857302367687224, + "loss/logits": 0.8251177936792373, + "step": 42770 + }, + { + "epoch": 0.4278, + "grad_norm": 14.25, + "grad_norm_var": 0.3346354166666667, + "learning_rate": 0.0003, + "loss": 11.2185, + "loss/aux_loss": 0.048068818263709547, + "loss/crossentropy": 2.7882674872875213, + "loss/logits": 0.8415611743927002, + "step": 42780 + }, + { + "epoch": 0.4279, + "grad_norm": 13.6875, + "grad_norm_var": 0.393603515625, + "learning_rate": 0.0003, + "loss": 11.1943, + "loss/aux_loss": 0.048084525391459465, + "loss/crossentropy": 2.8016934394836426, + "loss/logits": 0.8504360228776932, + "step": 42790 + }, + { + "epoch": 0.428, + "grad_norm": 15.5625, + "grad_norm_var": 0.8610514322916667, + "learning_rate": 0.0003, + "loss": 11.0533, + "loss/aux_loss": 0.0480705926194787, + "loss/crossentropy": 2.7852961301803587, + "loss/logits": 0.8427935183048249, + "step": 42800 + }, + { + "epoch": 0.4281, + "grad_norm": 14.375, + "grad_norm_var": 0.5283854166666667, + "learning_rate": 0.0003, + "loss": 11.1688, + "loss/aux_loss": 0.048075289465487, + "loss/crossentropy": 2.851541531085968, + "loss/logits": 0.8648887991905212, + "step": 42810 + }, + { + "epoch": 0.4282, + "grad_norm": 15.5625, + "grad_norm_var": 0.6245930989583334, + "learning_rate": 0.0003, + "loss": 11.1358, + "loss/aux_loss": 0.04808152187615633, + "loss/crossentropy": 2.728837323188782, + "loss/logits": 0.8126325309276581, + "step": 42820 + }, + { + "epoch": 0.4283, + "grad_norm": 14.0625, + "grad_norm_var": 5.513655598958334, + "learning_rate": 0.0003, + "loss": 11.3191, + "loss/aux_loss": 0.048070829920470716, + "loss/crossentropy": 2.9222333669662475, + "loss/logits": 0.8803920924663544, + "step": 42830 + }, + { + "epoch": 0.4284, + "grad_norm": 14.125, + "grad_norm_var": 5.320556640625, + "learning_rate": 0.0003, + "loss": 11.1858, + "loss/aux_loss": 0.04807902462780476, + "loss/crossentropy": 2.750357246398926, + "loss/logits": 0.8127260476350784, + "step": 42840 + }, + { + "epoch": 0.4285, + "grad_norm": 15.125, + "grad_norm_var": 0.3848795572916667, + "learning_rate": 0.0003, + "loss": 11.2752, + "loss/aux_loss": 0.048070596531033516, + "loss/crossentropy": 2.6048742115497587, + "loss/logits": 0.8551869869232178, + "step": 42850 + }, + { + "epoch": 0.4286, + "grad_norm": 14.25, + "grad_norm_var": 0.5794108072916667, + "learning_rate": 0.0003, + "loss": 11.132, + "loss/aux_loss": 0.048073590733110905, + "loss/crossentropy": 2.8263909220695496, + "loss/logits": 0.8633887559175492, + "step": 42860 + }, + { + "epoch": 0.4287, + "grad_norm": 13.375, + "grad_norm_var": 1.9244140625, + "learning_rate": 0.0003, + "loss": 11.3004, + "loss/aux_loss": 0.04807244185358286, + "loss/crossentropy": 2.7152198910713197, + "loss/logits": 0.8596496641635895, + "step": 42870 + }, + { + "epoch": 0.4288, + "grad_norm": 15.25, + "grad_norm_var": 1.720556640625, + "learning_rate": 0.0003, + "loss": 11.3347, + "loss/aux_loss": 0.04807446151971817, + "loss/crossentropy": 2.721926176548004, + "loss/logits": 0.8560007959604263, + "step": 42880 + }, + { + "epoch": 0.4289, + "grad_norm": 14.0, + "grad_norm_var": 2.203108723958333, + "learning_rate": 0.0003, + "loss": 11.0291, + "loss/aux_loss": 0.04807375371456146, + "loss/crossentropy": 2.8105222463607786, + "loss/logits": 0.8752927869558335, + "step": 42890 + }, + { + "epoch": 0.429, + "grad_norm": 15.75, + "grad_norm_var": 1.431494140625, + "learning_rate": 0.0003, + "loss": 10.9854, + "loss/aux_loss": 0.04806910492479801, + "loss/crossentropy": 2.690297359228134, + "loss/logits": 0.837578096985817, + "step": 42900 + }, + { + "epoch": 0.4291, + "grad_norm": 13.25, + "grad_norm_var": 1.439697265625, + "learning_rate": 0.0003, + "loss": 11.1458, + "loss/aux_loss": 0.04807917848229408, + "loss/crossentropy": 2.812906527519226, + "loss/logits": 0.839617344737053, + "step": 42910 + }, + { + "epoch": 0.4292, + "grad_norm": 14.125, + "grad_norm_var": 0.37185872395833336, + "learning_rate": 0.0003, + "loss": 11.2282, + "loss/aux_loss": 0.04807562492787838, + "loss/crossentropy": 2.778420227766037, + "loss/logits": 0.8231628626585007, + "step": 42920 + }, + { + "epoch": 0.4293, + "grad_norm": 14.5625, + "grad_norm_var": 0.30520833333333336, + "learning_rate": 0.0003, + "loss": 10.9583, + "loss/aux_loss": 0.048068897984921935, + "loss/crossentropy": 2.7826973259449006, + "loss/logits": 0.8252352714538574, + "step": 42930 + }, + { + "epoch": 0.4294, + "grad_norm": 14.5625, + "grad_norm_var": 0.5637858072916667, + "learning_rate": 0.0003, + "loss": 11.159, + "loss/aux_loss": 0.04807413574308157, + "loss/crossentropy": 2.5992193698883055, + "loss/logits": 0.8467897325754166, + "step": 42940 + }, + { + "epoch": 0.4295, + "grad_norm": 15.5, + "grad_norm_var": 0.315478515625, + "learning_rate": 0.0003, + "loss": 10.9422, + "loss/aux_loss": 0.04807993993163109, + "loss/crossentropy": 2.524407982826233, + "loss/logits": 0.8164256751537323, + "step": 42950 + }, + { + "epoch": 0.4296, + "grad_norm": 14.25, + "grad_norm_var": 0.6348795572916667, + "learning_rate": 0.0003, + "loss": 11.2791, + "loss/aux_loss": 0.04807226173579693, + "loss/crossentropy": 2.6596532464027405, + "loss/logits": 0.8530319899320602, + "step": 42960 + }, + { + "epoch": 0.4297, + "grad_norm": 13.9375, + "grad_norm_var": 1.3340983072916666, + "learning_rate": 0.0003, + "loss": 11.0176, + "loss/aux_loss": 0.04807053208351135, + "loss/crossentropy": 2.718638336658478, + "loss/logits": 0.8549129962921143, + "step": 42970 + }, + { + "epoch": 0.4298, + "grad_norm": 15.5, + "grad_norm_var": 1.595947265625, + "learning_rate": 0.0003, + "loss": 11.2805, + "loss/aux_loss": 0.04808430094271898, + "loss/crossentropy": 2.637483465671539, + "loss/logits": 0.8228224605321884, + "step": 42980 + }, + { + "epoch": 0.4299, + "grad_norm": 13.8125, + "grad_norm_var": 0.708056640625, + "learning_rate": 0.0003, + "loss": 11.1181, + "loss/aux_loss": 0.04807401914149523, + "loss/crossentropy": 2.7925415635108948, + "loss/logits": 0.82631796002388, + "step": 42990 + }, + { + "epoch": 0.43, + "grad_norm": 15.3125, + "grad_norm_var": 0.6119140625, + "learning_rate": 0.0003, + "loss": 11.188, + "loss/aux_loss": 0.04808122981339693, + "loss/crossentropy": 2.739946460723877, + "loss/logits": 0.833682969212532, + "step": 43000 + }, + { + "epoch": 0.4301, + "grad_norm": 14.375, + "grad_norm_var": 1809.0212890625, + "learning_rate": 0.0003, + "loss": 11.3295, + "loss/aux_loss": 0.0480794845148921, + "loss/crossentropy": 2.7596442997455597, + "loss/logits": 0.8646740794181824, + "step": 43010 + }, + { + "epoch": 0.4302, + "grad_norm": 13.25, + "grad_norm_var": 0.8145670572916667, + "learning_rate": 0.0003, + "loss": 11.0717, + "loss/aux_loss": 0.04808532949537039, + "loss/crossentropy": 2.585456448793411, + "loss/logits": 0.8001091122627259, + "step": 43020 + }, + { + "epoch": 0.4303, + "grad_norm": 13.4375, + "grad_norm_var": 0.20572916666666666, + "learning_rate": 0.0003, + "loss": 11.1748, + "loss/aux_loss": 0.04807361718267202, + "loss/crossentropy": 2.6641751885414124, + "loss/logits": 0.8602477341890336, + "step": 43030 + }, + { + "epoch": 0.4304, + "grad_norm": 14.0625, + "grad_norm_var": 1.0601399739583333, + "learning_rate": 0.0003, + "loss": 11.2242, + "loss/aux_loss": 0.04806696530431509, + "loss/crossentropy": 2.7215495467185975, + "loss/logits": 0.8751588940620423, + "step": 43040 + }, + { + "epoch": 0.4305, + "grad_norm": 14.625, + "grad_norm_var": 0.760400390625, + "learning_rate": 0.0003, + "loss": 11.0104, + "loss/aux_loss": 0.04807785041630268, + "loss/crossentropy": 2.6973119556903837, + "loss/logits": 0.8398984521627426, + "step": 43050 + }, + { + "epoch": 0.4306, + "grad_norm": 13.625, + "grad_norm_var": 0.7916015625, + "learning_rate": 0.0003, + "loss": 11.4002, + "loss/aux_loss": 0.04808125030249357, + "loss/crossentropy": 2.758901071548462, + "loss/logits": 0.8731786936521531, + "step": 43060 + }, + { + "epoch": 0.4307, + "grad_norm": 13.25, + "grad_norm_var": 0.610009765625, + "learning_rate": 0.0003, + "loss": 11.1299, + "loss/aux_loss": 0.04807206802070141, + "loss/crossentropy": 2.708112859725952, + "loss/logits": 0.7979910880327225, + "step": 43070 + }, + { + "epoch": 0.4308, + "grad_norm": 14.0625, + "grad_norm_var": 0.7150390625, + "learning_rate": 0.0003, + "loss": 11.145, + "loss/aux_loss": 0.04807481300085783, + "loss/crossentropy": 2.7164962589740753, + "loss/logits": 0.8307441174983978, + "step": 43080 + }, + { + "epoch": 0.4309, + "grad_norm": 14.3125, + "grad_norm_var": 0.603369140625, + "learning_rate": 0.0003, + "loss": 11.2082, + "loss/aux_loss": 0.048082906566560266, + "loss/crossentropy": 2.830370819568634, + "loss/logits": 0.8672911942005157, + "step": 43090 + }, + { + "epoch": 0.431, + "grad_norm": 16.0, + "grad_norm_var": 0.6677083333333333, + "learning_rate": 0.0003, + "loss": 11.2983, + "loss/aux_loss": 0.048063176684081554, + "loss/crossentropy": 2.7665525555610655, + "loss/logits": 0.8384849548339843, + "step": 43100 + }, + { + "epoch": 0.4311, + "grad_norm": 13.75, + "grad_norm_var": 0.8820149739583333, + "learning_rate": 0.0003, + "loss": 11.3458, + "loss/aux_loss": 0.0480750685557723, + "loss/crossentropy": 2.6808901131153107, + "loss/logits": 0.8562900602817536, + "step": 43110 + }, + { + "epoch": 0.4312, + "grad_norm": 15.0, + "grad_norm_var": 0.675244140625, + "learning_rate": 0.0003, + "loss": 11.2144, + "loss/aux_loss": 0.048073401860892775, + "loss/crossentropy": 2.792336130142212, + "loss/logits": 0.8535761684179306, + "step": 43120 + }, + { + "epoch": 0.4313, + "grad_norm": 13.0, + "grad_norm_var": 0.264306640625, + "learning_rate": 0.0003, + "loss": 11.3156, + "loss/aux_loss": 0.048070548288524154, + "loss/crossentropy": 2.618429493904114, + "loss/logits": 0.8526875019073487, + "step": 43130 + }, + { + "epoch": 0.4314, + "grad_norm": 14.6875, + "grad_norm_var": 0.3589680989583333, + "learning_rate": 0.0003, + "loss": 10.973, + "loss/aux_loss": 0.048080014809966085, + "loss/crossentropy": 2.6525802075862885, + "loss/logits": 0.8168764710426331, + "step": 43140 + }, + { + "epoch": 0.4315, + "grad_norm": 14.3125, + "grad_norm_var": 0.3770670572916667, + "learning_rate": 0.0003, + "loss": 11.207, + "loss/aux_loss": 0.048070788569748404, + "loss/crossentropy": 2.828217601776123, + "loss/logits": 0.8801011204719543, + "step": 43150 + }, + { + "epoch": 0.4316, + "grad_norm": 13.8125, + "grad_norm_var": 0.122509765625, + "learning_rate": 0.0003, + "loss": 11.0918, + "loss/aux_loss": 0.04807632640004158, + "loss/crossentropy": 2.74559006690979, + "loss/logits": 0.8384597927331925, + "step": 43160 + }, + { + "epoch": 0.4317, + "grad_norm": 14.375, + "grad_norm_var": 0.35149739583333334, + "learning_rate": 0.0003, + "loss": 11.1667, + "loss/aux_loss": 0.0480788629502058, + "loss/crossentropy": 2.6675486505031585, + "loss/logits": 0.858903244137764, + "step": 43170 + }, + { + "epoch": 0.4318, + "grad_norm": 15.625, + "grad_norm_var": 0.9427083333333334, + "learning_rate": 0.0003, + "loss": 11.1853, + "loss/aux_loss": 0.04807556346058846, + "loss/crossentropy": 2.764018404483795, + "loss/logits": 0.8240988850593567, + "step": 43180 + }, + { + "epoch": 0.4319, + "grad_norm": 14.0, + "grad_norm_var": 0.96875, + "learning_rate": 0.0003, + "loss": 11.271, + "loss/aux_loss": 0.04806935228407383, + "loss/crossentropy": 2.8441020369529726, + "loss/logits": 0.8296251714229583, + "step": 43190 + }, + { + "epoch": 0.432, + "grad_norm": 15.0, + "grad_norm_var": 0.245556640625, + "learning_rate": 0.0003, + "loss": 11.2131, + "loss/aux_loss": 0.04808467049151659, + "loss/crossentropy": 2.6359627187252044, + "loss/logits": 0.8088801056146622, + "step": 43200 + }, + { + "epoch": 0.4321, + "grad_norm": 13.375, + "grad_norm_var": 0.28370768229166665, + "learning_rate": 0.0003, + "loss": 11.1807, + "loss/aux_loss": 0.04807442501187324, + "loss/crossentropy": 2.629367303848267, + "loss/logits": 0.8538803130388259, + "step": 43210 + }, + { + "epoch": 0.4322, + "grad_norm": 13.9375, + "grad_norm_var": 0.3994140625, + "learning_rate": 0.0003, + "loss": 11.1458, + "loss/aux_loss": 0.0480644728988409, + "loss/crossentropy": 2.70165359377861, + "loss/logits": 0.8403062671422958, + "step": 43220 + }, + { + "epoch": 0.4323, + "grad_norm": 14.0, + "grad_norm_var": 0.28253580729166666, + "learning_rate": 0.0003, + "loss": 11.2449, + "loss/aux_loss": 0.048089561983942986, + "loss/crossentropy": 2.599137383699417, + "loss/logits": 0.8236678332090378, + "step": 43230 + }, + { + "epoch": 0.4324, + "grad_norm": 13.375, + "grad_norm_var": 0.305322265625, + "learning_rate": 0.0003, + "loss": 11.2337, + "loss/aux_loss": 0.04807605054229498, + "loss/crossentropy": 2.8088149547576906, + "loss/logits": 0.8818845838308335, + "step": 43240 + }, + { + "epoch": 0.4325, + "grad_norm": 15.0, + "grad_norm_var": 0.4161295572916667, + "learning_rate": 0.0003, + "loss": 11.204, + "loss/aux_loss": 0.04806992541998625, + "loss/crossentropy": 2.8428435802459715, + "loss/logits": 0.8899227410554886, + "step": 43250 + }, + { + "epoch": 0.4326, + "grad_norm": 14.875, + "grad_norm_var": 0.388134765625, + "learning_rate": 0.0003, + "loss": 11.1815, + "loss/aux_loss": 0.04808182567358017, + "loss/crossentropy": 2.685653477907181, + "loss/logits": 0.8344039708375931, + "step": 43260 + }, + { + "epoch": 0.4327, + "grad_norm": 13.6875, + "grad_norm_var": 0.3270182291666667, + "learning_rate": 0.0003, + "loss": 11.3153, + "loss/aux_loss": 0.04808889031410217, + "loss/crossentropy": 2.976236271858215, + "loss/logits": 0.8441417008638382, + "step": 43270 + }, + { + "epoch": 0.4328, + "grad_norm": 14.125, + "grad_norm_var": 0.93203125, + "learning_rate": 0.0003, + "loss": 11.0959, + "loss/aux_loss": 0.04806580301374197, + "loss/crossentropy": 2.711184060573578, + "loss/logits": 0.840970367193222, + "step": 43280 + }, + { + "epoch": 0.4329, + "grad_norm": 16.25, + "grad_norm_var": 0.7880208333333333, + "learning_rate": 0.0003, + "loss": 11.2848, + "loss/aux_loss": 0.04807568024843931, + "loss/crossentropy": 2.764776086807251, + "loss/logits": 0.8647139281034469, + "step": 43290 + }, + { + "epoch": 0.433, + "grad_norm": 13.5, + "grad_norm_var": 58.57421875, + "learning_rate": 0.0003, + "loss": 11.2579, + "loss/aux_loss": 0.0480875076726079, + "loss/crossentropy": 2.639911252260208, + "loss/logits": 0.855933940410614, + "step": 43300 + }, + { + "epoch": 0.4331, + "grad_norm": 13.625, + "grad_norm_var": 58.1744140625, + "learning_rate": 0.0003, + "loss": 11.1596, + "loss/aux_loss": 0.048066693171858785, + "loss/crossentropy": 2.7509276986122133, + "loss/logits": 0.8506129652261734, + "step": 43310 + }, + { + "epoch": 0.4332, + "grad_norm": 14.125, + "grad_norm_var": 0.2994140625, + "learning_rate": 0.0003, + "loss": 11.2364, + "loss/aux_loss": 0.04807441793382168, + "loss/crossentropy": 2.7721996307373047, + "loss/logits": 0.841002207994461, + "step": 43320 + }, + { + "epoch": 0.4333, + "grad_norm": 12.9375, + "grad_norm_var": 0.411962890625, + "learning_rate": 0.0003, + "loss": 11.0887, + "loss/aux_loss": 0.04807002730667591, + "loss/crossentropy": 2.702434003353119, + "loss/logits": 0.8071956008672714, + "step": 43330 + }, + { + "epoch": 0.4334, + "grad_norm": 15.125, + "grad_norm_var": 0.3262858072916667, + "learning_rate": 0.0003, + "loss": 11.1554, + "loss/aux_loss": 0.04807128868997097, + "loss/crossentropy": 2.727681612968445, + "loss/logits": 0.8188419610261917, + "step": 43340 + }, + { + "epoch": 0.4335, + "grad_norm": 13.875, + "grad_norm_var": 0.27447916666666666, + "learning_rate": 0.0003, + "loss": 11.196, + "loss/aux_loss": 0.04806978348642588, + "loss/crossentropy": 2.779603922367096, + "loss/logits": 0.8584359914064408, + "step": 43350 + }, + { + "epoch": 0.4336, + "grad_norm": 14.125, + "grad_norm_var": 0.5494140625, + "learning_rate": 0.0003, + "loss": 11.2599, + "loss/aux_loss": 0.04807741772383452, + "loss/crossentropy": 2.7377023220062258, + "loss/logits": 0.8400143414735795, + "step": 43360 + }, + { + "epoch": 0.4337, + "grad_norm": 15.9375, + "grad_norm_var": 1.5880208333333334, + "learning_rate": 0.0003, + "loss": 11.14, + "loss/aux_loss": 0.04808063004165888, + "loss/crossentropy": 2.7512278735637663, + "loss/logits": 0.8345052689313889, + "step": 43370 + }, + { + "epoch": 0.4338, + "grad_norm": 13.875, + "grad_norm_var": 1.5528645833333334, + "learning_rate": 0.0003, + "loss": 11.0079, + "loss/aux_loss": 0.04806357547640801, + "loss/crossentropy": 2.7596873223781584, + "loss/logits": 0.8277094513177872, + "step": 43380 + }, + { + "epoch": 0.4339, + "grad_norm": 14.4375, + "grad_norm_var": 0.5911458333333334, + "learning_rate": 0.0003, + "loss": 11.0468, + "loss/aux_loss": 0.04807555004954338, + "loss/crossentropy": 2.7843938052654265, + "loss/logits": 0.8598534375429153, + "step": 43390 + }, + { + "epoch": 0.434, + "grad_norm": 14.375, + "grad_norm_var": 0.410400390625, + "learning_rate": 0.0003, + "loss": 11.1959, + "loss/aux_loss": 0.048078845627605914, + "loss/crossentropy": 2.5064321935176848, + "loss/logits": 0.8137326329946518, + "step": 43400 + }, + { + "epoch": 0.4341, + "grad_norm": 14.5, + "grad_norm_var": 0.521728515625, + "learning_rate": 0.0003, + "loss": 11.2099, + "loss/aux_loss": 0.04807638339698315, + "loss/crossentropy": 2.8900754928588865, + "loss/logits": 0.8562443405389786, + "step": 43410 + }, + { + "epoch": 0.4342, + "grad_norm": 14.875, + "grad_norm_var": 0.3265462239583333, + "learning_rate": 0.0003, + "loss": 11.3643, + "loss/aux_loss": 0.04807485099881888, + "loss/crossentropy": 2.76791490316391, + "loss/logits": 0.8608255743980407, + "step": 43420 + }, + { + "epoch": 0.4343, + "grad_norm": 13.75, + "grad_norm_var": 0.7968098958333333, + "learning_rate": 0.0003, + "loss": 11.0301, + "loss/aux_loss": 0.04807616826146841, + "loss/crossentropy": 2.6704135179519652, + "loss/logits": 0.8353795439004899, + "step": 43430 + }, + { + "epoch": 0.4344, + "grad_norm": 14.4375, + "grad_norm_var": 0.878369140625, + "learning_rate": 0.0003, + "loss": 11.1472, + "loss/aux_loss": 0.04807905219495297, + "loss/crossentropy": 2.6787062883377075, + "loss/logits": 0.8412859380245209, + "step": 43440 + }, + { + "epoch": 0.4345, + "grad_norm": 14.75, + "grad_norm_var": 0.46027018229166666, + "learning_rate": 0.0003, + "loss": 11.1762, + "loss/aux_loss": 0.04807356093078852, + "loss/crossentropy": 2.7421591579914093, + "loss/logits": 0.8316115468740464, + "step": 43450 + }, + { + "epoch": 0.4346, + "grad_norm": 14.8125, + "grad_norm_var": 0.47732747395833336, + "learning_rate": 0.0003, + "loss": 11.0474, + "loss/aux_loss": 0.04807029739022255, + "loss/crossentropy": 2.731302946805954, + "loss/logits": 0.8437435656785965, + "step": 43460 + }, + { + "epoch": 0.4347, + "grad_norm": 14.4375, + "grad_norm_var": 0.42864583333333334, + "learning_rate": 0.0003, + "loss": 11.1795, + "loss/aux_loss": 0.048083835281431675, + "loss/crossentropy": 2.633001279830933, + "loss/logits": 0.8292666167020798, + "step": 43470 + }, + { + "epoch": 0.4348, + "grad_norm": 13.75, + "grad_norm_var": 0.6476399739583333, + "learning_rate": 0.0003, + "loss": 11.071, + "loss/aux_loss": 0.04806802663952112, + "loss/crossentropy": 2.698196220397949, + "loss/logits": 0.811674302816391, + "step": 43480 + }, + { + "epoch": 0.4349, + "grad_norm": 15.8125, + "grad_norm_var": 0.7286295572916667, + "learning_rate": 0.0003, + "loss": 11.106, + "loss/aux_loss": 0.04807435814291239, + "loss/crossentropy": 2.8671145260334017, + "loss/logits": 0.8537077218294143, + "step": 43490 + }, + { + "epoch": 0.435, + "grad_norm": 15.375, + "grad_norm_var": 0.6048014322916667, + "learning_rate": 0.0003, + "loss": 11.0003, + "loss/aux_loss": 0.048080187290906906, + "loss/crossentropy": 2.722633057832718, + "loss/logits": 0.8278620541095734, + "step": 43500 + }, + { + "epoch": 0.4351, + "grad_norm": 14.3125, + "grad_norm_var": 0.3934733072916667, + "learning_rate": 0.0003, + "loss": 11.2418, + "loss/aux_loss": 0.048065092600882056, + "loss/crossentropy": 2.733147954940796, + "loss/logits": 0.8363703429698944, + "step": 43510 + }, + { + "epoch": 0.4352, + "grad_norm": 13.875, + "grad_norm_var": 0.43162434895833335, + "learning_rate": 0.0003, + "loss": 11.1544, + "loss/aux_loss": 0.04809010047465563, + "loss/crossentropy": 2.6833800315856933, + "loss/logits": 0.8280310302972793, + "step": 43520 + }, + { + "epoch": 0.4353, + "grad_norm": 14.0625, + "grad_norm_var": 0.5497395833333333, + "learning_rate": 0.0003, + "loss": 11.153, + "loss/aux_loss": 0.04807169977575541, + "loss/crossentropy": 2.723561632633209, + "loss/logits": 0.8542778968811036, + "step": 43530 + }, + { + "epoch": 0.4354, + "grad_norm": 15.1875, + "grad_norm_var": 0.8591145833333333, + "learning_rate": 0.0003, + "loss": 11.1498, + "loss/aux_loss": 0.0480776134878397, + "loss/crossentropy": 2.627856492996216, + "loss/logits": 0.8360980361700058, + "step": 43540 + }, + { + "epoch": 0.4355, + "grad_norm": 14.5, + "grad_norm_var": 0.4905598958333333, + "learning_rate": 0.0003, + "loss": 11.2714, + "loss/aux_loss": 0.04807308316230774, + "loss/crossentropy": 2.766408783197403, + "loss/logits": 0.8315641492605209, + "step": 43550 + }, + { + "epoch": 0.4356, + "grad_norm": 14.75, + "grad_norm_var": 0.3055826822916667, + "learning_rate": 0.0003, + "loss": 11.3886, + "loss/aux_loss": 0.048083190061151984, + "loss/crossentropy": 2.806036615371704, + "loss/logits": 0.883603885769844, + "step": 43560 + }, + { + "epoch": 0.4357, + "grad_norm": 15.0, + "grad_norm_var": 1.5067057291666666, + "learning_rate": 0.0003, + "loss": 11.0006, + "loss/aux_loss": 0.0480629924684763, + "loss/crossentropy": 2.5885447025299073, + "loss/logits": 0.8374526888132096, + "step": 43570 + }, + { + "epoch": 0.4358, + "grad_norm": 14.625, + "grad_norm_var": 1.2667805989583334, + "learning_rate": 0.0003, + "loss": 11.0437, + "loss/aux_loss": 0.04808878097683191, + "loss/crossentropy": 2.7568553149700166, + "loss/logits": 0.8364752948284149, + "step": 43580 + }, + { + "epoch": 0.4359, + "grad_norm": 14.875, + "grad_norm_var": 0.2431640625, + "learning_rate": 0.0003, + "loss": 11.2293, + "loss/aux_loss": 0.048077457770705226, + "loss/crossentropy": 2.718920850753784, + "loss/logits": 0.8596648782491684, + "step": 43590 + }, + { + "epoch": 0.436, + "grad_norm": 13.625, + "grad_norm_var": 0.323681640625, + "learning_rate": 0.0003, + "loss": 11.2943, + "loss/aux_loss": 0.04807487428188324, + "loss/crossentropy": 2.6981576442718507, + "loss/logits": 0.8179612189531327, + "step": 43600 + }, + { + "epoch": 0.4361, + "grad_norm": 14.1875, + "grad_norm_var": 0.404931640625, + "learning_rate": 0.0003, + "loss": 11.187, + "loss/aux_loss": 0.04809190686792135, + "loss/crossentropy": 2.6059759140014647, + "loss/logits": 0.8273712396621704, + "step": 43610 + }, + { + "epoch": 0.4362, + "grad_norm": 14.5, + "grad_norm_var": 0.39646809895833335, + "learning_rate": 0.0003, + "loss": 11.1114, + "loss/aux_loss": 0.048068244755268094, + "loss/crossentropy": 2.7903677105903624, + "loss/logits": 0.859819746017456, + "step": 43620 + }, + { + "epoch": 0.4363, + "grad_norm": 13.6875, + "grad_norm_var": 0.8854166666666666, + "learning_rate": 0.0003, + "loss": 11.1303, + "loss/aux_loss": 0.048077091202139856, + "loss/crossentropy": 2.811820614337921, + "loss/logits": 0.8357854694128036, + "step": 43630 + }, + { + "epoch": 0.4364, + "grad_norm": 14.3125, + "grad_norm_var": 0.396875, + "learning_rate": 0.0003, + "loss": 11.2525, + "loss/aux_loss": 0.048074031434953216, + "loss/crossentropy": 2.7376105189323425, + "loss/logits": 0.8477279067039489, + "step": 43640 + }, + { + "epoch": 0.4365, + "grad_norm": 15.4375, + "grad_norm_var": 0.595166015625, + "learning_rate": 0.0003, + "loss": 11.1445, + "loss/aux_loss": 0.048072155378758905, + "loss/crossentropy": 2.633346253633499, + "loss/logits": 0.8097257345914841, + "step": 43650 + }, + { + "epoch": 0.4366, + "grad_norm": 14.25, + "grad_norm_var": 0.479541015625, + "learning_rate": 0.0003, + "loss": 11.4783, + "loss/aux_loss": 0.04806961789727211, + "loss/crossentropy": 2.7709633708000183, + "loss/logits": 0.8549257487058639, + "step": 43660 + }, + { + "epoch": 0.4367, + "grad_norm": 14.4375, + "grad_norm_var": 0.6348795572916667, + "learning_rate": 0.0003, + "loss": 11.2698, + "loss/aux_loss": 0.04807803872972727, + "loss/crossentropy": 2.8225900530815125, + "loss/logits": 0.860035040974617, + "step": 43670 + }, + { + "epoch": 0.4368, + "grad_norm": 14.75, + "grad_norm_var": 1.0791666666666666, + "learning_rate": 0.0003, + "loss": 10.9831, + "loss/aux_loss": 0.0480777345597744, + "loss/crossentropy": 2.673772931098938, + "loss/logits": 0.8094230264425277, + "step": 43680 + }, + { + "epoch": 0.4369, + "grad_norm": 13.0, + "grad_norm_var": 0.876025390625, + "learning_rate": 0.0003, + "loss": 11.1929, + "loss/aux_loss": 0.048065176233649254, + "loss/crossentropy": 2.7113537013530733, + "loss/logits": 0.8660049647092819, + "step": 43690 + }, + { + "epoch": 0.437, + "grad_norm": 14.3125, + "grad_norm_var": 0.5234375, + "learning_rate": 0.0003, + "loss": 11.1811, + "loss/aux_loss": 0.048079431615769865, + "loss/crossentropy": 2.649705445766449, + "loss/logits": 0.7998382925987244, + "step": 43700 + }, + { + "epoch": 0.4371, + "grad_norm": 14.0, + "grad_norm_var": 0.822119140625, + "learning_rate": 0.0003, + "loss": 11.2201, + "loss/aux_loss": 0.04807834941893816, + "loss/crossentropy": 2.8499518752098085, + "loss/logits": 0.8855004251003266, + "step": 43710 + }, + { + "epoch": 0.4372, + "grad_norm": 15.3125, + "grad_norm_var": 0.8173014322916666, + "learning_rate": 0.0003, + "loss": 11.035, + "loss/aux_loss": 0.04806433636695147, + "loss/crossentropy": 2.820804786682129, + "loss/logits": 0.85684075653553, + "step": 43720 + }, + { + "epoch": 0.4373, + "grad_norm": 15.1875, + "grad_norm_var": 0.5952962239583334, + "learning_rate": 0.0003, + "loss": 11.0527, + "loss/aux_loss": 0.04808305986225605, + "loss/crossentropy": 2.71166330575943, + "loss/logits": 0.8343310207128525, + "step": 43730 + }, + { + "epoch": 0.4374, + "grad_norm": 14.125, + "grad_norm_var": 0.485791015625, + "learning_rate": 0.0003, + "loss": 11.2295, + "loss/aux_loss": 0.04806539099663496, + "loss/crossentropy": 2.766424697637558, + "loss/logits": 0.8757703483104706, + "step": 43740 + }, + { + "epoch": 0.4375, + "grad_norm": 14.375, + "grad_norm_var": 0.6057291666666667, + "learning_rate": 0.0003, + "loss": 11.1771, + "loss/aux_loss": 0.04808200504630804, + "loss/crossentropy": 2.7064111471176147, + "loss/logits": 0.8637802988290787, + "step": 43750 + }, + { + "epoch": 0.4376, + "grad_norm": 18.625, + "grad_norm_var": 1.48203125, + "learning_rate": 0.0003, + "loss": 11.1127, + "loss/aux_loss": 0.04807017743587494, + "loss/crossentropy": 2.653383284807205, + "loss/logits": 0.85880506336689, + "step": 43760 + }, + { + "epoch": 0.4377, + "grad_norm": 14.125, + "grad_norm_var": 1.5556640625, + "learning_rate": 0.0003, + "loss": 11.3311, + "loss/aux_loss": 0.04808002356439829, + "loss/crossentropy": 2.712994170188904, + "loss/logits": 0.853711587190628, + "step": 43770 + }, + { + "epoch": 0.4378, + "grad_norm": 14.3125, + "grad_norm_var": 0.3636555989583333, + "learning_rate": 0.0003, + "loss": 10.9948, + "loss/aux_loss": 0.04807909522205591, + "loss/crossentropy": 2.5925404846668245, + "loss/logits": 0.804463854432106, + "step": 43780 + }, + { + "epoch": 0.4379, + "grad_norm": 14.25, + "grad_norm_var": 0.32420247395833335, + "learning_rate": 0.0003, + "loss": 11.23, + "loss/aux_loss": 0.04806744996458292, + "loss/crossentropy": 2.6414481580257414, + "loss/logits": 0.8415878742933274, + "step": 43790 + }, + { + "epoch": 0.438, + "grad_norm": 14.4375, + "grad_norm_var": 0.140625, + "learning_rate": 0.0003, + "loss": 11.2518, + "loss/aux_loss": 0.04808863271027804, + "loss/crossentropy": 2.6875229835510255, + "loss/logits": 0.8200345158576965, + "step": 43800 + }, + { + "epoch": 0.4381, + "grad_norm": 16.5, + "grad_norm_var": 0.6113932291666667, + "learning_rate": 0.0003, + "loss": 11.152, + "loss/aux_loss": 0.04806707743555307, + "loss/crossentropy": 2.513974744081497, + "loss/logits": 0.8057096034288407, + "step": 43810 + }, + { + "epoch": 0.4382, + "grad_norm": 14.25, + "grad_norm_var": 0.7825520833333334, + "learning_rate": 0.0003, + "loss": 11.2658, + "loss/aux_loss": 0.04807666204869747, + "loss/crossentropy": 2.51427965760231, + "loss/logits": 0.7930530071258545, + "step": 43820 + }, + { + "epoch": 0.4383, + "grad_norm": 13.875, + "grad_norm_var": 0.5705729166666667, + "learning_rate": 0.0003, + "loss": 11.2529, + "loss/aux_loss": 0.048075707629323006, + "loss/crossentropy": 2.711172878742218, + "loss/logits": 0.8392647117376327, + "step": 43830 + }, + { + "epoch": 0.4384, + "grad_norm": 14.5, + "grad_norm_var": 0.6738118489583333, + "learning_rate": 0.0003, + "loss": 11.2097, + "loss/aux_loss": 0.04807080589234829, + "loss/crossentropy": 2.796042335033417, + "loss/logits": 0.8655966311693192, + "step": 43840 + }, + { + "epoch": 0.4385, + "grad_norm": 14.0625, + "grad_norm_var": 0.6041015625, + "learning_rate": 0.0003, + "loss": 11.1667, + "loss/aux_loss": 0.048078172095119956, + "loss/crossentropy": 2.5107653200626374, + "loss/logits": 0.7942128717899323, + "step": 43850 + }, + { + "epoch": 0.4386, + "grad_norm": 14.8125, + "grad_norm_var": 0.18411458333333333, + "learning_rate": 0.0003, + "loss": 10.9717, + "loss/aux_loss": 0.048072948679327966, + "loss/crossentropy": 2.6562119662761687, + "loss/logits": 0.839997673034668, + "step": 43860 + }, + { + "epoch": 0.4387, + "grad_norm": 13.875, + "grad_norm_var": 0.5127604166666667, + "learning_rate": 0.0003, + "loss": 11.1122, + "loss/aux_loss": 0.048071831464767456, + "loss/crossentropy": 2.632074463367462, + "loss/logits": 0.8598015516996383, + "step": 43870 + }, + { + "epoch": 0.4388, + "grad_norm": 14.625, + "grad_norm_var": 0.656884765625, + "learning_rate": 0.0003, + "loss": 11.1054, + "loss/aux_loss": 0.04807578288018703, + "loss/crossentropy": 2.6886400461196898, + "loss/logits": 0.8491002053022385, + "step": 43880 + }, + { + "epoch": 0.4389, + "grad_norm": 15.0, + "grad_norm_var": 0.3563639322916667, + "learning_rate": 0.0003, + "loss": 10.9558, + "loss/aux_loss": 0.0480809960514307, + "loss/crossentropy": 2.4997453689575195, + "loss/logits": 0.7939290121197701, + "step": 43890 + }, + { + "epoch": 0.439, + "grad_norm": 13.25, + "grad_norm_var": 0.34765625, + "learning_rate": 0.0003, + "loss": 11.3424, + "loss/aux_loss": 0.048082989640533924, + "loss/crossentropy": 2.65654296875, + "loss/logits": 0.8427457630634307, + "step": 43900 + }, + { + "epoch": 0.4391, + "grad_norm": 14.0625, + "grad_norm_var": 0.14542643229166666, + "learning_rate": 0.0003, + "loss": 11.0091, + "loss/aux_loss": 0.04807342197746038, + "loss/crossentropy": 2.6360219061374663, + "loss/logits": 0.8580428868532181, + "step": 43910 + }, + { + "epoch": 0.4392, + "grad_norm": 13.75, + "grad_norm_var": 1.5449055989583333, + "learning_rate": 0.0003, + "loss": 11.1625, + "loss/aux_loss": 0.04808421973139047, + "loss/crossentropy": 2.6617501974105835, + "loss/logits": 0.8398198932409286, + "step": 43920 + }, + { + "epoch": 0.4393, + "grad_norm": 14.375, + "grad_norm_var": 1.5051920572916666, + "learning_rate": 0.0003, + "loss": 11.1698, + "loss/aux_loss": 0.048086178675293925, + "loss/crossentropy": 2.554797637462616, + "loss/logits": 0.7996778011322021, + "step": 43930 + }, + { + "epoch": 0.4394, + "grad_norm": 13.75, + "grad_norm_var": 0.4822265625, + "learning_rate": 0.0003, + "loss": 11.0295, + "loss/aux_loss": 0.04808267541229725, + "loss/crossentropy": 2.7181775331497193, + "loss/logits": 0.8317540198564529, + "step": 43940 + }, + { + "epoch": 0.4395, + "grad_norm": 13.125, + "grad_norm_var": 0.399072265625, + "learning_rate": 0.0003, + "loss": 11.0659, + "loss/aux_loss": 0.0480749236419797, + "loss/crossentropy": 2.7755655884742736, + "loss/logits": 0.8220769613981247, + "step": 43950 + }, + { + "epoch": 0.4396, + "grad_norm": 14.375, + "grad_norm_var": 0.241259765625, + "learning_rate": 0.0003, + "loss": 11.143, + "loss/aux_loss": 0.04807134997099638, + "loss/crossentropy": 2.8343628644943237, + "loss/logits": 0.8345601588487626, + "step": 43960 + }, + { + "epoch": 0.4397, + "grad_norm": 13.5625, + "grad_norm_var": 0.24894205729166666, + "learning_rate": 0.0003, + "loss": 11.0354, + "loss/aux_loss": 0.04807570315897465, + "loss/crossentropy": 2.7424150824546816, + "loss/logits": 0.8381778568029403, + "step": 43970 + }, + { + "epoch": 0.4398, + "grad_norm": 13.9375, + "grad_norm_var": 0.27493489583333336, + "learning_rate": 0.0003, + "loss": 11.1642, + "loss/aux_loss": 0.048077446036040784, + "loss/crossentropy": 2.688514918088913, + "loss/logits": 0.8474517434835434, + "step": 43980 + }, + { + "epoch": 0.4399, + "grad_norm": 13.625, + "grad_norm_var": 1.0007649739583333, + "learning_rate": 0.0003, + "loss": 11.216, + "loss/aux_loss": 0.04808139279484749, + "loss/crossentropy": 2.829476696252823, + "loss/logits": 0.8397254914045333, + "step": 43990 + }, + { + "epoch": 0.44, + "grad_norm": 13.8125, + "grad_norm_var": 0.865869140625, + "learning_rate": 0.0003, + "loss": 11.2128, + "loss/aux_loss": 0.04807093515992165, + "loss/crossentropy": 2.7557824432849882, + "loss/logits": 0.8229851201176643, + "step": 44000 + }, + { + "epoch": 0.4401, + "grad_norm": 15.8125, + "grad_norm_var": 0.5528645833333333, + "learning_rate": 0.0003, + "loss": 11.1385, + "loss/aux_loss": 0.04807540941983461, + "loss/crossentropy": 2.7282972991466523, + "loss/logits": 0.8258565187454223, + "step": 44010 + }, + { + "epoch": 0.4402, + "grad_norm": 13.1875, + "grad_norm_var": 0.6098307291666667, + "learning_rate": 0.0003, + "loss": 11.3424, + "loss/aux_loss": 0.048072741366922855, + "loss/crossentropy": 2.8332776546478273, + "loss/logits": 0.8698825478553772, + "step": 44020 + }, + { + "epoch": 0.4403, + "grad_norm": 13.875, + "grad_norm_var": 0.6869140625, + "learning_rate": 0.0003, + "loss": 11.1079, + "loss/aux_loss": 0.04808550868183374, + "loss/crossentropy": 2.728848767280579, + "loss/logits": 0.878032585978508, + "step": 44030 + }, + { + "epoch": 0.4404, + "grad_norm": 13.5, + "grad_norm_var": 0.10983072916666667, + "learning_rate": 0.0003, + "loss": 11.182, + "loss/aux_loss": 0.04806646145880222, + "loss/crossentropy": 2.660983008146286, + "loss/logits": 0.8294680565595627, + "step": 44040 + }, + { + "epoch": 0.4405, + "grad_norm": 15.0, + "grad_norm_var": 1346.16953125, + "learning_rate": 0.0003, + "loss": 11.2939, + "loss/aux_loss": 0.04808670189231634, + "loss/crossentropy": 2.8212135076522826, + "loss/logits": 0.8692754089832306, + "step": 44050 + }, + { + "epoch": 0.4406, + "grad_norm": 14.75, + "grad_norm_var": 1335.9011555989584, + "learning_rate": 0.0003, + "loss": 10.9897, + "loss/aux_loss": 0.04808017909526825, + "loss/crossentropy": 2.73088259100914, + "loss/logits": 0.8444351434707642, + "step": 44060 + }, + { + "epoch": 0.4407, + "grad_norm": 16.25, + "grad_norm_var": 0.8820149739583333, + "learning_rate": 0.0003, + "loss": 11.2521, + "loss/aux_loss": 0.04806812740862369, + "loss/crossentropy": 2.777199387550354, + "loss/logits": 0.8744244068861008, + "step": 44070 + }, + { + "epoch": 0.4408, + "grad_norm": 15.5, + "grad_norm_var": 0.5431640625, + "learning_rate": 0.0003, + "loss": 11.0439, + "loss/aux_loss": 0.04805951733142137, + "loss/crossentropy": 2.804098057746887, + "loss/logits": 0.864593580365181, + "step": 44080 + }, + { + "epoch": 0.4409, + "grad_norm": 13.375, + "grad_norm_var": 0.634375, + "learning_rate": 0.0003, + "loss": 11.1746, + "loss/aux_loss": 0.048096229508519175, + "loss/crossentropy": 2.6453644156455995, + "loss/logits": 0.8177176743745804, + "step": 44090 + }, + { + "epoch": 0.441, + "grad_norm": 15.25, + "grad_norm_var": 0.5972493489583334, + "learning_rate": 0.0003, + "loss": 11.1598, + "loss/aux_loss": 0.048072378523647784, + "loss/crossentropy": 2.8311945855617524, + "loss/logits": 0.8428573668003082, + "step": 44100 + }, + { + "epoch": 0.4411, + "grad_norm": 14.0, + "grad_norm_var": 0.32693684895833336, + "learning_rate": 0.0003, + "loss": 11.1579, + "loss/aux_loss": 0.048064601607620716, + "loss/crossentropy": 2.780519354343414, + "loss/logits": 0.8639049649238586, + "step": 44110 + }, + { + "epoch": 0.4412, + "grad_norm": 14.1875, + "grad_norm_var": 0.214697265625, + "learning_rate": 0.0003, + "loss": 10.999, + "loss/aux_loss": 0.048081880807876586, + "loss/crossentropy": 2.7817383885383604, + "loss/logits": 0.8632400244474411, + "step": 44120 + }, + { + "epoch": 0.4413, + "grad_norm": 13.625, + "grad_norm_var": 7.629427083333334, + "learning_rate": 0.0003, + "loss": 11.0469, + "loss/aux_loss": 0.048066666908562185, + "loss/crossentropy": 2.7238622844219207, + "loss/logits": 0.8448743641376495, + "step": 44130 + }, + { + "epoch": 0.4414, + "grad_norm": 13.9375, + "grad_norm_var": 0.6567057291666667, + "learning_rate": 0.0003, + "loss": 11.154, + "loss/aux_loss": 0.04808225836604833, + "loss/crossentropy": 2.7381427764892576, + "loss/logits": 0.8270679324865341, + "step": 44140 + }, + { + "epoch": 0.4415, + "grad_norm": 14.9375, + "grad_norm_var": 0.5067708333333333, + "learning_rate": 0.0003, + "loss": 11.2288, + "loss/aux_loss": 0.04807639848440885, + "loss/crossentropy": 2.7136885285377503, + "loss/logits": 0.8549921065568924, + "step": 44150 + }, + { + "epoch": 0.4416, + "grad_norm": 14.625, + "grad_norm_var": 0.46295572916666666, + "learning_rate": 0.0003, + "loss": 11.2024, + "loss/aux_loss": 0.04807550571858883, + "loss/crossentropy": 2.909850722551346, + "loss/logits": 0.8444527328014374, + "step": 44160 + }, + { + "epoch": 0.4417, + "grad_norm": 15.0, + "grad_norm_var": 0.3348307291666667, + "learning_rate": 0.0003, + "loss": 11.2275, + "loss/aux_loss": 0.04807161632925272, + "loss/crossentropy": 2.8793214321136475, + "loss/logits": 0.8599152326583862, + "step": 44170 + }, + { + "epoch": 0.4418, + "grad_norm": 15.4375, + "grad_norm_var": 0.7601399739583333, + "learning_rate": 0.0003, + "loss": 11.1919, + "loss/aux_loss": 0.04808258600533009, + "loss/crossentropy": 2.723651033639908, + "loss/logits": 0.8421857535839081, + "step": 44180 + }, + { + "epoch": 0.4419, + "grad_norm": 16.625, + "grad_norm_var": 1.0032389322916666, + "learning_rate": 0.0003, + "loss": 11.1651, + "loss/aux_loss": 0.04808426704257727, + "loss/crossentropy": 2.745072239637375, + "loss/logits": 0.8335127264261246, + "step": 44190 + }, + { + "epoch": 0.442, + "grad_norm": 13.4375, + "grad_norm_var": 0.7898274739583333, + "learning_rate": 0.0003, + "loss": 11.0781, + "loss/aux_loss": 0.048068450205028056, + "loss/crossentropy": 2.478744846582413, + "loss/logits": 0.8013067185878754, + "step": 44200 + }, + { + "epoch": 0.4421, + "grad_norm": 15.1875, + "grad_norm_var": 0.43136393229166664, + "learning_rate": 0.0003, + "loss": 11.0637, + "loss/aux_loss": 0.04808264952152967, + "loss/crossentropy": 2.5987396478652953, + "loss/logits": 0.8236001014709473, + "step": 44210 + }, + { + "epoch": 0.4422, + "grad_norm": 14.3125, + "grad_norm_var": 0.348291015625, + "learning_rate": 0.0003, + "loss": 11.243, + "loss/aux_loss": 0.04807177521288395, + "loss/crossentropy": 2.930016368627548, + "loss/logits": 0.8525474965572357, + "step": 44220 + }, + { + "epoch": 0.4423, + "grad_norm": 14.8125, + "grad_norm_var": 2.8152180989583333, + "learning_rate": 0.0003, + "loss": 11.1347, + "loss/aux_loss": 0.04807257354259491, + "loss/crossentropy": 2.85026136636734, + "loss/logits": 0.8361983984708786, + "step": 44230 + }, + { + "epoch": 0.4424, + "grad_norm": 14.25, + "grad_norm_var": 3.3347493489583333, + "learning_rate": 0.0003, + "loss": 11.3583, + "loss/aux_loss": 0.04808483067899942, + "loss/crossentropy": 2.742392921447754, + "loss/logits": 0.8808601886034012, + "step": 44240 + }, + { + "epoch": 0.4425, + "grad_norm": 14.3125, + "grad_norm_var": 1.027587890625, + "learning_rate": 0.0003, + "loss": 11.1345, + "loss/aux_loss": 0.0480776023119688, + "loss/crossentropy": 2.7934012949466704, + "loss/logits": 0.8520541161298751, + "step": 44250 + }, + { + "epoch": 0.4426, + "grad_norm": 12.6875, + "grad_norm_var": 0.8328125, + "learning_rate": 0.0003, + "loss": 10.9229, + "loss/aux_loss": 0.0480777820572257, + "loss/crossentropy": 2.8462532997131347, + "loss/logits": 0.8383017539978027, + "step": 44260 + }, + { + "epoch": 0.4427, + "grad_norm": 14.6875, + "grad_norm_var": 0.5344889322916667, + "learning_rate": 0.0003, + "loss": 11.1232, + "loss/aux_loss": 0.04807592108845711, + "loss/crossentropy": 2.671667981147766, + "loss/logits": 0.8324110358953476, + "step": 44270 + }, + { + "epoch": 0.4428, + "grad_norm": 14.6875, + "grad_norm_var": 1.0883951822916667, + "learning_rate": 0.0003, + "loss": 11.0707, + "loss/aux_loss": 0.04807809516787529, + "loss/crossentropy": 2.7182459354400637, + "loss/logits": 0.8334134668111801, + "step": 44280 + }, + { + "epoch": 0.4429, + "grad_norm": 13.9375, + "grad_norm_var": 1.4891764322916667, + "learning_rate": 0.0003, + "loss": 11.3104, + "loss/aux_loss": 0.04806990176439285, + "loss/crossentropy": 2.6499986171722414, + "loss/logits": 0.8422462284564972, + "step": 44290 + }, + { + "epoch": 0.443, + "grad_norm": 16.125, + "grad_norm_var": 0.8169270833333333, + "learning_rate": 0.0003, + "loss": 11.087, + "loss/aux_loss": 0.048081264831125736, + "loss/crossentropy": 2.6629028499126433, + "loss/logits": 0.7944082587957382, + "step": 44300 + }, + { + "epoch": 0.4431, + "grad_norm": 14.25, + "grad_norm_var": 1.045556640625, + "learning_rate": 0.0003, + "loss": 11.1663, + "loss/aux_loss": 0.04805992990732193, + "loss/crossentropy": 2.6393331587314606, + "loss/logits": 0.8219176232814789, + "step": 44310 + }, + { + "epoch": 0.4432, + "grad_norm": 13.625, + "grad_norm_var": 1.5378743489583333, + "learning_rate": 0.0003, + "loss": 11.1893, + "loss/aux_loss": 0.048083177767693996, + "loss/crossentropy": 2.829824334383011, + "loss/logits": 0.8309338241815567, + "step": 44320 + }, + { + "epoch": 0.4433, + "grad_norm": 35.75, + "grad_norm_var": 28.234309895833334, + "learning_rate": 0.0003, + "loss": 10.9533, + "loss/aux_loss": 0.048075624741613865, + "loss/crossentropy": 2.6464429974555967, + "loss/logits": 0.8063034623861313, + "step": 44330 + }, + { + "epoch": 0.4434, + "grad_norm": 14.4375, + "grad_norm_var": 28.2, + "learning_rate": 0.0003, + "loss": 11.0942, + "loss/aux_loss": 0.048067497089505196, + "loss/crossentropy": 2.6193343341350555, + "loss/logits": 0.8304236233234406, + "step": 44340 + }, + { + "epoch": 0.4435, + "grad_norm": 14.9375, + "grad_norm_var": 0.934375, + "learning_rate": 0.0003, + "loss": 10.9155, + "loss/aux_loss": 0.04807949960231781, + "loss/crossentropy": 2.7103063344955443, + "loss/logits": 0.8495049208402634, + "step": 44350 + }, + { + "epoch": 0.4436, + "grad_norm": 14.625, + "grad_norm_var": 0.452587890625, + "learning_rate": 0.0003, + "loss": 11.1538, + "loss/aux_loss": 0.048082873411476615, + "loss/crossentropy": 2.634609413146973, + "loss/logits": 0.8501360476016998, + "step": 44360 + }, + { + "epoch": 0.4437, + "grad_norm": 14.0, + "grad_norm_var": 0.25974934895833335, + "learning_rate": 0.0003, + "loss": 11.1608, + "loss/aux_loss": 0.04807829111814499, + "loss/crossentropy": 2.619146168231964, + "loss/logits": 0.8419374793767929, + "step": 44370 + }, + { + "epoch": 0.4438, + "grad_norm": 14.625, + "grad_norm_var": 0.38644205729166664, + "learning_rate": 0.0003, + "loss": 11.1946, + "loss/aux_loss": 0.04806345794349909, + "loss/crossentropy": 2.7630446314811707, + "loss/logits": 0.8234979271888733, + "step": 44380 + }, + { + "epoch": 0.4439, + "grad_norm": 14.8125, + "grad_norm_var": 188.98396809895834, + "learning_rate": 0.0003, + "loss": 11.1839, + "loss/aux_loss": 0.04808218106627464, + "loss/crossentropy": 2.7055815279483797, + "loss/logits": 0.8304955214262009, + "step": 44390 + }, + { + "epoch": 0.444, + "grad_norm": 14.625, + "grad_norm_var": 0.49347330729166666, + "learning_rate": 0.0003, + "loss": 11.1252, + "loss/aux_loss": 0.048086220771074294, + "loss/crossentropy": 2.6687645077705384, + "loss/logits": 0.8254845380783081, + "step": 44400 + }, + { + "epoch": 0.4441, + "grad_norm": 14.1875, + "grad_norm_var": 22.872249348958334, + "learning_rate": 0.0003, + "loss": 11.2087, + "loss/aux_loss": 0.04806323740631342, + "loss/crossentropy": 2.7094571113586428, + "loss/logits": 0.8551447689533234, + "step": 44410 + }, + { + "epoch": 0.4442, + "grad_norm": 13.375, + "grad_norm_var": 0.490478515625, + "learning_rate": 0.0003, + "loss": 11.0191, + "loss/aux_loss": 0.0480901513248682, + "loss/crossentropy": 2.6127541959285736, + "loss/logits": 0.7744009613990783, + "step": 44420 + }, + { + "epoch": 0.4443, + "grad_norm": 16.125, + "grad_norm_var": 0.8528483072916667, + "learning_rate": 0.0003, + "loss": 11.1025, + "loss/aux_loss": 0.04808180164545774, + "loss/crossentropy": 2.7499490082263947, + "loss/logits": 0.8446706473827362, + "step": 44430 + }, + { + "epoch": 0.4444, + "grad_norm": 13.75, + "grad_norm_var": 0.781103515625, + "learning_rate": 0.0003, + "loss": 11.3893, + "loss/aux_loss": 0.048070714622735974, + "loss/crossentropy": 2.789631450176239, + "loss/logits": 0.8639173865318298, + "step": 44440 + }, + { + "epoch": 0.4445, + "grad_norm": 13.6875, + "grad_norm_var": 0.14855143229166667, + "learning_rate": 0.0003, + "loss": 11.1676, + "loss/aux_loss": 0.0480786357074976, + "loss/crossentropy": 2.794544792175293, + "loss/logits": 0.8530236780643463, + "step": 44450 + }, + { + "epoch": 0.4446, + "grad_norm": 13.1875, + "grad_norm_var": 0.23430989583333334, + "learning_rate": 0.0003, + "loss": 10.9838, + "loss/aux_loss": 0.04807055927813053, + "loss/crossentropy": 2.6851485848426817, + "loss/logits": 0.8450033336877822, + "step": 44460 + }, + { + "epoch": 0.4447, + "grad_norm": 13.375, + "grad_norm_var": 0.5980305989583333, + "learning_rate": 0.0003, + "loss": 11.1546, + "loss/aux_loss": 0.04807008933275938, + "loss/crossentropy": 2.6952660202980043, + "loss/logits": 0.806543692946434, + "step": 44470 + }, + { + "epoch": 0.4448, + "grad_norm": 14.6875, + "grad_norm_var": 1.6238932291666666, + "learning_rate": 0.0003, + "loss": 11.2524, + "loss/aux_loss": 0.04807232767343521, + "loss/crossentropy": 2.8130900621414185, + "loss/logits": 0.8416864901781083, + "step": 44480 + }, + { + "epoch": 0.4449, + "grad_norm": 14.6875, + "grad_norm_var": 1.2067708333333333, + "learning_rate": 0.0003, + "loss": 11.2551, + "loss/aux_loss": 0.04807587340474129, + "loss/crossentropy": 2.982305383682251, + "loss/logits": 0.8985978931188583, + "step": 44490 + }, + { + "epoch": 0.445, + "grad_norm": 14.3125, + "grad_norm_var": 0.45358072916666664, + "learning_rate": 0.0003, + "loss": 11.1725, + "loss/aux_loss": 0.048066375963389876, + "loss/crossentropy": 2.6484339118003843, + "loss/logits": 0.8086622357368469, + "step": 44500 + }, + { + "epoch": 0.4451, + "grad_norm": 15.1875, + "grad_norm_var": 0.7822265625, + "learning_rate": 0.0003, + "loss": 11.2345, + "loss/aux_loss": 0.04808136448264122, + "loss/crossentropy": 2.829063284397125, + "loss/logits": 0.8414050981402397, + "step": 44510 + }, + { + "epoch": 0.4452, + "grad_norm": 13.8125, + "grad_norm_var": 0.3245930989583333, + "learning_rate": 0.0003, + "loss": 11.2001, + "loss/aux_loss": 0.0480697114020586, + "loss/crossentropy": 2.8107310473918914, + "loss/logits": 0.8371037811040878, + "step": 44520 + }, + { + "epoch": 0.4453, + "grad_norm": 14.0, + "grad_norm_var": 0.4483723958333333, + "learning_rate": 0.0003, + "loss": 11.3441, + "loss/aux_loss": 0.04806445110589266, + "loss/crossentropy": 2.7447421967983248, + "loss/logits": 0.8555226683616638, + "step": 44530 + }, + { + "epoch": 0.4454, + "grad_norm": 14.5625, + "grad_norm_var": 0.9785807291666667, + "learning_rate": 0.0003, + "loss": 11.0857, + "loss/aux_loss": 0.04807347375899553, + "loss/crossentropy": 2.7596997022628784, + "loss/logits": 0.8404556185007095, + "step": 44540 + }, + { + "epoch": 0.4455, + "grad_norm": 14.625, + "grad_norm_var": 0.675634765625, + "learning_rate": 0.0003, + "loss": 11.1599, + "loss/aux_loss": 0.048077495954930785, + "loss/crossentropy": 2.838477683067322, + "loss/logits": 0.8429495930671692, + "step": 44550 + }, + { + "epoch": 0.4456, + "grad_norm": 14.1875, + "grad_norm_var": 0.159619140625, + "learning_rate": 0.0003, + "loss": 11.005, + "loss/aux_loss": 0.04807081557810307, + "loss/crossentropy": 2.608337712287903, + "loss/logits": 0.8179334878921509, + "step": 44560 + }, + { + "epoch": 0.4457, + "grad_norm": 13.25, + "grad_norm_var": 0.2166015625, + "learning_rate": 0.0003, + "loss": 11.0062, + "loss/aux_loss": 0.04806958455592394, + "loss/crossentropy": 2.4894912481307983, + "loss/logits": 0.8116365820169449, + "step": 44570 + }, + { + "epoch": 0.4458, + "grad_norm": 14.9375, + "grad_norm_var": 1.2166015625, + "learning_rate": 0.0003, + "loss": 11.0503, + "loss/aux_loss": 0.04808680806308985, + "loss/crossentropy": 2.543110156059265, + "loss/logits": 0.8014299184083938, + "step": 44580 + }, + { + "epoch": 0.4459, + "grad_norm": 13.625, + "grad_norm_var": 0.38743489583333335, + "learning_rate": 0.0003, + "loss": 11.2837, + "loss/aux_loss": 0.048072746768593785, + "loss/crossentropy": 2.6654117584228514, + "loss/logits": 0.8336873948574066, + "step": 44590 + }, + { + "epoch": 0.446, + "grad_norm": 14.0625, + "grad_norm_var": 0.6465983072916667, + "learning_rate": 0.0003, + "loss": 11.1985, + "loss/aux_loss": 0.04807266443967819, + "loss/crossentropy": 2.6742038309574125, + "loss/logits": 0.8307890117168426, + "step": 44600 + }, + { + "epoch": 0.4461, + "grad_norm": 19.0, + "grad_norm_var": 1.7292805989583333, + "learning_rate": 0.0003, + "loss": 11.1619, + "loss/aux_loss": 0.04808099921792745, + "loss/crossentropy": 2.602944529056549, + "loss/logits": 0.8340393453836441, + "step": 44610 + }, + { + "epoch": 0.4462, + "grad_norm": 14.5, + "grad_norm_var": 1.643603515625, + "learning_rate": 0.0003, + "loss": 11.2413, + "loss/aux_loss": 0.048074822127819064, + "loss/crossentropy": 2.7373409271240234, + "loss/logits": 0.8392162501811982, + "step": 44620 + }, + { + "epoch": 0.4463, + "grad_norm": 13.875, + "grad_norm_var": 0.268603515625, + "learning_rate": 0.0003, + "loss": 11.2579, + "loss/aux_loss": 0.04809278659522533, + "loss/crossentropy": 2.75430805683136, + "loss/logits": 0.8329770535230636, + "step": 44630 + }, + { + "epoch": 0.4464, + "grad_norm": 13.9375, + "grad_norm_var": 0.543603515625, + "learning_rate": 0.0003, + "loss": 11.333, + "loss/aux_loss": 0.04807210359722376, + "loss/crossentropy": 2.8144919753074644, + "loss/logits": 0.8301558136940003, + "step": 44640 + }, + { + "epoch": 0.4465, + "grad_norm": 14.25, + "grad_norm_var": 0.38800455729166666, + "learning_rate": 0.0003, + "loss": 11.0804, + "loss/aux_loss": 0.048076588474214074, + "loss/crossentropy": 2.5720925986766816, + "loss/logits": 0.8426287531852722, + "step": 44650 + }, + { + "epoch": 0.4466, + "grad_norm": 14.0625, + "grad_norm_var": 0.9827962239583333, + "learning_rate": 0.0003, + "loss": 11.1989, + "loss/aux_loss": 0.04807624667882919, + "loss/crossentropy": 2.7645800590515135, + "loss/logits": 0.867130133509636, + "step": 44660 + }, + { + "epoch": 0.4467, + "grad_norm": 13.4375, + "grad_norm_var": 0.46087239583333334, + "learning_rate": 0.0003, + "loss": 11.1464, + "loss/aux_loss": 0.04807679317891598, + "loss/crossentropy": 2.819456601142883, + "loss/logits": 0.8306093007326126, + "step": 44670 + }, + { + "epoch": 0.4468, + "grad_norm": 14.125, + "grad_norm_var": 0.261181640625, + "learning_rate": 0.0003, + "loss": 11.1236, + "loss/aux_loss": 0.04808004982769489, + "loss/crossentropy": 2.7177935242652893, + "loss/logits": 0.8448736160993576, + "step": 44680 + }, + { + "epoch": 0.4469, + "grad_norm": 14.4375, + "grad_norm_var": 0.390869140625, + "learning_rate": 0.0003, + "loss": 11.2744, + "loss/aux_loss": 0.04807827845215797, + "loss/crossentropy": 2.7277093112468718, + "loss/logits": 0.8623002141714096, + "step": 44690 + }, + { + "epoch": 0.447, + "grad_norm": 15.375, + "grad_norm_var": 0.3541015625, + "learning_rate": 0.0003, + "loss": 11.0419, + "loss/aux_loss": 0.04807532671838999, + "loss/crossentropy": 2.633826696872711, + "loss/logits": 0.8072617381811142, + "step": 44700 + }, + { + "epoch": 0.4471, + "grad_norm": 13.0625, + "grad_norm_var": 0.8296875, + "learning_rate": 0.0003, + "loss": 11.1652, + "loss/aux_loss": 0.04807002525776625, + "loss/crossentropy": 2.8819324254989622, + "loss/logits": 0.8609935432672501, + "step": 44710 + }, + { + "epoch": 0.4472, + "grad_norm": 12.8125, + "grad_norm_var": 67.39420572916667, + "learning_rate": 0.0003, + "loss": 11.1658, + "loss/aux_loss": 0.04808681160211563, + "loss/crossentropy": 2.6865515530109407, + "loss/logits": 0.8306647807359695, + "step": 44720 + }, + { + "epoch": 0.4473, + "grad_norm": 15.0, + "grad_norm_var": 1.3202473958333334, + "learning_rate": 0.0003, + "loss": 11.311, + "loss/aux_loss": 0.04807041622698307, + "loss/crossentropy": 2.785504710674286, + "loss/logits": 0.8608017772436142, + "step": 44730 + }, + { + "epoch": 0.4474, + "grad_norm": 14.25, + "grad_norm_var": 0.5462076822916667, + "learning_rate": 0.0003, + "loss": 11.0018, + "loss/aux_loss": 0.048068745993077755, + "loss/crossentropy": 2.671027088165283, + "loss/logits": 0.8593548953533172, + "step": 44740 + }, + { + "epoch": 0.4475, + "grad_norm": 14.6875, + "grad_norm_var": 0.5098795572916667, + "learning_rate": 0.0003, + "loss": 11.244, + "loss/aux_loss": 0.0480734009295702, + "loss/crossentropy": 2.8455959856510162, + "loss/logits": 0.8765670835971833, + "step": 44750 + }, + { + "epoch": 0.4476, + "grad_norm": 15.0625, + "grad_norm_var": 0.29609375, + "learning_rate": 0.0003, + "loss": 11.3258, + "loss/aux_loss": 0.04808003343641758, + "loss/crossentropy": 2.717009627819061, + "loss/logits": 0.8516732335090638, + "step": 44760 + }, + { + "epoch": 0.4477, + "grad_norm": 13.875, + "grad_norm_var": 0.2508951822916667, + "learning_rate": 0.0003, + "loss": 11.0886, + "loss/aux_loss": 0.04807702694088221, + "loss/crossentropy": 2.591119593381882, + "loss/logits": 0.8043858855962753, + "step": 44770 + }, + { + "epoch": 0.4478, + "grad_norm": 13.75, + "grad_norm_var": 0.37701822916666666, + "learning_rate": 0.0003, + "loss": 11.2574, + "loss/aux_loss": 0.04807428196072579, + "loss/crossentropy": 2.826832854747772, + "loss/logits": 0.8612869143486023, + "step": 44780 + }, + { + "epoch": 0.4479, + "grad_norm": 14.5625, + "grad_norm_var": 2.029801432291667, + "learning_rate": 0.0003, + "loss": 11.0258, + "loss/aux_loss": 0.048076750710606575, + "loss/crossentropy": 2.607262873649597, + "loss/logits": 0.8060549914836883, + "step": 44790 + }, + { + "epoch": 0.448, + "grad_norm": 15.0625, + "grad_norm_var": 0.7778645833333333, + "learning_rate": 0.0003, + "loss": 11.1719, + "loss/aux_loss": 0.04807757344096899, + "loss/crossentropy": 2.5432204246520995, + "loss/logits": 0.7854818969964981, + "step": 44800 + }, + { + "epoch": 0.4481, + "grad_norm": 14.0625, + "grad_norm_var": 0.18274739583333333, + "learning_rate": 0.0003, + "loss": 11.2529, + "loss/aux_loss": 0.048074766620993616, + "loss/crossentropy": 2.85523384809494, + "loss/logits": 0.8906599700450897, + "step": 44810 + }, + { + "epoch": 0.4482, + "grad_norm": 14.0625, + "grad_norm_var": 0.38723958333333336, + "learning_rate": 0.0003, + "loss": 11.3863, + "loss/aux_loss": 0.04806411787867546, + "loss/crossentropy": 2.7882557988166807, + "loss/logits": 0.8717421501874923, + "step": 44820 + }, + { + "epoch": 0.4483, + "grad_norm": 14.4375, + "grad_norm_var": 0.31027018229166664, + "learning_rate": 0.0003, + "loss": 11.1229, + "loss/aux_loss": 0.04807241186499596, + "loss/crossentropy": 2.737172317504883, + "loss/logits": 0.8286905974149704, + "step": 44830 + }, + { + "epoch": 0.4484, + "grad_norm": 13.5625, + "grad_norm_var": 0.32810872395833335, + "learning_rate": 0.0003, + "loss": 11.3924, + "loss/aux_loss": 0.048084205389022826, + "loss/crossentropy": 2.8003673791885375, + "loss/logits": 0.8761254161596298, + "step": 44840 + }, + { + "epoch": 0.4485, + "grad_norm": 15.625, + "grad_norm_var": 7.253580729166667, + "learning_rate": 0.0003, + "loss": 11.2739, + "loss/aux_loss": 0.04808361791074276, + "loss/crossentropy": 2.7707901895046234, + "loss/logits": 0.8379460781812668, + "step": 44850 + }, + { + "epoch": 0.4486, + "grad_norm": 13.75, + "grad_norm_var": 7.713134765625, + "learning_rate": 0.0003, + "loss": 11.1027, + "loss/aux_loss": 0.04806753098964691, + "loss/crossentropy": 2.857259654998779, + "loss/logits": 0.8505940139293671, + "step": 44860 + }, + { + "epoch": 0.4487, + "grad_norm": 14.5625, + "grad_norm_var": 0.3846354166666667, + "learning_rate": 0.0003, + "loss": 11.0449, + "loss/aux_loss": 0.048075702600181104, + "loss/crossentropy": 2.698404437303543, + "loss/logits": 0.817566591501236, + "step": 44870 + }, + { + "epoch": 0.4488, + "grad_norm": 13.4375, + "grad_norm_var": 0.4984212239583333, + "learning_rate": 0.0003, + "loss": 11.0498, + "loss/aux_loss": 0.04807261247187853, + "loss/crossentropy": 2.7336514472961424, + "loss/logits": 0.8175310790538788, + "step": 44880 + }, + { + "epoch": 0.4489, + "grad_norm": 13.5625, + "grad_norm_var": 0.32493489583333335, + "learning_rate": 0.0003, + "loss": 11.2769, + "loss/aux_loss": 0.0480836022645235, + "loss/crossentropy": 2.6234305024147035, + "loss/logits": 0.7813559800386429, + "step": 44890 + }, + { + "epoch": 0.449, + "grad_norm": 13.8125, + "grad_norm_var": 3.256103515625, + "learning_rate": 0.0003, + "loss": 11.2918, + "loss/aux_loss": 0.048073142766952515, + "loss/crossentropy": 2.686808633804321, + "loss/logits": 0.831533208489418, + "step": 44900 + }, + { + "epoch": 0.4491, + "grad_norm": 13.75, + "grad_norm_var": 0.213525390625, + "learning_rate": 0.0003, + "loss": 11.2277, + "loss/aux_loss": 0.0480730053037405, + "loss/crossentropy": 2.5610205233097076, + "loss/logits": 0.8396010220050811, + "step": 44910 + }, + { + "epoch": 0.4492, + "grad_norm": 15.1875, + "grad_norm_var": 0.4244140625, + "learning_rate": 0.0003, + "loss": 11.3523, + "loss/aux_loss": 0.04808488227427006, + "loss/crossentropy": 2.7543952822685243, + "loss/logits": 0.8435162544250489, + "step": 44920 + }, + { + "epoch": 0.4493, + "grad_norm": 13.5, + "grad_norm_var": 0.6640625, + "learning_rate": 0.0003, + "loss": 11.066, + "loss/aux_loss": 0.04807263296097517, + "loss/crossentropy": 2.755419361591339, + "loss/logits": 0.8490778416395187, + "step": 44930 + }, + { + "epoch": 0.4494, + "grad_norm": 13.875, + "grad_norm_var": 0.21053059895833334, + "learning_rate": 0.0003, + "loss": 11.2204, + "loss/aux_loss": 0.048071037791669366, + "loss/crossentropy": 2.6401973962783813, + "loss/logits": 0.8432885766029358, + "step": 44940 + }, + { + "epoch": 0.4495, + "grad_norm": 14.8125, + "grad_norm_var": 0.7831868489583333, + "learning_rate": 0.0003, + "loss": 11.0892, + "loss/aux_loss": 0.04807609617710114, + "loss/crossentropy": 2.685221529006958, + "loss/logits": 0.8308875828981399, + "step": 44950 + }, + { + "epoch": 0.4496, + "grad_norm": 13.1875, + "grad_norm_var": 0.596337890625, + "learning_rate": 0.0003, + "loss": 11.2131, + "loss/aux_loss": 0.048079296760261056, + "loss/crossentropy": 2.694732528924942, + "loss/logits": 0.8388465225696564, + "step": 44960 + }, + { + "epoch": 0.4497, + "grad_norm": 13.625, + "grad_norm_var": 0.5374348958333334, + "learning_rate": 0.0003, + "loss": 11.0489, + "loss/aux_loss": 0.048061837814748286, + "loss/crossentropy": 2.737008786201477, + "loss/logits": 0.850694689154625, + "step": 44970 + }, + { + "epoch": 0.4498, + "grad_norm": 13.6875, + "grad_norm_var": 0.3843587239583333, + "learning_rate": 0.0003, + "loss": 11.1391, + "loss/aux_loss": 0.048087633959949014, + "loss/crossentropy": 2.8430655121803285, + "loss/logits": 0.8736658453941345, + "step": 44980 + }, + { + "epoch": 0.4499, + "grad_norm": 13.6875, + "grad_norm_var": 0.2945149739583333, + "learning_rate": 0.0003, + "loss": 11.1745, + "loss/aux_loss": 0.04807091951370239, + "loss/crossentropy": 2.856264519691467, + "loss/logits": 0.8631505787372589, + "step": 44990 + }, + { + "epoch": 0.45, + "grad_norm": 14.4375, + "grad_norm_var": 0.34576822916666666, + "learning_rate": 0.0003, + "loss": 11.1792, + "loss/aux_loss": 0.048074228875339034, + "loss/crossentropy": 2.7000016987323763, + "loss/logits": 0.8304085314273835, + "step": 45000 + }, + { + "epoch": 0.4501, + "grad_norm": 19.375, + "grad_norm_var": 1.9964680989583334, + "learning_rate": 0.0003, + "loss": 11.201, + "loss/aux_loss": 0.04807187356054783, + "loss/crossentropy": 2.5905265331268312, + "loss/logits": 0.8301917672157287, + "step": 45010 + }, + { + "epoch": 0.4502, + "grad_norm": 14.5625, + "grad_norm_var": 1.995166015625, + "learning_rate": 0.0003, + "loss": 11.3012, + "loss/aux_loss": 0.048079838044941425, + "loss/crossentropy": 2.655366039276123, + "loss/logits": 0.8250725924968719, + "step": 45020 + }, + { + "epoch": 0.4503, + "grad_norm": 13.9375, + "grad_norm_var": 0.453125, + "learning_rate": 0.0003, + "loss": 11.0779, + "loss/aux_loss": 0.048072745092213154, + "loss/crossentropy": 2.873497819900513, + "loss/logits": 0.8643653631210327, + "step": 45030 + }, + { + "epoch": 0.4504, + "grad_norm": 13.875, + "grad_norm_var": 0.2747395833333333, + "learning_rate": 0.0003, + "loss": 11.0926, + "loss/aux_loss": 0.04807302486151457, + "loss/crossentropy": 2.816511571407318, + "loss/logits": 0.8430682748556138, + "step": 45040 + }, + { + "epoch": 0.4505, + "grad_norm": 15.0, + "grad_norm_var": 9.468994140625, + "learning_rate": 0.0003, + "loss": 11.2874, + "loss/aux_loss": 0.0480752307921648, + "loss/crossentropy": 2.796900761127472, + "loss/logits": 0.8639124810695649, + "step": 45050 + }, + { + "epoch": 0.4506, + "grad_norm": 15.4375, + "grad_norm_var": 0.5764973958333334, + "learning_rate": 0.0003, + "loss": 11.2776, + "loss/aux_loss": 0.048078755289316176, + "loss/crossentropy": 2.770525109767914, + "loss/logits": 0.8221762269735337, + "step": 45060 + }, + { + "epoch": 0.4507, + "grad_norm": 14.0625, + "grad_norm_var": 0.5235514322916667, + "learning_rate": 0.0003, + "loss": 11.0756, + "loss/aux_loss": 0.048073511384427545, + "loss/crossentropy": 2.642447865009308, + "loss/logits": 0.823766753077507, + "step": 45070 + }, + { + "epoch": 0.4508, + "grad_norm": 14.25, + "grad_norm_var": 0.667822265625, + "learning_rate": 0.0003, + "loss": 11.2534, + "loss/aux_loss": 0.04807865601032972, + "loss/crossentropy": 2.773304843902588, + "loss/logits": 0.8674245417118073, + "step": 45080 + }, + { + "epoch": 0.4509, + "grad_norm": 14.0625, + "grad_norm_var": 1.1044108072916667, + "learning_rate": 0.0003, + "loss": 10.9283, + "loss/aux_loss": 0.04806472901254892, + "loss/crossentropy": 2.545005625486374, + "loss/logits": 0.7938053220510483, + "step": 45090 + }, + { + "epoch": 0.451, + "grad_norm": 14.625, + "grad_norm_var": 0.41287434895833336, + "learning_rate": 0.0003, + "loss": 10.9847, + "loss/aux_loss": 0.04808530602604151, + "loss/crossentropy": 2.6405605256557463, + "loss/logits": 0.8240345329046249, + "step": 45100 + }, + { + "epoch": 0.4511, + "grad_norm": 14.1875, + "grad_norm_var": 0.48605143229166664, + "learning_rate": 0.0003, + "loss": 11.2396, + "loss/aux_loss": 0.04806354120373726, + "loss/crossentropy": 2.7839693784713746, + "loss/logits": 0.8868257701396942, + "step": 45110 + }, + { + "epoch": 0.4512, + "grad_norm": 16.875, + "grad_norm_var": 0.9191243489583333, + "learning_rate": 0.0003, + "loss": 11.0655, + "loss/aux_loss": 0.048075102269649506, + "loss/crossentropy": 2.752034366130829, + "loss/logits": 0.840661883354187, + "step": 45120 + }, + { + "epoch": 0.4513, + "grad_norm": 16.25, + "grad_norm_var": 1.6202473958333334, + "learning_rate": 0.0003, + "loss": 11.2822, + "loss/aux_loss": 0.048065843246877196, + "loss/crossentropy": 2.7595421195030214, + "loss/logits": 0.8664416402578354, + "step": 45130 + }, + { + "epoch": 0.4514, + "grad_norm": 14.5625, + "grad_norm_var": 0.522509765625, + "learning_rate": 0.0003, + "loss": 11.1742, + "loss/aux_loss": 0.04807279203087091, + "loss/crossentropy": 2.7141676127910612, + "loss/logits": 0.8610961318016053, + "step": 45140 + }, + { + "epoch": 0.4515, + "grad_norm": 15.8125, + "grad_norm_var": 0.7348307291666667, + "learning_rate": 0.0003, + "loss": 11.1127, + "loss/aux_loss": 0.04807740245014429, + "loss/crossentropy": 2.674371284246445, + "loss/logits": 0.8470130562782288, + "step": 45150 + }, + { + "epoch": 0.4516, + "grad_norm": 13.25, + "grad_norm_var": 5.373958333333333, + "learning_rate": 0.0003, + "loss": 11.1149, + "loss/aux_loss": 0.04808497317135334, + "loss/crossentropy": 2.658644849061966, + "loss/logits": 0.8162630528211594, + "step": 45160 + }, + { + "epoch": 0.4517, + "grad_norm": 14.75, + "grad_norm_var": 4.355322265625, + "learning_rate": 0.0003, + "loss": 11.1443, + "loss/aux_loss": 0.048073196038603785, + "loss/crossentropy": 2.693699061870575, + "loss/logits": 0.8589479506015778, + "step": 45170 + }, + { + "epoch": 0.4518, + "grad_norm": 16.875, + "grad_norm_var": 0.7890625, + "learning_rate": 0.0003, + "loss": 11.2395, + "loss/aux_loss": 0.04807229600846767, + "loss/crossentropy": 2.809789764881134, + "loss/logits": 0.8285229980945588, + "step": 45180 + }, + { + "epoch": 0.4519, + "grad_norm": 14.0625, + "grad_norm_var": 0.75, + "learning_rate": 0.0003, + "loss": 10.9856, + "loss/aux_loss": 0.048070633225142954, + "loss/crossentropy": 2.75104238986969, + "loss/logits": 0.8169887810945511, + "step": 45190 + }, + { + "epoch": 0.452, + "grad_norm": 14.0, + "grad_norm_var": 0.47239583333333335, + "learning_rate": 0.0003, + "loss": 11.2444, + "loss/aux_loss": 0.04806809239089489, + "loss/crossentropy": 2.7128111243247988, + "loss/logits": 0.8189259111881256, + "step": 45200 + }, + { + "epoch": 0.4521, + "grad_norm": 14.0625, + "grad_norm_var": 0.6692057291666667, + "learning_rate": 0.0003, + "loss": 10.9503, + "loss/aux_loss": 0.0480776846408844, + "loss/crossentropy": 2.7245679616928102, + "loss/logits": 0.8289970546960831, + "step": 45210 + }, + { + "epoch": 0.4522, + "grad_norm": 13.3125, + "grad_norm_var": 0.5874348958333333, + "learning_rate": 0.0003, + "loss": 11.094, + "loss/aux_loss": 0.04808888658881187, + "loss/crossentropy": 2.482409542798996, + "loss/logits": 0.8053638786077499, + "step": 45220 + }, + { + "epoch": 0.4523, + "grad_norm": 13.875, + "grad_norm_var": 0.5699055989583334, + "learning_rate": 0.0003, + "loss": 11.1009, + "loss/aux_loss": 0.04806402549147606, + "loss/crossentropy": 2.8999637961387634, + "loss/logits": 0.8342925250530243, + "step": 45230 + }, + { + "epoch": 0.4524, + "grad_norm": 13.0, + "grad_norm_var": 0.3544108072916667, + "learning_rate": 0.0003, + "loss": 11.1598, + "loss/aux_loss": 0.04807737078517675, + "loss/crossentropy": 2.8522120237350466, + "loss/logits": 0.8593276113271713, + "step": 45240 + }, + { + "epoch": 0.4525, + "grad_norm": 14.875, + "grad_norm_var": 0.5133951822916667, + "learning_rate": 0.0003, + "loss": 10.9334, + "loss/aux_loss": 0.048071842454373834, + "loss/crossentropy": 2.540039598941803, + "loss/logits": 0.8037934333086014, + "step": 45250 + }, + { + "epoch": 0.4526, + "grad_norm": 14.625, + "grad_norm_var": 0.62109375, + "learning_rate": 0.0003, + "loss": 11.1593, + "loss/aux_loss": 0.04807714056223631, + "loss/crossentropy": 2.7473401188850404, + "loss/logits": 0.8323444128036499, + "step": 45260 + }, + { + "epoch": 0.4527, + "grad_norm": 13.375, + "grad_norm_var": 0.5430826822916667, + "learning_rate": 0.0003, + "loss": 11.1556, + "loss/aux_loss": 0.048075577989220616, + "loss/crossentropy": 2.639462560415268, + "loss/logits": 0.8283006697893143, + "step": 45270 + }, + { + "epoch": 0.4528, + "grad_norm": 14.5, + "grad_norm_var": 0.459619140625, + "learning_rate": 0.0003, + "loss": 11.0925, + "loss/aux_loss": 0.04807265214622021, + "loss/crossentropy": 2.733612394332886, + "loss/logits": 0.8260679453611374, + "step": 45280 + }, + { + "epoch": 0.4529, + "grad_norm": 15.25, + "grad_norm_var": 0.3228515625, + "learning_rate": 0.0003, + "loss": 11.2664, + "loss/aux_loss": 0.04807320795953274, + "loss/crossentropy": 2.817549741268158, + "loss/logits": 0.872169628739357, + "step": 45290 + }, + { + "epoch": 0.453, + "grad_norm": 14.625, + "grad_norm_var": 1.0290201822916667, + "learning_rate": 0.0003, + "loss": 11.2883, + "loss/aux_loss": 0.04808189887553453, + "loss/crossentropy": 2.7291213452816008, + "loss/logits": 0.8284125924110413, + "step": 45300 + }, + { + "epoch": 0.4531, + "grad_norm": 14.25, + "grad_norm_var": 0.38619791666666664, + "learning_rate": 0.0003, + "loss": 11.3393, + "loss/aux_loss": 0.04806639589369297, + "loss/crossentropy": 2.888676828145981, + "loss/logits": 0.8628419786691666, + "step": 45310 + }, + { + "epoch": 0.4532, + "grad_norm": 16.125, + "grad_norm_var": 30.763395182291667, + "learning_rate": 0.0003, + "loss": 11.3253, + "loss/aux_loss": 0.04808086268603802, + "loss/crossentropy": 2.8755642414093017, + "loss/logits": 0.8577650129795075, + "step": 45320 + }, + { + "epoch": 0.4533, + "grad_norm": 14.625, + "grad_norm_var": 26.501936848958334, + "learning_rate": 0.0003, + "loss": 11.3604, + "loss/aux_loss": 0.04807670023292303, + "loss/crossentropy": 2.795285141468048, + "loss/logits": 0.8813230514526367, + "step": 45330 + }, + { + "epoch": 0.4534, + "grad_norm": 14.1875, + "grad_norm_var": 8.334830729166667, + "learning_rate": 0.0003, + "loss": 10.9839, + "loss/aux_loss": 0.048065191879868505, + "loss/crossentropy": 2.9003730535507204, + "loss/logits": 0.839951154589653, + "step": 45340 + }, + { + "epoch": 0.4535, + "grad_norm": 13.25, + "grad_norm_var": 0.870166015625, + "learning_rate": 0.0003, + "loss": 11.0878, + "loss/aux_loss": 0.048059961013495925, + "loss/crossentropy": 2.8457574963569643, + "loss/logits": 0.8674950510263443, + "step": 45350 + }, + { + "epoch": 0.4536, + "grad_norm": 14.8125, + "grad_norm_var": 2.363264973958333, + "learning_rate": 0.0003, + "loss": 11.1738, + "loss/aux_loss": 0.04807946924120188, + "loss/crossentropy": 2.8464213252067565, + "loss/logits": 0.8665098369121551, + "step": 45360 + }, + { + "epoch": 0.4537, + "grad_norm": 13.25, + "grad_norm_var": 2.085400390625, + "learning_rate": 0.0003, + "loss": 11.1825, + "loss/aux_loss": 0.04806684292852879, + "loss/crossentropy": 2.7150739192962647, + "loss/logits": 0.8413305938243866, + "step": 45370 + }, + { + "epoch": 0.4538, + "grad_norm": 13.3125, + "grad_norm_var": 1.294775390625, + "learning_rate": 0.0003, + "loss": 11.2586, + "loss/aux_loss": 0.048075356893241404, + "loss/crossentropy": 2.809218281507492, + "loss/logits": 0.8536065101623536, + "step": 45380 + }, + { + "epoch": 0.4539, + "grad_norm": 13.8125, + "grad_norm_var": 1.880712890625, + "learning_rate": 0.0003, + "loss": 11.1555, + "loss/aux_loss": 0.04809560999274254, + "loss/crossentropy": 2.602170443534851, + "loss/logits": 0.8421902984380722, + "step": 45390 + }, + { + "epoch": 0.454, + "grad_norm": 13.25, + "grad_norm_var": 2.2202962239583335, + "learning_rate": 0.0003, + "loss": 11.1732, + "loss/aux_loss": 0.04805552512407303, + "loss/crossentropy": 2.8673832774162293, + "loss/logits": 0.8450622230768203, + "step": 45400 + }, + { + "epoch": 0.4541, + "grad_norm": 13.75, + "grad_norm_var": 0.9009765625, + "learning_rate": 0.0003, + "loss": 11.023, + "loss/aux_loss": 0.04807879459112883, + "loss/crossentropy": 2.6993161380290984, + "loss/logits": 0.8065011203289032, + "step": 45410 + }, + { + "epoch": 0.4542, + "grad_norm": 13.5, + "grad_norm_var": 0.83046875, + "learning_rate": 0.0003, + "loss": 11.0644, + "loss/aux_loss": 0.04806725066155195, + "loss/crossentropy": 2.628107964992523, + "loss/logits": 0.8442522406578064, + "step": 45420 + }, + { + "epoch": 0.4543, + "grad_norm": 14.0, + "grad_norm_var": 0.7358723958333333, + "learning_rate": 0.0003, + "loss": 11.1131, + "loss/aux_loss": 0.04807357750833034, + "loss/crossentropy": 2.7379313945770263, + "loss/logits": 0.8388034462928772, + "step": 45430 + }, + { + "epoch": 0.4544, + "grad_norm": 13.8125, + "grad_norm_var": 0.3941243489583333, + "learning_rate": 0.0003, + "loss": 11.2006, + "loss/aux_loss": 0.04807491805404425, + "loss/crossentropy": 2.750547635555267, + "loss/logits": 0.8313853859901428, + "step": 45440 + }, + { + "epoch": 0.4545, + "grad_norm": 13.875, + "grad_norm_var": 0.5580729166666667, + "learning_rate": 0.0003, + "loss": 11.2105, + "loss/aux_loss": 0.04807582795619965, + "loss/crossentropy": 2.771452808380127, + "loss/logits": 0.8088362455368042, + "step": 45450 + }, + { + "epoch": 0.4546, + "grad_norm": 13.6875, + "grad_norm_var": 0.7884765625, + "learning_rate": 0.0003, + "loss": 11.1884, + "loss/aux_loss": 0.04807773567736149, + "loss/crossentropy": 2.833453130722046, + "loss/logits": 0.8409688085317611, + "step": 45460 + }, + { + "epoch": 0.4547, + "grad_norm": 16.875, + "grad_norm_var": 1.33984375, + "learning_rate": 0.0003, + "loss": 11.1772, + "loss/aux_loss": 0.04806816857308149, + "loss/crossentropy": 2.5675463676452637, + "loss/logits": 0.8408284574747086, + "step": 45470 + }, + { + "epoch": 0.4548, + "grad_norm": 13.3125, + "grad_norm_var": 1.1582682291666666, + "learning_rate": 0.0003, + "loss": 11.3571, + "loss/aux_loss": 0.04808399137109518, + "loss/crossentropy": 2.719745373725891, + "loss/logits": 0.8130148202180862, + "step": 45480 + }, + { + "epoch": 0.4549, + "grad_norm": 14.8125, + "grad_norm_var": 0.7718587239583333, + "learning_rate": 0.0003, + "loss": 11.2929, + "loss/aux_loss": 0.048072703368961814, + "loss/crossentropy": 2.8237855315208433, + "loss/logits": 0.886102220416069, + "step": 45490 + }, + { + "epoch": 0.455, + "grad_norm": 14.75, + "grad_norm_var": 0.449072265625, + "learning_rate": 0.0003, + "loss": 11.0946, + "loss/aux_loss": 0.04806439485400915, + "loss/crossentropy": 2.6629326224327086, + "loss/logits": 0.8396694749593735, + "step": 45500 + }, + { + "epoch": 0.4551, + "grad_norm": 14.125, + "grad_norm_var": 0.4596354166666667, + "learning_rate": 0.0003, + "loss": 11.3091, + "loss/aux_loss": 0.04807718340307474, + "loss/crossentropy": 2.878407192230225, + "loss/logits": 0.8687193512916564, + "step": 45510 + }, + { + "epoch": 0.4552, + "grad_norm": 13.375, + "grad_norm_var": 0.4332682291666667, + "learning_rate": 0.0003, + "loss": 11.1584, + "loss/aux_loss": 0.04807638432830572, + "loss/crossentropy": 2.7596149682998656, + "loss/logits": 0.8342228949069976, + "step": 45520 + }, + { + "epoch": 0.4553, + "grad_norm": 13.5625, + "grad_norm_var": 0.2872395833333333, + "learning_rate": 0.0003, + "loss": 11.2693, + "loss/aux_loss": 0.04807369913905859, + "loss/crossentropy": 3.037293183803558, + "loss/logits": 0.8476502895355225, + "step": 45530 + }, + { + "epoch": 0.4554, + "grad_norm": 14.375, + "grad_norm_var": 0.8960774739583334, + "learning_rate": 0.0003, + "loss": 10.9162, + "loss/aux_loss": 0.04806880354881286, + "loss/crossentropy": 2.5978680908679963, + "loss/logits": 0.8057895511388778, + "step": 45540 + }, + { + "epoch": 0.4555, + "grad_norm": 14.5625, + "grad_norm_var": 0.954931640625, + "learning_rate": 0.0003, + "loss": 11.0929, + "loss/aux_loss": 0.04807190727442503, + "loss/crossentropy": 2.7636757493019104, + "loss/logits": 0.8655385166406632, + "step": 45550 + }, + { + "epoch": 0.4556, + "grad_norm": 14.875, + "grad_norm_var": 0.3963541666666667, + "learning_rate": 0.0003, + "loss": 11.1835, + "loss/aux_loss": 0.048070300556719306, + "loss/crossentropy": 2.5805073499679567, + "loss/logits": 0.846220064163208, + "step": 45560 + }, + { + "epoch": 0.4557, + "grad_norm": 13.875, + "grad_norm_var": 0.7149576822916667, + "learning_rate": 0.0003, + "loss": 11.0291, + "loss/aux_loss": 0.04808596391230822, + "loss/crossentropy": 2.6298948764801025, + "loss/logits": 0.8278081536293029, + "step": 45570 + }, + { + "epoch": 0.4558, + "grad_norm": 13.4375, + "grad_norm_var": 0.8462890625, + "learning_rate": 0.0003, + "loss": 11.0205, + "loss/aux_loss": 0.04807253833860159, + "loss/crossentropy": 2.6736935675144196, + "loss/logits": 0.8378624528646469, + "step": 45580 + }, + { + "epoch": 0.4559, + "grad_norm": 13.9375, + "grad_norm_var": 0.6683430989583333, + "learning_rate": 0.0003, + "loss": 11.2032, + "loss/aux_loss": 0.04806935098022223, + "loss/crossentropy": 2.6951875925064086, + "loss/logits": 0.818251371383667, + "step": 45590 + }, + { + "epoch": 0.456, + "grad_norm": 14.3125, + "grad_norm_var": 0.2919108072916667, + "learning_rate": 0.0003, + "loss": 11.2407, + "loss/aux_loss": 0.04809526577591896, + "loss/crossentropy": 2.7838382720947266, + "loss/logits": 0.8391630411148071, + "step": 45600 + }, + { + "epoch": 0.4561, + "grad_norm": 14.5625, + "grad_norm_var": 0.6301432291666667, + "learning_rate": 0.0003, + "loss": 11.1217, + "loss/aux_loss": 0.048053614981472495, + "loss/crossentropy": 2.632568824291229, + "loss/logits": 0.845693039894104, + "step": 45610 + }, + { + "epoch": 0.4562, + "grad_norm": 15.8125, + "grad_norm_var": 0.6426920572916667, + "learning_rate": 0.0003, + "loss": 11.1653, + "loss/aux_loss": 0.048069927655160424, + "loss/crossentropy": 2.8206692337989807, + "loss/logits": 0.8522645890712738, + "step": 45620 + }, + { + "epoch": 0.4563, + "grad_norm": 14.75, + "grad_norm_var": 0.8895833333333333, + "learning_rate": 0.0003, + "loss": 11.037, + "loss/aux_loss": 0.048098363913595676, + "loss/crossentropy": 2.769987952709198, + "loss/logits": 0.8345672219991684, + "step": 45630 + }, + { + "epoch": 0.4564, + "grad_norm": 16.625, + "grad_norm_var": 0.988525390625, + "learning_rate": 0.0003, + "loss": 11.143, + "loss/aux_loss": 0.04806884527206421, + "loss/crossentropy": 2.755461257696152, + "loss/logits": 0.8266684681177139, + "step": 45640 + }, + { + "epoch": 0.4565, + "grad_norm": 13.8125, + "grad_norm_var": 1.1817708333333334, + "learning_rate": 0.0003, + "loss": 11.2068, + "loss/aux_loss": 0.04807171430438757, + "loss/crossentropy": 2.7919964730739593, + "loss/logits": 0.8207480728626251, + "step": 45650 + }, + { + "epoch": 0.4566, + "grad_norm": 13.125, + "grad_norm_var": 0.7269368489583333, + "learning_rate": 0.0003, + "loss": 11.0707, + "loss/aux_loss": 0.04808006528764963, + "loss/crossentropy": 2.6854530811309814, + "loss/logits": 0.8648782402276993, + "step": 45660 + }, + { + "epoch": 0.4567, + "grad_norm": 13.6875, + "grad_norm_var": 0.7372395833333333, + "learning_rate": 0.0003, + "loss": 11.2434, + "loss/aux_loss": 0.04807576704770326, + "loss/crossentropy": 2.7998494148254394, + "loss/logits": 0.812621483206749, + "step": 45670 + }, + { + "epoch": 0.4568, + "grad_norm": 16.625, + "grad_norm_var": 0.718603515625, + "learning_rate": 0.0003, + "loss": 11.2177, + "loss/aux_loss": 0.04807013794779778, + "loss/crossentropy": 2.7176717042922975, + "loss/logits": 0.8751024842262268, + "step": 45680 + }, + { + "epoch": 0.4569, + "grad_norm": 14.25, + "grad_norm_var": 0.5817057291666666, + "learning_rate": 0.0003, + "loss": 11.094, + "loss/aux_loss": 0.0480797715485096, + "loss/crossentropy": 2.630698436498642, + "loss/logits": 0.8169205486774445, + "step": 45690 + }, + { + "epoch": 0.457, + "grad_norm": 15.0, + "grad_norm_var": 0.6380208333333334, + "learning_rate": 0.0003, + "loss": 11.2579, + "loss/aux_loss": 0.048068609088659286, + "loss/crossentropy": 2.7162768125534056, + "loss/logits": 0.8566293030977249, + "step": 45700 + }, + { + "epoch": 0.4571, + "grad_norm": 14.1875, + "grad_norm_var": 0.6869791666666667, + "learning_rate": 0.0003, + "loss": 11.0956, + "loss/aux_loss": 0.048069410026073456, + "loss/crossentropy": 2.666369599103928, + "loss/logits": 0.846130108833313, + "step": 45710 + }, + { + "epoch": 0.4572, + "grad_norm": 14.875, + "grad_norm_var": 0.4239420572916667, + "learning_rate": 0.0003, + "loss": 11.3907, + "loss/aux_loss": 0.04808373041450977, + "loss/crossentropy": 2.7648482978343965, + "loss/logits": 0.8433201760053635, + "step": 45720 + }, + { + "epoch": 0.4573, + "grad_norm": 14.3125, + "grad_norm_var": 0.3359375, + "learning_rate": 0.0003, + "loss": 10.9668, + "loss/aux_loss": 0.04806965310126543, + "loss/crossentropy": 2.628385591506958, + "loss/logits": 0.8232957303524018, + "step": 45730 + }, + { + "epoch": 0.4574, + "grad_norm": 13.5625, + "grad_norm_var": 0.17526041666666667, + "learning_rate": 0.0003, + "loss": 11.028, + "loss/aux_loss": 0.0480685856193304, + "loss/crossentropy": 2.8586939454078673, + "loss/logits": 0.8752603858709336, + "step": 45740 + }, + { + "epoch": 0.4575, + "grad_norm": 14.1875, + "grad_norm_var": 1.039697265625, + "learning_rate": 0.0003, + "loss": 11.2328, + "loss/aux_loss": 0.04807407390326261, + "loss/crossentropy": 2.7650853276252745, + "loss/logits": 0.8581269145011902, + "step": 45750 + }, + { + "epoch": 0.4576, + "grad_norm": 14.25, + "grad_norm_var": 1.2577962239583333, + "learning_rate": 0.0003, + "loss": 11.2529, + "loss/aux_loss": 0.04806558396667242, + "loss/crossentropy": 2.670504766702652, + "loss/logits": 0.8334173530340194, + "step": 45760 + }, + { + "epoch": 0.4577, + "grad_norm": 13.25, + "grad_norm_var": 0.2718098958333333, + "learning_rate": 0.0003, + "loss": 10.9535, + "loss/aux_loss": 0.04807530362159014, + "loss/crossentropy": 2.6863086402416227, + "loss/logits": 0.7867847800254821, + "step": 45770 + }, + { + "epoch": 0.4578, + "grad_norm": 14.75, + "grad_norm_var": 0.29739583333333336, + "learning_rate": 0.0003, + "loss": 11.0572, + "loss/aux_loss": 0.04807478487491608, + "loss/crossentropy": 2.7828167259693144, + "loss/logits": 0.8462360620498657, + "step": 45780 + }, + { + "epoch": 0.4579, + "grad_norm": 14.25, + "grad_norm_var": 0.5660807291666666, + "learning_rate": 0.0003, + "loss": 11.212, + "loss/aux_loss": 0.04808699581772089, + "loss/crossentropy": 2.7481481969356536, + "loss/logits": 0.8268178194761276, + "step": 45790 + }, + { + "epoch": 0.458, + "grad_norm": 15.4375, + "grad_norm_var": 0.563525390625, + "learning_rate": 0.0003, + "loss": 11.1529, + "loss/aux_loss": 0.0480684332549572, + "loss/crossentropy": 2.6542268633842467, + "loss/logits": 0.8398558348417282, + "step": 45800 + }, + { + "epoch": 0.4581, + "grad_norm": 14.5625, + "grad_norm_var": 0.570556640625, + "learning_rate": 0.0003, + "loss": 11.0782, + "loss/aux_loss": 0.04807002916932106, + "loss/crossentropy": 2.724322813749313, + "loss/logits": 0.8166536599397659, + "step": 45810 + }, + { + "epoch": 0.4582, + "grad_norm": 15.25, + "grad_norm_var": 0.9817545572916667, + "learning_rate": 0.0003, + "loss": 11.1127, + "loss/aux_loss": 0.04808499440550804, + "loss/crossentropy": 2.623278909921646, + "loss/logits": 0.8362865537405014, + "step": 45820 + }, + { + "epoch": 0.4583, + "grad_norm": 14.875, + "grad_norm_var": 1.3544270833333334, + "learning_rate": 0.0003, + "loss": 11.1638, + "loss/aux_loss": 0.048072373308241365, + "loss/crossentropy": 2.7822245001792907, + "loss/logits": 0.8362233757972717, + "step": 45830 + }, + { + "epoch": 0.4584, + "grad_norm": 15.25, + "grad_norm_var": 0.7874837239583333, + "learning_rate": 0.0003, + "loss": 11.0392, + "loss/aux_loss": 0.04805925581604242, + "loss/crossentropy": 2.83905189037323, + "loss/logits": 0.8411778301000595, + "step": 45840 + }, + { + "epoch": 0.4585, + "grad_norm": 14.0, + "grad_norm_var": 0.319384765625, + "learning_rate": 0.0003, + "loss": 11.136, + "loss/aux_loss": 0.048078327998518945, + "loss/crossentropy": 2.6659455597400665, + "loss/logits": 0.8175705790519714, + "step": 45850 + }, + { + "epoch": 0.4586, + "grad_norm": 14.0625, + "grad_norm_var": 0.1556640625, + "learning_rate": 0.0003, + "loss": 11.1928, + "loss/aux_loss": 0.04805846642702818, + "loss/crossentropy": 2.7064037203788756, + "loss/logits": 0.8587689280509949, + "step": 45860 + }, + { + "epoch": 0.4587, + "grad_norm": 14.8125, + "grad_norm_var": 0.6977701822916667, + "learning_rate": 0.0003, + "loss": 11.2205, + "loss/aux_loss": 0.04808104075491428, + "loss/crossentropy": 2.6719759106636047, + "loss/logits": 0.8492055386304855, + "step": 45870 + }, + { + "epoch": 0.4588, + "grad_norm": 14.1875, + "grad_norm_var": 68.06521809895834, + "learning_rate": 0.0003, + "loss": 11.3008, + "loss/aux_loss": 0.048071546480059624, + "loss/crossentropy": 2.944806432723999, + "loss/logits": 0.8660283535718918, + "step": 45880 + }, + { + "epoch": 0.4589, + "grad_norm": 13.1875, + "grad_norm_var": 0.8034993489583333, + "learning_rate": 0.0003, + "loss": 11.1377, + "loss/aux_loss": 0.04807901885360479, + "loss/crossentropy": 2.7466224670410155, + "loss/logits": 0.8269301950931549, + "step": 45890 + }, + { + "epoch": 0.459, + "grad_norm": 14.5, + "grad_norm_var": 1.1934733072916666, + "learning_rate": 0.0003, + "loss": 10.9827, + "loss/aux_loss": 0.04806364104151726, + "loss/crossentropy": 2.5919273018836977, + "loss/logits": 0.8305024951696396, + "step": 45900 + }, + { + "epoch": 0.4591, + "grad_norm": 14.75, + "grad_norm_var": 0.9688639322916667, + "learning_rate": 0.0003, + "loss": 11.1109, + "loss/aux_loss": 0.04807860106229782, + "loss/crossentropy": 2.6946555733680726, + "loss/logits": 0.8430137366056443, + "step": 45910 + }, + { + "epoch": 0.4592, + "grad_norm": 13.8125, + "grad_norm_var": 0.5634765625, + "learning_rate": 0.0003, + "loss": 10.9809, + "loss/aux_loss": 0.048061893321573734, + "loss/crossentropy": 2.7854873657226564, + "loss/logits": 0.8397706598043442, + "step": 45920 + }, + { + "epoch": 0.4593, + "grad_norm": 14.3125, + "grad_norm_var": 0.615869140625, + "learning_rate": 0.0003, + "loss": 11.193, + "loss/aux_loss": 0.04807352740317583, + "loss/crossentropy": 2.7639912247657774, + "loss/logits": 0.8295485734939575, + "step": 45930 + }, + { + "epoch": 0.4594, + "grad_norm": 14.3125, + "grad_norm_var": 0.585400390625, + "learning_rate": 0.0003, + "loss": 11.0833, + "loss/aux_loss": 0.04807315729558468, + "loss/crossentropy": 2.7072408556938172, + "loss/logits": 0.83205626308918, + "step": 45940 + }, + { + "epoch": 0.4595, + "grad_norm": 13.375, + "grad_norm_var": 0.324462890625, + "learning_rate": 0.0003, + "loss": 11.1237, + "loss/aux_loss": 0.04807840902358294, + "loss/crossentropy": 2.775273883342743, + "loss/logits": 0.8464349508285522, + "step": 45950 + }, + { + "epoch": 0.4596, + "grad_norm": 14.3125, + "grad_norm_var": 0.7369791666666666, + "learning_rate": 0.0003, + "loss": 11.1625, + "loss/aux_loss": 0.048066251538693906, + "loss/crossentropy": 2.761694145202637, + "loss/logits": 0.8378835052251816, + "step": 45960 + }, + { + "epoch": 0.4597, + "grad_norm": 13.875, + "grad_norm_var": 0.39453125, + "learning_rate": 0.0003, + "loss": 11.16, + "loss/aux_loss": 0.048080692254006865, + "loss/crossentropy": 2.6108572721481322, + "loss/logits": 0.8397493064403534, + "step": 45970 + }, + { + "epoch": 0.4598, + "grad_norm": 14.25, + "grad_norm_var": 0.30572916666666666, + "learning_rate": 0.0003, + "loss": 11.1456, + "loss/aux_loss": 0.04807419683784246, + "loss/crossentropy": 2.7630624175071716, + "loss/logits": 0.8382513612508774, + "step": 45980 + }, + { + "epoch": 0.4599, + "grad_norm": 13.875, + "grad_norm_var": 0.295556640625, + "learning_rate": 0.0003, + "loss": 11.0327, + "loss/aux_loss": 0.048076769709587096, + "loss/crossentropy": 2.7021873712539675, + "loss/logits": 0.8546818345785141, + "step": 45990 + }, + { + "epoch": 0.46, + "grad_norm": 14.4375, + "grad_norm_var": 0.33932291666666664, + "learning_rate": 0.0003, + "loss": 11.1386, + "loss/aux_loss": 0.04808690138161183, + "loss/crossentropy": 2.621007192134857, + "loss/logits": 0.8594667464494705, + "step": 46000 + }, + { + "epoch": 0.4601, + "grad_norm": 17.5, + "grad_norm_var": 0.825, + "learning_rate": 0.0003, + "loss": 11.2569, + "loss/aux_loss": 0.04806713555008173, + "loss/crossentropy": 2.6976438403129577, + "loss/logits": 0.8213419556617737, + "step": 46010 + }, + { + "epoch": 0.4602, + "grad_norm": 13.875, + "grad_norm_var": 1.0632649739583333, + "learning_rate": 0.0003, + "loss": 11.113, + "loss/aux_loss": 0.048088106140494344, + "loss/crossentropy": 2.520746982097626, + "loss/logits": 0.8219372004270553, + "step": 46020 + }, + { + "epoch": 0.4603, + "grad_norm": 15.9375, + "grad_norm_var": 0.590478515625, + "learning_rate": 0.0003, + "loss": 11.2881, + "loss/aux_loss": 0.048078617081046104, + "loss/crossentropy": 2.840721619129181, + "loss/logits": 0.8715362250804901, + "step": 46030 + }, + { + "epoch": 0.4604, + "grad_norm": 13.4375, + "grad_norm_var": 0.7445149739583333, + "learning_rate": 0.0003, + "loss": 10.9794, + "loss/aux_loss": 0.048061452060937884, + "loss/crossentropy": 2.6260744273662566, + "loss/logits": 0.8213737875223159, + "step": 46040 + }, + { + "epoch": 0.4605, + "grad_norm": 14.0625, + "grad_norm_var": 0.2738118489583333, + "learning_rate": 0.0003, + "loss": 11.3335, + "loss/aux_loss": 0.048074982687830926, + "loss/crossentropy": 2.783993864059448, + "loss/logits": 0.8809378027915955, + "step": 46050 + }, + { + "epoch": 0.4606, + "grad_norm": 14.0625, + "grad_norm_var": 1.7400390625, + "learning_rate": 0.0003, + "loss": 11.1385, + "loss/aux_loss": 0.0480780715122819, + "loss/crossentropy": 2.767115068435669, + "loss/logits": 0.8657530009746551, + "step": 46060 + }, + { + "epoch": 0.4607, + "grad_norm": 14.5625, + "grad_norm_var": 1.1791015625, + "learning_rate": 0.0003, + "loss": 11.0435, + "loss/aux_loss": 0.04808122143149376, + "loss/crossentropy": 2.465041011571884, + "loss/logits": 0.8050199329853058, + "step": 46070 + }, + { + "epoch": 0.4608, + "grad_norm": 14.8125, + "grad_norm_var": 0.37135416666666665, + "learning_rate": 0.0003, + "loss": 11.1579, + "loss/aux_loss": 0.04807000420987606, + "loss/crossentropy": 2.614718121290207, + "loss/logits": 0.8386980295181274, + "step": 46080 + }, + { + "epoch": 0.4609, + "grad_norm": 14.8125, + "grad_norm_var": 2.321728515625, + "learning_rate": 0.0003, + "loss": 11.1314, + "loss/aux_loss": 0.0480863306671381, + "loss/crossentropy": 2.7782493591308595, + "loss/logits": 0.8749098181724548, + "step": 46090 + }, + { + "epoch": 0.461, + "grad_norm": 13.8125, + "grad_norm_var": 0.5106770833333333, + "learning_rate": 0.0003, + "loss": 11.1323, + "loss/aux_loss": 0.048072931729257105, + "loss/crossentropy": 2.8069660782814028, + "loss/logits": 0.827642685174942, + "step": 46100 + }, + { + "epoch": 0.4611, + "grad_norm": 13.75, + "grad_norm_var": 0.5239583333333333, + "learning_rate": 0.0003, + "loss": 11.1249, + "loss/aux_loss": 0.048074934631586075, + "loss/crossentropy": 2.5645545959472655, + "loss/logits": 0.8122676819562912, + "step": 46110 + }, + { + "epoch": 0.4612, + "grad_norm": 14.375, + "grad_norm_var": 0.8458170572916667, + "learning_rate": 0.0003, + "loss": 11.1389, + "loss/aux_loss": 0.04807667378336191, + "loss/crossentropy": 2.7756105303764342, + "loss/logits": 0.8374488890171051, + "step": 46120 + }, + { + "epoch": 0.4613, + "grad_norm": 14.5, + "grad_norm_var": 0.643212890625, + "learning_rate": 0.0003, + "loss": 11.1476, + "loss/aux_loss": 0.048084722831845284, + "loss/crossentropy": 2.862342894077301, + "loss/logits": 0.8817748308181763, + "step": 46130 + }, + { + "epoch": 0.4614, + "grad_norm": 19.375, + "grad_norm_var": 185.560791015625, + "learning_rate": 0.0003, + "loss": 11.1808, + "loss/aux_loss": 0.048067561350762844, + "loss/crossentropy": 2.596503585577011, + "loss/logits": 0.8253944367170334, + "step": 46140 + }, + { + "epoch": 0.4615, + "grad_norm": 14.5, + "grad_norm_var": 185.8166015625, + "learning_rate": 0.0003, + "loss": 11.1384, + "loss/aux_loss": 0.04807464778423309, + "loss/crossentropy": 2.654829728603363, + "loss/logits": 0.8233345150947571, + "step": 46150 + }, + { + "epoch": 0.4616, + "grad_norm": 17.125, + "grad_norm_var": 0.84609375, + "learning_rate": 0.0003, + "loss": 11.0069, + "loss/aux_loss": 0.048072910867631435, + "loss/crossentropy": 2.758739471435547, + "loss/logits": 0.8712568372488022, + "step": 46160 + }, + { + "epoch": 0.4617, + "grad_norm": 13.875, + "grad_norm_var": 0.951025390625, + "learning_rate": 0.0003, + "loss": 11.0499, + "loss/aux_loss": 0.0480669179931283, + "loss/crossentropy": 2.609746116399765, + "loss/logits": 0.8174644142389298, + "step": 46170 + }, + { + "epoch": 0.4618, + "grad_norm": 16.5, + "grad_norm_var": 0.5794270833333334, + "learning_rate": 0.0003, + "loss": 11.1406, + "loss/aux_loss": 0.04809269942343235, + "loss/crossentropy": 2.7125259757041933, + "loss/logits": 0.829924201965332, + "step": 46180 + }, + { + "epoch": 0.4619, + "grad_norm": 14.6875, + "grad_norm_var": 0.4805826822916667, + "learning_rate": 0.0003, + "loss": 11.194, + "loss/aux_loss": 0.04806003961712122, + "loss/crossentropy": 2.7480635344982147, + "loss/logits": 0.844669246673584, + "step": 46190 + }, + { + "epoch": 0.462, + "grad_norm": 14.25, + "grad_norm_var": 0.4934895833333333, + "learning_rate": 0.0003, + "loss": 11.2594, + "loss/aux_loss": 0.0480703953653574, + "loss/crossentropy": 2.7278804779052734, + "loss/logits": 0.8578185975551605, + "step": 46200 + }, + { + "epoch": 0.4621, + "grad_norm": 15.125, + "grad_norm_var": 0.4827473958333333, + "learning_rate": 0.0003, + "loss": 11.1531, + "loss/aux_loss": 0.04808447137475014, + "loss/crossentropy": 2.583157116174698, + "loss/logits": 0.8229734599590302, + "step": 46210 + }, + { + "epoch": 0.4622, + "grad_norm": 14.375, + "grad_norm_var": 0.181884765625, + "learning_rate": 0.0003, + "loss": 11.2071, + "loss/aux_loss": 0.048074091970920566, + "loss/crossentropy": 2.6130159497261047, + "loss/logits": 0.8274956196546555, + "step": 46220 + }, + { + "epoch": 0.4623, + "grad_norm": 14.875, + "grad_norm_var": 0.326806640625, + "learning_rate": 0.0003, + "loss": 11.1961, + "loss/aux_loss": 0.04808221161365509, + "loss/crossentropy": 2.7028493165969847, + "loss/logits": 0.8648825436830521, + "step": 46230 + }, + { + "epoch": 0.4624, + "grad_norm": 13.9375, + "grad_norm_var": 0.405712890625, + "learning_rate": 0.0003, + "loss": 11.0438, + "loss/aux_loss": 0.04807335082441568, + "loss/crossentropy": 2.5999584436416625, + "loss/logits": 0.8028821110725403, + "step": 46240 + }, + { + "epoch": 0.4625, + "grad_norm": 13.0, + "grad_norm_var": 0.47263997395833335, + "learning_rate": 0.0003, + "loss": 11.0311, + "loss/aux_loss": 0.04807595033198595, + "loss/crossentropy": 2.7554810464382173, + "loss/logits": 0.841679847240448, + "step": 46250 + }, + { + "epoch": 0.4626, + "grad_norm": 13.9375, + "grad_norm_var": 0.47858072916666666, + "learning_rate": 0.0003, + "loss": 11.0339, + "loss/aux_loss": 0.04807290639728308, + "loss/crossentropy": 2.826436698436737, + "loss/logits": 0.8312930345535279, + "step": 46260 + }, + { + "epoch": 0.4627, + "grad_norm": 14.875, + "grad_norm_var": 0.5843587239583333, + "learning_rate": 0.0003, + "loss": 11.194, + "loss/aux_loss": 0.04807629976421594, + "loss/crossentropy": 2.6330519676208497, + "loss/logits": 0.8189602941274643, + "step": 46270 + }, + { + "epoch": 0.4628, + "grad_norm": 14.0, + "grad_norm_var": 0.36912434895833335, + "learning_rate": 0.0003, + "loss": 11.1749, + "loss/aux_loss": 0.048078407719731334, + "loss/crossentropy": 2.704203653335571, + "loss/logits": 0.8706455767154694, + "step": 46280 + }, + { + "epoch": 0.4629, + "grad_norm": 14.3125, + "grad_norm_var": 0.361181640625, + "learning_rate": 0.0003, + "loss": 11.0766, + "loss/aux_loss": 0.04808397404849529, + "loss/crossentropy": 2.679097306728363, + "loss/logits": 0.8224625796079635, + "step": 46290 + }, + { + "epoch": 0.463, + "grad_norm": 14.25, + "grad_norm_var": 0.33982747395833335, + "learning_rate": 0.0003, + "loss": 11.0087, + "loss/aux_loss": 0.04806742705404758, + "loss/crossentropy": 2.6513377904891966, + "loss/logits": 0.7877238169312477, + "step": 46300 + }, + { + "epoch": 0.4631, + "grad_norm": 14.875, + "grad_norm_var": 0.46484375, + "learning_rate": 0.0003, + "loss": 11.1393, + "loss/aux_loss": 0.04807655327022076, + "loss/crossentropy": 2.542707550525665, + "loss/logits": 0.8172414094209671, + "step": 46310 + }, + { + "epoch": 0.4632, + "grad_norm": 14.1875, + "grad_norm_var": 0.9903483072916667, + "learning_rate": 0.0003, + "loss": 11.1643, + "loss/aux_loss": 0.04806794133037329, + "loss/crossentropy": 2.8341934442520142, + "loss/logits": 0.8416117280721664, + "step": 46320 + }, + { + "epoch": 0.4633, + "grad_norm": 14.25, + "grad_norm_var": 0.546875, + "learning_rate": 0.0003, + "loss": 11.1235, + "loss/aux_loss": 0.04808292984962463, + "loss/crossentropy": 2.7436033606529238, + "loss/logits": 0.8215555369853973, + "step": 46330 + }, + { + "epoch": 0.4634, + "grad_norm": 15.125, + "grad_norm_var": 0.32472330729166665, + "learning_rate": 0.0003, + "loss": 11.4896, + "loss/aux_loss": 0.04807242415845394, + "loss/crossentropy": 2.7257355570793154, + "loss/logits": 0.8585422575473786, + "step": 46340 + }, + { + "epoch": 0.4635, + "grad_norm": 14.1875, + "grad_norm_var": 7.888004557291667, + "learning_rate": 0.0003, + "loss": 11.112, + "loss/aux_loss": 0.0480771217495203, + "loss/crossentropy": 2.5921223521232606, + "loss/logits": 0.8013135939836502, + "step": 46350 + }, + { + "epoch": 0.4636, + "grad_norm": 15.625, + "grad_norm_var": 6.84375, + "learning_rate": 0.0003, + "loss": 11.2715, + "loss/aux_loss": 0.04807939790189266, + "loss/crossentropy": 2.74160099029541, + "loss/logits": 0.8368293017148971, + "step": 46360 + }, + { + "epoch": 0.4637, + "grad_norm": 14.4375, + "grad_norm_var": 0.4964680989583333, + "learning_rate": 0.0003, + "loss": 11.2342, + "loss/aux_loss": 0.04807531572878361, + "loss/crossentropy": 2.690195268392563, + "loss/logits": 0.8286543309688568, + "step": 46370 + }, + { + "epoch": 0.4638, + "grad_norm": 13.8125, + "grad_norm_var": 0.37161458333333336, + "learning_rate": 0.0003, + "loss": 11.1557, + "loss/aux_loss": 0.048067241348326205, + "loss/crossentropy": 2.800563335418701, + "loss/logits": 0.8472881704568863, + "step": 46380 + }, + { + "epoch": 0.4639, + "grad_norm": 14.875, + "grad_norm_var": 0.4212890625, + "learning_rate": 0.0003, + "loss": 11.2161, + "loss/aux_loss": 0.048077549785375595, + "loss/crossentropy": 2.8395881056785583, + "loss/logits": 0.8345950871706009, + "step": 46390 + }, + { + "epoch": 0.464, + "grad_norm": 13.4375, + "grad_norm_var": 0.580712890625, + "learning_rate": 0.0003, + "loss": 11.2054, + "loss/aux_loss": 0.048074125126004216, + "loss/crossentropy": 2.6557404458522798, + "loss/logits": 0.8473072737455368, + "step": 46400 + }, + { + "epoch": 0.4641, + "grad_norm": 14.25, + "grad_norm_var": 0.5900390625, + "learning_rate": 0.0003, + "loss": 11.1693, + "loss/aux_loss": 0.04806842841207981, + "loss/crossentropy": 2.796481668949127, + "loss/logits": 0.8443670809268952, + "step": 46410 + }, + { + "epoch": 0.4642, + "grad_norm": 14.0625, + "grad_norm_var": 0.6135416666666667, + "learning_rate": 0.0003, + "loss": 11.1548, + "loss/aux_loss": 0.048084372840821746, + "loss/crossentropy": 2.760413956642151, + "loss/logits": 0.8425527215003967, + "step": 46420 + }, + { + "epoch": 0.4643, + "grad_norm": 15.3125, + "grad_norm_var": 0.633056640625, + "learning_rate": 0.0003, + "loss": 11.2473, + "loss/aux_loss": 0.048075103759765626, + "loss/crossentropy": 2.7352624416351317, + "loss/logits": 0.8507170170545578, + "step": 46430 + }, + { + "epoch": 0.4644, + "grad_norm": 14.5625, + "grad_norm_var": 1.6200358072916667, + "learning_rate": 0.0003, + "loss": 11.2188, + "loss/aux_loss": 0.048076110705733296, + "loss/crossentropy": 2.759999096393585, + "loss/logits": 0.8683469414710998, + "step": 46440 + }, + { + "epoch": 0.4645, + "grad_norm": 14.0, + "grad_norm_var": 0.44296875, + "learning_rate": 0.0003, + "loss": 11.1575, + "loss/aux_loss": 0.04807174541056156, + "loss/crossentropy": 2.7848057746887207, + "loss/logits": 0.8197889029979706, + "step": 46450 + }, + { + "epoch": 0.4646, + "grad_norm": 14.5, + "grad_norm_var": 0.5098307291666667, + "learning_rate": 0.0003, + "loss": 11.0128, + "loss/aux_loss": 0.04808044023811817, + "loss/crossentropy": 2.6938924133777618, + "loss/logits": 0.8240805625915527, + "step": 46460 + }, + { + "epoch": 0.4647, + "grad_norm": 14.0, + "grad_norm_var": 0.403125, + "learning_rate": 0.0003, + "loss": 11.2575, + "loss/aux_loss": 0.04806338362395764, + "loss/crossentropy": 2.746562212705612, + "loss/logits": 0.8269091069698333, + "step": 46470 + }, + { + "epoch": 0.4648, + "grad_norm": 13.125, + "grad_norm_var": 0.37057291666666664, + "learning_rate": 0.0003, + "loss": 11.1455, + "loss/aux_loss": 0.048089843057096, + "loss/crossentropy": 2.7687614023685456, + "loss/logits": 0.8314522951841354, + "step": 46480 + }, + { + "epoch": 0.4649, + "grad_norm": 13.6875, + "grad_norm_var": 60.507747395833334, + "learning_rate": 0.0003, + "loss": 11.321, + "loss/aux_loss": 0.04807740245014429, + "loss/crossentropy": 2.713161385059357, + "loss/logits": 0.8670831322669983, + "step": 46490 + }, + { + "epoch": 0.465, + "grad_norm": 15.75, + "grad_norm_var": 59.106770833333336, + "learning_rate": 0.0003, + "loss": 11.1114, + "loss/aux_loss": 0.04807022921741009, + "loss/crossentropy": 2.7916195511817934, + "loss/logits": 0.8448772758245469, + "step": 46500 + }, + { + "epoch": 0.4651, + "grad_norm": 14.375, + "grad_norm_var": 0.8102701822916667, + "learning_rate": 0.0003, + "loss": 11.2403, + "loss/aux_loss": 0.048087266460061076, + "loss/crossentropy": 2.7339360535144808, + "loss/logits": 0.8473565667867661, + "step": 46510 + }, + { + "epoch": 0.4652, + "grad_norm": 14.1875, + "grad_norm_var": 1.5085774739583333, + "learning_rate": 0.0003, + "loss": 11.1092, + "loss/aux_loss": 0.04807992558926344, + "loss/crossentropy": 2.879719001054764, + "loss/logits": 0.8366673439741135, + "step": 46520 + }, + { + "epoch": 0.4653, + "grad_norm": 14.1875, + "grad_norm_var": 1.065478515625, + "learning_rate": 0.0003, + "loss": 11.0443, + "loss/aux_loss": 0.04807272665202618, + "loss/crossentropy": 2.8038435697555544, + "loss/logits": 0.8307929456233978, + "step": 46530 + }, + { + "epoch": 0.4654, + "grad_norm": 15.3125, + "grad_norm_var": 1.0056640625, + "learning_rate": 0.0003, + "loss": 11.02, + "loss/aux_loss": 0.048068790882825854, + "loss/crossentropy": 2.8201247453689575, + "loss/logits": 0.8609393984079361, + "step": 46540 + }, + { + "epoch": 0.4655, + "grad_norm": 15.0625, + "grad_norm_var": 0.7639973958333334, + "learning_rate": 0.0003, + "loss": 10.9846, + "loss/aux_loss": 0.04807808455079794, + "loss/crossentropy": 2.641385281085968, + "loss/logits": 0.8092559695243835, + "step": 46550 + }, + { + "epoch": 0.4656, + "grad_norm": 14.75, + "grad_norm_var": 0.5659993489583334, + "learning_rate": 0.0003, + "loss": 11.0114, + "loss/aux_loss": 0.048075922578573224, + "loss/crossentropy": 2.549112868309021, + "loss/logits": 0.7909631967544556, + "step": 46560 + }, + { + "epoch": 0.4657, + "grad_norm": 14.875, + "grad_norm_var": 0.309375, + "learning_rate": 0.0003, + "loss": 11.1391, + "loss/aux_loss": 0.04808427933603525, + "loss/crossentropy": 2.796935510635376, + "loss/logits": 0.8834913015365601, + "step": 46570 + }, + { + "epoch": 0.4658, + "grad_norm": 13.4375, + "grad_norm_var": 0.385400390625, + "learning_rate": 0.0003, + "loss": 10.9412, + "loss/aux_loss": 0.04806541427969933, + "loss/crossentropy": 2.649865931272507, + "loss/logits": 0.8530022531747818, + "step": 46580 + }, + { + "epoch": 0.4659, + "grad_norm": 13.625, + "grad_norm_var": 0.34609375, + "learning_rate": 0.0003, + "loss": 11.3094, + "loss/aux_loss": 0.04808234199881554, + "loss/crossentropy": 2.7041312396526336, + "loss/logits": 0.8479943692684173, + "step": 46590 + }, + { + "epoch": 0.466, + "grad_norm": 14.0625, + "grad_norm_var": 0.37810872395833334, + "learning_rate": 0.0003, + "loss": 11.2404, + "loss/aux_loss": 0.04807773306965828, + "loss/crossentropy": 2.6949519872665406, + "loss/logits": 0.828350055217743, + "step": 46600 + }, + { + "epoch": 0.4661, + "grad_norm": 14.3125, + "grad_norm_var": 0.5723307291666667, + "learning_rate": 0.0003, + "loss": 11.1212, + "loss/aux_loss": 0.048074806481599806, + "loss/crossentropy": 2.8176488399505617, + "loss/logits": 0.8534007757902146, + "step": 46610 + }, + { + "epoch": 0.4662, + "grad_norm": 15.625, + "grad_norm_var": 0.8304524739583333, + "learning_rate": 0.0003, + "loss": 10.9068, + "loss/aux_loss": 0.04807550571858883, + "loss/crossentropy": 2.683753031492233, + "loss/logits": 0.8132462590932846, + "step": 46620 + }, + { + "epoch": 0.4663, + "grad_norm": 14.6875, + "grad_norm_var": 0.7883951822916667, + "learning_rate": 0.0003, + "loss": 11.1563, + "loss/aux_loss": 0.04807454776018858, + "loss/crossentropy": 2.742726969718933, + "loss/logits": 0.8348707377910614, + "step": 46630 + }, + { + "epoch": 0.4664, + "grad_norm": 14.1875, + "grad_norm_var": 0.3150390625, + "learning_rate": 0.0003, + "loss": 11.1891, + "loss/aux_loss": 0.04808441940695048, + "loss/crossentropy": 2.798567849397659, + "loss/logits": 0.8553645014762878, + "step": 46640 + }, + { + "epoch": 0.4665, + "grad_norm": 15.0625, + "grad_norm_var": 0.252978515625, + "learning_rate": 0.0003, + "loss": 11.0609, + "loss/aux_loss": 0.04807608053088188, + "loss/crossentropy": 2.6830251634120943, + "loss/logits": 0.8196864813566208, + "step": 46650 + }, + { + "epoch": 0.4666, + "grad_norm": 13.5625, + "grad_norm_var": 0.5020833333333333, + "learning_rate": 0.0003, + "loss": 11.0233, + "loss/aux_loss": 0.048064926825463775, + "loss/crossentropy": 2.7553923606872557, + "loss/logits": 0.853671881556511, + "step": 46660 + }, + { + "epoch": 0.4667, + "grad_norm": 14.0625, + "grad_norm_var": 0.645166015625, + "learning_rate": 0.0003, + "loss": 11.1551, + "loss/aux_loss": 0.0480745829641819, + "loss/crossentropy": 2.7849517345428465, + "loss/logits": 0.8592245787382126, + "step": 46670 + }, + { + "epoch": 0.4668, + "grad_norm": 13.5, + "grad_norm_var": 0.5312337239583333, + "learning_rate": 0.0003, + "loss": 11.0834, + "loss/aux_loss": 0.04807850234210491, + "loss/crossentropy": 2.877470552921295, + "loss/logits": 0.8332111418247223, + "step": 46680 + }, + { + "epoch": 0.4669, + "grad_norm": 15.375, + "grad_norm_var": 0.5258951822916667, + "learning_rate": 0.0003, + "loss": 11.1315, + "loss/aux_loss": 0.048073893412947655, + "loss/crossentropy": 2.6425564885139465, + "loss/logits": 0.834293258190155, + "step": 46690 + }, + { + "epoch": 0.467, + "grad_norm": 13.9375, + "grad_norm_var": 0.4778645833333333, + "learning_rate": 0.0003, + "loss": 11.1072, + "loss/aux_loss": 0.048084541223943233, + "loss/crossentropy": 2.7174774527549745, + "loss/logits": 0.8334024339914322, + "step": 46700 + }, + { + "epoch": 0.4671, + "grad_norm": 14.5, + "grad_norm_var": 0.657275390625, + "learning_rate": 0.0003, + "loss": 11.0016, + "loss/aux_loss": 0.04806803483515978, + "loss/crossentropy": 2.5842558205127717, + "loss/logits": 0.8057064324617386, + "step": 46710 + }, + { + "epoch": 0.4672, + "grad_norm": 13.9375, + "grad_norm_var": 0.7184733072916667, + "learning_rate": 0.0003, + "loss": 11.0592, + "loss/aux_loss": 0.04807887524366379, + "loss/crossentropy": 2.8033472537994384, + "loss/logits": 0.827720096707344, + "step": 46720 + }, + { + "epoch": 0.4673, + "grad_norm": 14.6875, + "grad_norm_var": 1.1671223958333334, + "learning_rate": 0.0003, + "loss": 11.0515, + "loss/aux_loss": 0.04807654283940792, + "loss/crossentropy": 2.6938624501228334, + "loss/logits": 0.8351145356893539, + "step": 46730 + }, + { + "epoch": 0.4674, + "grad_norm": 14.125, + "grad_norm_var": 0.8932291666666666, + "learning_rate": 0.0003, + "loss": 11.1301, + "loss/aux_loss": 0.0480727557092905, + "loss/crossentropy": 2.7404383420944214, + "loss/logits": 0.8500682055950165, + "step": 46740 + }, + { + "epoch": 0.4675, + "grad_norm": 15.25, + "grad_norm_var": 0.9286458333333333, + "learning_rate": 0.0003, + "loss": 11.1978, + "loss/aux_loss": 0.04807423073798418, + "loss/crossentropy": 2.6602605104446413, + "loss/logits": 0.847180300951004, + "step": 46750 + }, + { + "epoch": 0.4676, + "grad_norm": 14.375, + "grad_norm_var": 0.8609375, + "learning_rate": 0.0003, + "loss": 11.2002, + "loss/aux_loss": 0.048065530881285665, + "loss/crossentropy": 2.704084634780884, + "loss/logits": 0.8421103477478027, + "step": 46760 + }, + { + "epoch": 0.4677, + "grad_norm": 14.6875, + "grad_norm_var": 0.30859375, + "learning_rate": 0.0003, + "loss": 11.1246, + "loss/aux_loss": 0.048081099055707455, + "loss/crossentropy": 2.7489688992500305, + "loss/logits": 0.8322398364543915, + "step": 46770 + }, + { + "epoch": 0.4678, + "grad_norm": 13.5, + "grad_norm_var": 0.24412434895833332, + "learning_rate": 0.0003, + "loss": 10.8646, + "loss/aux_loss": 0.04807508103549481, + "loss/crossentropy": 2.630194664001465, + "loss/logits": 0.8167033612728118, + "step": 46780 + }, + { + "epoch": 0.4679, + "grad_norm": 13.5625, + "grad_norm_var": 0.66875, + "learning_rate": 0.0003, + "loss": 11.1296, + "loss/aux_loss": 0.04808248318731785, + "loss/crossentropy": 2.754181432723999, + "loss/logits": 0.8682912677526474, + "step": 46790 + }, + { + "epoch": 0.468, + "grad_norm": 13.9375, + "grad_norm_var": 0.7300618489583334, + "learning_rate": 0.0003, + "loss": 11.1366, + "loss/aux_loss": 0.04807512406259775, + "loss/crossentropy": 2.807320773601532, + "loss/logits": 0.8542409300804138, + "step": 46800 + }, + { + "epoch": 0.4681, + "grad_norm": 13.625, + "grad_norm_var": 0.6877604166666667, + "learning_rate": 0.0003, + "loss": 11.0965, + "loss/aux_loss": 0.04807012528181076, + "loss/crossentropy": 2.7169011294841767, + "loss/logits": 0.8518575340509414, + "step": 46810 + }, + { + "epoch": 0.4682, + "grad_norm": 15.0, + "grad_norm_var": 0.5700358072916667, + "learning_rate": 0.0003, + "loss": 11.1799, + "loss/aux_loss": 0.048083074390888214, + "loss/crossentropy": 2.6793820321559907, + "loss/logits": 0.8517037600278854, + "step": 46820 + }, + { + "epoch": 0.4683, + "grad_norm": 15.6875, + "grad_norm_var": 0.6593098958333333, + "learning_rate": 0.0003, + "loss": 10.9502, + "loss/aux_loss": 0.04807259049266577, + "loss/crossentropy": 2.71062428355217, + "loss/logits": 0.8300145417451859, + "step": 46830 + }, + { + "epoch": 0.4684, + "grad_norm": 13.1875, + "grad_norm_var": 0.7988118489583333, + "learning_rate": 0.0003, + "loss": 11.1379, + "loss/aux_loss": 0.04807367566972971, + "loss/crossentropy": 2.77940719127655, + "loss/logits": 0.8435407996177673, + "step": 46840 + }, + { + "epoch": 0.4685, + "grad_norm": 14.0625, + "grad_norm_var": 0.5723795572916667, + "learning_rate": 0.0003, + "loss": 11.0454, + "loss/aux_loss": 0.048079926520586014, + "loss/crossentropy": 2.5133470952510835, + "loss/logits": 0.8150019586086273, + "step": 46850 + }, + { + "epoch": 0.4686, + "grad_norm": 14.0, + "grad_norm_var": 0.6298014322916666, + "learning_rate": 0.0003, + "loss": 11.1473, + "loss/aux_loss": 0.04806667976081371, + "loss/crossentropy": 2.8036882996559145, + "loss/logits": 0.8526762962341309, + "step": 46860 + }, + { + "epoch": 0.4687, + "grad_norm": 13.9375, + "grad_norm_var": 0.437353515625, + "learning_rate": 0.0003, + "loss": 11.0997, + "loss/aux_loss": 0.04806803800165653, + "loss/crossentropy": 2.7410527229309083, + "loss/logits": 0.8402520000934601, + "step": 46870 + }, + { + "epoch": 0.4688, + "grad_norm": 15.4375, + "grad_norm_var": 0.5257649739583333, + "learning_rate": 0.0003, + "loss": 11.2314, + "loss/aux_loss": 0.0480783374980092, + "loss/crossentropy": 2.5751788198947905, + "loss/logits": 0.8502856940031052, + "step": 46880 + }, + { + "epoch": 0.4689, + "grad_norm": 14.625, + "grad_norm_var": 0.39055989583333334, + "learning_rate": 0.0003, + "loss": 11.1735, + "loss/aux_loss": 0.048073621653020385, + "loss/crossentropy": 2.7046410202980042, + "loss/logits": 0.8371324121952057, + "step": 46890 + }, + { + "epoch": 0.469, + "grad_norm": 15.125, + "grad_norm_var": 0.6059895833333333, + "learning_rate": 0.0003, + "loss": 10.9607, + "loss/aux_loss": 0.04807861391454935, + "loss/crossentropy": 2.786002492904663, + "loss/logits": 0.8466158479452133, + "step": 46900 + }, + { + "epoch": 0.4691, + "grad_norm": 14.375, + "grad_norm_var": 0.49920247395833334, + "learning_rate": 0.0003, + "loss": 11.2057, + "loss/aux_loss": 0.04806822370737791, + "loss/crossentropy": 2.6618904173374176, + "loss/logits": 0.8399115055799484, + "step": 46910 + }, + { + "epoch": 0.4692, + "grad_norm": 12.875, + "grad_norm_var": 0.8572265625, + "learning_rate": 0.0003, + "loss": 11.0673, + "loss/aux_loss": 0.048087282478809355, + "loss/crossentropy": 2.707621991634369, + "loss/logits": 0.8286347270011902, + "step": 46920 + }, + { + "epoch": 0.4693, + "grad_norm": 14.9375, + "grad_norm_var": 0.9629557291666667, + "learning_rate": 0.0003, + "loss": 11.0683, + "loss/aux_loss": 0.048071885108947755, + "loss/crossentropy": 2.7881909012794495, + "loss/logits": 0.8593872129917145, + "step": 46930 + }, + { + "epoch": 0.4694, + "grad_norm": 14.6875, + "grad_norm_var": 0.6229166666666667, + "learning_rate": 0.0003, + "loss": 11.2006, + "loss/aux_loss": 0.048074318841099736, + "loss/crossentropy": 2.8505053877830506, + "loss/logits": 0.824359530210495, + "step": 46940 + }, + { + "epoch": 0.4695, + "grad_norm": 13.4375, + "grad_norm_var": 0.9066243489583333, + "learning_rate": 0.0003, + "loss": 10.9586, + "loss/aux_loss": 0.04807030726224184, + "loss/crossentropy": 2.7617590546607973, + "loss/logits": 0.8363817751407623, + "step": 46950 + }, + { + "epoch": 0.4696, + "grad_norm": 15.0625, + "grad_norm_var": 0.5494140625, + "learning_rate": 0.0003, + "loss": 11.2257, + "loss/aux_loss": 0.048083371855318545, + "loss/crossentropy": 2.7488768696784973, + "loss/logits": 0.8112467706203461, + "step": 46960 + }, + { + "epoch": 0.4697, + "grad_norm": 14.625, + "grad_norm_var": 0.5541015625, + "learning_rate": 0.0003, + "loss": 11.1934, + "loss/aux_loss": 0.048067718744277954, + "loss/crossentropy": 2.713601952791214, + "loss/logits": 0.8483431667089463, + "step": 46970 + }, + { + "epoch": 0.4698, + "grad_norm": 14.375, + "grad_norm_var": 1.0541666666666667, + "learning_rate": 0.0003, + "loss": 11.2815, + "loss/aux_loss": 0.04806978404521942, + "loss/crossentropy": 2.7652095437049864, + "loss/logits": 0.899308231472969, + "step": 46980 + }, + { + "epoch": 0.4699, + "grad_norm": 16.5, + "grad_norm_var": 0.4723307291666667, + "learning_rate": 0.0003, + "loss": 11.1383, + "loss/aux_loss": 0.04807682782411575, + "loss/crossentropy": 2.7637027978897093, + "loss/logits": 0.8606138914823532, + "step": 46990 + }, + { + "epoch": 0.47, + "grad_norm": 13.875, + "grad_norm_var": 0.6843587239583333, + "learning_rate": 0.0003, + "loss": 11.0771, + "loss/aux_loss": 0.04807099532335997, + "loss/crossentropy": 2.546469062566757, + "loss/logits": 0.8381029695272446, + "step": 47000 + }, + { + "epoch": 0.4701, + "grad_norm": 13.625, + "grad_norm_var": 0.7808430989583334, + "learning_rate": 0.0003, + "loss": 11.2179, + "loss/aux_loss": 0.048072867281734946, + "loss/crossentropy": 2.650752639770508, + "loss/logits": 0.808814725279808, + "step": 47010 + }, + { + "epoch": 0.4702, + "grad_norm": 15.25, + "grad_norm_var": 3.489697265625, + "learning_rate": 0.0003, + "loss": 11.1539, + "loss/aux_loss": 0.04807664155960083, + "loss/crossentropy": 2.579551470279694, + "loss/logits": 0.8188108772039413, + "step": 47020 + }, + { + "epoch": 0.4703, + "grad_norm": 15.375, + "grad_norm_var": 3.3722493489583334, + "learning_rate": 0.0003, + "loss": 11.1198, + "loss/aux_loss": 0.04807888753712177, + "loss/crossentropy": 2.577794688940048, + "loss/logits": 0.7898141339421272, + "step": 47030 + }, + { + "epoch": 0.4704, + "grad_norm": 13.4375, + "grad_norm_var": 0.8204264322916667, + "learning_rate": 0.0003, + "loss": 10.9596, + "loss/aux_loss": 0.04807488694787025, + "loss/crossentropy": 2.592330676317215, + "loss/logits": 0.8327637195587159, + "step": 47040 + }, + { + "epoch": 0.4705, + "grad_norm": 14.0625, + "grad_norm_var": 0.6800618489583333, + "learning_rate": 0.0003, + "loss": 11.0693, + "loss/aux_loss": 0.048071105033159256, + "loss/crossentropy": 2.6126275599002837, + "loss/logits": 0.7871117860078811, + "step": 47050 + }, + { + "epoch": 0.4706, + "grad_norm": 13.625, + "grad_norm_var": 0.5578125, + "learning_rate": 0.0003, + "loss": 11.225, + "loss/aux_loss": 0.04807652160525322, + "loss/crossentropy": 2.8688260078430177, + "loss/logits": 0.874985545873642, + "step": 47060 + }, + { + "epoch": 0.4707, + "grad_norm": 13.8125, + "grad_norm_var": 0.5534993489583333, + "learning_rate": 0.0003, + "loss": 11.0904, + "loss/aux_loss": 0.04806956704705954, + "loss/crossentropy": 2.6465198278427122, + "loss/logits": 0.8147805094718933, + "step": 47070 + }, + { + "epoch": 0.4708, + "grad_norm": 14.375, + "grad_norm_var": 0.702978515625, + "learning_rate": 0.0003, + "loss": 11.1015, + "loss/aux_loss": 0.04807991813868284, + "loss/crossentropy": 2.845957559347153, + "loss/logits": 0.8246800363063812, + "step": 47080 + }, + { + "epoch": 0.4709, + "grad_norm": 15.25, + "grad_norm_var": 1.0942057291666667, + "learning_rate": 0.0003, + "loss": 10.9138, + "loss/aux_loss": 0.04807828050106764, + "loss/crossentropy": 2.6598312139511107, + "loss/logits": 0.7844241559505463, + "step": 47090 + }, + { + "epoch": 0.471, + "grad_norm": 15.0, + "grad_norm_var": 4.145833333333333, + "learning_rate": 0.0003, + "loss": 10.998, + "loss/aux_loss": 0.0480682672932744, + "loss/crossentropy": 2.7192655980587004, + "loss/logits": 0.8166234135627747, + "step": 47100 + }, + { + "epoch": 0.4711, + "grad_norm": 14.5, + "grad_norm_var": 3.998421223958333, + "learning_rate": 0.0003, + "loss": 11.1524, + "loss/aux_loss": 0.04806790165603161, + "loss/crossentropy": 2.8544438123703, + "loss/logits": 0.8227733701467514, + "step": 47110 + }, + { + "epoch": 0.4712, + "grad_norm": 20.375, + "grad_norm_var": 2.564957682291667, + "learning_rate": 0.0003, + "loss": 10.9704, + "loss/aux_loss": 0.04809897541999817, + "loss/crossentropy": 2.6712704062461854, + "loss/logits": 0.8468765825033188, + "step": 47120 + }, + { + "epoch": 0.4713, + "grad_norm": 14.5, + "grad_norm_var": 2.6884765625, + "learning_rate": 0.0003, + "loss": 11.1002, + "loss/aux_loss": 0.04806103594601154, + "loss/crossentropy": 2.654213637113571, + "loss/logits": 0.8358179897069931, + "step": 47130 + }, + { + "epoch": 0.4714, + "grad_norm": 14.0625, + "grad_norm_var": 0.2447265625, + "learning_rate": 0.0003, + "loss": 11.2835, + "loss/aux_loss": 0.048075387999415395, + "loss/crossentropy": 2.828604817390442, + "loss/logits": 0.8241938591003418, + "step": 47140 + }, + { + "epoch": 0.4715, + "grad_norm": 16.125, + "grad_norm_var": 1.079541015625, + "learning_rate": 0.0003, + "loss": 11.092, + "loss/aux_loss": 0.04808494281023741, + "loss/crossentropy": 2.693093776702881, + "loss/logits": 0.8137997329235077, + "step": 47150 + }, + { + "epoch": 0.4716, + "grad_norm": 14.875, + "grad_norm_var": 0.7993326822916667, + "learning_rate": 0.0003, + "loss": 11.0348, + "loss/aux_loss": 0.04807308670133352, + "loss/crossentropy": 2.648904633522034, + "loss/logits": 0.8604849994182586, + "step": 47160 + }, + { + "epoch": 0.4717, + "grad_norm": 14.3125, + "grad_norm_var": 0.6218098958333333, + "learning_rate": 0.0003, + "loss": 11.0803, + "loss/aux_loss": 0.04807374849915504, + "loss/crossentropy": 2.6493657648563387, + "loss/logits": 0.8324615597724915, + "step": 47170 + }, + { + "epoch": 0.4718, + "grad_norm": 15.75, + "grad_norm_var": 2.496468098958333, + "learning_rate": 0.0003, + "loss": 10.9996, + "loss/aux_loss": 0.048081782087683676, + "loss/crossentropy": 2.64060292840004, + "loss/logits": 0.8085512012243271, + "step": 47180 + }, + { + "epoch": 0.4719, + "grad_norm": 14.25, + "grad_norm_var": 2.939957682291667, + "learning_rate": 0.0003, + "loss": 11.111, + "loss/aux_loss": 0.04806481916457415, + "loss/crossentropy": 2.6764299035072328, + "loss/logits": 0.8713664382696151, + "step": 47190 + }, + { + "epoch": 0.472, + "grad_norm": 14.9375, + "grad_norm_var": 0.23709309895833333, + "learning_rate": 0.0003, + "loss": 10.9265, + "loss/aux_loss": 0.048081744089722635, + "loss/crossentropy": 2.605439066886902, + "loss/logits": 0.8055230677127838, + "step": 47200 + }, + { + "epoch": 0.4721, + "grad_norm": 15.125, + "grad_norm_var": 0.211572265625, + "learning_rate": 0.0003, + "loss": 11.0814, + "loss/aux_loss": 0.048065853863954545, + "loss/crossentropy": 2.8016534447669983, + "loss/logits": 0.821602874994278, + "step": 47210 + }, + { + "epoch": 0.4722, + "grad_norm": 14.5625, + "grad_norm_var": 1.84140625, + "learning_rate": 0.0003, + "loss": 11.0479, + "loss/aux_loss": 0.048074455559253694, + "loss/crossentropy": 2.6670961678028107, + "loss/logits": 0.8343064039945602, + "step": 47220 + }, + { + "epoch": 0.4723, + "grad_norm": 14.5625, + "grad_norm_var": 1.4374837239583333, + "learning_rate": 0.0003, + "loss": 11.1253, + "loss/aux_loss": 0.0480674734339118, + "loss/crossentropy": 2.7862078309059144, + "loss/logits": 0.8591863840818406, + "step": 47230 + }, + { + "epoch": 0.4724, + "grad_norm": 14.3125, + "grad_norm_var": 0.7679524739583333, + "learning_rate": 0.0003, + "loss": 11.1595, + "loss/aux_loss": 0.048076837323606014, + "loss/crossentropy": 2.742043745517731, + "loss/logits": 0.8124003469944, + "step": 47240 + }, + { + "epoch": 0.4725, + "grad_norm": 14.5, + "grad_norm_var": 0.8222493489583333, + "learning_rate": 0.0003, + "loss": 11.1575, + "loss/aux_loss": 0.04806795790791511, + "loss/crossentropy": 2.688308924436569, + "loss/logits": 0.8328756958246231, + "step": 47250 + }, + { + "epoch": 0.4726, + "grad_norm": 13.9375, + "grad_norm_var": 0.24947916666666667, + "learning_rate": 0.0003, + "loss": 11.1297, + "loss/aux_loss": 0.04808314982801676, + "loss/crossentropy": 2.64586056470871, + "loss/logits": 0.8115894719958305, + "step": 47260 + }, + { + "epoch": 0.4727, + "grad_norm": 14.875, + "grad_norm_var": 0.315869140625, + "learning_rate": 0.0003, + "loss": 11.1337, + "loss/aux_loss": 0.048072290048003195, + "loss/crossentropy": 2.638070636987686, + "loss/logits": 0.8229748249053955, + "step": 47270 + }, + { + "epoch": 0.4728, + "grad_norm": 15.0, + "grad_norm_var": 0.5014973958333333, + "learning_rate": 0.0003, + "loss": 11.2275, + "loss/aux_loss": 0.04807377941906452, + "loss/crossentropy": 2.794324481487274, + "loss/logits": 0.8639295071363449, + "step": 47280 + }, + { + "epoch": 0.4729, + "grad_norm": 14.625, + "grad_norm_var": 0.2816243489583333, + "learning_rate": 0.0003, + "loss": 11.087, + "loss/aux_loss": 0.04807295482605696, + "loss/crossentropy": 2.8186910152435303, + "loss/logits": 0.8727422952651978, + "step": 47290 + }, + { + "epoch": 0.473, + "grad_norm": 15.3125, + "grad_norm_var": 0.7311848958333333, + "learning_rate": 0.0003, + "loss": 11.0382, + "loss/aux_loss": 0.04807542134076357, + "loss/crossentropy": 2.7608199238777162, + "loss/logits": 0.8439187675714492, + "step": 47300 + }, + { + "epoch": 0.4731, + "grad_norm": 13.125, + "grad_norm_var": 0.4222493489583333, + "learning_rate": 0.0003, + "loss": 11.0159, + "loss/aux_loss": 0.04807105585932732, + "loss/crossentropy": 2.8253512501716616, + "loss/logits": 0.8332709580659866, + "step": 47310 + }, + { + "epoch": 0.4732, + "grad_norm": 14.9375, + "grad_norm_var": 0.3082682291666667, + "learning_rate": 0.0003, + "loss": 11.2657, + "loss/aux_loss": 0.04807895310223102, + "loss/crossentropy": 2.7635598182678223, + "loss/logits": 0.862557715177536, + "step": 47320 + }, + { + "epoch": 0.4733, + "grad_norm": 13.875, + "grad_norm_var": 0.43214518229166665, + "learning_rate": 0.0003, + "loss": 11.1253, + "loss/aux_loss": 0.04807691089808941, + "loss/crossentropy": 2.7444288194179536, + "loss/logits": 0.8490573525428772, + "step": 47330 + }, + { + "epoch": 0.4734, + "grad_norm": 14.875, + "grad_norm_var": 0.39034830729166664, + "learning_rate": 0.0003, + "loss": 11.0076, + "loss/aux_loss": 0.04806413035839796, + "loss/crossentropy": 2.6496753454208375, + "loss/logits": 0.8270862758159637, + "step": 47340 + }, + { + "epoch": 0.4735, + "grad_norm": 15.125, + "grad_norm_var": 0.543994140625, + "learning_rate": 0.0003, + "loss": 11.0769, + "loss/aux_loss": 0.04807628635317087, + "loss/crossentropy": 2.6999243259429933, + "loss/logits": 0.8481123268604278, + "step": 47350 + }, + { + "epoch": 0.4736, + "grad_norm": 13.75, + "grad_norm_var": 0.37838541666666664, + "learning_rate": 0.0003, + "loss": 11.0257, + "loss/aux_loss": 0.04807580206543207, + "loss/crossentropy": 2.6780034720897676, + "loss/logits": 0.7950571686029434, + "step": 47360 + }, + { + "epoch": 0.4737, + "grad_norm": 13.3125, + "grad_norm_var": 0.3567708333333333, + "learning_rate": 0.0003, + "loss": 11.0902, + "loss/aux_loss": 0.048067801631987095, + "loss/crossentropy": 2.508834218978882, + "loss/logits": 0.8000911891460418, + "step": 47370 + }, + { + "epoch": 0.4738, + "grad_norm": 13.75, + "grad_norm_var": 1.1479166666666667, + "learning_rate": 0.0003, + "loss": 11.1167, + "loss/aux_loss": 0.04807884600013494, + "loss/crossentropy": 2.683100324869156, + "loss/logits": 0.8076074302196503, + "step": 47380 + }, + { + "epoch": 0.4739, + "grad_norm": 14.375, + "grad_norm_var": 0.7238118489583333, + "learning_rate": 0.0003, + "loss": 11.2828, + "loss/aux_loss": 0.048065136186778545, + "loss/crossentropy": 2.7504623413085936, + "loss/logits": 0.8754628151655197, + "step": 47390 + }, + { + "epoch": 0.474, + "grad_norm": 14.375, + "grad_norm_var": 3.4449055989583335, + "learning_rate": 0.0003, + "loss": 11.2768, + "loss/aux_loss": 0.048078180849552156, + "loss/crossentropy": 2.7185048401355743, + "loss/logits": 0.8097441285848618, + "step": 47400 + }, + { + "epoch": 0.4741, + "grad_norm": 15.0, + "grad_norm_var": 3.372509765625, + "learning_rate": 0.0003, + "loss": 11.0728, + "loss/aux_loss": 0.04806989543139935, + "loss/crossentropy": 2.5462702572345735, + "loss/logits": 0.8237005978822708, + "step": 47410 + }, + { + "epoch": 0.4742, + "grad_norm": 14.875, + "grad_norm_var": 0.3337890625, + "learning_rate": 0.0003, + "loss": 11.1406, + "loss/aux_loss": 0.04807539358735084, + "loss/crossentropy": 2.801107907295227, + "loss/logits": 0.8503676056861877, + "step": 47420 + }, + { + "epoch": 0.4743, + "grad_norm": 14.0, + "grad_norm_var": 1.088916015625, + "learning_rate": 0.0003, + "loss": 11.3099, + "loss/aux_loss": 0.04808101002126932, + "loss/crossentropy": 2.658205211162567, + "loss/logits": 0.8350113093852997, + "step": 47430 + }, + { + "epoch": 0.4744, + "grad_norm": 13.1875, + "grad_norm_var": 1.206103515625, + "learning_rate": 0.0003, + "loss": 10.9774, + "loss/aux_loss": 0.048071620799601075, + "loss/crossentropy": 2.6002185225486754, + "loss/logits": 0.8004775673151017, + "step": 47440 + }, + { + "epoch": 0.4745, + "grad_norm": 15.375, + "grad_norm_var": 0.4525390625, + "learning_rate": 0.0003, + "loss": 11.0775, + "loss/aux_loss": 0.04807077720761299, + "loss/crossentropy": 2.785941880941391, + "loss/logits": 0.8646159768104553, + "step": 47450 + }, + { + "epoch": 0.4746, + "grad_norm": 15.0625, + "grad_norm_var": 0.427197265625, + "learning_rate": 0.0003, + "loss": 11.056, + "loss/aux_loss": 0.048071554861962795, + "loss/crossentropy": 2.7326180696487428, + "loss/logits": 0.8212353408336639, + "step": 47460 + }, + { + "epoch": 0.4747, + "grad_norm": 14.5, + "grad_norm_var": 0.28567708333333336, + "learning_rate": 0.0003, + "loss": 11.1585, + "loss/aux_loss": 0.04807238578796387, + "loss/crossentropy": 2.755151998996735, + "loss/logits": 0.8459553897380829, + "step": 47470 + }, + { + "epoch": 0.4748, + "grad_norm": 14.5, + "grad_norm_var": 0.6296223958333333, + "learning_rate": 0.0003, + "loss": 11.1363, + "loss/aux_loss": 0.04808005690574646, + "loss/crossentropy": 2.7062522768974304, + "loss/logits": 0.823663991689682, + "step": 47480 + }, + { + "epoch": 0.4749, + "grad_norm": 12.8125, + "grad_norm_var": 1.0574055989583333, + "learning_rate": 0.0003, + "loss": 11.078, + "loss/aux_loss": 0.04807120095938444, + "loss/crossentropy": 2.7074629366397858, + "loss/logits": 0.8275787591934204, + "step": 47490 + }, + { + "epoch": 0.475, + "grad_norm": 14.0, + "grad_norm_var": 0.7460774739583333, + "learning_rate": 0.0003, + "loss": 11.2052, + "loss/aux_loss": 0.04807096980512142, + "loss/crossentropy": 2.7104438126087187, + "loss/logits": 0.8310169786214828, + "step": 47500 + }, + { + "epoch": 0.4751, + "grad_norm": 14.625, + "grad_norm_var": 0.5322265625, + "learning_rate": 0.0003, + "loss": 11.1362, + "loss/aux_loss": 0.04808025192469358, + "loss/crossentropy": 2.709762120246887, + "loss/logits": 0.8251390606164932, + "step": 47510 + }, + { + "epoch": 0.4752, + "grad_norm": 14.6875, + "grad_norm_var": 0.620166015625, + "learning_rate": 0.0003, + "loss": 11.1076, + "loss/aux_loss": 0.04807448033243418, + "loss/crossentropy": 2.773437148332596, + "loss/logits": 0.8604608118534088, + "step": 47520 + }, + { + "epoch": 0.4753, + "grad_norm": 15.3125, + "grad_norm_var": 0.2972493489583333, + "learning_rate": 0.0003, + "loss": 11.2152, + "loss/aux_loss": 0.048080704919993875, + "loss/crossentropy": 2.793582892417908, + "loss/logits": 0.8527662813663482, + "step": 47530 + }, + { + "epoch": 0.4754, + "grad_norm": 17.5, + "grad_norm_var": 0.8360514322916667, + "learning_rate": 0.0003, + "loss": 11.1323, + "loss/aux_loss": 0.04807116650044918, + "loss/crossentropy": 2.6637362360954286, + "loss/logits": 0.8446590304374695, + "step": 47540 + }, + { + "epoch": 0.4755, + "grad_norm": 13.9375, + "grad_norm_var": 0.8988118489583333, + "learning_rate": 0.0003, + "loss": 11.0892, + "loss/aux_loss": 0.04808674175292253, + "loss/crossentropy": 2.6643483340740204, + "loss/logits": 0.8172670543193817, + "step": 47550 + }, + { + "epoch": 0.4756, + "grad_norm": 15.0625, + "grad_norm_var": 0.41534830729166666, + "learning_rate": 0.0003, + "loss": 11.0397, + "loss/aux_loss": 0.0480627054348588, + "loss/crossentropy": 2.537232494354248, + "loss/logits": 0.8236280262470246, + "step": 47560 + }, + { + "epoch": 0.4757, + "grad_norm": 13.25, + "grad_norm_var": 0.5806640625, + "learning_rate": 0.0003, + "loss": 11.1531, + "loss/aux_loss": 0.04807612933218479, + "loss/crossentropy": 2.6730732560157775, + "loss/logits": 0.8434269517660141, + "step": 47570 + }, + { + "epoch": 0.4758, + "grad_norm": 13.625, + "grad_norm_var": 0.6679524739583333, + "learning_rate": 0.0003, + "loss": 11.4003, + "loss/aux_loss": 0.04807697795331478, + "loss/crossentropy": 2.80619136095047, + "loss/logits": 0.8598318874835968, + "step": 47580 + }, + { + "epoch": 0.4759, + "grad_norm": 15.75, + "grad_norm_var": 0.7728515625, + "learning_rate": 0.0003, + "loss": 11.1052, + "loss/aux_loss": 0.04807619452476501, + "loss/crossentropy": 2.746232843399048, + "loss/logits": 0.8442793190479279, + "step": 47590 + }, + { + "epoch": 0.476, + "grad_norm": 15.3125, + "grad_norm_var": 0.46920572916666664, + "learning_rate": 0.0003, + "loss": 10.9343, + "loss/aux_loss": 0.04806558098644018, + "loss/crossentropy": 2.794121563434601, + "loss/logits": 0.8108414888381958, + "step": 47600 + }, + { + "epoch": 0.4761, + "grad_norm": 13.75, + "grad_norm_var": 0.22962239583333333, + "learning_rate": 0.0003, + "loss": 11.1234, + "loss/aux_loss": 0.048085267655551434, + "loss/crossentropy": 2.7552334010601043, + "loss/logits": 0.8350756138563156, + "step": 47610 + }, + { + "epoch": 0.4762, + "grad_norm": 14.6875, + "grad_norm_var": 0.37180989583333335, + "learning_rate": 0.0003, + "loss": 11.0146, + "loss/aux_loss": 0.048061388358473775, + "loss/crossentropy": 2.721570539474487, + "loss/logits": 0.8332007586956024, + "step": 47620 + }, + { + "epoch": 0.4763, + "grad_norm": 28.0, + "grad_norm_var": 11.769010416666667, + "learning_rate": 0.0003, + "loss": 11.1894, + "loss/aux_loss": 0.048081564158201216, + "loss/crossentropy": 2.689694482088089, + "loss/logits": 0.8350923985242844, + "step": 47630 + }, + { + "epoch": 0.4764, + "grad_norm": 14.5625, + "grad_norm_var": 15.7728515625, + "learning_rate": 0.0003, + "loss": 11.178, + "loss/aux_loss": 0.048070958070456984, + "loss/crossentropy": 2.8211479425430297, + "loss/logits": 0.8629825711250305, + "step": 47640 + }, + { + "epoch": 0.4765, + "grad_norm": 14.25, + "grad_norm_var": 1.2567057291666666, + "learning_rate": 0.0003, + "loss": 11.0979, + "loss/aux_loss": 0.04807667341083288, + "loss/crossentropy": 2.6792636036872866, + "loss/logits": 0.8190094619989395, + "step": 47650 + }, + { + "epoch": 0.4766, + "grad_norm": 15.0, + "grad_norm_var": 0.68828125, + "learning_rate": 0.0003, + "loss": 11.2105, + "loss/aux_loss": 0.04806965459138155, + "loss/crossentropy": 2.6743164896965026, + "loss/logits": 0.8335071861743927, + "step": 47660 + }, + { + "epoch": 0.4767, + "grad_norm": 15.9375, + "grad_norm_var": 3.9567057291666665, + "learning_rate": 0.0003, + "loss": 11.1965, + "loss/aux_loss": 0.04807929284870625, + "loss/crossentropy": 2.7867653131484986, + "loss/logits": 0.8356254577636719, + "step": 47670 + }, + { + "epoch": 0.4768, + "grad_norm": 15.875, + "grad_norm_var": 0.7306640625, + "learning_rate": 0.0003, + "loss": 11.3107, + "loss/aux_loss": 0.04806722085922956, + "loss/crossentropy": 2.628416657447815, + "loss/logits": 0.827005535364151, + "step": 47680 + }, + { + "epoch": 0.4769, + "grad_norm": 14.9375, + "grad_norm_var": 2.5714680989583334, + "learning_rate": 0.0003, + "loss": 11.065, + "loss/aux_loss": 0.048084712401032445, + "loss/crossentropy": 2.7183729648590087, + "loss/logits": 0.798021674156189, + "step": 47690 + }, + { + "epoch": 0.477, + "grad_norm": 13.25, + "grad_norm_var": 2.953108723958333, + "learning_rate": 0.0003, + "loss": 11.115, + "loss/aux_loss": 0.04808032158762217, + "loss/crossentropy": 2.842688000202179, + "loss/logits": 0.8267738074064255, + "step": 47700 + }, + { + "epoch": 0.4771, + "grad_norm": 15.0625, + "grad_norm_var": 0.9541015625, + "learning_rate": 0.0003, + "loss": 11.1761, + "loss/aux_loss": 0.04806661438196898, + "loss/crossentropy": 2.811937117576599, + "loss/logits": 0.8213659793138504, + "step": 47710 + }, + { + "epoch": 0.4772, + "grad_norm": 15.3125, + "grad_norm_var": 0.8820149739583333, + "learning_rate": 0.0003, + "loss": 11.1166, + "loss/aux_loss": 0.048074382916092874, + "loss/crossentropy": 2.6995534360408784, + "loss/logits": 0.8364298850297928, + "step": 47720 + }, + { + "epoch": 0.4773, + "grad_norm": 17.625, + "grad_norm_var": 51.601497395833334, + "learning_rate": 0.0003, + "loss": 11.2294, + "loss/aux_loss": 0.04806399717926979, + "loss/crossentropy": 2.6920024275779726, + "loss/logits": 0.8450191617012024, + "step": 47730 + }, + { + "epoch": 0.4774, + "grad_norm": 13.8125, + "grad_norm_var": 51.483317057291664, + "learning_rate": 0.0003, + "loss": 11.047, + "loss/aux_loss": 0.04807861316949129, + "loss/crossentropy": 2.694988691806793, + "loss/logits": 0.8258244037628174, + "step": 47740 + }, + { + "epoch": 0.4775, + "grad_norm": 14.8125, + "grad_norm_var": 0.6005045572916666, + "learning_rate": 0.0003, + "loss": 11.1232, + "loss/aux_loss": 0.048065470159053804, + "loss/crossentropy": 2.887584125995636, + "loss/logits": 0.8933877527713776, + "step": 47750 + }, + { + "epoch": 0.4776, + "grad_norm": 16.75, + "grad_norm_var": 0.788525390625, + "learning_rate": 0.0003, + "loss": 10.8924, + "loss/aux_loss": 0.04808261953294277, + "loss/crossentropy": 2.6200126349925994, + "loss/logits": 0.8102252304553985, + "step": 47760 + }, + { + "epoch": 0.4777, + "grad_norm": 14.0625, + "grad_norm_var": 0.9286458333333333, + "learning_rate": 0.0003, + "loss": 10.9407, + "loss/aux_loss": 0.04806542750447988, + "loss/crossentropy": 2.6927346110343935, + "loss/logits": 0.8065591782331467, + "step": 47770 + }, + { + "epoch": 0.4778, + "grad_norm": 16.625, + "grad_norm_var": 0.6200520833333333, + "learning_rate": 0.0003, + "loss": 10.9372, + "loss/aux_loss": 0.0480722114443779, + "loss/crossentropy": 2.7056717574596405, + "loss/logits": 0.825093024969101, + "step": 47780 + }, + { + "epoch": 0.4779, + "grad_norm": 14.75, + "grad_norm_var": 0.5973795572916667, + "learning_rate": 0.0003, + "loss": 11.1361, + "loss/aux_loss": 0.04807592462748289, + "loss/crossentropy": 2.7489038705825806, + "loss/logits": 0.8381727159023284, + "step": 47790 + }, + { + "epoch": 0.478, + "grad_norm": 14.1875, + "grad_norm_var": 0.3947265625, + "learning_rate": 0.0003, + "loss": 11.0163, + "loss/aux_loss": 0.04807247947901487, + "loss/crossentropy": 2.7729135751724243, + "loss/logits": 0.8364667683839798, + "step": 47800 + }, + { + "epoch": 0.4781, + "grad_norm": 14.8125, + "grad_norm_var": 0.43826497395833336, + "learning_rate": 0.0003, + "loss": 11.1055, + "loss/aux_loss": 0.04807865787297487, + "loss/crossentropy": 2.699937582015991, + "loss/logits": 0.8402740955352783, + "step": 47810 + }, + { + "epoch": 0.4782, + "grad_norm": 14.8125, + "grad_norm_var": 0.33489583333333334, + "learning_rate": 0.0003, + "loss": 11.0608, + "loss/aux_loss": 0.04806937780231237, + "loss/crossentropy": 2.7677336633205414, + "loss/logits": 0.8421557247638702, + "step": 47820 + }, + { + "epoch": 0.4783, + "grad_norm": 14.5625, + "grad_norm_var": 0.29373372395833336, + "learning_rate": 0.0003, + "loss": 11.1592, + "loss/aux_loss": 0.04806809443980455, + "loss/crossentropy": 2.534024041891098, + "loss/logits": 0.7998458266258239, + "step": 47830 + }, + { + "epoch": 0.4784, + "grad_norm": 14.9375, + "grad_norm_var": 0.7176432291666667, + "learning_rate": 0.0003, + "loss": 11.1525, + "loss/aux_loss": 0.04807998463511467, + "loss/crossentropy": 2.6285907328128815, + "loss/logits": 0.8426926136016846, + "step": 47840 + }, + { + "epoch": 0.4785, + "grad_norm": 14.3125, + "grad_norm_var": 0.3020833333333333, + "learning_rate": 0.0003, + "loss": 11.071, + "loss/aux_loss": 0.04807542841881514, + "loss/crossentropy": 2.613954132795334, + "loss/logits": 0.8264847338199616, + "step": 47850 + }, + { + "epoch": 0.4786, + "grad_norm": 14.25, + "grad_norm_var": 0.42849934895833336, + "learning_rate": 0.0003, + "loss": 10.9981, + "loss/aux_loss": 0.04806617666035891, + "loss/crossentropy": 2.7965017437934874, + "loss/logits": 0.8339490979909897, + "step": 47860 + }, + { + "epoch": 0.4787, + "grad_norm": 14.4375, + "grad_norm_var": 0.98203125, + "learning_rate": 0.0003, + "loss": 11.0962, + "loss/aux_loss": 0.048079381324350835, + "loss/crossentropy": 2.7282164812088014, + "loss/logits": 0.8202213078737259, + "step": 47870 + }, + { + "epoch": 0.4788, + "grad_norm": 15.375, + "grad_norm_var": 0.6374348958333333, + "learning_rate": 0.0003, + "loss": 11.1136, + "loss/aux_loss": 0.048078177496790886, + "loss/crossentropy": 2.6539010763168336, + "loss/logits": 0.812344890832901, + "step": 47880 + }, + { + "epoch": 0.4789, + "grad_norm": 15.125, + "grad_norm_var": 1.1622233072916666, + "learning_rate": 0.0003, + "loss": 10.9553, + "loss/aux_loss": 0.048067126609385016, + "loss/crossentropy": 2.7369919776916505, + "loss/logits": 0.8319634586572647, + "step": 47890 + }, + { + "epoch": 0.479, + "grad_norm": 15.6875, + "grad_norm_var": 7.665559895833334, + "learning_rate": 0.0003, + "loss": 11.136, + "loss/aux_loss": 0.048075918667018415, + "loss/crossentropy": 2.895642626285553, + "loss/logits": 0.8190632820129394, + "step": 47900 + }, + { + "epoch": 0.4791, + "grad_norm": 15.0, + "grad_norm_var": 7.870768229166667, + "learning_rate": 0.0003, + "loss": 11.0985, + "loss/aux_loss": 0.048073847964406016, + "loss/crossentropy": 2.734054809808731, + "loss/logits": 0.8439525783061981, + "step": 47910 + }, + { + "epoch": 0.4792, + "grad_norm": 14.8125, + "grad_norm_var": 0.39088541666666665, + "learning_rate": 0.0003, + "loss": 11.0615, + "loss/aux_loss": 0.048062325455248356, + "loss/crossentropy": 2.5045742869377134, + "loss/logits": 0.8234647005796433, + "step": 47920 + }, + { + "epoch": 0.4793, + "grad_norm": 14.9375, + "grad_norm_var": 0.32198893229166664, + "learning_rate": 0.0003, + "loss": 11.0936, + "loss/aux_loss": 0.048081851191818716, + "loss/crossentropy": 2.55519557595253, + "loss/logits": 0.8111729115247727, + "step": 47930 + }, + { + "epoch": 0.4794, + "grad_norm": 14.1875, + "grad_norm_var": 0.4494140625, + "learning_rate": 0.0003, + "loss": 11.0921, + "loss/aux_loss": 0.04807233922183514, + "loss/crossentropy": 2.8699767351150514, + "loss/logits": 0.8300641059875489, + "step": 47940 + }, + { + "epoch": 0.4795, + "grad_norm": 14.0, + "grad_norm_var": 1.0027180989583333, + "learning_rate": 0.0003, + "loss": 11.0988, + "loss/aux_loss": 0.048071389086544514, + "loss/crossentropy": 2.839382266998291, + "loss/logits": 0.8115487396717072, + "step": 47950 + }, + { + "epoch": 0.4796, + "grad_norm": 13.4375, + "grad_norm_var": 0.6723795572916667, + "learning_rate": 0.0003, + "loss": 11.0085, + "loss/aux_loss": 0.048065657168626784, + "loss/crossentropy": 2.7501906633377073, + "loss/logits": 0.8143287628889084, + "step": 47960 + }, + { + "epoch": 0.4797, + "grad_norm": 16.25, + "grad_norm_var": 0.8417805989583333, + "learning_rate": 0.0003, + "loss": 11.0563, + "loss/aux_loss": 0.048076402582228187, + "loss/crossentropy": 2.676505321264267, + "loss/logits": 0.8360762178897858, + "step": 47970 + }, + { + "epoch": 0.4798, + "grad_norm": 14.6875, + "grad_norm_var": 0.9072265625, + "learning_rate": 0.0003, + "loss": 11.1315, + "loss/aux_loss": 0.04806589502841234, + "loss/crossentropy": 2.6232878804206847, + "loss/logits": 0.8199890315532684, + "step": 47980 + }, + { + "epoch": 0.4799, + "grad_norm": 13.5625, + "grad_norm_var": 0.6792805989583334, + "learning_rate": 0.0003, + "loss": 11.1561, + "loss/aux_loss": 0.048061208054423335, + "loss/crossentropy": 2.7454289555549622, + "loss/logits": 0.8468107730150223, + "step": 47990 + }, + { + "epoch": 0.48, + "grad_norm": 14.3125, + "grad_norm_var": 0.32447916666666665, + "learning_rate": 0.0003, + "loss": 11.0602, + "loss/aux_loss": 0.0480829494073987, + "loss/crossentropy": 2.7469854950904846, + "loss/logits": 0.8431436151266098, + "step": 48000 + }, + { + "epoch": 0.4801, + "grad_norm": 14.75, + "grad_norm_var": 0.6816243489583333, + "learning_rate": 0.0003, + "loss": 11.2187, + "loss/aux_loss": 0.04806633796542883, + "loss/crossentropy": 2.800862890481949, + "loss/logits": 0.8245023936033249, + "step": 48010 + }, + { + "epoch": 0.4802, + "grad_norm": 14.9375, + "grad_norm_var": 0.6129557291666666, + "learning_rate": 0.0003, + "loss": 11.0638, + "loss/aux_loss": 0.04809022005647421, + "loss/crossentropy": 2.6285562753677367, + "loss/logits": 0.8116117030382156, + "step": 48020 + }, + { + "epoch": 0.4803, + "grad_norm": 14.6875, + "grad_norm_var": 6.8197265625, + "learning_rate": 0.0003, + "loss": 11.0978, + "loss/aux_loss": 0.048075268231332305, + "loss/crossentropy": 2.732567811012268, + "loss/logits": 0.844546177983284, + "step": 48030 + }, + { + "epoch": 0.4804, + "grad_norm": 15.25, + "grad_norm_var": 6.534879557291666, + "learning_rate": 0.0003, + "loss": 11.0056, + "loss/aux_loss": 0.048071599751710894, + "loss/crossentropy": 2.695717829465866, + "loss/logits": 0.7987852036952973, + "step": 48040 + }, + { + "epoch": 0.4805, + "grad_norm": 14.8125, + "grad_norm_var": 0.43776041666666665, + "learning_rate": 0.0003, + "loss": 11.0473, + "loss/aux_loss": 0.04807654451578856, + "loss/crossentropy": 2.877095127105713, + "loss/logits": 0.8336584985256195, + "step": 48050 + }, + { + "epoch": 0.4806, + "grad_norm": 14.75, + "grad_norm_var": 0.758447265625, + "learning_rate": 0.0003, + "loss": 11.0729, + "loss/aux_loss": 0.04807484894990921, + "loss/crossentropy": 2.8576854825019837, + "loss/logits": 0.8484826743602752, + "step": 48060 + }, + { + "epoch": 0.4807, + "grad_norm": 14.5625, + "grad_norm_var": 0.6466145833333333, + "learning_rate": 0.0003, + "loss": 10.9455, + "loss/aux_loss": 0.04806608278304338, + "loss/crossentropy": 2.7044803380966185, + "loss/logits": 0.8198621302843094, + "step": 48070 + }, + { + "epoch": 0.4808, + "grad_norm": 14.0625, + "grad_norm_var": 0.234228515625, + "learning_rate": 0.0003, + "loss": 11.2132, + "loss/aux_loss": 0.048075415566563605, + "loss/crossentropy": 2.8697200059890746, + "loss/logits": 0.8511811017990112, + "step": 48080 + }, + { + "epoch": 0.4809, + "grad_norm": 15.3125, + "grad_norm_var": 0.3804524739583333, + "learning_rate": 0.0003, + "loss": 11.1382, + "loss/aux_loss": 0.04807231742888689, + "loss/crossentropy": 2.600096642971039, + "loss/logits": 0.8244834512472152, + "step": 48090 + }, + { + "epoch": 0.481, + "grad_norm": 16.5, + "grad_norm_var": 0.5700520833333333, + "learning_rate": 0.0003, + "loss": 10.9186, + "loss/aux_loss": 0.04807635005563497, + "loss/crossentropy": 2.604368954896927, + "loss/logits": 0.8412629932165145, + "step": 48100 + }, + { + "epoch": 0.4811, + "grad_norm": 14.0625, + "grad_norm_var": 0.5356770833333333, + "learning_rate": 0.0003, + "loss": 11.1471, + "loss/aux_loss": 0.04806556645780802, + "loss/crossentropy": 2.717805975675583, + "loss/logits": 0.829289898276329, + "step": 48110 + }, + { + "epoch": 0.4812, + "grad_norm": 13.875, + "grad_norm_var": 0.698291015625, + "learning_rate": 0.0003, + "loss": 11.1148, + "loss/aux_loss": 0.04808323364704847, + "loss/crossentropy": 2.707306903600693, + "loss/logits": 0.8816626042127609, + "step": 48120 + }, + { + "epoch": 0.4813, + "grad_norm": 14.8125, + "grad_norm_var": 0.7191243489583333, + "learning_rate": 0.0003, + "loss": 11.1623, + "loss/aux_loss": 0.04807705953717232, + "loss/crossentropy": 2.7037087202072145, + "loss/logits": 0.8497424215078354, + "step": 48130 + }, + { + "epoch": 0.4814, + "grad_norm": 13.8125, + "grad_norm_var": 0.7626139322916666, + "learning_rate": 0.0003, + "loss": 11.158, + "loss/aux_loss": 0.04806127455085516, + "loss/crossentropy": 2.565345358848572, + "loss/logits": 0.8115527182817459, + "step": 48140 + }, + { + "epoch": 0.4815, + "grad_norm": 18.0, + "grad_norm_var": 63.133447265625, + "learning_rate": 0.0003, + "loss": 11.2609, + "loss/aux_loss": 0.04808726757764816, + "loss/crossentropy": 2.6367207527160645, + "loss/logits": 0.831730630993843, + "step": 48150 + }, + { + "epoch": 0.4816, + "grad_norm": 15.875, + "grad_norm_var": 177.33839518229166, + "learning_rate": 0.0003, + "loss": 11.1555, + "loss/aux_loss": 0.04808530658483505, + "loss/crossentropy": 2.6289633989334105, + "loss/logits": 0.846698772907257, + "step": 48160 + }, + { + "epoch": 0.4817, + "grad_norm": 14.125, + "grad_norm_var": 135.46302083333333, + "learning_rate": 0.0003, + "loss": 10.9952, + "loss/aux_loss": 0.0480637326836586, + "loss/crossentropy": 2.6193889021873473, + "loss/logits": 0.8518955647945404, + "step": 48170 + }, + { + "epoch": 0.4818, + "grad_norm": 14.6875, + "grad_norm_var": 0.23631184895833332, + "learning_rate": 0.0003, + "loss": 10.9472, + "loss/aux_loss": 0.04807006679475308, + "loss/crossentropy": 2.5161637544631956, + "loss/logits": 0.7855435490608216, + "step": 48180 + }, + { + "epoch": 0.4819, + "grad_norm": 15.75, + "grad_norm_var": 1.4041015625, + "learning_rate": 0.0003, + "loss": 11.1728, + "loss/aux_loss": 0.04807739406824112, + "loss/crossentropy": 2.578041511774063, + "loss/logits": 0.8267721891403198, + "step": 48190 + }, + { + "epoch": 0.482, + "grad_norm": 14.75, + "grad_norm_var": 1.2307291666666667, + "learning_rate": 0.0003, + "loss": 11.1619, + "loss/aux_loss": 0.04807015117257833, + "loss/crossentropy": 2.851909363269806, + "loss/logits": 0.862451794743538, + "step": 48200 + }, + { + "epoch": 0.4821, + "grad_norm": 15.625, + "grad_norm_var": 0.6079264322916667, + "learning_rate": 0.0003, + "loss": 11.1798, + "loss/aux_loss": 0.04807384721934795, + "loss/crossentropy": 2.7018039345741274, + "loss/logits": 0.8276410967111587, + "step": 48210 + }, + { + "epoch": 0.4822, + "grad_norm": 14.9375, + "grad_norm_var": 0.8374837239583334, + "learning_rate": 0.0003, + "loss": 11.1851, + "loss/aux_loss": 0.04808012768626213, + "loss/crossentropy": 2.792585861682892, + "loss/logits": 0.8121782958507537, + "step": 48220 + }, + { + "epoch": 0.4823, + "grad_norm": 14.25, + "grad_norm_var": 0.5713541666666667, + "learning_rate": 0.0003, + "loss": 11.0181, + "loss/aux_loss": 0.04806952588260174, + "loss/crossentropy": 2.7548343539237976, + "loss/logits": 0.8300218850374221, + "step": 48230 + }, + { + "epoch": 0.4824, + "grad_norm": 13.625, + "grad_norm_var": 0.5208333333333334, + "learning_rate": 0.0003, + "loss": 10.8143, + "loss/aux_loss": 0.04807311985641718, + "loss/crossentropy": 2.582223576307297, + "loss/logits": 0.8206641644239425, + "step": 48240 + }, + { + "epoch": 0.4825, + "grad_norm": 14.125, + "grad_norm_var": 0.4598307291666667, + "learning_rate": 0.0003, + "loss": 11.1807, + "loss/aux_loss": 0.04807979855686426, + "loss/crossentropy": 2.6816562175750733, + "loss/logits": 0.8363949626684188, + "step": 48250 + }, + { + "epoch": 0.4826, + "grad_norm": 14.125, + "grad_norm_var": 0.2869791666666667, + "learning_rate": 0.0003, + "loss": 11.0489, + "loss/aux_loss": 0.04806763082742691, + "loss/crossentropy": 2.7296660900115968, + "loss/logits": 0.8218880474567414, + "step": 48260 + }, + { + "epoch": 0.4827, + "grad_norm": 13.9375, + "grad_norm_var": 0.29347330729166665, + "learning_rate": 0.0003, + "loss": 11.1751, + "loss/aux_loss": 0.04807308837771416, + "loss/crossentropy": 2.7679852724075316, + "loss/logits": 0.8572757095098495, + "step": 48270 + }, + { + "epoch": 0.4828, + "grad_norm": 14.75, + "grad_norm_var": 0.2908854166666667, + "learning_rate": 0.0003, + "loss": 10.9314, + "loss/aux_loss": 0.04806960169225931, + "loss/crossentropy": 2.6580540001392365, + "loss/logits": 0.8273628979921341, + "step": 48280 + }, + { + "epoch": 0.4829, + "grad_norm": 14.625, + "grad_norm_var": 0.41795247395833335, + "learning_rate": 0.0003, + "loss": 11.0294, + "loss/aux_loss": 0.04808035921305418, + "loss/crossentropy": 2.713185727596283, + "loss/logits": 0.8252017825841904, + "step": 48290 + }, + { + "epoch": 0.483, + "grad_norm": 13.9375, + "grad_norm_var": 0.28592122395833336, + "learning_rate": 0.0003, + "loss": 11.0288, + "loss/aux_loss": 0.04807136319577694, + "loss/crossentropy": 2.6470188081264494, + "loss/logits": 0.8096356302499771, + "step": 48300 + }, + { + "epoch": 0.4831, + "grad_norm": 14.6875, + "grad_norm_var": 0.33865559895833336, + "learning_rate": 0.0003, + "loss": 11.0124, + "loss/aux_loss": 0.048075702227652076, + "loss/crossentropy": 2.628252637386322, + "loss/logits": 0.8306295484304428, + "step": 48310 + }, + { + "epoch": 0.4832, + "grad_norm": 13.9375, + "grad_norm_var": 0.333837890625, + "learning_rate": 0.0003, + "loss": 11.2257, + "loss/aux_loss": 0.04808404166251421, + "loss/crossentropy": 2.575061935186386, + "loss/logits": 0.8175129801034927, + "step": 48320 + }, + { + "epoch": 0.4833, + "grad_norm": 14.0625, + "grad_norm_var": 0.39386393229166666, + "learning_rate": 0.0003, + "loss": 11.2456, + "loss/aux_loss": 0.04806752149015665, + "loss/crossentropy": 2.6643171072006226, + "loss/logits": 0.8468601524829864, + "step": 48330 + }, + { + "epoch": 0.4834, + "grad_norm": 16.375, + "grad_norm_var": 0.9049479166666666, + "learning_rate": 0.0003, + "loss": 11.0414, + "loss/aux_loss": 0.048077189922332765, + "loss/crossentropy": 2.659744346141815, + "loss/logits": 0.7988389104604721, + "step": 48340 + }, + { + "epoch": 0.4835, + "grad_norm": 14.375, + "grad_norm_var": 367.3979166666667, + "learning_rate": 0.0003, + "loss": 11.1791, + "loss/aux_loss": 0.04807696957141161, + "loss/crossentropy": 2.824947530031204, + "loss/logits": 0.8144560337066651, + "step": 48350 + }, + { + "epoch": 0.4836, + "grad_norm": 14.75, + "grad_norm_var": 2.502067057291667, + "learning_rate": 0.0003, + "loss": 11.0857, + "loss/aux_loss": 0.048078769072890284, + "loss/crossentropy": 2.700359559059143, + "loss/logits": 0.8216308414936065, + "step": 48360 + }, + { + "epoch": 0.4837, + "grad_norm": 15.0625, + "grad_norm_var": 0.2712890625, + "learning_rate": 0.0003, + "loss": 11.0512, + "loss/aux_loss": 0.048067495599389075, + "loss/crossentropy": 2.7094205021858215, + "loss/logits": 0.848452877998352, + "step": 48370 + }, + { + "epoch": 0.4838, + "grad_norm": 14.3125, + "grad_norm_var": 0.5499348958333333, + "learning_rate": 0.0003, + "loss": 11.2143, + "loss/aux_loss": 0.04807147514075041, + "loss/crossentropy": 2.610717463493347, + "loss/logits": 0.8028295308351516, + "step": 48380 + }, + { + "epoch": 0.4839, + "grad_norm": 15.875, + "grad_norm_var": 0.8441243489583333, + "learning_rate": 0.0003, + "loss": 11.0868, + "loss/aux_loss": 0.04808458909392357, + "loss/crossentropy": 2.649008184671402, + "loss/logits": 0.7971860766410828, + "step": 48390 + }, + { + "epoch": 0.484, + "grad_norm": 16.875, + "grad_norm_var": 0.6166666666666667, + "learning_rate": 0.0003, + "loss": 11.1314, + "loss/aux_loss": 0.048060483299195766, + "loss/crossentropy": 2.60991570353508, + "loss/logits": 0.8266617238521576, + "step": 48400 + }, + { + "epoch": 0.4841, + "grad_norm": 14.5, + "grad_norm_var": 0.7341145833333333, + "learning_rate": 0.0003, + "loss": 11.2854, + "loss/aux_loss": 0.04807337708771229, + "loss/crossentropy": 2.8312358379364015, + "loss/logits": 0.8423346072435379, + "step": 48410 + }, + { + "epoch": 0.4842, + "grad_norm": 14.25, + "grad_norm_var": 0.461572265625, + "learning_rate": 0.0003, + "loss": 11.0738, + "loss/aux_loss": 0.048069039918482305, + "loss/crossentropy": 2.6287239670753477, + "loss/logits": 0.8132491081953048, + "step": 48420 + }, + { + "epoch": 0.4843, + "grad_norm": 14.375, + "grad_norm_var": 0.38409830729166666, + "learning_rate": 0.0003, + "loss": 11.1052, + "loss/aux_loss": 0.04807084016501904, + "loss/crossentropy": 2.6560630083084105, + "loss/logits": 0.8387343198060989, + "step": 48430 + }, + { + "epoch": 0.4844, + "grad_norm": 15.3125, + "grad_norm_var": 0.22786458333333334, + "learning_rate": 0.0003, + "loss": 11.218, + "loss/aux_loss": 0.04808529261499643, + "loss/crossentropy": 2.574995279312134, + "loss/logits": 0.8190798044204712, + "step": 48440 + }, + { + "epoch": 0.4845, + "grad_norm": 14.5, + "grad_norm_var": 1.1051920572916667, + "learning_rate": 0.0003, + "loss": 11.2403, + "loss/aux_loss": 0.04806236382573843, + "loss/crossentropy": 2.641158491373062, + "loss/logits": 0.8146007388830185, + "step": 48450 + }, + { + "epoch": 0.4846, + "grad_norm": 13.125, + "grad_norm_var": 0.8126139322916667, + "learning_rate": 0.0003, + "loss": 11.1928, + "loss/aux_loss": 0.048080836050212385, + "loss/crossentropy": 2.9179535865783692, + "loss/logits": 0.8601721286773681, + "step": 48460 + }, + { + "epoch": 0.4847, + "grad_norm": 14.5625, + "grad_norm_var": 0.4988932291666667, + "learning_rate": 0.0003, + "loss": 11.1159, + "loss/aux_loss": 0.04806997887790203, + "loss/crossentropy": 2.536268186569214, + "loss/logits": 0.8264925092458725, + "step": 48470 + }, + { + "epoch": 0.4848, + "grad_norm": 13.3125, + "grad_norm_var": 1.3744791666666667, + "learning_rate": 0.0003, + "loss": 11.0528, + "loss/aux_loss": 0.0480733098462224, + "loss/crossentropy": 2.5422776341438293, + "loss/logits": 0.7887350648641587, + "step": 48480 + }, + { + "epoch": 0.4849, + "grad_norm": 14.125, + "grad_norm_var": 0.5436848958333333, + "learning_rate": 0.0003, + "loss": 11.1683, + "loss/aux_loss": 0.048076307587325576, + "loss/crossentropy": 2.744097375869751, + "loss/logits": 0.8400337219238281, + "step": 48490 + }, + { + "epoch": 0.485, + "grad_norm": 14.3125, + "grad_norm_var": 0.22473958333333333, + "learning_rate": 0.0003, + "loss": 11.0867, + "loss/aux_loss": 0.048070046678185464, + "loss/crossentropy": 2.8123038172721864, + "loss/logits": 0.8171926707029342, + "step": 48500 + }, + { + "epoch": 0.4851, + "grad_norm": 14.0, + "grad_norm_var": 0.318994140625, + "learning_rate": 0.0003, + "loss": 10.9393, + "loss/aux_loss": 0.048068254627287386, + "loss/crossentropy": 2.693702256679535, + "loss/logits": 0.8272054940462112, + "step": 48510 + }, + { + "epoch": 0.4852, + "grad_norm": 15.4375, + "grad_norm_var": 0.492822265625, + "learning_rate": 0.0003, + "loss": 11.2476, + "loss/aux_loss": 0.04807813167572021, + "loss/crossentropy": 2.7948949217796324, + "loss/logits": 0.8314435452222824, + "step": 48520 + }, + { + "epoch": 0.4853, + "grad_norm": 14.0625, + "grad_norm_var": 0.4557291666666667, + "learning_rate": 0.0003, + "loss": 11.1876, + "loss/aux_loss": 0.04806694649159908, + "loss/crossentropy": 2.935932195186615, + "loss/logits": 0.8360010713338852, + "step": 48530 + }, + { + "epoch": 0.4854, + "grad_norm": 14.625, + "grad_norm_var": 59.37890625, + "learning_rate": 0.0003, + "loss": 11.0565, + "loss/aux_loss": 0.048075059242546556, + "loss/crossentropy": 2.6938049614429476, + "loss/logits": 0.8414293229579926, + "step": 48540 + }, + { + "epoch": 0.4855, + "grad_norm": 14.0, + "grad_norm_var": 51.1009765625, + "learning_rate": 0.0003, + "loss": 11.2106, + "loss/aux_loss": 0.0480806240811944, + "loss/crossentropy": 2.7439634084701536, + "loss/logits": 0.8203217297792434, + "step": 48550 + }, + { + "epoch": 0.4856, + "grad_norm": 15.4375, + "grad_norm_var": 0.3509765625, + "learning_rate": 0.0003, + "loss": 11.0734, + "loss/aux_loss": 0.04806556981056929, + "loss/crossentropy": 2.665660631656647, + "loss/logits": 0.8277056187391281, + "step": 48560 + }, + { + "epoch": 0.4857, + "grad_norm": 14.4375, + "grad_norm_var": 0.25128580729166666, + "learning_rate": 0.0003, + "loss": 11.0559, + "loss/aux_loss": 0.04807299673557282, + "loss/crossentropy": 2.8438867926597595, + "loss/logits": 0.8425880312919617, + "step": 48570 + }, + { + "epoch": 0.4858, + "grad_norm": 13.5625, + "grad_norm_var": 0.641650390625, + "learning_rate": 0.0003, + "loss": 10.9983, + "loss/aux_loss": 0.04807940311729908, + "loss/crossentropy": 2.7427866578102114, + "loss/logits": 0.8035318404436111, + "step": 48580 + }, + { + "epoch": 0.4859, + "grad_norm": 14.5, + "grad_norm_var": 0.7212890625, + "learning_rate": 0.0003, + "loss": 10.964, + "loss/aux_loss": 0.04807807970792055, + "loss/crossentropy": 2.6292437076568604, + "loss/logits": 0.800726181268692, + "step": 48590 + }, + { + "epoch": 0.486, + "grad_norm": 15.625, + "grad_norm_var": 0.913916015625, + "learning_rate": 0.0003, + "loss": 11.1338, + "loss/aux_loss": 0.04807404633611441, + "loss/crossentropy": 2.686028057336807, + "loss/logits": 0.8427057951688767, + "step": 48600 + }, + { + "epoch": 0.4861, + "grad_norm": 14.875, + "grad_norm_var": 0.8051432291666667, + "learning_rate": 0.0003, + "loss": 10.9843, + "loss/aux_loss": 0.04807655718177557, + "loss/crossentropy": 2.5984533965587615, + "loss/logits": 0.7909109711647033, + "step": 48610 + }, + { + "epoch": 0.4862, + "grad_norm": 16.625, + "grad_norm_var": 1.9972493489583334, + "learning_rate": 0.0003, + "loss": 10.9925, + "loss/aux_loss": 0.04807711597532034, + "loss/crossentropy": 2.593876451253891, + "loss/logits": 0.831505474448204, + "step": 48620 + }, + { + "epoch": 0.4863, + "grad_norm": 13.9375, + "grad_norm_var": 1.0317545572916667, + "learning_rate": 0.0003, + "loss": 11.0305, + "loss/aux_loss": 0.04807915035635233, + "loss/crossentropy": 2.761583888530731, + "loss/logits": 0.8605498760938645, + "step": 48630 + }, + { + "epoch": 0.4864, + "grad_norm": 14.8125, + "grad_norm_var": 0.37233072916666665, + "learning_rate": 0.0003, + "loss": 11.1418, + "loss/aux_loss": 0.04806447252631187, + "loss/crossentropy": 2.606149101257324, + "loss/logits": 0.8358212620019912, + "step": 48640 + }, + { + "epoch": 0.4865, + "grad_norm": 15.0, + "grad_norm_var": 0.32962239583333336, + "learning_rate": 0.0003, + "loss": 10.9811, + "loss/aux_loss": 0.04808054771274328, + "loss/crossentropy": 2.6689969122409822, + "loss/logits": 0.8446838974952697, + "step": 48650 + }, + { + "epoch": 0.4866, + "grad_norm": 15.1875, + "grad_norm_var": 2.2749348958333333, + "learning_rate": 0.0003, + "loss": 11.2319, + "loss/aux_loss": 0.048064498230814934, + "loss/crossentropy": 2.760690987110138, + "loss/logits": 0.8313882291316986, + "step": 48660 + }, + { + "epoch": 0.4867, + "grad_norm": 14.5, + "grad_norm_var": 0.6442057291666666, + "learning_rate": 0.0003, + "loss": 11.0473, + "loss/aux_loss": 0.04807339608669281, + "loss/crossentropy": 2.789001631736755, + "loss/logits": 0.8379965245723724, + "step": 48670 + }, + { + "epoch": 0.4868, + "grad_norm": 14.5625, + "grad_norm_var": 0.3728515625, + "learning_rate": 0.0003, + "loss": 11.193, + "loss/aux_loss": 0.04807340279221535, + "loss/crossentropy": 2.65660617351532, + "loss/logits": 0.8233199805021286, + "step": 48680 + }, + { + "epoch": 0.4869, + "grad_norm": 15.625, + "grad_norm_var": 0.5338541666666666, + "learning_rate": 0.0003, + "loss": 10.828, + "loss/aux_loss": 0.04807305708527565, + "loss/crossentropy": 2.6044947862625123, + "loss/logits": 0.7953520357608795, + "step": 48690 + }, + { + "epoch": 0.487, + "grad_norm": 14.8125, + "grad_norm_var": 0.7331868489583333, + "learning_rate": 0.0003, + "loss": 11.0447, + "loss/aux_loss": 0.04807252772152424, + "loss/crossentropy": 2.5909561276435853, + "loss/logits": 0.8053539365530014, + "step": 48700 + }, + { + "epoch": 0.4871, + "grad_norm": 17.0, + "grad_norm_var": 0.653759765625, + "learning_rate": 0.0003, + "loss": 11.1363, + "loss/aux_loss": 0.04807318150997162, + "loss/crossentropy": 2.6690219819545744, + "loss/logits": 0.8558798760175705, + "step": 48710 + }, + { + "epoch": 0.4872, + "grad_norm": 13.1875, + "grad_norm_var": 0.7322916666666667, + "learning_rate": 0.0003, + "loss": 10.9482, + "loss/aux_loss": 0.048087730258703235, + "loss/crossentropy": 2.7921680390834807, + "loss/logits": 0.8220372408628464, + "step": 48720 + }, + { + "epoch": 0.4873, + "grad_norm": 14.3125, + "grad_norm_var": 0.6102701822916666, + "learning_rate": 0.0003, + "loss": 11.0286, + "loss/aux_loss": 0.04806871749460697, + "loss/crossentropy": 2.6840153992176057, + "loss/logits": 0.850041389465332, + "step": 48730 + }, + { + "epoch": 0.4874, + "grad_norm": 15.5625, + "grad_norm_var": 0.7270670572916667, + "learning_rate": 0.0003, + "loss": 11.1639, + "loss/aux_loss": 0.04806739930063486, + "loss/crossentropy": 2.858708620071411, + "loss/logits": 0.8481850981712341, + "step": 48740 + }, + { + "epoch": 0.4875, + "grad_norm": 16.25, + "grad_norm_var": 3.2315104166666666, + "learning_rate": 0.0003, + "loss": 10.9632, + "loss/aux_loss": 0.04807026702910662, + "loss/crossentropy": 2.6791675448417664, + "loss/logits": 0.8031369209289551, + "step": 48750 + }, + { + "epoch": 0.4876, + "grad_norm": 13.375, + "grad_norm_var": 0.8124837239583333, + "learning_rate": 0.0003, + "loss": 10.9988, + "loss/aux_loss": 0.04808725789189339, + "loss/crossentropy": 2.6807437360286714, + "loss/logits": 0.8061759442090988, + "step": 48760 + }, + { + "epoch": 0.4877, + "grad_norm": 15.5625, + "grad_norm_var": 0.7791666666666667, + "learning_rate": 0.0003, + "loss": 11.1572, + "loss/aux_loss": 0.04806392826139927, + "loss/crossentropy": 2.44208277463913, + "loss/logits": 0.8261926531791687, + "step": 48770 + }, + { + "epoch": 0.4878, + "grad_norm": 15.625, + "grad_norm_var": 0.9025390625, + "learning_rate": 0.0003, + "loss": 11.1327, + "loss/aux_loss": 0.04807030875235796, + "loss/crossentropy": 2.6741757929325103, + "loss/logits": 0.8217558234930038, + "step": 48780 + }, + { + "epoch": 0.4879, + "grad_norm": 14.9375, + "grad_norm_var": 0.49412434895833335, + "learning_rate": 0.0003, + "loss": 11.1311, + "loss/aux_loss": 0.04807980488985777, + "loss/crossentropy": 2.5993297338485717, + "loss/logits": 0.8183979272842408, + "step": 48790 + }, + { + "epoch": 0.488, + "grad_norm": 15.0625, + "grad_norm_var": 0.2634765625, + "learning_rate": 0.0003, + "loss": 11.1117, + "loss/aux_loss": 0.04807808380573988, + "loss/crossentropy": 2.6094757199287413, + "loss/logits": 0.7935949236154556, + "step": 48800 + }, + { + "epoch": 0.4881, + "grad_norm": 15.6875, + "grad_norm_var": 0.7231770833333333, + "learning_rate": 0.0003, + "loss": 11.2077, + "loss/aux_loss": 0.04806241802871227, + "loss/crossentropy": 2.876955544948578, + "loss/logits": 0.8346400111913681, + "step": 48810 + }, + { + "epoch": 0.4882, + "grad_norm": 15.9375, + "grad_norm_var": 0.9645182291666666, + "learning_rate": 0.0003, + "loss": 11.1004, + "loss/aux_loss": 0.048071075975894925, + "loss/crossentropy": 2.8417654395103455, + "loss/logits": 0.8361264318227768, + "step": 48820 + }, + { + "epoch": 0.4883, + "grad_norm": 14.1875, + "grad_norm_var": 0.551416015625, + "learning_rate": 0.0003, + "loss": 11.171, + "loss/aux_loss": 0.04807602632790804, + "loss/crossentropy": 2.799322694540024, + "loss/logits": 0.8414522469043731, + "step": 48830 + }, + { + "epoch": 0.4884, + "grad_norm": 15.875, + "grad_norm_var": 0.8311848958333333, + "learning_rate": 0.0003, + "loss": 11.2567, + "loss/aux_loss": 0.04806129559874535, + "loss/crossentropy": 2.689255505800247, + "loss/logits": 0.8547393798828125, + "step": 48840 + }, + { + "epoch": 0.4885, + "grad_norm": 13.9375, + "grad_norm_var": 0.563916015625, + "learning_rate": 0.0003, + "loss": 11.1285, + "loss/aux_loss": 0.04807682503014803, + "loss/crossentropy": 2.797248286008835, + "loss/logits": 0.8470206201076508, + "step": 48850 + }, + { + "epoch": 0.4886, + "grad_norm": 14.625, + "grad_norm_var": 0.78046875, + "learning_rate": 0.0003, + "loss": 11.0018, + "loss/aux_loss": 0.04807422161102295, + "loss/crossentropy": 2.511183685064316, + "loss/logits": 0.7976685196161271, + "step": 48860 + }, + { + "epoch": 0.4887, + "grad_norm": 14.9375, + "grad_norm_var": 7.047509765625, + "learning_rate": 0.0003, + "loss": 11.0695, + "loss/aux_loss": 0.04808024186640978, + "loss/crossentropy": 2.8470928072929382, + "loss/logits": 0.8435232043266296, + "step": 48870 + }, + { + "epoch": 0.4888, + "grad_norm": 15.625, + "grad_norm_var": 7.532666015625, + "learning_rate": 0.0003, + "loss": 11.0045, + "loss/aux_loss": 0.048071150295436384, + "loss/crossentropy": 2.7967050075531006, + "loss/logits": 0.8275404214859009, + "step": 48880 + }, + { + "epoch": 0.4889, + "grad_norm": 14.5625, + "grad_norm_var": 0.4337890625, + "learning_rate": 0.0003, + "loss": 11.0734, + "loss/aux_loss": 0.04806635808199644, + "loss/crossentropy": 2.776483827829361, + "loss/logits": 0.8333809942007064, + "step": 48890 + }, + { + "epoch": 0.489, + "grad_norm": 13.8125, + "grad_norm_var": 0.5416015625, + "learning_rate": 0.0003, + "loss": 11.1054, + "loss/aux_loss": 0.04807385727763176, + "loss/crossentropy": 2.7456183671951293, + "loss/logits": 0.8244105398654937, + "step": 48900 + }, + { + "epoch": 0.4891, + "grad_norm": 14.5625, + "grad_norm_var": 0.9114420572916667, + "learning_rate": 0.0003, + "loss": 11.2572, + "loss/aux_loss": 0.04807461760938168, + "loss/crossentropy": 2.9388389587402344, + "loss/logits": 0.8438066065311431, + "step": 48910 + }, + { + "epoch": 0.4892, + "grad_norm": 14.5625, + "grad_norm_var": 0.9707682291666667, + "learning_rate": 0.0003, + "loss": 11.3584, + "loss/aux_loss": 0.04808568153530359, + "loss/crossentropy": 2.8007669508457185, + "loss/logits": 0.8510422587394715, + "step": 48920 + }, + { + "epoch": 0.4893, + "grad_norm": 14.1875, + "grad_norm_var": 0.916650390625, + "learning_rate": 0.0003, + "loss": 10.971, + "loss/aux_loss": 0.0480602802708745, + "loss/crossentropy": 2.614273113012314, + "loss/logits": 0.8163867175579071, + "step": 48930 + }, + { + "epoch": 0.4894, + "grad_norm": 15.125, + "grad_norm_var": 3.5825520833333333, + "learning_rate": 0.0003, + "loss": 11.1483, + "loss/aux_loss": 0.04808527324348688, + "loss/crossentropy": 2.74897957444191, + "loss/logits": 0.8206755816936493, + "step": 48940 + }, + { + "epoch": 0.4895, + "grad_norm": 14.5, + "grad_norm_var": 2.756705729166667, + "learning_rate": 0.0003, + "loss": 11.139, + "loss/aux_loss": 0.04805995114147663, + "loss/crossentropy": 2.8490783333778382, + "loss/logits": 0.8311156839132309, + "step": 48950 + }, + { + "epoch": 0.4896, + "grad_norm": 14.5, + "grad_norm_var": 0.779150390625, + "learning_rate": 0.0003, + "loss": 11.1204, + "loss/aux_loss": 0.04806854724884033, + "loss/crossentropy": 2.634866565465927, + "loss/logits": 0.8357069045305252, + "step": 48960 + }, + { + "epoch": 0.4897, + "grad_norm": 14.875, + "grad_norm_var": 0.4479166666666667, + "learning_rate": 0.0003, + "loss": 10.908, + "loss/aux_loss": 0.04807532802224159, + "loss/crossentropy": 2.716035795211792, + "loss/logits": 0.8380373746156693, + "step": 48970 + }, + { + "epoch": 0.4898, + "grad_norm": 20.375, + "grad_norm_var": 2.3684895833333335, + "learning_rate": 0.0003, + "loss": 11.1729, + "loss/aux_loss": 0.04806866105645895, + "loss/crossentropy": 2.523072302341461, + "loss/logits": 0.809576940536499, + "step": 48980 + }, + { + "epoch": 0.4899, + "grad_norm": 14.5625, + "grad_norm_var": 2.4712890625, + "learning_rate": 0.0003, + "loss": 10.9139, + "loss/aux_loss": 0.0480740724131465, + "loss/crossentropy": 2.5105146706104278, + "loss/logits": 0.8070461362600326, + "step": 48990 + }, + { + "epoch": 0.49, + "grad_norm": 14.125, + "grad_norm_var": 0.3726399739583333, + "learning_rate": 0.0003, + "loss": 10.9705, + "loss/aux_loss": 0.048071536049246785, + "loss/crossentropy": 2.8065383076667785, + "loss/logits": 0.8255183070898056, + "step": 49000 + }, + { + "epoch": 0.4901, + "grad_norm": 14.9375, + "grad_norm_var": 0.33917643229166666, + "learning_rate": 0.0003, + "loss": 11.1304, + "loss/aux_loss": 0.048070686869323254, + "loss/crossentropy": 2.783307147026062, + "loss/logits": 0.817973655462265, + "step": 49010 + }, + { + "epoch": 0.4902, + "grad_norm": 13.25, + "grad_norm_var": 0.7018229166666666, + "learning_rate": 0.0003, + "loss": 11.1179, + "loss/aux_loss": 0.048075612261891366, + "loss/crossentropy": 2.7615358352661135, + "loss/logits": 0.8785318732261658, + "step": 49020 + }, + { + "epoch": 0.4903, + "grad_norm": 14.0, + "grad_norm_var": 0.8627604166666667, + "learning_rate": 0.0003, + "loss": 11.1486, + "loss/aux_loss": 0.04808089416474104, + "loss/crossentropy": 2.7252781689167023, + "loss/logits": 0.8219867736101151, + "step": 49030 + }, + { + "epoch": 0.4904, + "grad_norm": 15.0625, + "grad_norm_var": 0.7822916666666667, + "learning_rate": 0.0003, + "loss": 10.9766, + "loss/aux_loss": 0.048066638968884946, + "loss/crossentropy": 2.6021959662437437, + "loss/logits": 0.8107183337211609, + "step": 49040 + }, + { + "epoch": 0.4905, + "grad_norm": 14.25, + "grad_norm_var": 0.6675618489583334, + "learning_rate": 0.0003, + "loss": 11.0778, + "loss/aux_loss": 0.04807810839265585, + "loss/crossentropy": 2.7399057030677794, + "loss/logits": 0.8506060719490052, + "step": 49050 + }, + { + "epoch": 0.4906, + "grad_norm": 14.0625, + "grad_norm_var": 0.299853515625, + "learning_rate": 0.0003, + "loss": 10.9571, + "loss/aux_loss": 0.04807314686477184, + "loss/crossentropy": 2.8067448258399965, + "loss/logits": 0.8424109250307084, + "step": 49060 + }, + { + "epoch": 0.4907, + "grad_norm": 15.125, + "grad_norm_var": 0.6353515625, + "learning_rate": 0.0003, + "loss": 11.0461, + "loss/aux_loss": 0.048080555908381936, + "loss/crossentropy": 2.6311775505542756, + "loss/logits": 0.8175216227769851, + "step": 49070 + }, + { + "epoch": 0.4908, + "grad_norm": 13.125, + "grad_norm_var": 1538.88359375, + "learning_rate": 0.0003, + "loss": 11.1342, + "loss/aux_loss": 0.04807594697922468, + "loss/crossentropy": 2.646185690164566, + "loss/logits": 0.8125636070966721, + "step": 49080 + }, + { + "epoch": 0.4909, + "grad_norm": 13.75, + "grad_norm_var": 0.865087890625, + "learning_rate": 0.0003, + "loss": 11.1334, + "loss/aux_loss": 0.04806858729571104, + "loss/crossentropy": 2.7989614844322204, + "loss/logits": 0.825323560833931, + "step": 49090 + }, + { + "epoch": 0.491, + "grad_norm": 14.75, + "grad_norm_var": 0.31197916666666664, + "learning_rate": 0.0003, + "loss": 11.1018, + "loss/aux_loss": 0.04807522725313902, + "loss/crossentropy": 2.722189944982529, + "loss/logits": 0.8272465378046036, + "step": 49100 + }, + { + "epoch": 0.4911, + "grad_norm": 15.3125, + "grad_norm_var": 0.37180989583333335, + "learning_rate": 0.0003, + "loss": 11.0668, + "loss/aux_loss": 0.048069358244538306, + "loss/crossentropy": 2.7910789966583254, + "loss/logits": 0.8452953428030014, + "step": 49110 + }, + { + "epoch": 0.4912, + "grad_norm": 14.5625, + "grad_norm_var": 0.38430989583333336, + "learning_rate": 0.0003, + "loss": 11.0669, + "loss/aux_loss": 0.04807008057832718, + "loss/crossentropy": 2.660131776332855, + "loss/logits": 0.8045534908771514, + "step": 49120 + }, + { + "epoch": 0.4913, + "grad_norm": 13.875, + "grad_norm_var": 1.168603515625, + "learning_rate": 0.0003, + "loss": 10.9811, + "loss/aux_loss": 0.048069944977760314, + "loss/crossentropy": 2.664647787809372, + "loss/logits": 0.8101317912340165, + "step": 49130 + }, + { + "epoch": 0.4914, + "grad_norm": 14.875, + "grad_norm_var": 1.0976399739583333, + "learning_rate": 0.0003, + "loss": 11.0029, + "loss/aux_loss": 0.04808374773710966, + "loss/crossentropy": 2.6824153780937197, + "loss/logits": 0.8375868052244186, + "step": 49140 + }, + { + "epoch": 0.4915, + "grad_norm": 14.0625, + "grad_norm_var": 1.1554524739583334, + "learning_rate": 0.0003, + "loss": 11.1305, + "loss/aux_loss": 0.04806366134434938, + "loss/crossentropy": 2.7819727063179016, + "loss/logits": 0.848738157749176, + "step": 49150 + }, + { + "epoch": 0.4916, + "grad_norm": 15.8125, + "grad_norm_var": 0.650634765625, + "learning_rate": 0.0003, + "loss": 11.2014, + "loss/aux_loss": 0.048080661334097385, + "loss/crossentropy": 2.6901016354560854, + "loss/logits": 0.850712725520134, + "step": 49160 + }, + { + "epoch": 0.4917, + "grad_norm": 14.1875, + "grad_norm_var": 7.600634765625, + "learning_rate": 0.0003, + "loss": 11.1302, + "loss/aux_loss": 0.04806759636849165, + "loss/crossentropy": 2.750749206542969, + "loss/logits": 0.8357772469520569, + "step": 49170 + }, + { + "epoch": 0.4918, + "grad_norm": 16.625, + "grad_norm_var": 15.297330729166667, + "learning_rate": 0.0003, + "loss": 11.0257, + "loss/aux_loss": 0.048100476153194904, + "loss/crossentropy": 2.585780292749405, + "loss/logits": 0.7976143449544907, + "step": 49180 + }, + { + "epoch": 0.4919, + "grad_norm": 16.875, + "grad_norm_var": 8.941259765625, + "learning_rate": 0.0003, + "loss": 11.1719, + "loss/aux_loss": 0.048067899979650976, + "loss/crossentropy": 2.7343214392662047, + "loss/logits": 0.835987788438797, + "step": 49190 + }, + { + "epoch": 0.492, + "grad_norm": 15.375, + "grad_norm_var": 0.9044270833333333, + "learning_rate": 0.0003, + "loss": 11.0691, + "loss/aux_loss": 0.04807320646941662, + "loss/crossentropy": 2.6477014422416687, + "loss/logits": 0.8445640474557876, + "step": 49200 + }, + { + "epoch": 0.4921, + "grad_norm": 14.6875, + "grad_norm_var": 0.6184733072916667, + "learning_rate": 0.0003, + "loss": 11.0543, + "loss/aux_loss": 0.048070452734828, + "loss/crossentropy": 2.7898465573787687, + "loss/logits": 0.8426847785711289, + "step": 49210 + }, + { + "epoch": 0.4922, + "grad_norm": 16.75, + "grad_norm_var": 0.6379557291666667, + "learning_rate": 0.0003, + "loss": 11.094, + "loss/aux_loss": 0.04807724431157112, + "loss/crossentropy": 2.6970956563949584, + "loss/logits": 0.8135352551937103, + "step": 49220 + }, + { + "epoch": 0.4923, + "grad_norm": 16.125, + "grad_norm_var": 1.0978515625, + "learning_rate": 0.0003, + "loss": 11.1644, + "loss/aux_loss": 0.048074241168797015, + "loss/crossentropy": 2.668863868713379, + "loss/logits": 0.803708478808403, + "step": 49230 + }, + { + "epoch": 0.4924, + "grad_norm": 14.25, + "grad_norm_var": 0.9801432291666666, + "learning_rate": 0.0003, + "loss": 11.1072, + "loss/aux_loss": 0.04808567836880684, + "loss/crossentropy": 2.7105720579624175, + "loss/logits": 0.8392592817544937, + "step": 49240 + }, + { + "epoch": 0.4925, + "grad_norm": 14.0625, + "grad_norm_var": 0.61328125, + "learning_rate": 0.0003, + "loss": 11.1155, + "loss/aux_loss": 0.04807297699153423, + "loss/crossentropy": 2.7127108812332152, + "loss/logits": 0.8241129338741302, + "step": 49250 + }, + { + "epoch": 0.4926, + "grad_norm": 13.875, + "grad_norm_var": 1.4054524739583334, + "learning_rate": 0.0003, + "loss": 10.8193, + "loss/aux_loss": 0.04807514287531376, + "loss/crossentropy": 2.4936522424221037, + "loss/logits": 0.7675803631544114, + "step": 49260 + }, + { + "epoch": 0.4927, + "grad_norm": 14.4375, + "grad_norm_var": 0.5218098958333334, + "learning_rate": 0.0003, + "loss": 11.0741, + "loss/aux_loss": 0.0480735182762146, + "loss/crossentropy": 2.5747238457202912, + "loss/logits": 0.825323086977005, + "step": 49270 + }, + { + "epoch": 0.4928, + "grad_norm": 14.75, + "grad_norm_var": 0.5516764322916666, + "learning_rate": 0.0003, + "loss": 10.9145, + "loss/aux_loss": 0.048077659122645854, + "loss/crossentropy": 2.6410838782787325, + "loss/logits": 0.7915389269590378, + "step": 49280 + }, + { + "epoch": 0.4929, + "grad_norm": 14.25, + "grad_norm_var": 1.9891764322916667, + "learning_rate": 0.0003, + "loss": 11.1417, + "loss/aux_loss": 0.04807187095284462, + "loss/crossentropy": 2.687062478065491, + "loss/logits": 0.842128136754036, + "step": 49290 + }, + { + "epoch": 0.493, + "grad_norm": 14.5625, + "grad_norm_var": 1.7817545572916667, + "learning_rate": 0.0003, + "loss": 11.1977, + "loss/aux_loss": 0.048072554357349875, + "loss/crossentropy": 2.6798668265342713, + "loss/logits": 0.8448872178792953, + "step": 49300 + }, + { + "epoch": 0.4931, + "grad_norm": 15.625, + "grad_norm_var": 2.8541666666666665, + "learning_rate": 0.0003, + "loss": 11.2154, + "loss/aux_loss": 0.04807398393750191, + "loss/crossentropy": 2.708115738630295, + "loss/logits": 0.8346493154764175, + "step": 49310 + }, + { + "epoch": 0.4932, + "grad_norm": 15.4375, + "grad_norm_var": 3.0263020833333334, + "learning_rate": 0.0003, + "loss": 11.0153, + "loss/aux_loss": 0.04807023722678423, + "loss/crossentropy": 2.7580654978752137, + "loss/logits": 0.8456075847148895, + "step": 49320 + }, + { + "epoch": 0.4933, + "grad_norm": 14.25, + "grad_norm_var": 0.3138020833333333, + "learning_rate": 0.0003, + "loss": 10.9872, + "loss/aux_loss": 0.04807710256427526, + "loss/crossentropy": 2.983762502670288, + "loss/logits": 0.8636047869920731, + "step": 49330 + }, + { + "epoch": 0.4934, + "grad_norm": 14.8125, + "grad_norm_var": 1.1015462239583333, + "learning_rate": 0.0003, + "loss": 10.9637, + "loss/aux_loss": 0.04806807395070791, + "loss/crossentropy": 2.467972230911255, + "loss/logits": 0.7947597026824951, + "step": 49340 + }, + { + "epoch": 0.4935, + "grad_norm": 13.1875, + "grad_norm_var": 0.37628580729166666, + "learning_rate": 0.0003, + "loss": 11.0743, + "loss/aux_loss": 0.04806096330285072, + "loss/crossentropy": 2.7554059624671936, + "loss/logits": 0.8382692068815232, + "step": 49350 + }, + { + "epoch": 0.4936, + "grad_norm": 14.4375, + "grad_norm_var": 0.47708333333333336, + "learning_rate": 0.0003, + "loss": 10.9667, + "loss/aux_loss": 0.04808943476527929, + "loss/crossentropy": 2.6665013074874877, + "loss/logits": 0.8158300817012787, + "step": 49360 + }, + { + "epoch": 0.4937, + "grad_norm": 14.5, + "grad_norm_var": 0.49152018229166666, + "learning_rate": 0.0003, + "loss": 10.8597, + "loss/aux_loss": 0.0480616694316268, + "loss/crossentropy": 2.788412946462631, + "loss/logits": 0.8223504841327667, + "step": 49370 + }, + { + "epoch": 0.4938, + "grad_norm": 13.375, + "grad_norm_var": 0.46087239583333334, + "learning_rate": 0.0003, + "loss": 11.2213, + "loss/aux_loss": 0.04808288011699915, + "loss/crossentropy": 2.643638551235199, + "loss/logits": 0.8171029478311539, + "step": 49380 + }, + { + "epoch": 0.4939, + "grad_norm": 14.4375, + "grad_norm_var": 0.3973795572916667, + "learning_rate": 0.0003, + "loss": 10.9418, + "loss/aux_loss": 0.04807625114917755, + "loss/crossentropy": 2.7338321208953857, + "loss/logits": 0.8375842243432998, + "step": 49390 + }, + { + "epoch": 0.494, + "grad_norm": 14.6875, + "grad_norm_var": 2.592431640625, + "learning_rate": 0.0003, + "loss": 11.3181, + "loss/aux_loss": 0.04807534031569958, + "loss/crossentropy": 2.6576287031173704, + "loss/logits": 0.8461039811372757, + "step": 49400 + }, + { + "epoch": 0.4941, + "grad_norm": 14.1875, + "grad_norm_var": 0.3700358072916667, + "learning_rate": 0.0003, + "loss": 10.883, + "loss/aux_loss": 0.04808159098029137, + "loss/crossentropy": 2.8762070536613464, + "loss/logits": 0.8369950473308563, + "step": 49410 + }, + { + "epoch": 0.4942, + "grad_norm": 14.25, + "grad_norm_var": 0.33326822916666665, + "learning_rate": 0.0003, + "loss": 11.0212, + "loss/aux_loss": 0.04807743299752474, + "loss/crossentropy": 2.612814891338348, + "loss/logits": 0.7918889284133911, + "step": 49420 + }, + { + "epoch": 0.4943, + "grad_norm": 14.8125, + "grad_norm_var": 6.32265625, + "learning_rate": 0.0003, + "loss": 11.0833, + "loss/aux_loss": 0.04807397872209549, + "loss/crossentropy": 2.7480882346630096, + "loss/logits": 0.837815847992897, + "step": 49430 + }, + { + "epoch": 0.4944, + "grad_norm": 14.5625, + "grad_norm_var": 5.894905598958333, + "learning_rate": 0.0003, + "loss": 11.1483, + "loss/aux_loss": 0.04806387610733509, + "loss/crossentropy": 2.7095935344696045, + "loss/logits": 0.8372041195631027, + "step": 49440 + }, + { + "epoch": 0.4945, + "grad_norm": 15.5625, + "grad_norm_var": 0.7262858072916667, + "learning_rate": 0.0003, + "loss": 10.9738, + "loss/aux_loss": 0.048079794831573965, + "loss/crossentropy": 2.595053482055664, + "loss/logits": 0.8303221762180328, + "step": 49450 + }, + { + "epoch": 0.4946, + "grad_norm": 15.0, + "grad_norm_var": 0.693212890625, + "learning_rate": 0.0003, + "loss": 11.1594, + "loss/aux_loss": 0.04807852059602737, + "loss/crossentropy": 2.8263864398002623, + "loss/logits": 0.8446054220199585, + "step": 49460 + }, + { + "epoch": 0.4947, + "grad_norm": 15.1875, + "grad_norm_var": 0.38865559895833335, + "learning_rate": 0.0003, + "loss": 11.0476, + "loss/aux_loss": 0.04807306993752718, + "loss/crossentropy": 2.781277060508728, + "loss/logits": 0.8279220938682557, + "step": 49470 + }, + { + "epoch": 0.4948, + "grad_norm": 13.25, + "grad_norm_var": 0.8374837239583334, + "learning_rate": 0.0003, + "loss": 11.0972, + "loss/aux_loss": 0.0480706974864006, + "loss/crossentropy": 2.7130113363265993, + "loss/logits": 0.8263016819953919, + "step": 49480 + }, + { + "epoch": 0.4949, + "grad_norm": 15.375, + "grad_norm_var": 0.3653645833333333, + "learning_rate": 0.0003, + "loss": 11.1089, + "loss/aux_loss": 0.04807698726654053, + "loss/crossentropy": 2.870068061351776, + "loss/logits": 0.8862683087587356, + "step": 49490 + }, + { + "epoch": 0.495, + "grad_norm": 14.625, + "grad_norm_var": 0.790625, + "learning_rate": 0.0003, + "loss": 11.0814, + "loss/aux_loss": 0.04808134399354458, + "loss/crossentropy": 2.724984419345856, + "loss/logits": 0.8414475739002227, + "step": 49500 + }, + { + "epoch": 0.4951, + "grad_norm": 15.0625, + "grad_norm_var": 0.5382649739583333, + "learning_rate": 0.0003, + "loss": 11.0471, + "loss/aux_loss": 0.04806778896600008, + "loss/crossentropy": 2.652865248918533, + "loss/logits": 0.8197323232889175, + "step": 49510 + }, + { + "epoch": 0.4952, + "grad_norm": 14.625, + "grad_norm_var": 0.4546712239583333, + "learning_rate": 0.0003, + "loss": 11.051, + "loss/aux_loss": 0.04808030594140291, + "loss/crossentropy": 2.7049909591674806, + "loss/logits": 0.8460562914609909, + "step": 49520 + }, + { + "epoch": 0.4953, + "grad_norm": 14.5625, + "grad_norm_var": 1.0590983072916667, + "learning_rate": 0.0003, + "loss": 10.8739, + "loss/aux_loss": 0.0480698561295867, + "loss/crossentropy": 2.702311968803406, + "loss/logits": 0.8319470345973968, + "step": 49530 + }, + { + "epoch": 0.4954, + "grad_norm": 14.6875, + "grad_norm_var": 1.3048014322916666, + "learning_rate": 0.0003, + "loss": 11.0651, + "loss/aux_loss": 0.048078288696706294, + "loss/crossentropy": 2.6818348348140715, + "loss/logits": 0.8396122336387635, + "step": 49540 + }, + { + "epoch": 0.4955, + "grad_norm": 14.0625, + "grad_norm_var": 0.73203125, + "learning_rate": 0.0003, + "loss": 11.0647, + "loss/aux_loss": 0.04807879626750946, + "loss/crossentropy": 2.7367110908031465, + "loss/logits": 0.824239781498909, + "step": 49550 + }, + { + "epoch": 0.4956, + "grad_norm": 14.9375, + "grad_norm_var": 0.648291015625, + "learning_rate": 0.0003, + "loss": 10.9588, + "loss/aux_loss": 0.04806343484669924, + "loss/crossentropy": 2.6144358277320863, + "loss/logits": 0.8504854917526246, + "step": 49560 + }, + { + "epoch": 0.4957, + "grad_norm": 14.5, + "grad_norm_var": 0.9452473958333333, + "learning_rate": 0.0003, + "loss": 11.0769, + "loss/aux_loss": 0.0480787593871355, + "loss/crossentropy": 2.649821126461029, + "loss/logits": 0.8459902018308639, + "step": 49570 + }, + { + "epoch": 0.4958, + "grad_norm": 15.375, + "grad_norm_var": 0.488134765625, + "learning_rate": 0.0003, + "loss": 11.1721, + "loss/aux_loss": 0.04807934109121561, + "loss/crossentropy": 2.710836374759674, + "loss/logits": 0.8465037196874619, + "step": 49580 + }, + { + "epoch": 0.4959, + "grad_norm": 14.25, + "grad_norm_var": 0.59921875, + "learning_rate": 0.0003, + "loss": 10.9767, + "loss/aux_loss": 0.04806499667465687, + "loss/crossentropy": 2.8792532682418823, + "loss/logits": 0.8230845898389816, + "step": 49590 + }, + { + "epoch": 0.496, + "grad_norm": 16.25, + "grad_norm_var": 1.7556640625, + "learning_rate": 0.0003, + "loss": 11.0328, + "loss/aux_loss": 0.04807982686907053, + "loss/crossentropy": 2.6688818752765657, + "loss/logits": 0.8213403493165969, + "step": 49600 + }, + { + "epoch": 0.4961, + "grad_norm": 13.375, + "grad_norm_var": 1.2947265625, + "learning_rate": 0.0003, + "loss": 11.1903, + "loss/aux_loss": 0.04807685986161232, + "loss/crossentropy": 2.7829610109329224, + "loss/logits": 0.8443383306264878, + "step": 49610 + }, + { + "epoch": 0.4962, + "grad_norm": 14.0, + "grad_norm_var": 0.4874348958333333, + "learning_rate": 0.0003, + "loss": 10.9827, + "loss/aux_loss": 0.048072018660604954, + "loss/crossentropy": 2.6003858983516692, + "loss/logits": 0.823526531457901, + "step": 49620 + }, + { + "epoch": 0.4963, + "grad_norm": 14.375, + "grad_norm_var": 0.306494140625, + "learning_rate": 0.0003, + "loss": 11.0844, + "loss/aux_loss": 0.04806656241416931, + "loss/crossentropy": 2.573668730258942, + "loss/logits": 0.818437111377716, + "step": 49630 + }, + { + "epoch": 0.4964, + "grad_norm": 14.4375, + "grad_norm_var": 0.44073893229166666, + "learning_rate": 0.0003, + "loss": 10.9656, + "loss/aux_loss": 0.04807706866413355, + "loss/crossentropy": 2.759202075004578, + "loss/logits": 0.840299728512764, + "step": 49640 + }, + { + "epoch": 0.4965, + "grad_norm": 14.375, + "grad_norm_var": 0.4557291666666667, + "learning_rate": 0.0003, + "loss": 11.0086, + "loss/aux_loss": 0.0480681087821722, + "loss/crossentropy": 2.6497737407684325, + "loss/logits": 0.8393264710903168, + "step": 49650 + }, + { + "epoch": 0.4966, + "grad_norm": 14.0, + "grad_norm_var": 0.43474934895833334, + "learning_rate": 0.0003, + "loss": 11.1813, + "loss/aux_loss": 0.048070674762129784, + "loss/crossentropy": 2.611664170026779, + "loss/logits": 0.8550961494445801, + "step": 49660 + }, + { + "epoch": 0.4967, + "grad_norm": 15.375, + "grad_norm_var": 10.99765625, + "learning_rate": 0.0003, + "loss": 11.1897, + "loss/aux_loss": 0.048074728436768056, + "loss/crossentropy": 2.723458409309387, + "loss/logits": 0.842160576581955, + "step": 49670 + }, + { + "epoch": 0.4968, + "grad_norm": 15.625, + "grad_norm_var": 0.83046875, + "learning_rate": 0.0003, + "loss": 11.0535, + "loss/aux_loss": 0.048070961609482765, + "loss/crossentropy": 2.845665168762207, + "loss/logits": 0.8630867063999176, + "step": 49680 + }, + { + "epoch": 0.4969, + "grad_norm": 15.5, + "grad_norm_var": 3.896614583333333, + "learning_rate": 0.0003, + "loss": 11.1466, + "loss/aux_loss": 0.048077776283025744, + "loss/crossentropy": 2.748256707191467, + "loss/logits": 0.8359945237636566, + "step": 49690 + }, + { + "epoch": 0.497, + "grad_norm": 14.25, + "grad_norm_var": 1.504931640625, + "learning_rate": 0.0003, + "loss": 11.1832, + "loss/aux_loss": 0.04807632230222225, + "loss/crossentropy": 2.6894050359725954, + "loss/logits": 0.8393000155687332, + "step": 49700 + }, + { + "epoch": 0.4971, + "grad_norm": 15.3125, + "grad_norm_var": 1.5921223958333333, + "learning_rate": 0.0003, + "loss": 11.2199, + "loss/aux_loss": 0.04806073512881994, + "loss/crossentropy": 2.835462886095047, + "loss/logits": 0.8458648949861527, + "step": 49710 + }, + { + "epoch": 0.4972, + "grad_norm": 15.3125, + "grad_norm_var": 0.45078125, + "learning_rate": 0.0003, + "loss": 11.07, + "loss/aux_loss": 0.048081550374627115, + "loss/crossentropy": 2.640725481510162, + "loss/logits": 0.8050734728574753, + "step": 49720 + }, + { + "epoch": 0.4973, + "grad_norm": 14.8125, + "grad_norm_var": 0.299853515625, + "learning_rate": 0.0003, + "loss": 10.9709, + "loss/aux_loss": 0.04808553606271744, + "loss/crossentropy": 2.663204771280289, + "loss/logits": 0.7960788905620575, + "step": 49730 + }, + { + "epoch": 0.4974, + "grad_norm": 15.25, + "grad_norm_var": 60.1416015625, + "learning_rate": 0.0003, + "loss": 11.0414, + "loss/aux_loss": 0.04805216509848833, + "loss/crossentropy": 2.5938608229160307, + "loss/logits": 0.8149016201496124, + "step": 49740 + }, + { + "epoch": 0.4975, + "grad_norm": 13.875, + "grad_norm_var": 59.84060872395833, + "learning_rate": 0.0003, + "loss": 11.2352, + "loss/aux_loss": 0.048083586245775224, + "loss/crossentropy": 2.6643555045127867, + "loss/logits": 0.815577107667923, + "step": 49750 + }, + { + "epoch": 0.4976, + "grad_norm": 14.6875, + "grad_norm_var": 0.23645833333333333, + "learning_rate": 0.0003, + "loss": 10.9069, + "loss/aux_loss": 0.048074550181627276, + "loss/crossentropy": 2.618529570102692, + "loss/logits": 0.8274006098508835, + "step": 49760 + }, + { + "epoch": 0.4977, + "grad_norm": 14.8125, + "grad_norm_var": 0.4942708333333333, + "learning_rate": 0.0003, + "loss": 11.1073, + "loss/aux_loss": 0.04806795679032803, + "loss/crossentropy": 2.694086503982544, + "loss/logits": 0.8305206030607224, + "step": 49770 + }, + { + "epoch": 0.4978, + "grad_norm": 14.25, + "grad_norm_var": 0.37107747395833335, + "learning_rate": 0.0003, + "loss": 11.1821, + "loss/aux_loss": 0.048074840754270556, + "loss/crossentropy": 2.715044713020325, + "loss/logits": 0.8461979001760482, + "step": 49780 + }, + { + "epoch": 0.4979, + "grad_norm": 14.5, + "grad_norm_var": 0.7588541666666667, + "learning_rate": 0.0003, + "loss": 11.1215, + "loss/aux_loss": 0.048070978559553626, + "loss/crossentropy": 2.643089586496353, + "loss/logits": 0.806346595287323, + "step": 49790 + }, + { + "epoch": 0.498, + "grad_norm": 28.625, + "grad_norm_var": 12.72578125, + "learning_rate": 0.0003, + "loss": 11.1285, + "loss/aux_loss": 0.048064558580517766, + "loss/crossentropy": 2.699583125114441, + "loss/logits": 0.8594643115997315, + "step": 49800 + }, + { + "epoch": 0.4981, + "grad_norm": 14.6875, + "grad_norm_var": 13.309309895833334, + "learning_rate": 0.0003, + "loss": 11.0075, + "loss/aux_loss": 0.04808203764259815, + "loss/crossentropy": 2.7328962683677673, + "loss/logits": 0.8031840980052948, + "step": 49810 + }, + { + "epoch": 0.4982, + "grad_norm": 14.6875, + "grad_norm_var": 0.5218098958333334, + "learning_rate": 0.0003, + "loss": 11.2001, + "loss/aux_loss": 0.04806886278092861, + "loss/crossentropy": 2.808696722984314, + "loss/logits": 0.8115378528833389, + "step": 49820 + }, + { + "epoch": 0.4983, + "grad_norm": 15.0, + "grad_norm_var": 0.32745768229166666, + "learning_rate": 0.0003, + "loss": 10.9526, + "loss/aux_loss": 0.04807125814259052, + "loss/crossentropy": 2.6571006894111635, + "loss/logits": 0.8229701191186904, + "step": 49830 + }, + { + "epoch": 0.4984, + "grad_norm": 13.5625, + "grad_norm_var": 0.766259765625, + "learning_rate": 0.0003, + "loss": 11.1019, + "loss/aux_loss": 0.04808720909059048, + "loss/crossentropy": 2.7945067286491394, + "loss/logits": 0.8268558502197265, + "step": 49840 + }, + { + "epoch": 0.4985, + "grad_norm": 15.375, + "grad_norm_var": 0.4476399739583333, + "learning_rate": 0.0003, + "loss": 10.9448, + "loss/aux_loss": 0.04806675110012293, + "loss/crossentropy": 2.5788372695446014, + "loss/logits": 0.7906678229570389, + "step": 49850 + }, + { + "epoch": 0.4986, + "grad_norm": 14.8125, + "grad_norm_var": 0.4019368489583333, + "learning_rate": 0.0003, + "loss": 11.1355, + "loss/aux_loss": 0.04806358329951763, + "loss/crossentropy": 2.6663641929626465, + "loss/logits": 0.8542564064264297, + "step": 49860 + }, + { + "epoch": 0.4987, + "grad_norm": 13.875, + "grad_norm_var": 0.32864583333333336, + "learning_rate": 0.0003, + "loss": 11.111, + "loss/aux_loss": 0.04807950519025326, + "loss/crossentropy": 2.5420661509037017, + "loss/logits": 0.8125041216611862, + "step": 49870 + }, + { + "epoch": 0.4988, + "grad_norm": 14.9375, + "grad_norm_var": 0.3811848958333333, + "learning_rate": 0.0003, + "loss": 11.1871, + "loss/aux_loss": 0.04807094018906355, + "loss/crossentropy": 2.7237226247787474, + "loss/logits": 0.8434429466724396, + "step": 49880 + }, + { + "epoch": 0.4989, + "grad_norm": 13.5625, + "grad_norm_var": 0.6645670572916667, + "learning_rate": 0.0003, + "loss": 11.0105, + "loss/aux_loss": 0.0480699697509408, + "loss/crossentropy": 2.797856557369232, + "loss/logits": 0.8193605899810791, + "step": 49890 + }, + { + "epoch": 0.499, + "grad_norm": 15.0625, + "grad_norm_var": 0.38800455729166666, + "learning_rate": 0.0003, + "loss": 11.002, + "loss/aux_loss": 0.048073521442711355, + "loss/crossentropy": 2.7964406251907348, + "loss/logits": 0.8461063802242279, + "step": 49900 + }, + { + "epoch": 0.4991, + "grad_norm": 14.0625, + "grad_norm_var": 0.3094889322916667, + "learning_rate": 0.0003, + "loss": 11.2589, + "loss/aux_loss": 0.04806688260287047, + "loss/crossentropy": 2.6836509346961974, + "loss/logits": 0.821695277094841, + "step": 49910 + }, + { + "epoch": 0.4992, + "grad_norm": 15.125, + "grad_norm_var": 0.240478515625, + "learning_rate": 0.0003, + "loss": 11.0785, + "loss/aux_loss": 0.048078537732362744, + "loss/crossentropy": 2.5942283451557158, + "loss/logits": 0.8679295003414154, + "step": 49920 + }, + { + "epoch": 0.4993, + "grad_norm": 15.0, + "grad_norm_var": 0.33058268229166665, + "learning_rate": 0.0003, + "loss": 11.1418, + "loss/aux_loss": 0.0480769170448184, + "loss/crossentropy": 2.7477990865707396, + "loss/logits": 0.8367729008197784, + "step": 49930 + }, + { + "epoch": 0.4994, + "grad_norm": 14.5625, + "grad_norm_var": 0.8858723958333333, + "learning_rate": 0.0003, + "loss": 10.927, + "loss/aux_loss": 0.04806565400213003, + "loss/crossentropy": 2.6473158240318297, + "loss/logits": 0.8119089126586914, + "step": 49940 + }, + { + "epoch": 0.4995, + "grad_norm": 14.9375, + "grad_norm_var": 0.5980305989583333, + "learning_rate": 0.0003, + "loss": 11.1737, + "loss/aux_loss": 0.04807810839265585, + "loss/crossentropy": 2.774595522880554, + "loss/logits": 0.8349743068218232, + "step": 49950 + }, + { + "epoch": 0.4996, + "grad_norm": 15.1875, + "grad_norm_var": 0.6206868489583334, + "learning_rate": 0.0003, + "loss": 11.195, + "loss/aux_loss": 0.04806303158402443, + "loss/crossentropy": 2.6357213258743286, + "loss/logits": 0.8456666976213455, + "step": 49960 + }, + { + "epoch": 0.4997, + "grad_norm": 15.1875, + "grad_norm_var": 0.9304524739583333, + "learning_rate": 0.0003, + "loss": 10.9898, + "loss/aux_loss": 0.048077212646603584, + "loss/crossentropy": 2.6961658537387847, + "loss/logits": 0.8146316468715668, + "step": 49970 + }, + { + "epoch": 0.4998, + "grad_norm": 13.25, + "grad_norm_var": 0.326025390625, + "learning_rate": 0.0003, + "loss": 11.0497, + "loss/aux_loss": 0.0480698449537158, + "loss/crossentropy": 2.809601533412933, + "loss/logits": 0.8374509602785111, + "step": 49980 + }, + { + "epoch": 0.4999, + "grad_norm": 13.75, + "grad_norm_var": 0.523291015625, + "learning_rate": 0.0003, + "loss": 11.1053, + "loss/aux_loss": 0.0480751309543848, + "loss/crossentropy": 2.734011006355286, + "loss/logits": 0.8388842344284058, + "step": 49990 + }, + { + "epoch": 0.5, + "grad_norm": 15.5, + "grad_norm_var": 0.29791666666666666, + "learning_rate": 0.0003, + "loss": 11.0715, + "loss/aux_loss": 0.04807236734777689, + "loss/crossentropy": 2.8549141943454743, + "loss/logits": 0.8532051771879197, + "step": 50000 + }, + { + "epoch": 0.5001, + "grad_norm": 15.375, + "grad_norm_var": 0.7484212239583333, + "learning_rate": 0.0003, + "loss": 11.1863, + "loss/aux_loss": 0.0480760183185339, + "loss/crossentropy": 2.526221138238907, + "loss/logits": 0.8279460847377778, + "step": 50010 + }, + { + "epoch": 0.5002, + "grad_norm": 14.75, + "grad_norm_var": 0.8884765625, + "learning_rate": 0.0003, + "loss": 11.0844, + "loss/aux_loss": 0.04807603172957897, + "loss/crossentropy": 2.786004549264908, + "loss/logits": 0.8343773394823074, + "step": 50020 + }, + { + "epoch": 0.5003, + "grad_norm": 14.75, + "grad_norm_var": 0.40234375, + "learning_rate": 0.0003, + "loss": 11.1362, + "loss/aux_loss": 0.04807902593165636, + "loss/crossentropy": 2.614196312427521, + "loss/logits": 0.8386217921972274, + "step": 50030 + }, + { + "epoch": 0.5004, + "grad_norm": 14.25, + "grad_norm_var": 0.4337076822916667, + "learning_rate": 0.0003, + "loss": 11.0241, + "loss/aux_loss": 0.048064802400767805, + "loss/crossentropy": 2.7048224210739136, + "loss/logits": 0.8228438705205917, + "step": 50040 + }, + { + "epoch": 0.5005, + "grad_norm": 15.4375, + "grad_norm_var": 188.100244140625, + "learning_rate": 0.0003, + "loss": 11.1056, + "loss/aux_loss": 0.04807979743927717, + "loss/crossentropy": 2.6778744578361513, + "loss/logits": 0.8488940119743347, + "step": 50050 + }, + { + "epoch": 0.5006, + "grad_norm": 16.875, + "grad_norm_var": 2.1015625, + "learning_rate": 0.0003, + "loss": 11.0635, + "loss/aux_loss": 0.0480663301423192, + "loss/crossentropy": 2.6144404113292694, + "loss/logits": 0.7824886530637741, + "step": 50060 + }, + { + "epoch": 0.5007, + "grad_norm": 16.5, + "grad_norm_var": 0.9775390625, + "learning_rate": 0.0003, + "loss": 11.13, + "loss/aux_loss": 0.048070177994668485, + "loss/crossentropy": 2.738340699672699, + "loss/logits": 0.823601758480072, + "step": 50070 + }, + { + "epoch": 0.5008, + "grad_norm": 13.0, + "grad_norm_var": 0.8587076822916667, + "learning_rate": 0.0003, + "loss": 11.0701, + "loss/aux_loss": 0.0480703879147768, + "loss/crossentropy": 2.7732195377349855, + "loss/logits": 0.8485528379678726, + "step": 50080 + }, + { + "epoch": 0.5009, + "grad_norm": 15.9375, + "grad_norm_var": 0.47902018229166665, + "learning_rate": 0.0003, + "loss": 10.9742, + "loss/aux_loss": 0.04807235468178987, + "loss/crossentropy": 2.750224161148071, + "loss/logits": 0.8227509766817093, + "step": 50090 + }, + { + "epoch": 0.501, + "grad_norm": 13.0625, + "grad_norm_var": 0.8212076822916666, + "learning_rate": 0.0003, + "loss": 11.0486, + "loss/aux_loss": 0.048071041516959664, + "loss/crossentropy": 2.774020862579346, + "loss/logits": 0.8568490296602249, + "step": 50100 + }, + { + "epoch": 0.5011, + "grad_norm": 14.25, + "grad_norm_var": 0.4593587239583333, + "learning_rate": 0.0003, + "loss": 11.04, + "loss/aux_loss": 0.048077203519642356, + "loss/crossentropy": 2.6621095538139343, + "loss/logits": 0.8019068986177444, + "step": 50110 + }, + { + "epoch": 0.5012, + "grad_norm": 15.0, + "grad_norm_var": 15.922395833333333, + "learning_rate": 0.0003, + "loss": 11.2264, + "loss/aux_loss": 0.04807031415402889, + "loss/crossentropy": 2.7066645860671996, + "loss/logits": 0.8399304032325745, + "step": 50120 + }, + { + "epoch": 0.5013, + "grad_norm": 14.3125, + "grad_norm_var": 157.83645833333333, + "learning_rate": 0.0003, + "loss": 11.1837, + "loss/aux_loss": 0.048072899132966994, + "loss/crossentropy": 2.6874527156353, + "loss/logits": 0.7938042402267456, + "step": 50130 + }, + { + "epoch": 0.5014, + "grad_norm": 14.9375, + "grad_norm_var": 1.9280598958333333, + "learning_rate": 0.0003, + "loss": 11.0273, + "loss/aux_loss": 0.04807997718453407, + "loss/crossentropy": 2.8585541009902955, + "loss/logits": 0.8428541749715805, + "step": 50140 + }, + { + "epoch": 0.5015, + "grad_norm": 13.9375, + "grad_norm_var": 0.8619140625, + "learning_rate": 0.0003, + "loss": 11.1604, + "loss/aux_loss": 0.04806944746524096, + "loss/crossentropy": 2.7434488892555238, + "loss/logits": 0.809849202632904, + "step": 50150 + }, + { + "epoch": 0.5016, + "grad_norm": 14.5, + "grad_norm_var": 0.5499837239583333, + "learning_rate": 0.0003, + "loss": 10.9876, + "loss/aux_loss": 0.04807947650551796, + "loss/crossentropy": 2.6629399359226227, + "loss/logits": 0.8340989917516708, + "step": 50160 + }, + { + "epoch": 0.5017, + "grad_norm": 14.1875, + "grad_norm_var": 0.3348307291666667, + "learning_rate": 0.0003, + "loss": 10.9673, + "loss/aux_loss": 0.048072699643671515, + "loss/crossentropy": 2.620541423559189, + "loss/logits": 0.7972109645605088, + "step": 50170 + }, + { + "epoch": 0.5018, + "grad_norm": 14.9375, + "grad_norm_var": 0.36451822916666665, + "learning_rate": 0.0003, + "loss": 11.0375, + "loss/aux_loss": 0.04806852545589209, + "loss/crossentropy": 2.6545013010501863, + "loss/logits": 0.7923622548580169, + "step": 50180 + }, + { + "epoch": 0.5019, + "grad_norm": 13.8125, + "grad_norm_var": 0.321875, + "learning_rate": 0.0003, + "loss": 11.1828, + "loss/aux_loss": 0.048075834102928636, + "loss/crossentropy": 2.8539741396903993, + "loss/logits": 0.8799011826515197, + "step": 50190 + }, + { + "epoch": 0.502, + "grad_norm": 13.875, + "grad_norm_var": 0.4442545572916667, + "learning_rate": 0.0003, + "loss": 10.9104, + "loss/aux_loss": 0.04806615300476551, + "loss/crossentropy": 2.641600948572159, + "loss/logits": 0.7879884839057922, + "step": 50200 + }, + { + "epoch": 0.5021, + "grad_norm": 16.0, + "grad_norm_var": 1.0354166666666667, + "learning_rate": 0.0003, + "loss": 10.984, + "loss/aux_loss": 0.04808139093220234, + "loss/crossentropy": 2.5606437027454376, + "loss/logits": 0.8123639971017838, + "step": 50210 + }, + { + "epoch": 0.5022, + "grad_norm": 13.75, + "grad_norm_var": 0.7514973958333333, + "learning_rate": 0.0003, + "loss": 11.198, + "loss/aux_loss": 0.0480662377551198, + "loss/crossentropy": 2.697253167629242, + "loss/logits": 0.8449769735336303, + "step": 50220 + }, + { + "epoch": 0.5023, + "grad_norm": 16.375, + "grad_norm_var": 0.793994140625, + "learning_rate": 0.0003, + "loss": 11.1197, + "loss/aux_loss": 0.04807373005896807, + "loss/crossentropy": 2.7797034323215484, + "loss/logits": 0.8115016400814057, + "step": 50230 + }, + { + "epoch": 0.5024, + "grad_norm": 14.5625, + "grad_norm_var": 269.56243489583335, + "learning_rate": 0.0003, + "loss": 11.1053, + "loss/aux_loss": 0.0480750922113657, + "loss/crossentropy": 2.799087393283844, + "loss/logits": 0.8153054699301719, + "step": 50240 + }, + { + "epoch": 0.5025, + "grad_norm": 15.625, + "grad_norm_var": 267.090625, + "learning_rate": 0.0003, + "loss": 11.0676, + "loss/aux_loss": 0.04806772004812956, + "loss/crossentropy": 2.7093150496482847, + "loss/logits": 0.8238262414932251, + "step": 50250 + }, + { + "epoch": 0.5026, + "grad_norm": 16.25, + "grad_norm_var": 0.7735514322916667, + "learning_rate": 0.0003, + "loss": 11.1285, + "loss/aux_loss": 0.04807775299996138, + "loss/crossentropy": 2.690925532579422, + "loss/logits": 0.8347889751195907, + "step": 50260 + }, + { + "epoch": 0.5027, + "grad_norm": 16.5, + "grad_norm_var": 0.8484212239583333, + "learning_rate": 0.0003, + "loss": 11.0363, + "loss/aux_loss": 0.04806351810693741, + "loss/crossentropy": 2.7472257018089294, + "loss/logits": 0.8274175226688385, + "step": 50270 + }, + { + "epoch": 0.5028, + "grad_norm": 18.375, + "grad_norm_var": 233.536572265625, + "learning_rate": 0.0003, + "loss": 11.3035, + "loss/aux_loss": 0.048083963245153426, + "loss/crossentropy": 2.9165143728256226, + "loss/logits": 0.8531142026185989, + "step": 50280 + }, + { + "epoch": 0.5029, + "grad_norm": 13.875, + "grad_norm_var": 228.95670572916666, + "learning_rate": 0.0003, + "loss": 11.0395, + "loss/aux_loss": 0.048072627559304235, + "loss/crossentropy": 2.9179752588272097, + "loss/logits": 0.8547284364700317, + "step": 50290 + }, + { + "epoch": 0.503, + "grad_norm": 14.1875, + "grad_norm_var": 1.7687337239583334, + "learning_rate": 0.0003, + "loss": 11.0327, + "loss/aux_loss": 0.048072236590087414, + "loss/crossentropy": 2.6800991177558897, + "loss/logits": 0.8386821538209915, + "step": 50300 + }, + { + "epoch": 0.5031, + "grad_norm": 14.4375, + "grad_norm_var": 0.724462890625, + "learning_rate": 0.0003, + "loss": 11.1287, + "loss/aux_loss": 0.04807893894612789, + "loss/crossentropy": 2.683143067359924, + "loss/logits": 0.8309021919965744, + "step": 50310 + }, + { + "epoch": 0.5032, + "grad_norm": 13.8125, + "grad_norm_var": 0.3902180989583333, + "learning_rate": 0.0003, + "loss": 10.955, + "loss/aux_loss": 0.048069473914802076, + "loss/crossentropy": 2.6512105405330657, + "loss/logits": 0.8183946311473846, + "step": 50320 + }, + { + "epoch": 0.5033, + "grad_norm": 14.8125, + "grad_norm_var": 150.9041015625, + "learning_rate": 0.0003, + "loss": 11.2423, + "loss/aux_loss": 0.04808805175125599, + "loss/crossentropy": 2.5937359273433684, + "loss/logits": 0.8347382307052612, + "step": 50330 + }, + { + "epoch": 0.5034, + "grad_norm": 13.9375, + "grad_norm_var": 0.5128743489583333, + "learning_rate": 0.0003, + "loss": 11.0916, + "loss/aux_loss": 0.04806785080581903, + "loss/crossentropy": 2.7667890906333925, + "loss/logits": 0.8234162241220474, + "step": 50340 + }, + { + "epoch": 0.5035, + "grad_norm": 13.6875, + "grad_norm_var": 0.48020833333333335, + "learning_rate": 0.0003, + "loss": 10.9453, + "loss/aux_loss": 0.04806566257029772, + "loss/crossentropy": 2.5693565726280214, + "loss/logits": 0.8084887236356735, + "step": 50350 + }, + { + "epoch": 0.5036, + "grad_norm": 14.9375, + "grad_norm_var": 1.7186848958333334, + "learning_rate": 0.0003, + "loss": 11.1153, + "loss/aux_loss": 0.04807969201356173, + "loss/crossentropy": 2.739818775653839, + "loss/logits": 0.8364595293998718, + "step": 50360 + }, + { + "epoch": 0.5037, + "grad_norm": 14.8125, + "grad_norm_var": 1.5858723958333334, + "learning_rate": 0.0003, + "loss": 11.1912, + "loss/aux_loss": 0.04808463733643294, + "loss/crossentropy": 2.6995759308338165, + "loss/logits": 0.8050726383924485, + "step": 50370 + }, + { + "epoch": 0.5038, + "grad_norm": 13.8125, + "grad_norm_var": 0.5778645833333333, + "learning_rate": 0.0003, + "loss": 10.9566, + "loss/aux_loss": 0.048061018250882624, + "loss/crossentropy": 2.6875993072986604, + "loss/logits": 0.8345743596553803, + "step": 50380 + }, + { + "epoch": 0.5039, + "grad_norm": 13.9375, + "grad_norm_var": 0.45362955729166665, + "learning_rate": 0.0003, + "loss": 11.041, + "loss/aux_loss": 0.04808121174573898, + "loss/crossentropy": 2.851317548751831, + "loss/logits": 0.8381609439849853, + "step": 50390 + }, + { + "epoch": 0.504, + "grad_norm": 14.9375, + "grad_norm_var": 0.32317708333333334, + "learning_rate": 0.0003, + "loss": 10.9708, + "loss/aux_loss": 0.04808126352727413, + "loss/crossentropy": 2.739950382709503, + "loss/logits": 0.8330327928066253, + "step": 50400 + }, + { + "epoch": 0.5041, + "grad_norm": 14.625, + "grad_norm_var": 0.2955729166666667, + "learning_rate": 0.0003, + "loss": 11.0153, + "loss/aux_loss": 0.04806138556450605, + "loss/crossentropy": 2.7034616589546205, + "loss/logits": 0.851497569680214, + "step": 50410 + }, + { + "epoch": 0.5042, + "grad_norm": 15.3125, + "grad_norm_var": 0.2337890625, + "learning_rate": 0.0003, + "loss": 11.1483, + "loss/aux_loss": 0.04807694610208273, + "loss/crossentropy": 2.784847009181976, + "loss/logits": 0.8578290939331055, + "step": 50420 + }, + { + "epoch": 0.5043, + "grad_norm": 14.125, + "grad_norm_var": 0.3153483072916667, + "learning_rate": 0.0003, + "loss": 10.8493, + "loss/aux_loss": 0.048078867606818676, + "loss/crossentropy": 2.6146656930446626, + "loss/logits": 0.7973528385162354, + "step": 50430 + }, + { + "epoch": 0.5044, + "grad_norm": 14.75, + "grad_norm_var": 0.341650390625, + "learning_rate": 0.0003, + "loss": 11.0265, + "loss/aux_loss": 0.048073571361601355, + "loss/crossentropy": 2.8554004311561583, + "loss/logits": 0.8406151056289672, + "step": 50440 + }, + { + "epoch": 0.5045, + "grad_norm": 14.0625, + "grad_norm_var": 0.4567057291666667, + "learning_rate": 0.0003, + "loss": 10.9926, + "loss/aux_loss": 0.048083682730793956, + "loss/crossentropy": 2.713830453157425, + "loss/logits": 0.8488957345485687, + "step": 50450 + }, + { + "epoch": 0.5046, + "grad_norm": 13.0625, + "grad_norm_var": 1.5556640625, + "learning_rate": 0.0003, + "loss": 11.11, + "loss/aux_loss": 0.048066365718841556, + "loss/crossentropy": 2.902998661994934, + "loss/logits": 0.8279580295085907, + "step": 50460 + }, + { + "epoch": 0.5047, + "grad_norm": 15.5625, + "grad_norm_var": 0.7299479166666667, + "learning_rate": 0.0003, + "loss": 10.9543, + "loss/aux_loss": 0.04807742275297642, + "loss/crossentropy": 2.7396446764469147, + "loss/logits": 0.8536162942647934, + "step": 50470 + }, + { + "epoch": 0.5048, + "grad_norm": 13.625, + "grad_norm_var": 0.7792805989583333, + "learning_rate": 0.0003, + "loss": 10.9662, + "loss/aux_loss": 0.048072476498782636, + "loss/crossentropy": 2.8252785921096804, + "loss/logits": 0.869564825296402, + "step": 50480 + }, + { + "epoch": 0.5049, + "grad_norm": 14.25, + "grad_norm_var": 0.6042805989583333, + "learning_rate": 0.0003, + "loss": 11.0867, + "loss/aux_loss": 0.04807360861450434, + "loss/crossentropy": 2.7099923491477966, + "loss/logits": 0.8268058747053146, + "step": 50490 + }, + { + "epoch": 0.505, + "grad_norm": 14.5625, + "grad_norm_var": 0.49138997395833334, + "learning_rate": 0.0003, + "loss": 10.9612, + "loss/aux_loss": 0.04806963559240103, + "loss/crossentropy": 2.7237312316894533, + "loss/logits": 0.8322340279817582, + "step": 50500 + }, + { + "epoch": 0.5051, + "grad_norm": 16.75, + "grad_norm_var": 0.5423014322916667, + "learning_rate": 0.0003, + "loss": 11.0486, + "loss/aux_loss": 0.04807584658265114, + "loss/crossentropy": 2.7208118796348573, + "loss/logits": 0.8030982494354248, + "step": 50510 + }, + { + "epoch": 0.5052, + "grad_norm": 15.6875, + "grad_norm_var": 0.9128743489583333, + "learning_rate": 0.0003, + "loss": 11.1668, + "loss/aux_loss": 0.04807027783244848, + "loss/crossentropy": 2.654416823387146, + "loss/logits": 0.8475210994482041, + "step": 50520 + }, + { + "epoch": 0.5053, + "grad_norm": 14.5625, + "grad_norm_var": 15.680143229166667, + "learning_rate": 0.0003, + "loss": 10.9435, + "loss/aux_loss": 0.04807609729468822, + "loss/crossentropy": 2.5730921030044556, + "loss/logits": 0.838240772485733, + "step": 50530 + }, + { + "epoch": 0.5054, + "grad_norm": 13.9375, + "grad_norm_var": 15.242171223958334, + "learning_rate": 0.0003, + "loss": 11.1737, + "loss/aux_loss": 0.04807768948376179, + "loss/crossentropy": 2.771801221370697, + "loss/logits": 0.8654070168733596, + "step": 50540 + }, + { + "epoch": 0.5055, + "grad_norm": 15.1875, + "grad_norm_var": 0.717822265625, + "learning_rate": 0.0003, + "loss": 11.2251, + "loss/aux_loss": 0.048080765083432196, + "loss/crossentropy": 2.7945044159889223, + "loss/logits": 0.838275796175003, + "step": 50550 + }, + { + "epoch": 0.5056, + "grad_norm": 13.9375, + "grad_norm_var": 0.4163899739583333, + "learning_rate": 0.0003, + "loss": 11.2706, + "loss/aux_loss": 0.048064058646559715, + "loss/crossentropy": 2.9519375801086425, + "loss/logits": 0.8616402268409729, + "step": 50560 + }, + { + "epoch": 0.5057, + "grad_norm": 14.875, + "grad_norm_var": 0.3119140625, + "learning_rate": 0.0003, + "loss": 11.0596, + "loss/aux_loss": 0.04807540029287338, + "loss/crossentropy": 2.5487895905971527, + "loss/logits": 0.7579917728900909, + "step": 50570 + }, + { + "epoch": 0.5058, + "grad_norm": 15.375, + "grad_norm_var": 0.15857747395833333, + "learning_rate": 0.0003, + "loss": 11.0143, + "loss/aux_loss": 0.04808062519878149, + "loss/crossentropy": 2.639469766616821, + "loss/logits": 0.8144773453474045, + "step": 50580 + }, + { + "epoch": 0.5059, + "grad_norm": 13.5625, + "grad_norm_var": 0.40358072916666665, + "learning_rate": 0.0003, + "loss": 11.0849, + "loss/aux_loss": 0.048076164163649085, + "loss/crossentropy": 2.898840081691742, + "loss/logits": 0.8714124709367752, + "step": 50590 + }, + { + "epoch": 0.506, + "grad_norm": 15.9375, + "grad_norm_var": 0.6910807291666666, + "learning_rate": 0.0003, + "loss": 11.1215, + "loss/aux_loss": 0.048068304732441905, + "loss/crossentropy": 2.8098879933357237, + "loss/logits": 0.8603219360113143, + "step": 50600 + }, + { + "epoch": 0.5061, + "grad_norm": 14.6875, + "grad_norm_var": 0.397119140625, + "learning_rate": 0.0003, + "loss": 10.9947, + "loss/aux_loss": 0.04808787330985069, + "loss/crossentropy": 2.6829119682312013, + "loss/logits": 0.8325252383947372, + "step": 50610 + }, + { + "epoch": 0.5062, + "grad_norm": 14.9375, + "grad_norm_var": 0.3572916666666667, + "learning_rate": 0.0003, + "loss": 11.0705, + "loss/aux_loss": 0.04808214660733938, + "loss/crossentropy": 2.7047139048576354, + "loss/logits": 0.8234624296426774, + "step": 50620 + }, + { + "epoch": 0.5063, + "grad_norm": 14.125, + "grad_norm_var": 0.484375, + "learning_rate": 0.0003, + "loss": 10.9123, + "loss/aux_loss": 0.04805761631578207, + "loss/crossentropy": 2.766616940498352, + "loss/logits": 0.8159997165203094, + "step": 50630 + }, + { + "epoch": 0.5064, + "grad_norm": 14.8125, + "grad_norm_var": 0.30857747395833335, + "learning_rate": 0.0003, + "loss": 11.0885, + "loss/aux_loss": 0.048085050843656066, + "loss/crossentropy": 2.773435640335083, + "loss/logits": 0.8345916509628296, + "step": 50640 + }, + { + "epoch": 0.5065, + "grad_norm": 14.625, + "grad_norm_var": 0.511181640625, + "learning_rate": 0.0003, + "loss": 11.0337, + "loss/aux_loss": 0.048079907149076465, + "loss/crossentropy": 2.7371358036994935, + "loss/logits": 0.8105407744646073, + "step": 50650 + }, + { + "epoch": 0.5066, + "grad_norm": 14.4375, + "grad_norm_var": 0.6534993489583333, + "learning_rate": 0.0003, + "loss": 10.9133, + "loss/aux_loss": 0.048066824488341806, + "loss/crossentropy": 2.6571763515472413, + "loss/logits": 0.800066152215004, + "step": 50660 + }, + { + "epoch": 0.5067, + "grad_norm": 14.875, + "grad_norm_var": 0.4369140625, + "learning_rate": 0.0003, + "loss": 11.2353, + "loss/aux_loss": 0.0480772802606225, + "loss/crossentropy": 2.7930760741233827, + "loss/logits": 0.8640264600515366, + "step": 50670 + }, + { + "epoch": 0.5068, + "grad_norm": 14.5625, + "grad_norm_var": 0.313916015625, + "learning_rate": 0.0003, + "loss": 11.1041, + "loss/aux_loss": 0.04806650523096323, + "loss/crossentropy": 2.7191444516181944, + "loss/logits": 0.8551998734474182, + "step": 50680 + }, + { + "epoch": 0.5069, + "grad_norm": 16.0, + "grad_norm_var": 0.5488118489583333, + "learning_rate": 0.0003, + "loss": 11.088, + "loss/aux_loss": 0.04807362388819456, + "loss/crossentropy": 2.662330609560013, + "loss/logits": 0.8373664259910584, + "step": 50690 + }, + { + "epoch": 0.507, + "grad_norm": 17.0, + "grad_norm_var": 0.87109375, + "learning_rate": 0.0003, + "loss": 10.8954, + "loss/aux_loss": 0.04806805476546287, + "loss/crossentropy": 2.762675553560257, + "loss/logits": 0.8527992933988571, + "step": 50700 + }, + { + "epoch": 0.5071, + "grad_norm": 14.1875, + "grad_norm_var": 0.7587890625, + "learning_rate": 0.0003, + "loss": 11.0398, + "loss/aux_loss": 0.04807548895478249, + "loss/crossentropy": 2.6396145045757295, + "loss/logits": 0.8134666383266449, + "step": 50710 + }, + { + "epoch": 0.5072, + "grad_norm": 14.6875, + "grad_norm_var": 0.17805989583333334, + "learning_rate": 0.0003, + "loss": 10.959, + "loss/aux_loss": 0.04807210974395275, + "loss/crossentropy": 2.8681382477283477, + "loss/logits": 0.8086955964565277, + "step": 50720 + }, + { + "epoch": 0.5073, + "grad_norm": 15.5625, + "grad_norm_var": 0.2509765625, + "learning_rate": 0.0003, + "loss": 11.1709, + "loss/aux_loss": 0.04807695783674717, + "loss/crossentropy": 2.681786209344864, + "loss/logits": 0.820175650715828, + "step": 50730 + }, + { + "epoch": 0.5074, + "grad_norm": 15.375, + "grad_norm_var": 0.4979166666666667, + "learning_rate": 0.0003, + "loss": 10.9011, + "loss/aux_loss": 0.0480791661888361, + "loss/crossentropy": 2.6313997209072113, + "loss/logits": 0.8222554922103882, + "step": 50740 + }, + { + "epoch": 0.5075, + "grad_norm": 14.375, + "grad_norm_var": 0.42303059895833334, + "learning_rate": 0.0003, + "loss": 11.059, + "loss/aux_loss": 0.048068783991038797, + "loss/crossentropy": 2.8632388710975647, + "loss/logits": 0.8794094920158386, + "step": 50750 + }, + { + "epoch": 0.5076, + "grad_norm": 16.0, + "grad_norm_var": 2.855712890625, + "learning_rate": 0.0003, + "loss": 10.9921, + "loss/aux_loss": 0.04807401727885008, + "loss/crossentropy": 2.740087425708771, + "loss/logits": 0.8258580267429352, + "step": 50760 + }, + { + "epoch": 0.5077, + "grad_norm": 14.0, + "grad_norm_var": 3.0181640625, + "learning_rate": 0.0003, + "loss": 11.2036, + "loss/aux_loss": 0.04808244872838259, + "loss/crossentropy": 2.6240702331066132, + "loss/logits": 0.8118566811084748, + "step": 50770 + }, + { + "epoch": 0.5078, + "grad_norm": 15.5625, + "grad_norm_var": 0.7315104166666667, + "learning_rate": 0.0003, + "loss": 11.0808, + "loss/aux_loss": 0.04807658027857542, + "loss/crossentropy": 2.756423282623291, + "loss/logits": 0.8248686224222184, + "step": 50780 + }, + { + "epoch": 0.5079, + "grad_norm": 16.5, + "grad_norm_var": 0.6945149739583333, + "learning_rate": 0.0003, + "loss": 10.9433, + "loss/aux_loss": 0.04807685222476721, + "loss/crossentropy": 2.8291961908340455, + "loss/logits": 0.8037528693675995, + "step": 50790 + }, + { + "epoch": 0.508, + "grad_norm": 15.1875, + "grad_norm_var": 0.467431640625, + "learning_rate": 0.0003, + "loss": 11.0704, + "loss/aux_loss": 0.04805314373224974, + "loss/crossentropy": 2.850136566162109, + "loss/logits": 0.8551195234060287, + "step": 50800 + }, + { + "epoch": 0.5081, + "grad_norm": 15.75, + "grad_norm_var": 0.4166666666666667, + "learning_rate": 0.0003, + "loss": 11.0691, + "loss/aux_loss": 0.04807635135948658, + "loss/crossentropy": 2.7100765228271486, + "loss/logits": 0.8092273443937301, + "step": 50810 + }, + { + "epoch": 0.5082, + "grad_norm": 15.8125, + "grad_norm_var": 0.768603515625, + "learning_rate": 0.0003, + "loss": 11.1062, + "loss/aux_loss": 0.04807732086628676, + "loss/crossentropy": 2.6739711463451385, + "loss/logits": 0.8558676153421402, + "step": 50820 + }, + { + "epoch": 0.5083, + "grad_norm": 15.0, + "grad_norm_var": 0.3651041666666667, + "learning_rate": 0.0003, + "loss": 11.1696, + "loss/aux_loss": 0.048065226152539255, + "loss/crossentropy": 2.7774511337280274, + "loss/logits": 0.8358054220676422, + "step": 50830 + }, + { + "epoch": 0.5084, + "grad_norm": 14.5, + "grad_norm_var": 1.025244140625, + "learning_rate": 0.0003, + "loss": 11.0029, + "loss/aux_loss": 0.04808473084121943, + "loss/crossentropy": 2.6720672845840454, + "loss/logits": 0.8290561676025391, + "step": 50840 + }, + { + "epoch": 0.5085, + "grad_norm": 15.75, + "grad_norm_var": 0.6338541666666667, + "learning_rate": 0.0003, + "loss": 11.2649, + "loss/aux_loss": 0.04807058796286583, + "loss/crossentropy": 2.7967530369758604, + "loss/logits": 0.8311424374580383, + "step": 50850 + }, + { + "epoch": 0.5086, + "grad_norm": 14.875, + "grad_norm_var": 0.503125, + "learning_rate": 0.0003, + "loss": 11.1975, + "loss/aux_loss": 0.048081024549901485, + "loss/crossentropy": 2.813568663597107, + "loss/logits": 0.858703076839447, + "step": 50860 + }, + { + "epoch": 0.5087, + "grad_norm": 14.5625, + "grad_norm_var": 0.4905598958333333, + "learning_rate": 0.0003, + "loss": 10.9619, + "loss/aux_loss": 0.04806181099265814, + "loss/crossentropy": 2.6827987372875213, + "loss/logits": 0.8177167236804962, + "step": 50870 + }, + { + "epoch": 0.5088, + "grad_norm": 13.9375, + "grad_norm_var": 0.7285807291666667, + "learning_rate": 0.0003, + "loss": 11.0417, + "loss/aux_loss": 0.04808285180479288, + "loss/crossentropy": 2.5345280170440674, + "loss/logits": 0.7993739306926727, + "step": 50880 + }, + { + "epoch": 0.5089, + "grad_norm": 14.625, + "grad_norm_var": 0.7882649739583333, + "learning_rate": 0.0003, + "loss": 11.1877, + "loss/aux_loss": 0.04806562829762697, + "loss/crossentropy": 2.7045423090457916, + "loss/logits": 0.8371960252523423, + "step": 50890 + }, + { + "epoch": 0.509, + "grad_norm": 14.5, + "grad_norm_var": 0.4561848958333333, + "learning_rate": 0.0003, + "loss": 11.0157, + "loss/aux_loss": 0.048074452206492424, + "loss/crossentropy": 2.830974745750427, + "loss/logits": 0.8685533732175827, + "step": 50900 + }, + { + "epoch": 0.5091, + "grad_norm": 13.4375, + "grad_norm_var": 0.30441080729166664, + "learning_rate": 0.0003, + "loss": 11.0398, + "loss/aux_loss": 0.04807122685015201, + "loss/crossentropy": 2.736586630344391, + "loss/logits": 0.8220911502838135, + "step": 50910 + }, + { + "epoch": 0.5092, + "grad_norm": 14.125, + "grad_norm_var": 0.612353515625, + "learning_rate": 0.0003, + "loss": 10.9015, + "loss/aux_loss": 0.04807872846722603, + "loss/crossentropy": 2.7401693642139433, + "loss/logits": 0.8360977441072464, + "step": 50920 + }, + { + "epoch": 0.5093, + "grad_norm": 16.25, + "grad_norm_var": 0.44698893229166664, + "learning_rate": 0.0003, + "loss": 10.9974, + "loss/aux_loss": 0.04807762745767832, + "loss/crossentropy": 2.7443562030792235, + "loss/logits": 0.7941692680120468, + "step": 50930 + }, + { + "epoch": 0.5094, + "grad_norm": 15.625, + "grad_norm_var": 0.503369140625, + "learning_rate": 0.0003, + "loss": 11.2314, + "loss/aux_loss": 0.048069654405117034, + "loss/crossentropy": 2.642306762933731, + "loss/logits": 0.8203934520483017, + "step": 50940 + }, + { + "epoch": 0.5095, + "grad_norm": 14.625, + "grad_norm_var": 0.38697916666666665, + "learning_rate": 0.0003, + "loss": 11.1083, + "loss/aux_loss": 0.0480804480612278, + "loss/crossentropy": 2.747548055648804, + "loss/logits": 0.8185478031635285, + "step": 50950 + }, + { + "epoch": 0.5096, + "grad_norm": 16.25, + "grad_norm_var": 0.4195149739583333, + "learning_rate": 0.0003, + "loss": 10.9981, + "loss/aux_loss": 0.048063276521861556, + "loss/crossentropy": 2.8265784859657286, + "loss/logits": 0.8263007819652557, + "step": 50960 + }, + { + "epoch": 0.5097, + "grad_norm": 14.25, + "grad_norm_var": 0.6395182291666667, + "learning_rate": 0.0003, + "loss": 11.0577, + "loss/aux_loss": 0.048073398880660534, + "loss/crossentropy": 2.760447859764099, + "loss/logits": 0.8406284034252167, + "step": 50970 + }, + { + "epoch": 0.5098, + "grad_norm": 13.375, + "grad_norm_var": 0.5361979166666667, + "learning_rate": 0.0003, + "loss": 11.0733, + "loss/aux_loss": 0.04807597603648901, + "loss/crossentropy": 2.785585403442383, + "loss/logits": 0.8304526567459106, + "step": 50980 + }, + { + "epoch": 0.5099, + "grad_norm": 13.1875, + "grad_norm_var": 0.4671223958333333, + "learning_rate": 0.0003, + "loss": 11.1251, + "loss/aux_loss": 0.0480725534260273, + "loss/crossentropy": 2.7471178472042084, + "loss/logits": 0.8607824087142945, + "step": 50990 + }, + { + "epoch": 0.51, + "grad_norm": 14.75, + "grad_norm_var": 0.396875, + "learning_rate": 0.0003, + "loss": 11.0672, + "loss/aux_loss": 0.04808218758553266, + "loss/crossentropy": 2.7346564173698424, + "loss/logits": 0.8275700658559799, + "step": 51000 + }, + { + "epoch": 0.5101, + "grad_norm": 14.875, + "grad_norm_var": 0.26015625, + "learning_rate": 0.0003, + "loss": 11.1911, + "loss/aux_loss": 0.048065698333084585, + "loss/crossentropy": 2.800529360771179, + "loss/logits": 0.8583203822374343, + "step": 51010 + }, + { + "epoch": 0.5102, + "grad_norm": 15.4375, + "grad_norm_var": 0.20545247395833333, + "learning_rate": 0.0003, + "loss": 11.1362, + "loss/aux_loss": 0.04808482229709625, + "loss/crossentropy": 2.708675539493561, + "loss/logits": 0.8227375984191895, + "step": 51020 + }, + { + "epoch": 0.5103, + "grad_norm": 13.5, + "grad_norm_var": 0.45514322916666666, + "learning_rate": 0.0003, + "loss": 10.9633, + "loss/aux_loss": 0.04807134531438351, + "loss/crossentropy": 2.5863205909729006, + "loss/logits": 0.833231994509697, + "step": 51030 + }, + { + "epoch": 0.5104, + "grad_norm": 15.625, + "grad_norm_var": 0.8641764322916666, + "learning_rate": 0.0003, + "loss": 10.992, + "loss/aux_loss": 0.04806938972324133, + "loss/crossentropy": 2.5987710535526274, + "loss/logits": 0.8221124142408371, + "step": 51040 + }, + { + "epoch": 0.5105, + "grad_norm": 15.4375, + "grad_norm_var": 2.3384765625, + "learning_rate": 0.0003, + "loss": 11.1323, + "loss/aux_loss": 0.04807955361902714, + "loss/crossentropy": 2.7755866408348084, + "loss/logits": 0.8378236562013626, + "step": 51050 + }, + { + "epoch": 0.5106, + "grad_norm": 14.9375, + "grad_norm_var": 0.4400390625, + "learning_rate": 0.0003, + "loss": 11.0057, + "loss/aux_loss": 0.048068666271865367, + "loss/crossentropy": 2.6365352988243105, + "loss/logits": 0.8126265555620193, + "step": 51060 + }, + { + "epoch": 0.5107, + "grad_norm": 14.4375, + "grad_norm_var": 0.4025390625, + "learning_rate": 0.0003, + "loss": 11.1041, + "loss/aux_loss": 0.0480704678222537, + "loss/crossentropy": 2.6664901852607725, + "loss/logits": 0.826370707154274, + "step": 51070 + }, + { + "epoch": 0.5108, + "grad_norm": 14.625, + "grad_norm_var": 0.30911458333333336, + "learning_rate": 0.0003, + "loss": 11.0334, + "loss/aux_loss": 0.048087571002542975, + "loss/crossentropy": 2.511679470539093, + "loss/logits": 0.8164161443710327, + "step": 51080 + }, + { + "epoch": 0.5109, + "grad_norm": 15.3125, + "grad_norm_var": 0.208837890625, + "learning_rate": 0.0003, + "loss": 11.0154, + "loss/aux_loss": 0.04806430675089359, + "loss/crossentropy": 2.721172201633453, + "loss/logits": 0.7954764574766159, + "step": 51090 + }, + { + "epoch": 0.511, + "grad_norm": 12.9375, + "grad_norm_var": 0.5202962239583333, + "learning_rate": 0.0003, + "loss": 11.0233, + "loss/aux_loss": 0.04807245638221502, + "loss/crossentropy": 2.619920516014099, + "loss/logits": 0.8742161899805069, + "step": 51100 + }, + { + "epoch": 0.5111, + "grad_norm": 14.375, + "grad_norm_var": 75.28722330729167, + "learning_rate": 0.0003, + "loss": 11.1598, + "loss/aux_loss": 0.04808569718152285, + "loss/crossentropy": 2.64174947142601, + "loss/logits": 0.7941539883613586, + "step": 51110 + }, + { + "epoch": 0.5112, + "grad_norm": 14.75, + "grad_norm_var": 2.0212890625, + "learning_rate": 0.0003, + "loss": 10.969, + "loss/aux_loss": 0.04807308297604322, + "loss/crossentropy": 2.7143703937530517, + "loss/logits": 0.8080418884754181, + "step": 51120 + }, + { + "epoch": 0.5113, + "grad_norm": 15.75, + "grad_norm_var": 0.504541015625, + "learning_rate": 0.0003, + "loss": 11.1646, + "loss/aux_loss": 0.0480702068656683, + "loss/crossentropy": 2.8439256310462953, + "loss/logits": 0.849370151758194, + "step": 51130 + }, + { + "epoch": 0.5114, + "grad_norm": 14.1875, + "grad_norm_var": 0.309228515625, + "learning_rate": 0.0003, + "loss": 11.2208, + "loss/aux_loss": 0.048074052482843396, + "loss/crossentropy": 2.691173183917999, + "loss/logits": 0.8135675758123397, + "step": 51140 + }, + { + "epoch": 0.5115, + "grad_norm": 14.625, + "grad_norm_var": 1.3165201822916666, + "learning_rate": 0.0003, + "loss": 11.055, + "loss/aux_loss": 0.04807727038860321, + "loss/crossentropy": 2.8615395545959474, + "loss/logits": 0.833811953663826, + "step": 51150 + }, + { + "epoch": 0.5116, + "grad_norm": 14.3125, + "grad_norm_var": 1.818994140625, + "learning_rate": 0.0003, + "loss": 11.23, + "loss/aux_loss": 0.04806257952004671, + "loss/crossentropy": 2.637695002555847, + "loss/logits": 0.8264784872531891, + "step": 51160 + }, + { + "epoch": 0.5117, + "grad_norm": 15.9375, + "grad_norm_var": 1.2104166666666667, + "learning_rate": 0.0003, + "loss": 11.1183, + "loss/aux_loss": 0.0480752969160676, + "loss/crossentropy": 2.65714670419693, + "loss/logits": 0.8472563087940216, + "step": 51170 + }, + { + "epoch": 0.5118, + "grad_norm": 14.375, + "grad_norm_var": 0.372900390625, + "learning_rate": 0.0003, + "loss": 11.0255, + "loss/aux_loss": 0.04808554705232382, + "loss/crossentropy": 2.6965773463249207, + "loss/logits": 0.7938098013401031, + "step": 51180 + }, + { + "epoch": 0.5119, + "grad_norm": 14.375, + "grad_norm_var": 0.45826822916666665, + "learning_rate": 0.0003, + "loss": 11.0631, + "loss/aux_loss": 0.04806725028902292, + "loss/crossentropy": 2.699047327041626, + "loss/logits": 0.8511703968048095, + "step": 51190 + }, + { + "epoch": 0.512, + "grad_norm": 20.0, + "grad_norm_var": 2.3203125, + "learning_rate": 0.0003, + "loss": 11.016, + "loss/aux_loss": 0.04806656241416931, + "loss/crossentropy": 2.580675709247589, + "loss/logits": 0.7733906388282776, + "step": 51200 + }, + { + "epoch": 0.5121, + "grad_norm": 15.3125, + "grad_norm_var": 2.1278483072916665, + "learning_rate": 0.0003, + "loss": 11.2027, + "loss/aux_loss": 0.04808239191770554, + "loss/crossentropy": 2.8701157569885254, + "loss/logits": 0.8477713167667389, + "step": 51210 + }, + { + "epoch": 0.5122, + "grad_norm": 14.25, + "grad_norm_var": 0.595166015625, + "learning_rate": 0.0003, + "loss": 11.1915, + "loss/aux_loss": 0.04806038942188025, + "loss/crossentropy": 2.67539005279541, + "loss/logits": 0.8615487456321717, + "step": 51220 + }, + { + "epoch": 0.5123, + "grad_norm": 14.3125, + "grad_norm_var": 1.3512858072916667, + "learning_rate": 0.0003, + "loss": 10.9781, + "loss/aux_loss": 0.04807022716850042, + "loss/crossentropy": 2.6567323327064516, + "loss/logits": 0.8351798057556152, + "step": 51230 + }, + { + "epoch": 0.5124, + "grad_norm": 15.25, + "grad_norm_var": 0.6555826822916667, + "learning_rate": 0.0003, + "loss": 11.0322, + "loss/aux_loss": 0.04807539042085409, + "loss/crossentropy": 2.5088176906108854, + "loss/logits": 0.8250403732061387, + "step": 51240 + }, + { + "epoch": 0.5125, + "grad_norm": 15.5, + "grad_norm_var": 0.8641764322916666, + "learning_rate": 0.0003, + "loss": 10.8031, + "loss/aux_loss": 0.0480835122987628, + "loss/crossentropy": 2.4206930220127107, + "loss/logits": 0.7768561899662018, + "step": 51250 + }, + { + "epoch": 0.5126, + "grad_norm": 14.1875, + "grad_norm_var": 0.865087890625, + "learning_rate": 0.0003, + "loss": 11.0423, + "loss/aux_loss": 0.04806485194712877, + "loss/crossentropy": 2.7578662991523744, + "loss/logits": 0.8342026203870774, + "step": 51260 + }, + { + "epoch": 0.5127, + "grad_norm": 15.0625, + "grad_norm_var": 0.5695149739583333, + "learning_rate": 0.0003, + "loss": 11.0093, + "loss/aux_loss": 0.04807938933372498, + "loss/crossentropy": 2.6144285678863524, + "loss/logits": 0.8025279492139816, + "step": 51270 + }, + { + "epoch": 0.5128, + "grad_norm": 14.375, + "grad_norm_var": 0.41901041666666666, + "learning_rate": 0.0003, + "loss": 11.0911, + "loss/aux_loss": 0.04807790834456682, + "loss/crossentropy": 2.751217710971832, + "loss/logits": 0.8257042407989502, + "step": 51280 + }, + { + "epoch": 0.5129, + "grad_norm": 16.875, + "grad_norm_var": 0.5968098958333333, + "learning_rate": 0.0003, + "loss": 11.0336, + "loss/aux_loss": 0.04806835390627384, + "loss/crossentropy": 2.7564366936683653, + "loss/logits": 0.8366163045167923, + "step": 51290 + }, + { + "epoch": 0.513, + "grad_norm": 15.9375, + "grad_norm_var": 1.1509765625, + "learning_rate": 0.0003, + "loss": 11.0734, + "loss/aux_loss": 0.04808133132755756, + "loss/crossentropy": 2.557454949617386, + "loss/logits": 0.8112009972333908, + "step": 51300 + }, + { + "epoch": 0.5131, + "grad_norm": 14.625, + "grad_norm_var": 0.8239583333333333, + "learning_rate": 0.0003, + "loss": 11.1618, + "loss/aux_loss": 0.04807974435389042, + "loss/crossentropy": 2.628247785568237, + "loss/logits": 0.820238995552063, + "step": 51310 + }, + { + "epoch": 0.5132, + "grad_norm": 15.1875, + "grad_norm_var": 0.7140462239583333, + "learning_rate": 0.0003, + "loss": 11.0499, + "loss/aux_loss": 0.048069828934967515, + "loss/crossentropy": 2.68018000125885, + "loss/logits": 0.8339303702116012, + "step": 51320 + }, + { + "epoch": 0.5133, + "grad_norm": 13.75, + "grad_norm_var": 0.470947265625, + "learning_rate": 0.0003, + "loss": 11.0399, + "loss/aux_loss": 0.04807551633566618, + "loss/crossentropy": 2.8822931230068205, + "loss/logits": 0.8533807754516601, + "step": 51330 + }, + { + "epoch": 0.5134, + "grad_norm": 14.8125, + "grad_norm_var": 0.5947916666666667, + "learning_rate": 0.0003, + "loss": 11.0277, + "loss/aux_loss": 0.04806902166455984, + "loss/crossentropy": 2.6886990547180174, + "loss/logits": 0.8332021862268448, + "step": 51340 + }, + { + "epoch": 0.5135, + "grad_norm": 14.1875, + "grad_norm_var": 0.4669108072916667, + "learning_rate": 0.0003, + "loss": 11.0988, + "loss/aux_loss": 0.04808459766209126, + "loss/crossentropy": 2.685576003789902, + "loss/logits": 0.8132620543241501, + "step": 51350 + }, + { + "epoch": 0.5136, + "grad_norm": 14.5625, + "grad_norm_var": 0.240478515625, + "learning_rate": 0.0003, + "loss": 11.1777, + "loss/aux_loss": 0.04807017575949431, + "loss/crossentropy": 2.7257566928863524, + "loss/logits": 0.8060571432113648, + "step": 51360 + }, + { + "epoch": 0.5137, + "grad_norm": 16.75, + "grad_norm_var": 0.6994140625, + "learning_rate": 0.0003, + "loss": 10.858, + "loss/aux_loss": 0.0480698412284255, + "loss/crossentropy": 2.5365270376205444, + "loss/logits": 0.816512593626976, + "step": 51370 + }, + { + "epoch": 0.5138, + "grad_norm": 14.5, + "grad_norm_var": 0.754541015625, + "learning_rate": 0.0003, + "loss": 11.1839, + "loss/aux_loss": 0.0480722613632679, + "loss/crossentropy": 2.6611290633678437, + "loss/logits": 0.8183812767267227, + "step": 51380 + }, + { + "epoch": 0.5139, + "grad_norm": 14.5625, + "grad_norm_var": 1.1984375, + "learning_rate": 0.0003, + "loss": 10.9507, + "loss/aux_loss": 0.04807326439768076, + "loss/crossentropy": 2.77123561501503, + "loss/logits": 0.8091616094112396, + "step": 51390 + }, + { + "epoch": 0.514, + "grad_norm": 14.5, + "grad_norm_var": 49.47433268229167, + "learning_rate": 0.0003, + "loss": 11.1145, + "loss/aux_loss": 0.04806629903614521, + "loss/crossentropy": 2.870545446872711, + "loss/logits": 0.8608437448740005, + "step": 51400 + }, + { + "epoch": 0.5141, + "grad_norm": 14.75, + "grad_norm_var": 48.878580729166664, + "learning_rate": 0.0003, + "loss": 10.9984, + "loss/aux_loss": 0.04807921946048736, + "loss/crossentropy": 2.808860683441162, + "loss/logits": 0.8428457826375961, + "step": 51410 + }, + { + "epoch": 0.5142, + "grad_norm": 15.0, + "grad_norm_var": 0.7305826822916667, + "learning_rate": 0.0003, + "loss": 11.1773, + "loss/aux_loss": 0.04806988965719938, + "loss/crossentropy": 2.8295932352542876, + "loss/logits": 0.8579282373189926, + "step": 51420 + }, + { + "epoch": 0.5143, + "grad_norm": 15.0625, + "grad_norm_var": 0.3619140625, + "learning_rate": 0.0003, + "loss": 11.0726, + "loss/aux_loss": 0.048077551648020744, + "loss/crossentropy": 2.625492978096008, + "loss/logits": 0.8137675523757935, + "step": 51430 + }, + { + "epoch": 0.5144, + "grad_norm": 15.1875, + "grad_norm_var": 0.2757649739583333, + "learning_rate": 0.0003, + "loss": 11.1048, + "loss/aux_loss": 0.0480729004368186, + "loss/crossentropy": 2.861330282688141, + "loss/logits": 0.8448628783226013, + "step": 51440 + }, + { + "epoch": 0.5145, + "grad_norm": 14.8125, + "grad_norm_var": 0.32472330729166665, + "learning_rate": 0.0003, + "loss": 11.0903, + "loss/aux_loss": 0.04806978609412908, + "loss/crossentropy": 2.701605361700058, + "loss/logits": 0.7941394478082657, + "step": 51450 + }, + { + "epoch": 0.5146, + "grad_norm": 15.3125, + "grad_norm_var": 0.33670247395833336, + "learning_rate": 0.0003, + "loss": 11.028, + "loss/aux_loss": 0.04808089081197977, + "loss/crossentropy": 2.683489578962326, + "loss/logits": 0.8333400577306748, + "step": 51460 + }, + { + "epoch": 0.5147, + "grad_norm": 15.1875, + "grad_norm_var": 0.9140462239583333, + "learning_rate": 0.0003, + "loss": 10.9866, + "loss/aux_loss": 0.04807344228029251, + "loss/crossentropy": 2.7665489315986633, + "loss/logits": 0.8190008670091629, + "step": 51470 + }, + { + "epoch": 0.5148, + "grad_norm": 13.8125, + "grad_norm_var": 1.077978515625, + "learning_rate": 0.0003, + "loss": 10.8875, + "loss/aux_loss": 0.04807010628283024, + "loss/crossentropy": 2.58920761346817, + "loss/logits": 0.8068946480751038, + "step": 51480 + }, + { + "epoch": 0.5149, + "grad_norm": 15.5625, + "grad_norm_var": 1.1469889322916667, + "learning_rate": 0.0003, + "loss": 11.0627, + "loss/aux_loss": 0.04806904457509518, + "loss/crossentropy": 2.661976617574692, + "loss/logits": 0.8472563207149506, + "step": 51490 + }, + { + "epoch": 0.515, + "grad_norm": 15.0625, + "grad_norm_var": 1.404931640625, + "learning_rate": 0.0003, + "loss": 11.0402, + "loss/aux_loss": 0.048074154369533065, + "loss/crossentropy": 2.8123088240623475, + "loss/logits": 0.8572196811437607, + "step": 51500 + }, + { + "epoch": 0.5151, + "grad_norm": 13.875, + "grad_norm_var": 1.3155598958333334, + "learning_rate": 0.0003, + "loss": 10.9436, + "loss/aux_loss": 0.04807008523494005, + "loss/crossentropy": 2.711523699760437, + "loss/logits": 0.8486291140317916, + "step": 51510 + }, + { + "epoch": 0.5152, + "grad_norm": 14.125, + "grad_norm_var": 0.4384765625, + "learning_rate": 0.0003, + "loss": 11.0391, + "loss/aux_loss": 0.04808166529983282, + "loss/crossentropy": 2.766609239578247, + "loss/logits": 0.8237248331308364, + "step": 51520 + }, + { + "epoch": 0.5153, + "grad_norm": 15.8125, + "grad_norm_var": 2.97265625, + "learning_rate": 0.0003, + "loss": 11.0567, + "loss/aux_loss": 0.048059662245213984, + "loss/crossentropy": 2.716182154417038, + "loss/logits": 0.8255683243274688, + "step": 51530 + }, + { + "epoch": 0.5154, + "grad_norm": 17.75, + "grad_norm_var": 1760.974853515625, + "learning_rate": 0.0003, + "loss": 11.0691, + "loss/aux_loss": 0.048087839223444465, + "loss/crossentropy": 2.7150439620018005, + "loss/logits": 0.8070930659770965, + "step": 51540 + }, + { + "epoch": 0.5155, + "grad_norm": 15.625, + "grad_norm_var": 8.13046875, + "learning_rate": 0.0003, + "loss": 11.0962, + "loss/aux_loss": 0.048063835315406325, + "loss/crossentropy": 2.81580011844635, + "loss/logits": 0.8323242962360382, + "step": 51550 + }, + { + "epoch": 0.5156, + "grad_norm": 15.0625, + "grad_norm_var": 0.43331705729166664, + "learning_rate": 0.0003, + "loss": 11.0541, + "loss/aux_loss": 0.04805823341012001, + "loss/crossentropy": 2.722984766960144, + "loss/logits": 0.8511014252901077, + "step": 51560 + }, + { + "epoch": 0.5157, + "grad_norm": 15.1875, + "grad_norm_var": 0.193603515625, + "learning_rate": 0.0003, + "loss": 11.0758, + "loss/aux_loss": 0.04808205440640449, + "loss/crossentropy": 2.716118276119232, + "loss/logits": 0.8201917320489883, + "step": 51570 + }, + { + "epoch": 0.5158, + "grad_norm": 13.9375, + "grad_norm_var": 0.5306640625, + "learning_rate": 0.0003, + "loss": 11.2349, + "loss/aux_loss": 0.048078343458473685, + "loss/crossentropy": 2.661349093914032, + "loss/logits": 0.8327278316020965, + "step": 51580 + }, + { + "epoch": 0.5159, + "grad_norm": 15.3125, + "grad_norm_var": 0.6602701822916667, + "learning_rate": 0.0003, + "loss": 10.9827, + "loss/aux_loss": 0.04806844424456358, + "loss/crossentropy": 2.665369528532028, + "loss/logits": 0.8237587451934815, + "step": 51590 + }, + { + "epoch": 0.516, + "grad_norm": 14.1875, + "grad_norm_var": 0.8885416666666667, + "learning_rate": 0.0003, + "loss": 10.9778, + "loss/aux_loss": 0.04806563388556242, + "loss/crossentropy": 2.744618034362793, + "loss/logits": 0.8178337156772614, + "step": 51600 + }, + { + "epoch": 0.5161, + "grad_norm": 14.375, + "grad_norm_var": 0.753759765625, + "learning_rate": 0.0003, + "loss": 10.9837, + "loss/aux_loss": 0.04807935301214457, + "loss/crossentropy": 2.6647940456867216, + "loss/logits": 0.8174872279167176, + "step": 51610 + }, + { + "epoch": 0.5162, + "grad_norm": 14.4375, + "grad_norm_var": 0.8535807291666667, + "learning_rate": 0.0003, + "loss": 11.1282, + "loss/aux_loss": 0.0480736693367362, + "loss/crossentropy": 2.6697156190872193, + "loss/logits": 0.8190987050533295, + "step": 51620 + }, + { + "epoch": 0.5163, + "grad_norm": 14.3125, + "grad_norm_var": 0.8660807291666667, + "learning_rate": 0.0003, + "loss": 10.9985, + "loss/aux_loss": 0.04808506760746241, + "loss/crossentropy": 2.6137089908123015, + "loss/logits": 0.8341957181692123, + "step": 51630 + }, + { + "epoch": 0.5164, + "grad_norm": 19.375, + "grad_norm_var": 1.9468587239583333, + "learning_rate": 0.0003, + "loss": 11.0263, + "loss/aux_loss": 0.048055645637214185, + "loss/crossentropy": 2.7922864675521852, + "loss/logits": 0.8405203580856323, + "step": 51640 + }, + { + "epoch": 0.5165, + "grad_norm": 15.8125, + "grad_norm_var": 1.5942545572916667, + "learning_rate": 0.0003, + "loss": 11.0421, + "loss/aux_loss": 0.04808124210685492, + "loss/crossentropy": 2.6083596289157867, + "loss/logits": 0.7906621545553207, + "step": 51650 + }, + { + "epoch": 0.5166, + "grad_norm": 14.6875, + "grad_norm_var": 0.3421223958333333, + "learning_rate": 0.0003, + "loss": 11.1676, + "loss/aux_loss": 0.04807032104581595, + "loss/crossentropy": 2.7878468513488768, + "loss/logits": 0.8364533364772797, + "step": 51660 + }, + { + "epoch": 0.5167, + "grad_norm": 16.125, + "grad_norm_var": 0.48240559895833335, + "learning_rate": 0.0003, + "loss": 11.083, + "loss/aux_loss": 0.04807390999048948, + "loss/crossentropy": 2.8989575624465944, + "loss/logits": 0.7967435866594315, + "step": 51670 + }, + { + "epoch": 0.5168, + "grad_norm": 15.3125, + "grad_norm_var": 0.4363932291666667, + "learning_rate": 0.0003, + "loss": 10.8368, + "loss/aux_loss": 0.04807572904974222, + "loss/crossentropy": 2.6682200372219085, + "loss/logits": 0.7805459082126618, + "step": 51680 + }, + { + "epoch": 0.5169, + "grad_norm": 14.3125, + "grad_norm_var": 0.5583170572916667, + "learning_rate": 0.0003, + "loss": 10.9825, + "loss/aux_loss": 0.048071629367768764, + "loss/crossentropy": 2.6587085843086244, + "loss/logits": 0.8401948183774948, + "step": 51690 + }, + { + "epoch": 0.517, + "grad_norm": 16.5, + "grad_norm_var": 0.7034993489583333, + "learning_rate": 0.0003, + "loss": 11.1472, + "loss/aux_loss": 0.04806802216917276, + "loss/crossentropy": 2.8146503806114196, + "loss/logits": 0.8239098250865936, + "step": 51700 + }, + { + "epoch": 0.5171, + "grad_norm": 16.0, + "grad_norm_var": 187.33723958333334, + "learning_rate": 0.0003, + "loss": 11.1429, + "loss/aux_loss": 0.04808622244745493, + "loss/crossentropy": 2.851152813434601, + "loss/logits": 0.8723404318094253, + "step": 51710 + }, + { + "epoch": 0.5172, + "grad_norm": 14.375, + "grad_norm_var": 188.23019205729167, + "learning_rate": 0.0003, + "loss": 10.9794, + "loss/aux_loss": 0.04806773141026497, + "loss/crossentropy": 2.7131691336631776, + "loss/logits": 0.8205563336610794, + "step": 51720 + }, + { + "epoch": 0.5173, + "grad_norm": 14.5625, + "grad_norm_var": 0.41380208333333335, + "learning_rate": 0.0003, + "loss": 11.0507, + "loss/aux_loss": 0.04806727990508079, + "loss/crossentropy": 2.589705538749695, + "loss/logits": 0.8440980285406112, + "step": 51730 + }, + { + "epoch": 0.5174, + "grad_norm": 14.5, + "grad_norm_var": 0.5446451822916667, + "learning_rate": 0.0003, + "loss": 10.9806, + "loss/aux_loss": 0.04807034377008677, + "loss/crossentropy": 2.791779488325119, + "loss/logits": 0.8750499516725541, + "step": 51740 + }, + { + "epoch": 0.5175, + "grad_norm": 13.375, + "grad_norm_var": 0.39791666666666664, + "learning_rate": 0.0003, + "loss": 10.8982, + "loss/aux_loss": 0.048070017248392105, + "loss/crossentropy": 2.7478320360183717, + "loss/logits": 0.8556131899356842, + "step": 51750 + }, + { + "epoch": 0.5176, + "grad_norm": 13.4375, + "grad_norm_var": 0.6917805989583333, + "learning_rate": 0.0003, + "loss": 11.1149, + "loss/aux_loss": 0.04807256907224655, + "loss/crossentropy": 2.7422623872756957, + "loss/logits": 0.8193183451890945, + "step": 51760 + }, + { + "epoch": 0.5177, + "grad_norm": 14.5, + "grad_norm_var": 0.597900390625, + "learning_rate": 0.0003, + "loss": 11.1238, + "loss/aux_loss": 0.048071306012570855, + "loss/crossentropy": 2.6727042496204376, + "loss/logits": 0.8390705615282059, + "step": 51770 + }, + { + "epoch": 0.5178, + "grad_norm": 15.4375, + "grad_norm_var": 0.311962890625, + "learning_rate": 0.0003, + "loss": 10.9609, + "loss/aux_loss": 0.04806660022586584, + "loss/crossentropy": 2.560292327404022, + "loss/logits": 0.8443563103675842, + "step": 51780 + }, + { + "epoch": 0.5179, + "grad_norm": 13.9375, + "grad_norm_var": 0.5431640625, + "learning_rate": 0.0003, + "loss": 10.9787, + "loss/aux_loss": 0.04807846024632454, + "loss/crossentropy": 2.757317876815796, + "loss/logits": 0.841183426976204, + "step": 51790 + }, + { + "epoch": 0.518, + "grad_norm": 15.4375, + "grad_norm_var": 0.5400390625, + "learning_rate": 0.0003, + "loss": 11.0011, + "loss/aux_loss": 0.04806059673428535, + "loss/crossentropy": 2.6568395853042603, + "loss/logits": 0.8394730240106583, + "step": 51800 + }, + { + "epoch": 0.5181, + "grad_norm": 14.0, + "grad_norm_var": 0.3941243489583333, + "learning_rate": 0.0003, + "loss": 10.9822, + "loss/aux_loss": 0.04807980302721262, + "loss/crossentropy": 2.7917647838592528, + "loss/logits": 0.8249937295913696, + "step": 51810 + }, + { + "epoch": 0.5182, + "grad_norm": 14.125, + "grad_norm_var": 0.6313639322916667, + "learning_rate": 0.0003, + "loss": 11.235, + "loss/aux_loss": 0.04807339478284121, + "loss/crossentropy": 2.748648017644882, + "loss/logits": 0.8033011108636856, + "step": 51820 + }, + { + "epoch": 0.5183, + "grad_norm": 15.3125, + "grad_norm_var": 0.4869140625, + "learning_rate": 0.0003, + "loss": 10.9818, + "loss/aux_loss": 0.04806631077080965, + "loss/crossentropy": 2.7276119709014894, + "loss/logits": 0.8400139749050141, + "step": 51830 + }, + { + "epoch": 0.5184, + "grad_norm": 13.5, + "grad_norm_var": 0.6833170572916667, + "learning_rate": 0.0003, + "loss": 11.1312, + "loss/aux_loss": 0.0480747552588582, + "loss/crossentropy": 2.6863146901130674, + "loss/logits": 0.8191409975290298, + "step": 51840 + }, + { + "epoch": 0.5185, + "grad_norm": 14.375, + "grad_norm_var": 0.418603515625, + "learning_rate": 0.0003, + "loss": 11.0812, + "loss/aux_loss": 0.048067673854529855, + "loss/crossentropy": 2.724157619476318, + "loss/logits": 0.8137524396181106, + "step": 51850 + }, + { + "epoch": 0.5186, + "grad_norm": 14.4375, + "grad_norm_var": 0.5212076822916667, + "learning_rate": 0.0003, + "loss": 10.9867, + "loss/aux_loss": 0.04808175042271614, + "loss/crossentropy": 2.7627050638198853, + "loss/logits": 0.8452069222927093, + "step": 51860 + }, + { + "epoch": 0.5187, + "grad_norm": 14.75, + "grad_norm_var": 0.8152180989583333, + "learning_rate": 0.0003, + "loss": 10.9896, + "loss/aux_loss": 0.048070698603987695, + "loss/crossentropy": 2.6193623900413514, + "loss/logits": 0.8333276480436325, + "step": 51870 + }, + { + "epoch": 0.5188, + "grad_norm": 14.1875, + "grad_norm_var": 0.5051432291666667, + "learning_rate": 0.0003, + "loss": 11.0474, + "loss/aux_loss": 0.04807662758976221, + "loss/crossentropy": 2.6479109644889833, + "loss/logits": 0.8272971555590629, + "step": 51880 + }, + { + "epoch": 0.5189, + "grad_norm": 14.9375, + "grad_norm_var": 0.41087239583333335, + "learning_rate": 0.0003, + "loss": 10.9789, + "loss/aux_loss": 0.048069034889340403, + "loss/crossentropy": 2.781693035364151, + "loss/logits": 0.7891067415475845, + "step": 51890 + }, + { + "epoch": 0.519, + "grad_norm": 14.8125, + "grad_norm_var": 0.46920572916666664, + "learning_rate": 0.0003, + "loss": 11.0487, + "loss/aux_loss": 0.04807375390082598, + "loss/crossentropy": 2.760209488868713, + "loss/logits": 0.8277522176504135, + "step": 51900 + }, + { + "epoch": 0.5191, + "grad_norm": 14.4375, + "grad_norm_var": 0.6077473958333334, + "learning_rate": 0.0003, + "loss": 11.1482, + "loss/aux_loss": 0.04807181041687727, + "loss/crossentropy": 2.803401565551758, + "loss/logits": 0.8468029230833054, + "step": 51910 + }, + { + "epoch": 0.5192, + "grad_norm": 13.6875, + "grad_norm_var": 0.8886555989583333, + "learning_rate": 0.0003, + "loss": 11.1805, + "loss/aux_loss": 0.048076972179114816, + "loss/crossentropy": 2.638881093263626, + "loss/logits": 0.8475582480430603, + "step": 51920 + }, + { + "epoch": 0.5193, + "grad_norm": 15.1875, + "grad_norm_var": 0.7930826822916667, + "learning_rate": 0.0003, + "loss": 11.147, + "loss/aux_loss": 0.048069387674331665, + "loss/crossentropy": 2.719567573070526, + "loss/logits": 0.8314665377140045, + "step": 51930 + }, + { + "epoch": 0.5194, + "grad_norm": 14.3125, + "grad_norm_var": 0.32916666666666666, + "learning_rate": 0.0003, + "loss": 11.2553, + "loss/aux_loss": 0.0480784498155117, + "loss/crossentropy": 2.7691810011863707, + "loss/logits": 0.8145269155502319, + "step": 51940 + }, + { + "epoch": 0.5195, + "grad_norm": 14.75, + "grad_norm_var": 0.9473307291666667, + "learning_rate": 0.0003, + "loss": 11.2179, + "loss/aux_loss": 0.04808474984019995, + "loss/crossentropy": 2.6275469183921816, + "loss/logits": 0.8544179648160934, + "step": 51950 + }, + { + "epoch": 0.5196, + "grad_norm": 14.5625, + "grad_norm_var": 0.9541666666666667, + "learning_rate": 0.0003, + "loss": 11.0911, + "loss/aux_loss": 0.04806585069745779, + "loss/crossentropy": 2.7665723621845246, + "loss/logits": 0.839416640996933, + "step": 51960 + }, + { + "epoch": 0.5197, + "grad_norm": 14.125, + "grad_norm_var": 0.9515625, + "learning_rate": 0.0003, + "loss": 11.0934, + "loss/aux_loss": 0.04807109031826258, + "loss/crossentropy": 2.7955354332923887, + "loss/logits": 0.8091706037521362, + "step": 51970 + }, + { + "epoch": 0.5198, + "grad_norm": 14.375, + "grad_norm_var": 0.790869140625, + "learning_rate": 0.0003, + "loss": 10.9905, + "loss/aux_loss": 0.04806825909763575, + "loss/crossentropy": 2.7529439866542815, + "loss/logits": 0.8357030868530273, + "step": 51980 + }, + { + "epoch": 0.5199, + "grad_norm": 13.8125, + "grad_norm_var": 0.36013997395833336, + "learning_rate": 0.0003, + "loss": 10.9674, + "loss/aux_loss": 0.048064617440104485, + "loss/crossentropy": 2.542526823282242, + "loss/logits": 0.7883663177490234, + "step": 51990 + }, + { + "epoch": 0.52, + "grad_norm": 14.5625, + "grad_norm_var": 0.5002604166666667, + "learning_rate": 0.0003, + "loss": 11.1598, + "loss/aux_loss": 0.048075161315500735, + "loss/crossentropy": 2.624401843547821, + "loss/logits": 0.8276967614889145, + "step": 52000 + }, + { + "epoch": 0.5201, + "grad_norm": 13.25, + "grad_norm_var": 0.3551432291666667, + "learning_rate": 0.0003, + "loss": 11.0968, + "loss/aux_loss": 0.048081882484257224, + "loss/crossentropy": 2.6817555725574493, + "loss/logits": 0.8237560451030731, + "step": 52010 + }, + { + "epoch": 0.5202, + "grad_norm": 14.4375, + "grad_norm_var": 0.49724934895833334, + "learning_rate": 0.0003, + "loss": 10.8817, + "loss/aux_loss": 0.0480725109577179, + "loss/crossentropy": 2.782274627685547, + "loss/logits": 0.8277134209871292, + "step": 52020 + }, + { + "epoch": 0.5203, + "grad_norm": 14.4375, + "grad_norm_var": 0.7645182291666667, + "learning_rate": 0.0003, + "loss": 11.0936, + "loss/aux_loss": 0.04807168003171682, + "loss/crossentropy": 2.7582414865493776, + "loss/logits": 0.825353017449379, + "step": 52030 + }, + { + "epoch": 0.5204, + "grad_norm": 16.0, + "grad_norm_var": 0.22120768229166668, + "learning_rate": 0.0003, + "loss": 11.1796, + "loss/aux_loss": 0.04807770941406488, + "loss/crossentropy": 2.7298890888690948, + "loss/logits": 0.8350825905799866, + "step": 52040 + }, + { + "epoch": 0.5205, + "grad_norm": 15.3125, + "grad_norm_var": 0.30857747395833335, + "learning_rate": 0.0003, + "loss": 11.1765, + "loss/aux_loss": 0.048072948679327966, + "loss/crossentropy": 2.7066911339759825, + "loss/logits": 0.79820456802845, + "step": 52050 + }, + { + "epoch": 0.5206, + "grad_norm": 15.1875, + "grad_norm_var": 0.22029622395833334, + "learning_rate": 0.0003, + "loss": 11.0453, + "loss/aux_loss": 0.04807343035936355, + "loss/crossentropy": 2.80033460855484, + "loss/logits": 0.8346506953239441, + "step": 52060 + }, + { + "epoch": 0.5207, + "grad_norm": 14.375, + "grad_norm_var": 0.9656087239583333, + "learning_rate": 0.0003, + "loss": 11.0548, + "loss/aux_loss": 0.048069695197045806, + "loss/crossentropy": 2.9161171913146973, + "loss/logits": 0.8307078570127487, + "step": 52070 + }, + { + "epoch": 0.5208, + "grad_norm": 13.4375, + "grad_norm_var": 1.5013020833333333, + "learning_rate": 0.0003, + "loss": 10.9583, + "loss/aux_loss": 0.048071876727044584, + "loss/crossentropy": 2.7011435866355895, + "loss/logits": 0.8153011113405227, + "step": 52080 + }, + { + "epoch": 0.5209, + "grad_norm": 13.875, + "grad_norm_var": 0.499853515625, + "learning_rate": 0.0003, + "loss": 11.1305, + "loss/aux_loss": 0.048079690895974636, + "loss/crossentropy": 2.848330098390579, + "loss/logits": 0.8801573872566223, + "step": 52090 + }, + { + "epoch": 0.521, + "grad_norm": 17.0, + "grad_norm_var": 0.8848307291666667, + "learning_rate": 0.0003, + "loss": 11.2081, + "loss/aux_loss": 0.04806297663599253, + "loss/crossentropy": 2.7441537618637084, + "loss/logits": 0.8512663036584854, + "step": 52100 + }, + { + "epoch": 0.5211, + "grad_norm": 13.75, + "grad_norm_var": 0.8066243489583333, + "learning_rate": 0.0003, + "loss": 11.0919, + "loss/aux_loss": 0.048075188882648945, + "loss/crossentropy": 2.779297721385956, + "loss/logits": 0.8286562114953995, + "step": 52110 + }, + { + "epoch": 0.5212, + "grad_norm": 14.875, + "grad_norm_var": 0.6700520833333333, + "learning_rate": 0.0003, + "loss": 11.0529, + "loss/aux_loss": 0.04808066878467798, + "loss/crossentropy": 2.675219976902008, + "loss/logits": 0.822179701924324, + "step": 52120 + }, + { + "epoch": 0.5213, + "grad_norm": 15.4375, + "grad_norm_var": 3.655143229166667, + "learning_rate": 0.0003, + "loss": 11.055, + "loss/aux_loss": 0.04806961119174957, + "loss/crossentropy": 2.6856570720672606, + "loss/logits": 0.8347759008407593, + "step": 52130 + }, + { + "epoch": 0.5214, + "grad_norm": 13.875, + "grad_norm_var": 1.320947265625, + "learning_rate": 0.0003, + "loss": 11.083, + "loss/aux_loss": 0.048076645098626615, + "loss/crossentropy": 2.7049236536026, + "loss/logits": 0.8613912463188171, + "step": 52140 + }, + { + "epoch": 0.5215, + "grad_norm": 15.8125, + "grad_norm_var": 1.0139973958333333, + "learning_rate": 0.0003, + "loss": 11.1862, + "loss/aux_loss": 0.04807023461908102, + "loss/crossentropy": 2.8382157564163206, + "loss/logits": 0.8311042636632919, + "step": 52150 + }, + { + "epoch": 0.5216, + "grad_norm": 14.5625, + "grad_norm_var": 0.28274739583333336, + "learning_rate": 0.0003, + "loss": 10.8209, + "loss/aux_loss": 0.04806796368211508, + "loss/crossentropy": 2.614904749393463, + "loss/logits": 0.820676788687706, + "step": 52160 + }, + { + "epoch": 0.5217, + "grad_norm": 17.375, + "grad_norm_var": 432.25670572916664, + "learning_rate": 0.0003, + "loss": 11.0011, + "loss/aux_loss": 0.04808936510235071, + "loss/crossentropy": 2.7092471361160277, + "loss/logits": 0.8464861899614334, + "step": 52170 + }, + { + "epoch": 0.5218, + "grad_norm": 14.75, + "grad_norm_var": 422.3980305989583, + "learning_rate": 0.0003, + "loss": 10.9375, + "loss/aux_loss": 0.0480745954439044, + "loss/crossentropy": 2.670332300662994, + "loss/logits": 0.8062131941318512, + "step": 52180 + }, + { + "epoch": 0.5219, + "grad_norm": 19.75, + "grad_norm_var": 66.319775390625, + "learning_rate": 0.0003, + "loss": 10.9526, + "loss/aux_loss": 0.04807987660169601, + "loss/crossentropy": 2.55713204741478, + "loss/logits": 0.7974629938602448, + "step": 52190 + }, + { + "epoch": 0.522, + "grad_norm": 16.0, + "grad_norm_var": 65.07649739583333, + "learning_rate": 0.0003, + "loss": 11.1056, + "loss/aux_loss": 0.04808241315186024, + "loss/crossentropy": 2.6287878811359406, + "loss/logits": 0.8144524425268174, + "step": 52200 + }, + { + "epoch": 0.5221, + "grad_norm": 14.875, + "grad_norm_var": 6.874983723958334, + "learning_rate": 0.0003, + "loss": 11.0077, + "loss/aux_loss": 0.04806690067052841, + "loss/crossentropy": 2.838245689868927, + "loss/logits": 0.8343179583549499, + "step": 52210 + }, + { + "epoch": 0.5222, + "grad_norm": 16.375, + "grad_norm_var": 173.25271809895833, + "learning_rate": 0.0003, + "loss": 11.0835, + "loss/aux_loss": 0.0480758348479867, + "loss/crossentropy": 2.793542319536209, + "loss/logits": 0.8210146844387054, + "step": 52220 + }, + { + "epoch": 0.5223, + "grad_norm": 16.625, + "grad_norm_var": 169.347119140625, + "learning_rate": 0.0003, + "loss": 11.2643, + "loss/aux_loss": 0.048066737875342366, + "loss/crossentropy": 2.8591265738010407, + "loss/logits": 0.8664484679698944, + "step": 52230 + }, + { + "epoch": 0.5224, + "grad_norm": 16.875, + "grad_norm_var": 1.5620930989583333, + "learning_rate": 0.0003, + "loss": 11.0528, + "loss/aux_loss": 0.048073151335120204, + "loss/crossentropy": 2.916612446308136, + "loss/logits": 0.8306013375520707, + "step": 52240 + }, + { + "epoch": 0.5225, + "grad_norm": 15.75, + "grad_norm_var": 0.9489583333333333, + "learning_rate": 0.0003, + "loss": 10.9975, + "loss/aux_loss": 0.048077428713440895, + "loss/crossentropy": 2.7287715911865233, + "loss/logits": 0.8001983672380447, + "step": 52250 + }, + { + "epoch": 0.5226, + "grad_norm": 15.3125, + "grad_norm_var": 1.1489583333333333, + "learning_rate": 0.0003, + "loss": 11.1489, + "loss/aux_loss": 0.04807046465575695, + "loss/crossentropy": 2.690057897567749, + "loss/logits": 0.8564533293247223, + "step": 52260 + }, + { + "epoch": 0.5227, + "grad_norm": 14.6875, + "grad_norm_var": 1.4202962239583334, + "learning_rate": 0.0003, + "loss": 11.1189, + "loss/aux_loss": 0.04805558752268553, + "loss/crossentropy": 2.7192665219306944, + "loss/logits": 0.8252882838249207, + "step": 52270 + }, + { + "epoch": 0.5228, + "grad_norm": 14.0625, + "grad_norm_var": 1.149853515625, + "learning_rate": 0.0003, + "loss": 11.0902, + "loss/aux_loss": 0.048078200593590734, + "loss/crossentropy": 2.6997458934783936, + "loss/logits": 0.8123593002557754, + "step": 52280 + }, + { + "epoch": 0.5229, + "grad_norm": 14.3125, + "grad_norm_var": 0.503759765625, + "learning_rate": 0.0003, + "loss": 11.0166, + "loss/aux_loss": 0.04806804172694683, + "loss/crossentropy": 2.7590546131134035, + "loss/logits": 0.8631124138832093, + "step": 52290 + }, + { + "epoch": 0.523, + "grad_norm": 15.125, + "grad_norm_var": 0.36744791666666665, + "learning_rate": 0.0003, + "loss": 11.065, + "loss/aux_loss": 0.048063941113650796, + "loss/crossentropy": 2.7753712356090547, + "loss/logits": 0.8426102191209793, + "step": 52300 + }, + { + "epoch": 0.5231, + "grad_norm": 14.875, + "grad_norm_var": 0.5161295572916667, + "learning_rate": 0.0003, + "loss": 11.0001, + "loss/aux_loss": 0.04806868564337492, + "loss/crossentropy": 2.659641718864441, + "loss/logits": 0.7856981217861175, + "step": 52310 + }, + { + "epoch": 0.5232, + "grad_norm": 15.5625, + "grad_norm_var": 0.32784830729166664, + "learning_rate": 0.0003, + "loss": 10.9511, + "loss/aux_loss": 0.04808425325900316, + "loss/crossentropy": 2.805215048789978, + "loss/logits": 0.8224257946014404, + "step": 52320 + }, + { + "epoch": 0.5233, + "grad_norm": 14.1875, + "grad_norm_var": 0.6854166666666667, + "learning_rate": 0.0003, + "loss": 11.0145, + "loss/aux_loss": 0.04806450065225363, + "loss/crossentropy": 2.7425873398780825, + "loss/logits": 0.8594145178794861, + "step": 52330 + }, + { + "epoch": 0.5234, + "grad_norm": 14.4375, + "grad_norm_var": 0.8325520833333333, + "learning_rate": 0.0003, + "loss": 10.9849, + "loss/aux_loss": 0.048070778325200084, + "loss/crossentropy": 2.6495799660682677, + "loss/logits": 0.8561073631048203, + "step": 52340 + }, + { + "epoch": 0.5235, + "grad_norm": 14.4375, + "grad_norm_var": 0.3733723958333333, + "learning_rate": 0.0003, + "loss": 11.1114, + "loss/aux_loss": 0.048080655932426455, + "loss/crossentropy": 2.7500119626522066, + "loss/logits": 0.8257864147424698, + "step": 52350 + }, + { + "epoch": 0.5236, + "grad_norm": 13.625, + "grad_norm_var": 0.33098958333333334, + "learning_rate": 0.0003, + "loss": 11.1795, + "loss/aux_loss": 0.04806145485490561, + "loss/crossentropy": 2.7029913425445558, + "loss/logits": 0.8572422236204147, + "step": 52360 + }, + { + "epoch": 0.5237, + "grad_norm": 14.75, + "grad_norm_var": 0.5702962239583333, + "learning_rate": 0.0003, + "loss": 10.9993, + "loss/aux_loss": 0.04806832876056433, + "loss/crossentropy": 2.8137829422950746, + "loss/logits": 0.8569367885589599, + "step": 52370 + }, + { + "epoch": 0.5238, + "grad_norm": 15.3125, + "grad_norm_var": 0.3067708333333333, + "learning_rate": 0.0003, + "loss": 11.19, + "loss/aux_loss": 0.04807415381073952, + "loss/crossentropy": 2.7384074926376343, + "loss/logits": 0.8470358967781066, + "step": 52380 + }, + { + "epoch": 0.5239, + "grad_norm": 15.0625, + "grad_norm_var": 0.51953125, + "learning_rate": 0.0003, + "loss": 11.1412, + "loss/aux_loss": 0.048076053522527216, + "loss/crossentropy": 2.646379691362381, + "loss/logits": 0.8290688633918762, + "step": 52390 + }, + { + "epoch": 0.524, + "grad_norm": 14.5, + "grad_norm_var": 1.0010416666666666, + "learning_rate": 0.0003, + "loss": 11.132, + "loss/aux_loss": 0.04807012863457203, + "loss/crossentropy": 2.5831472992897035, + "loss/logits": 0.8385468900203705, + "step": 52400 + }, + { + "epoch": 0.5241, + "grad_norm": 14.0625, + "grad_norm_var": 0.63046875, + "learning_rate": 0.0003, + "loss": 11.025, + "loss/aux_loss": 0.0480716010555625, + "loss/crossentropy": 2.7688582479953765, + "loss/logits": 0.81949682533741, + "step": 52410 + }, + { + "epoch": 0.5242, + "grad_norm": 14.875, + "grad_norm_var": 1.2026041666666667, + "learning_rate": 0.0003, + "loss": 11.0372, + "loss/aux_loss": 0.04806106220930815, + "loss/crossentropy": 2.662490212917328, + "loss/logits": 0.8316513210535049, + "step": 52420 + }, + { + "epoch": 0.5243, + "grad_norm": 14.5625, + "grad_norm_var": 0.7013020833333333, + "learning_rate": 0.0003, + "loss": 11.0859, + "loss/aux_loss": 0.04808855298906565, + "loss/crossentropy": 2.754934787750244, + "loss/logits": 0.8487011790275574, + "step": 52430 + }, + { + "epoch": 0.5244, + "grad_norm": 15.25, + "grad_norm_var": 0.5528645833333333, + "learning_rate": 0.0003, + "loss": 11.0998, + "loss/aux_loss": 0.04806870725005865, + "loss/crossentropy": 2.7495794236660003, + "loss/logits": 0.8377079129219055, + "step": 52440 + }, + { + "epoch": 0.5245, + "grad_norm": 14.4375, + "grad_norm_var": 0.6384765625, + "learning_rate": 0.0003, + "loss": 11.0026, + "loss/aux_loss": 0.04806353971362114, + "loss/crossentropy": 2.7716871798038483, + "loss/logits": 0.8267540782690048, + "step": 52450 + }, + { + "epoch": 0.5246, + "grad_norm": 15.6875, + "grad_norm_var": 0.8356608072916667, + "learning_rate": 0.0003, + "loss": 11.0121, + "loss/aux_loss": 0.048077926598489286, + "loss/crossentropy": 2.6334754884243012, + "loss/logits": 0.8087111979722976, + "step": 52460 + }, + { + "epoch": 0.5247, + "grad_norm": 14.375, + "grad_norm_var": 0.5186848958333333, + "learning_rate": 0.0003, + "loss": 11.1926, + "loss/aux_loss": 0.048081686906516555, + "loss/crossentropy": 2.869077742099762, + "loss/logits": 0.8490048706531524, + "step": 52470 + }, + { + "epoch": 0.5248, + "grad_norm": 58.5, + "grad_norm_var": 120.93463541666667, + "learning_rate": 0.0003, + "loss": 11.1115, + "loss/aux_loss": 0.04806402996182442, + "loss/crossentropy": 2.7439518332481385, + "loss/logits": 0.8263050705194473, + "step": 52480 + }, + { + "epoch": 0.5249, + "grad_norm": 13.75, + "grad_norm_var": 120.603369140625, + "learning_rate": 0.0003, + "loss": 11.1591, + "loss/aux_loss": 0.04807528704404831, + "loss/crossentropy": 2.756017154455185, + "loss/logits": 0.8585342705249787, + "step": 52490 + }, + { + "epoch": 0.525, + "grad_norm": 17.0, + "grad_norm_var": 0.8096354166666667, + "learning_rate": 0.0003, + "loss": 10.8914, + "loss/aux_loss": 0.04808173086494207, + "loss/crossentropy": 2.5397191107273103, + "loss/logits": 0.7909625247120857, + "step": 52500 + }, + { + "epoch": 0.5251, + "grad_norm": 15.0625, + "grad_norm_var": 0.6671223958333333, + "learning_rate": 0.0003, + "loss": 11.1209, + "loss/aux_loss": 0.04808140583336353, + "loss/crossentropy": 2.77059742808342, + "loss/logits": 0.8521383255720139, + "step": 52510 + }, + { + "epoch": 0.5252, + "grad_norm": 14.875, + "grad_norm_var": 0.5299479166666666, + "learning_rate": 0.0003, + "loss": 11.0222, + "loss/aux_loss": 0.048074524849653244, + "loss/crossentropy": 2.839930164813995, + "loss/logits": 0.8667486757040024, + "step": 52520 + }, + { + "epoch": 0.5253, + "grad_norm": 14.9375, + "grad_norm_var": 0.49347330729166666, + "learning_rate": 0.0003, + "loss": 11.1244, + "loss/aux_loss": 0.048075276613235476, + "loss/crossentropy": 2.7688077211380007, + "loss/logits": 0.8387952595949173, + "step": 52530 + }, + { + "epoch": 0.5254, + "grad_norm": 14.625, + "grad_norm_var": 0.20774739583333332, + "learning_rate": 0.0003, + "loss": 10.9861, + "loss/aux_loss": 0.048070489801466464, + "loss/crossentropy": 2.698532724380493, + "loss/logits": 0.8255835890769958, + "step": 52540 + }, + { + "epoch": 0.5255, + "grad_norm": 14.625, + "grad_norm_var": 0.23587239583333333, + "learning_rate": 0.0003, + "loss": 10.9713, + "loss/aux_loss": 0.048068863339722157, + "loss/crossentropy": 2.6561999797821043, + "loss/logits": 0.8281907647848129, + "step": 52550 + }, + { + "epoch": 0.5256, + "grad_norm": 14.9375, + "grad_norm_var": 0.49635416666666665, + "learning_rate": 0.0003, + "loss": 11.1839, + "loss/aux_loss": 0.04807021860033274, + "loss/crossentropy": 2.6908550024032594, + "loss/logits": 0.8255572736263275, + "step": 52560 + }, + { + "epoch": 0.5257, + "grad_norm": 14.875, + "grad_norm_var": 0.877978515625, + "learning_rate": 0.0003, + "loss": 11.1567, + "loss/aux_loss": 0.04808451551944017, + "loss/crossentropy": 2.6696718633174896, + "loss/logits": 0.8380467757582665, + "step": 52570 + }, + { + "epoch": 0.5258, + "grad_norm": 14.4375, + "grad_norm_var": 0.8130208333333333, + "learning_rate": 0.0003, + "loss": 11.2055, + "loss/aux_loss": 0.048067509196698666, + "loss/crossentropy": 2.7303407311439516, + "loss/logits": 0.8700813353061676, + "step": 52580 + }, + { + "epoch": 0.5259, + "grad_norm": 14.125, + "grad_norm_var": 0.3790201822916667, + "learning_rate": 0.0003, + "loss": 11.0618, + "loss/aux_loss": 0.04807270802557469, + "loss/crossentropy": 2.6694105565547943, + "loss/logits": 0.7754775255918502, + "step": 52590 + }, + { + "epoch": 0.526, + "grad_norm": 14.375, + "grad_norm_var": 0.6511555989583333, + "learning_rate": 0.0003, + "loss": 10.9769, + "loss/aux_loss": 0.048081257939338685, + "loss/crossentropy": 2.6837186098098753, + "loss/logits": 0.8063045144081116, + "step": 52600 + }, + { + "epoch": 0.5261, + "grad_norm": 14.0, + "grad_norm_var": 0.396728515625, + "learning_rate": 0.0003, + "loss": 11.0115, + "loss/aux_loss": 0.04805648773908615, + "loss/crossentropy": 2.7159491300582888, + "loss/logits": 0.8259630739688874, + "step": 52610 + }, + { + "epoch": 0.5262, + "grad_norm": 14.1875, + "grad_norm_var": 0.3636555989583333, + "learning_rate": 0.0003, + "loss": 11.1381, + "loss/aux_loss": 0.048068524710834025, + "loss/crossentropy": 2.7253064274787904, + "loss/logits": 0.8619579613208771, + "step": 52620 + }, + { + "epoch": 0.5263, + "grad_norm": 16.625, + "grad_norm_var": 0.5613932291666667, + "learning_rate": 0.0003, + "loss": 11.0564, + "loss/aux_loss": 0.048091832920908927, + "loss/crossentropy": 2.6312204539775848, + "loss/logits": 0.8314170449972152, + "step": 52630 + }, + { + "epoch": 0.5264, + "grad_norm": 15.6875, + "grad_norm_var": 0.6113118489583333, + "learning_rate": 0.0003, + "loss": 10.9914, + "loss/aux_loss": 0.04807051923125982, + "loss/crossentropy": 2.6146963119506834, + "loss/logits": 0.8094371676445007, + "step": 52640 + }, + { + "epoch": 0.5265, + "grad_norm": 14.625, + "grad_norm_var": 0.5033854166666667, + "learning_rate": 0.0003, + "loss": 11.0347, + "loss/aux_loss": 0.048073076829314235, + "loss/crossentropy": 2.6342477977275847, + "loss/logits": 0.8295989811420441, + "step": 52650 + }, + { + "epoch": 0.5266, + "grad_norm": 14.3125, + "grad_norm_var": 0.3854166666666667, + "learning_rate": 0.0003, + "loss": 11.14, + "loss/aux_loss": 0.04807663932442665, + "loss/crossentropy": 2.7020954489707947, + "loss/logits": 0.8581462055444717, + "step": 52660 + }, + { + "epoch": 0.5267, + "grad_norm": 14.125, + "grad_norm_var": 0.30831705729166664, + "learning_rate": 0.0003, + "loss": 11.1963, + "loss/aux_loss": 0.04808024019002914, + "loss/crossentropy": 2.7205568671226503, + "loss/logits": 0.8558012962341308, + "step": 52670 + }, + { + "epoch": 0.5268, + "grad_norm": 14.5625, + "grad_norm_var": 0.48483072916666664, + "learning_rate": 0.0003, + "loss": 11.2696, + "loss/aux_loss": 0.04806710928678513, + "loss/crossentropy": 2.6928990364074705, + "loss/logits": 0.8565327882766723, + "step": 52680 + }, + { + "epoch": 0.5269, + "grad_norm": 13.9375, + "grad_norm_var": 0.33326822916666665, + "learning_rate": 0.0003, + "loss": 11.0155, + "loss/aux_loss": 0.04807606115937233, + "loss/crossentropy": 2.7816842436790465, + "loss/logits": 0.8381152004003525, + "step": 52690 + }, + { + "epoch": 0.527, + "grad_norm": 14.3125, + "grad_norm_var": 0.5822265625, + "learning_rate": 0.0003, + "loss": 10.9005, + "loss/aux_loss": 0.04807661436498165, + "loss/crossentropy": 2.7815606117248537, + "loss/logits": 0.8124045938253402, + "step": 52700 + }, + { + "epoch": 0.5271, + "grad_norm": 19.125, + "grad_norm_var": 126.51925455729166, + "learning_rate": 0.0003, + "loss": 11.0246, + "loss/aux_loss": 0.04807195011526346, + "loss/crossentropy": 2.7958267748355867, + "loss/logits": 0.8013758540153504, + "step": 52710 + }, + { + "epoch": 0.5272, + "grad_norm": 15.5625, + "grad_norm_var": 1.7684895833333334, + "learning_rate": 0.0003, + "loss": 10.8688, + "loss/aux_loss": 0.04807553049176931, + "loss/crossentropy": 2.6325803816318514, + "loss/logits": 0.774325168132782, + "step": 52720 + }, + { + "epoch": 0.5273, + "grad_norm": 16.25, + "grad_norm_var": 0.6153483072916667, + "learning_rate": 0.0003, + "loss": 10.9838, + "loss/aux_loss": 0.048076539672911166, + "loss/crossentropy": 2.693459987640381, + "loss/logits": 0.8326463222503662, + "step": 52730 + }, + { + "epoch": 0.5274, + "grad_norm": 14.9375, + "grad_norm_var": 0.92890625, + "learning_rate": 0.0003, + "loss": 11.0022, + "loss/aux_loss": 0.048076620884239675, + "loss/crossentropy": 2.664745795726776, + "loss/logits": 0.7966249287128448, + "step": 52740 + }, + { + "epoch": 0.5275, + "grad_norm": 14.25, + "grad_norm_var": 0.7697265625, + "learning_rate": 0.0003, + "loss": 11.0997, + "loss/aux_loss": 0.048067349940538406, + "loss/crossentropy": 2.7236247181892397, + "loss/logits": 0.8551719903945922, + "step": 52750 + }, + { + "epoch": 0.5276, + "grad_norm": 16.125, + "grad_norm_var": 1.3202473958333334, + "learning_rate": 0.0003, + "loss": 11.1669, + "loss/aux_loss": 0.04806178268045187, + "loss/crossentropy": 2.6296289205551147, + "loss/logits": 0.8232845932245254, + "step": 52760 + }, + { + "epoch": 0.5277, + "grad_norm": 14.125, + "grad_norm_var": 1.2356608072916666, + "learning_rate": 0.0003, + "loss": 10.9658, + "loss/aux_loss": 0.048072172701358794, + "loss/crossentropy": 2.7454709470272065, + "loss/logits": 0.8835760146379471, + "step": 52770 + }, + { + "epoch": 0.5278, + "grad_norm": 13.8125, + "grad_norm_var": 0.46608072916666665, + "learning_rate": 0.0003, + "loss": 10.9155, + "loss/aux_loss": 0.04808282610028982, + "loss/crossentropy": 2.6181671559810638, + "loss/logits": 0.7878573626279831, + "step": 52780 + }, + { + "epoch": 0.5279, + "grad_norm": 14.0, + "grad_norm_var": 0.4754557291666667, + "learning_rate": 0.0003, + "loss": 11.1934, + "loss/aux_loss": 0.04805999156087637, + "loss/crossentropy": 2.5984963536262513, + "loss/logits": 0.827095377445221, + "step": 52790 + }, + { + "epoch": 0.528, + "grad_norm": 15.8125, + "grad_norm_var": 3.491650390625, + "learning_rate": 0.0003, + "loss": 10.891, + "loss/aux_loss": 0.048077487759292124, + "loss/crossentropy": 2.659852463006973, + "loss/logits": 0.8111390471458435, + "step": 52800 + }, + { + "epoch": 0.5281, + "grad_norm": 17.0, + "grad_norm_var": 1.5148274739583334, + "learning_rate": 0.0003, + "loss": 11.0344, + "loss/aux_loss": 0.048082652315497396, + "loss/crossentropy": 2.7374175548553468, + "loss/logits": 0.8587904393672943, + "step": 52810 + }, + { + "epoch": 0.5282, + "grad_norm": 15.0, + "grad_norm_var": 0.9212890625, + "learning_rate": 0.0003, + "loss": 11.0007, + "loss/aux_loss": 0.048067308217287066, + "loss/crossentropy": 2.6461476027965545, + "loss/logits": 0.8455465078353882, + "step": 52820 + }, + { + "epoch": 0.5283, + "grad_norm": 14.6875, + "grad_norm_var": 0.19791666666666666, + "learning_rate": 0.0003, + "loss": 11.1109, + "loss/aux_loss": 0.04806644786149263, + "loss/crossentropy": 2.6328052401542665, + "loss/logits": 0.8342522650957107, + "step": 52830 + }, + { + "epoch": 0.5284, + "grad_norm": 14.375, + "grad_norm_var": 0.9562337239583333, + "learning_rate": 0.0003, + "loss": 11.086, + "loss/aux_loss": 0.04808475598692894, + "loss/crossentropy": 2.6256862759590147, + "loss/logits": 0.7829153060913085, + "step": 52840 + }, + { + "epoch": 0.5285, + "grad_norm": 17.0, + "grad_norm_var": 1.2145670572916667, + "learning_rate": 0.0003, + "loss": 10.9496, + "loss/aux_loss": 0.04806415122002363, + "loss/crossentropy": 2.6609319686889648, + "loss/logits": 0.8291169613599777, + "step": 52850 + }, + { + "epoch": 0.5286, + "grad_norm": 15.25, + "grad_norm_var": 0.740478515625, + "learning_rate": 0.0003, + "loss": 11.0984, + "loss/aux_loss": 0.04807539191097021, + "loss/crossentropy": 2.7916451573371885, + "loss/logits": 0.8835980743169785, + "step": 52860 + }, + { + "epoch": 0.5287, + "grad_norm": 15.0625, + "grad_norm_var": 0.3244140625, + "learning_rate": 0.0003, + "loss": 11.1493, + "loss/aux_loss": 0.04807141460478306, + "loss/crossentropy": 2.6395734310150147, + "loss/logits": 0.8369731396436692, + "step": 52870 + }, + { + "epoch": 0.5288, + "grad_norm": 15.0625, + "grad_norm_var": 0.3999837239583333, + "learning_rate": 0.0003, + "loss": 11.1769, + "loss/aux_loss": 0.0480716809630394, + "loss/crossentropy": 2.646906042098999, + "loss/logits": 0.8252363950014114, + "step": 52880 + }, + { + "epoch": 0.5289, + "grad_norm": 14.6875, + "grad_norm_var": 0.53359375, + "learning_rate": 0.0003, + "loss": 10.8501, + "loss/aux_loss": 0.0480677381157875, + "loss/crossentropy": 2.6961144506931305, + "loss/logits": 0.8162854909896851, + "step": 52890 + }, + { + "epoch": 0.529, + "grad_norm": 14.3125, + "grad_norm_var": 0.9093098958333333, + "learning_rate": 0.0003, + "loss": 11.0589, + "loss/aux_loss": 0.04807200450450182, + "loss/crossentropy": 2.698295068740845, + "loss/logits": 0.8631285429000854, + "step": 52900 + }, + { + "epoch": 0.5291, + "grad_norm": 13.5, + "grad_norm_var": 0.7817708333333333, + "learning_rate": 0.0003, + "loss": 11.1834, + "loss/aux_loss": 0.048069717921316625, + "loss/crossentropy": 2.6712867975234986, + "loss/logits": 0.8359966963529587, + "step": 52910 + }, + { + "epoch": 0.5292, + "grad_norm": 14.75, + "grad_norm_var": 0.6963541666666667, + "learning_rate": 0.0003, + "loss": 11.0659, + "loss/aux_loss": 0.04808163065463304, + "loss/crossentropy": 2.8286949574947355, + "loss/logits": 0.8520474523305893, + "step": 52920 + }, + { + "epoch": 0.5293, + "grad_norm": 13.375, + "grad_norm_var": 218.29959309895833, + "learning_rate": 0.0003, + "loss": 11.0394, + "loss/aux_loss": 0.04807936865836382, + "loss/crossentropy": 2.6493199944496153, + "loss/logits": 0.8226024299860001, + "step": 52930 + }, + { + "epoch": 0.5294, + "grad_norm": 14.375, + "grad_norm_var": 1.834375, + "learning_rate": 0.0003, + "loss": 11.0009, + "loss/aux_loss": 0.04807878416031599, + "loss/crossentropy": 2.7264573156833647, + "loss/logits": 0.8180064380168914, + "step": 52940 + }, + { + "epoch": 0.5295, + "grad_norm": 17.875, + "grad_norm_var": 209.07029622395834, + "learning_rate": 0.0003, + "loss": 11.0623, + "loss/aux_loss": 0.04807962104678154, + "loss/crossentropy": 2.6582289695739747, + "loss/logits": 0.8249925941228866, + "step": 52950 + }, + { + "epoch": 0.5296, + "grad_norm": 16.125, + "grad_norm_var": 205.12589518229166, + "learning_rate": 0.0003, + "loss": 11.0608, + "loss/aux_loss": 0.048064501583576204, + "loss/crossentropy": 2.701625847816467, + "loss/logits": 0.8246585041284561, + "step": 52960 + }, + { + "epoch": 0.5297, + "grad_norm": 15.5625, + "grad_norm_var": 1.485400390625, + "learning_rate": 0.0003, + "loss": 11.118, + "loss/aux_loss": 0.04807926844805479, + "loss/crossentropy": 2.733833837509155, + "loss/logits": 0.8345234960317611, + "step": 52970 + }, + { + "epoch": 0.5298, + "grad_norm": 14.875, + "grad_norm_var": 10.382747395833333, + "learning_rate": 0.0003, + "loss": 11.1176, + "loss/aux_loss": 0.04808099288493395, + "loss/crossentropy": 2.7490680694580076, + "loss/logits": 0.838299173116684, + "step": 52980 + }, + { + "epoch": 0.5299, + "grad_norm": 13.625, + "grad_norm_var": 10.414827473958333, + "learning_rate": 0.0003, + "loss": 11.0292, + "loss/aux_loss": 0.048076806217432023, + "loss/crossentropy": 2.852311670780182, + "loss/logits": 0.8578749477863312, + "step": 52990 + }, + { + "epoch": 0.53, + "grad_norm": 14.5, + "grad_norm_var": 1.9286295572916667, + "learning_rate": 0.0003, + "loss": 11.0788, + "loss/aux_loss": 0.04808791987597942, + "loss/crossentropy": 2.48682958483696, + "loss/logits": 0.7837358355522156, + "step": 53000 + }, + { + "epoch": 0.5301, + "grad_norm": 15.25, + "grad_norm_var": 0.5119791666666667, + "learning_rate": 0.0003, + "loss": 11.0287, + "loss/aux_loss": 0.04806363768875599, + "loss/crossentropy": 2.7360428392887117, + "loss/logits": 0.8066596657037735, + "step": 53010 + }, + { + "epoch": 0.5302, + "grad_norm": 14.0625, + "grad_norm_var": 1.45078125, + "learning_rate": 0.0003, + "loss": 11.0826, + "loss/aux_loss": 0.048080742731690405, + "loss/crossentropy": 2.624992382526398, + "loss/logits": 0.8372664958238601, + "step": 53020 + }, + { + "epoch": 0.5303, + "grad_norm": 17.125, + "grad_norm_var": 1.5770833333333334, + "learning_rate": 0.0003, + "loss": 10.9728, + "loss/aux_loss": 0.0480709882453084, + "loss/crossentropy": 2.6210521042346953, + "loss/logits": 0.8308377593755722, + "step": 53030 + }, + { + "epoch": 0.5304, + "grad_norm": 14.75, + "grad_norm_var": 1.211962890625, + "learning_rate": 0.0003, + "loss": 11.0566, + "loss/aux_loss": 0.04806949980556965, + "loss/crossentropy": 2.679835093021393, + "loss/logits": 0.8029073655605317, + "step": 53040 + }, + { + "epoch": 0.5305, + "grad_norm": 15.3125, + "grad_norm_var": 0.3603515625, + "learning_rate": 0.0003, + "loss": 11.064, + "loss/aux_loss": 0.04807988815009594, + "loss/crossentropy": 2.649093449115753, + "loss/logits": 0.8222986310720444, + "step": 53050 + }, + { + "epoch": 0.5306, + "grad_norm": 15.1875, + "grad_norm_var": 0.5082682291666667, + "learning_rate": 0.0003, + "loss": 11.0953, + "loss/aux_loss": 0.04807553198188543, + "loss/crossentropy": 2.7045671463012697, + "loss/logits": 0.815049484372139, + "step": 53060 + }, + { + "epoch": 0.5307, + "grad_norm": 14.875, + "grad_norm_var": 0.33255208333333336, + "learning_rate": 0.0003, + "loss": 11.0753, + "loss/aux_loss": 0.04806431755423546, + "loss/crossentropy": 2.7456027269363403, + "loss/logits": 0.8385494530200959, + "step": 53070 + }, + { + "epoch": 0.5308, + "grad_norm": 14.3125, + "grad_norm_var": 0.260009765625, + "learning_rate": 0.0003, + "loss": 10.9512, + "loss/aux_loss": 0.04808401577174663, + "loss/crossentropy": 2.5376985907554626, + "loss/logits": 0.8007230907678604, + "step": 53080 + }, + { + "epoch": 0.5309, + "grad_norm": 15.0625, + "grad_norm_var": 3.8843098958333333, + "learning_rate": 0.0003, + "loss": 10.9647, + "loss/aux_loss": 0.04807156920433044, + "loss/crossentropy": 2.8426152527332307, + "loss/logits": 0.8263060122728347, + "step": 53090 + }, + { + "epoch": 0.531, + "grad_norm": 14.25, + "grad_norm_var": 3.9544270833333335, + "learning_rate": 0.0003, + "loss": 11.1311, + "loss/aux_loss": 0.048070313036441804, + "loss/crossentropy": 2.679327297210693, + "loss/logits": 0.8237017244100571, + "step": 53100 + }, + { + "epoch": 0.5311, + "grad_norm": 14.1875, + "grad_norm_var": 0.3791015625, + "learning_rate": 0.0003, + "loss": 11.1016, + "loss/aux_loss": 0.04808187987655401, + "loss/crossentropy": 2.571546399593353, + "loss/logits": 0.8291330844163894, + "step": 53110 + }, + { + "epoch": 0.5312, + "grad_norm": 14.25, + "grad_norm_var": 0.44244791666666666, + "learning_rate": 0.0003, + "loss": 11.0984, + "loss/aux_loss": 0.04806428123265505, + "loss/crossentropy": 2.682811915874481, + "loss/logits": 0.8410823673009873, + "step": 53120 + }, + { + "epoch": 0.5313, + "grad_norm": 14.0625, + "grad_norm_var": 0.443994140625, + "learning_rate": 0.0003, + "loss": 11.0374, + "loss/aux_loss": 0.04808186236768961, + "loss/crossentropy": 2.8723417639732363, + "loss/logits": 0.8567765563726425, + "step": 53130 + }, + { + "epoch": 0.5314, + "grad_norm": 14.0, + "grad_norm_var": 0.48943684895833334, + "learning_rate": 0.0003, + "loss": 11.0291, + "loss/aux_loss": 0.048074983060359955, + "loss/crossentropy": 2.634695219993591, + "loss/logits": 0.836324593424797, + "step": 53140 + }, + { + "epoch": 0.5315, + "grad_norm": 14.625, + "grad_norm_var": 0.8124837239583333, + "learning_rate": 0.0003, + "loss": 11.1395, + "loss/aux_loss": 0.04806621633470058, + "loss/crossentropy": 2.740517848730087, + "loss/logits": 0.8308217048645019, + "step": 53150 + }, + { + "epoch": 0.5316, + "grad_norm": 13.1875, + "grad_norm_var": 0.7023274739583333, + "learning_rate": 0.0003, + "loss": 11.0551, + "loss/aux_loss": 0.04807417429983616, + "loss/crossentropy": 2.7302970767021177, + "loss/logits": 0.8507604837417603, + "step": 53160 + }, + { + "epoch": 0.5317, + "grad_norm": 15.1875, + "grad_norm_var": 1.5652180989583333, + "learning_rate": 0.0003, + "loss": 11.0137, + "loss/aux_loss": 0.04807768277823925, + "loss/crossentropy": 2.676066642999649, + "loss/logits": 0.8240761816501617, + "step": 53170 + }, + { + "epoch": 0.5318, + "grad_norm": 14.3125, + "grad_norm_var": 0.4925618489583333, + "learning_rate": 0.0003, + "loss": 11.1415, + "loss/aux_loss": 0.04806169308722019, + "loss/crossentropy": 2.6013071119785307, + "loss/logits": 0.8342882752418518, + "step": 53180 + }, + { + "epoch": 0.5319, + "grad_norm": 14.625, + "grad_norm_var": 0.3927083333333333, + "learning_rate": 0.0003, + "loss": 11.129, + "loss/aux_loss": 0.04808182790875435, + "loss/crossentropy": 2.7180078864097594, + "loss/logits": 0.8595794111490249, + "step": 53190 + }, + { + "epoch": 0.532, + "grad_norm": 15.0625, + "grad_norm_var": 0.19212239583333332, + "learning_rate": 0.0003, + "loss": 11.0407, + "loss/aux_loss": 0.04806814473122358, + "loss/crossentropy": 2.6401141822338103, + "loss/logits": 0.7996371448040008, + "step": 53200 + }, + { + "epoch": 0.5321, + "grad_norm": 14.4375, + "grad_norm_var": 0.2530598958333333, + "learning_rate": 0.0003, + "loss": 11.0913, + "loss/aux_loss": 0.04807532113045454, + "loss/crossentropy": 2.7931397438049315, + "loss/logits": 0.7978598833084106, + "step": 53210 + }, + { + "epoch": 0.5322, + "grad_norm": 14.875, + "grad_norm_var": 0.28854166666666664, + "learning_rate": 0.0003, + "loss": 11.0714, + "loss/aux_loss": 0.04807628560811281, + "loss/crossentropy": 2.6044381737709044, + "loss/logits": 0.8384490758180618, + "step": 53220 + }, + { + "epoch": 0.5323, + "grad_norm": 15.0625, + "grad_norm_var": 0.3311848958333333, + "learning_rate": 0.0003, + "loss": 11.0452, + "loss/aux_loss": 0.048069493100047114, + "loss/crossentropy": 2.725711923837662, + "loss/logits": 0.8274922966957092, + "step": 53230 + }, + { + "epoch": 0.5324, + "grad_norm": 14.3125, + "grad_norm_var": 0.6875, + "learning_rate": 0.0003, + "loss": 11.0122, + "loss/aux_loss": 0.0480713777244091, + "loss/crossentropy": 2.7457379400730133, + "loss/logits": 0.8154986947774887, + "step": 53240 + }, + { + "epoch": 0.5325, + "grad_norm": 14.3125, + "grad_norm_var": 0.4058430989583333, + "learning_rate": 0.0003, + "loss": 11.0645, + "loss/aux_loss": 0.048076901398599145, + "loss/crossentropy": 2.644556438922882, + "loss/logits": 0.7974002599716187, + "step": 53250 + }, + { + "epoch": 0.5326, + "grad_norm": 14.5, + "grad_norm_var": 1.065478515625, + "learning_rate": 0.0003, + "loss": 11.1355, + "loss/aux_loss": 0.04806389715522528, + "loss/crossentropy": 2.6610435485839843, + "loss/logits": 0.8192419022321701, + "step": 53260 + }, + { + "epoch": 0.5327, + "grad_norm": 14.3125, + "grad_norm_var": 1.362353515625, + "learning_rate": 0.0003, + "loss": 11.0868, + "loss/aux_loss": 0.04807983003556728, + "loss/crossentropy": 2.726958382129669, + "loss/logits": 0.8118180692195892, + "step": 53270 + }, + { + "epoch": 0.5328, + "grad_norm": 15.5625, + "grad_norm_var": 0.5813639322916667, + "learning_rate": 0.0003, + "loss": 11.1905, + "loss/aux_loss": 0.048070806078612804, + "loss/crossentropy": 2.5736697733402254, + "loss/logits": 0.8248791873455048, + "step": 53280 + }, + { + "epoch": 0.5329, + "grad_norm": 14.3125, + "grad_norm_var": 0.5400390625, + "learning_rate": 0.0003, + "loss": 11.117, + "loss/aux_loss": 0.04807869885116815, + "loss/crossentropy": 2.789425587654114, + "loss/logits": 0.8584134370088577, + "step": 53290 + }, + { + "epoch": 0.533, + "grad_norm": 14.25, + "grad_norm_var": 0.3907389322916667, + "learning_rate": 0.0003, + "loss": 11.0787, + "loss/aux_loss": 0.048071909509599206, + "loss/crossentropy": 2.6537403225898744, + "loss/logits": 0.8524068266153335, + "step": 53300 + }, + { + "epoch": 0.5331, + "grad_norm": 16.25, + "grad_norm_var": 1.6032389322916667, + "learning_rate": 0.0003, + "loss": 11.2028, + "loss/aux_loss": 0.04807873219251633, + "loss/crossentropy": 2.86398446559906, + "loss/logits": 0.84943727850914, + "step": 53310 + }, + { + "epoch": 0.5332, + "grad_norm": 15.4375, + "grad_norm_var": 0.3870930989583333, + "learning_rate": 0.0003, + "loss": 11.0856, + "loss/aux_loss": 0.048069454915821555, + "loss/crossentropy": 2.6376845240592957, + "loss/logits": 0.845120832324028, + "step": 53320 + }, + { + "epoch": 0.5333, + "grad_norm": 14.0625, + "grad_norm_var": 0.5899576822916667, + "learning_rate": 0.0003, + "loss": 11.0704, + "loss/aux_loss": 0.04807726014405489, + "loss/crossentropy": 2.852811598777771, + "loss/logits": 0.808278375864029, + "step": 53330 + }, + { + "epoch": 0.5334, + "grad_norm": 15.0625, + "grad_norm_var": 0.6477701822916667, + "learning_rate": 0.0003, + "loss": 10.8892, + "loss/aux_loss": 0.04806978944689035, + "loss/crossentropy": 2.7639645457267763, + "loss/logits": 0.8498774021863937, + "step": 53340 + }, + { + "epoch": 0.5335, + "grad_norm": 15.125, + "grad_norm_var": 0.7806640625, + "learning_rate": 0.0003, + "loss": 10.8939, + "loss/aux_loss": 0.048068863339722157, + "loss/crossentropy": 2.472467356920242, + "loss/logits": 0.7923805028200149, + "step": 53350 + }, + { + "epoch": 0.5336, + "grad_norm": 14.4375, + "grad_norm_var": 0.6285807291666666, + "learning_rate": 0.0003, + "loss": 10.9191, + "loss/aux_loss": 0.04807096607983112, + "loss/crossentropy": 2.535427379608154, + "loss/logits": 0.8171293288469315, + "step": 53360 + }, + { + "epoch": 0.5337, + "grad_norm": 15.3125, + "grad_norm_var": 0.27029622395833336, + "learning_rate": 0.0003, + "loss": 10.9995, + "loss/aux_loss": 0.04807757455855608, + "loss/crossentropy": 2.699053144454956, + "loss/logits": 0.8358200043439865, + "step": 53370 + }, + { + "epoch": 0.5338, + "grad_norm": 14.5625, + "grad_norm_var": 0.37916666666666665, + "learning_rate": 0.0003, + "loss": 10.9628, + "loss/aux_loss": 0.04807063937187195, + "loss/crossentropy": 2.595501071214676, + "loss/logits": 0.8261586040258407, + "step": 53380 + }, + { + "epoch": 0.5339, + "grad_norm": 14.3125, + "grad_norm_var": 0.746875, + "learning_rate": 0.0003, + "loss": 11.0618, + "loss/aux_loss": 0.04807272292673588, + "loss/crossentropy": 2.6923603653907775, + "loss/logits": 0.8231137573719025, + "step": 53390 + }, + { + "epoch": 0.534, + "grad_norm": 15.1875, + "grad_norm_var": 0.44869791666666664, + "learning_rate": 0.0003, + "loss": 10.979, + "loss/aux_loss": 0.04807500522583723, + "loss/crossentropy": 2.829385429620743, + "loss/logits": 0.8415767669677734, + "step": 53400 + }, + { + "epoch": 0.5341, + "grad_norm": 14.125, + "grad_norm_var": 0.3702473958333333, + "learning_rate": 0.0003, + "loss": 10.9503, + "loss/aux_loss": 0.04806397818028927, + "loss/crossentropy": 2.4466098248958588, + "loss/logits": 0.7799597263336182, + "step": 53410 + }, + { + "epoch": 0.5342, + "grad_norm": 14.25, + "grad_norm_var": 0.5653483072916666, + "learning_rate": 0.0003, + "loss": 11.1548, + "loss/aux_loss": 0.04807454627007246, + "loss/crossentropy": 2.669411617517471, + "loss/logits": 0.8267213612794876, + "step": 53420 + }, + { + "epoch": 0.5343, + "grad_norm": 15.9375, + "grad_norm_var": 0.8238932291666666, + "learning_rate": 0.0003, + "loss": 10.9296, + "loss/aux_loss": 0.048065404407680035, + "loss/crossentropy": 2.78000248670578, + "loss/logits": 0.8359499126672745, + "step": 53430 + }, + { + "epoch": 0.5344, + "grad_norm": 14.4375, + "grad_norm_var": 0.276025390625, + "learning_rate": 0.0003, + "loss": 11.0951, + "loss/aux_loss": 0.04806756917387247, + "loss/crossentropy": 2.854048955440521, + "loss/logits": 0.8441928833723068, + "step": 53440 + }, + { + "epoch": 0.5345, + "grad_norm": 13.5, + "grad_norm_var": 0.8166015625, + "learning_rate": 0.0003, + "loss": 11.1276, + "loss/aux_loss": 0.04807045813649893, + "loss/crossentropy": 2.7718479573726653, + "loss/logits": 0.8208920061588287, + "step": 53450 + }, + { + "epoch": 0.5346, + "grad_norm": 15.0625, + "grad_norm_var": 1.2453125, + "learning_rate": 0.0003, + "loss": 11.1, + "loss/aux_loss": 0.048068010993301866, + "loss/crossentropy": 2.890912193059921, + "loss/logits": 0.812038055062294, + "step": 53460 + }, + { + "epoch": 0.5347, + "grad_norm": 15.875, + "grad_norm_var": 0.943994140625, + "learning_rate": 0.0003, + "loss": 11.0452, + "loss/aux_loss": 0.04808399192988873, + "loss/crossentropy": 2.8517406702041628, + "loss/logits": 0.8417524635791779, + "step": 53470 + }, + { + "epoch": 0.5348, + "grad_norm": 14.9375, + "grad_norm_var": 0.9677083333333333, + "learning_rate": 0.0003, + "loss": 10.964, + "loss/aux_loss": 0.048053649812936784, + "loss/crossentropy": 2.7846075654029847, + "loss/logits": 0.8391733020544052, + "step": 53480 + }, + { + "epoch": 0.5349, + "grad_norm": 15.75, + "grad_norm_var": 0.26171875, + "learning_rate": 0.0003, + "loss": 11.0427, + "loss/aux_loss": 0.04807787444442511, + "loss/crossentropy": 2.6856286406517027, + "loss/logits": 0.8094239175319672, + "step": 53490 + }, + { + "epoch": 0.535, + "grad_norm": 15.3125, + "grad_norm_var": 0.5763020833333333, + "learning_rate": 0.0003, + "loss": 11.0114, + "loss/aux_loss": 0.048055261932313444, + "loss/crossentropy": 2.517116981744766, + "loss/logits": 0.7816114693880081, + "step": 53500 + }, + { + "epoch": 0.5351, + "grad_norm": 13.6875, + "grad_norm_var": 0.5478515625, + "learning_rate": 0.0003, + "loss": 11.1496, + "loss/aux_loss": 0.0480762280523777, + "loss/crossentropy": 2.9342130780220033, + "loss/logits": 0.856569093465805, + "step": 53510 + }, + { + "epoch": 0.5352, + "grad_norm": 14.125, + "grad_norm_var": 5.563525390625, + "learning_rate": 0.0003, + "loss": 11.0715, + "loss/aux_loss": 0.04807373471558094, + "loss/crossentropy": 2.8552743911743166, + "loss/logits": 0.8635714590549469, + "step": 53520 + }, + { + "epoch": 0.5353, + "grad_norm": 14.4375, + "grad_norm_var": 80.24724934895833, + "learning_rate": 0.0003, + "loss": 11.065, + "loss/aux_loss": 0.04806609898805618, + "loss/crossentropy": 2.737239396572113, + "loss/logits": 0.8101500362157822, + "step": 53530 + }, + { + "epoch": 0.5354, + "grad_norm": 15.8125, + "grad_norm_var": 79.16443684895833, + "learning_rate": 0.0003, + "loss": 10.8988, + "loss/aux_loss": 0.04807798117399216, + "loss/crossentropy": 2.601036584377289, + "loss/logits": 0.8181641131639481, + "step": 53540 + }, + { + "epoch": 0.5355, + "grad_norm": 15.1875, + "grad_norm_var": 0.45779622395833336, + "learning_rate": 0.0003, + "loss": 11.0052, + "loss/aux_loss": 0.04806607346981764, + "loss/crossentropy": 2.7229600071907045, + "loss/logits": 0.8561374306678772, + "step": 53550 + }, + { + "epoch": 0.5356, + "grad_norm": 17.25, + "grad_norm_var": 1.2523274739583334, + "learning_rate": 0.0003, + "loss": 11.011, + "loss/aux_loss": 0.04806992299854755, + "loss/crossentropy": 2.877775228023529, + "loss/logits": 0.8536604076623917, + "step": 53560 + }, + { + "epoch": 0.5357, + "grad_norm": 14.125, + "grad_norm_var": 1.068994140625, + "learning_rate": 0.0003, + "loss": 10.9918, + "loss/aux_loss": 0.04807414263486862, + "loss/crossentropy": 2.705450266599655, + "loss/logits": 0.8226163148880005, + "step": 53570 + }, + { + "epoch": 0.5358, + "grad_norm": 15.8125, + "grad_norm_var": 0.403369140625, + "learning_rate": 0.0003, + "loss": 10.838, + "loss/aux_loss": 0.0480672538280487, + "loss/crossentropy": 2.744877350330353, + "loss/logits": 0.793895834684372, + "step": 53580 + }, + { + "epoch": 0.5359, + "grad_norm": 15.0625, + "grad_norm_var": 0.9878743489583334, + "learning_rate": 0.0003, + "loss": 11.1031, + "loss/aux_loss": 0.04807367753237486, + "loss/crossentropy": 2.871882838010788, + "loss/logits": 0.8179901033639908, + "step": 53590 + }, + { + "epoch": 0.536, + "grad_norm": 14.4375, + "grad_norm_var": 0.9676432291666667, + "learning_rate": 0.0003, + "loss": 10.9992, + "loss/aux_loss": 0.048075889237225056, + "loss/crossentropy": 2.8211424231529234, + "loss/logits": 0.8252835303544999, + "step": 53600 + }, + { + "epoch": 0.5361, + "grad_norm": 14.875, + "grad_norm_var": 0.7976399739583333, + "learning_rate": 0.0003, + "loss": 11.194, + "loss/aux_loss": 0.048059284873306754, + "loss/crossentropy": 2.791327440738678, + "loss/logits": 0.8332475572824478, + "step": 53610 + }, + { + "epoch": 0.5362, + "grad_norm": 13.6875, + "grad_norm_var": 1.3942057291666667, + "learning_rate": 0.0003, + "loss": 11.1379, + "loss/aux_loss": 0.04807420019060373, + "loss/crossentropy": 2.5513383507728578, + "loss/logits": 0.7817022532224656, + "step": 53620 + }, + { + "epoch": 0.5363, + "grad_norm": 14.625, + "grad_norm_var": 6.633317057291666, + "learning_rate": 0.0003, + "loss": 11.0451, + "loss/aux_loss": 0.04806692767888308, + "loss/crossentropy": 2.764022743701935, + "loss/logits": 0.8246651530265808, + "step": 53630 + }, + { + "epoch": 0.5364, + "grad_norm": 14.25, + "grad_norm_var": 6.286197916666667, + "learning_rate": 0.0003, + "loss": 10.9723, + "loss/aux_loss": 0.0480807863175869, + "loss/crossentropy": 2.5619694709777834, + "loss/logits": 0.8253705441951752, + "step": 53640 + }, + { + "epoch": 0.5365, + "grad_norm": 13.625, + "grad_norm_var": 0.6011555989583334, + "learning_rate": 0.0003, + "loss": 10.9507, + "loss/aux_loss": 0.04806951284408569, + "loss/crossentropy": 2.7298443794250487, + "loss/logits": 0.8315545797348023, + "step": 53650 + }, + { + "epoch": 0.5366, + "grad_norm": 14.0625, + "grad_norm_var": 0.804931640625, + "learning_rate": 0.0003, + "loss": 10.979, + "loss/aux_loss": 0.04807220809161663, + "loss/crossentropy": 2.6058572232723236, + "loss/logits": 0.7958117395639419, + "step": 53660 + }, + { + "epoch": 0.5367, + "grad_norm": 14.125, + "grad_norm_var": 0.602978515625, + "learning_rate": 0.0003, + "loss": 11.0157, + "loss/aux_loss": 0.04807724934071302, + "loss/crossentropy": 2.6349853515625, + "loss/logits": 0.8204777866601944, + "step": 53670 + }, + { + "epoch": 0.5368, + "grad_norm": 14.3125, + "grad_norm_var": 0.51015625, + "learning_rate": 0.0003, + "loss": 11.212, + "loss/aux_loss": 0.04807522259652615, + "loss/crossentropy": 2.769087851047516, + "loss/logits": 0.8173623502254486, + "step": 53680 + }, + { + "epoch": 0.5369, + "grad_norm": 15.375, + "grad_norm_var": 0.83671875, + "learning_rate": 0.0003, + "loss": 11.0955, + "loss/aux_loss": 0.04807520546019077, + "loss/crossentropy": 2.750330662727356, + "loss/logits": 0.8260623097419739, + "step": 53690 + }, + { + "epoch": 0.537, + "grad_norm": 14.75, + "grad_norm_var": 0.9067545572916667, + "learning_rate": 0.0003, + "loss": 11.0911, + "loss/aux_loss": 0.04807036910206079, + "loss/crossentropy": 2.819355845451355, + "loss/logits": 0.8768564403057099, + "step": 53700 + }, + { + "epoch": 0.5371, + "grad_norm": 14.5625, + "grad_norm_var": 1.2452473958333334, + "learning_rate": 0.0003, + "loss": 10.955, + "loss/aux_loss": 0.04807029124349356, + "loss/crossentropy": 2.6186971068382263, + "loss/logits": 0.8072131723165512, + "step": 53710 + }, + { + "epoch": 0.5372, + "grad_norm": 14.75, + "grad_norm_var": 3.1669270833333334, + "learning_rate": 0.0003, + "loss": 11.0975, + "loss/aux_loss": 0.04807576686143875, + "loss/crossentropy": 2.7736194491386414, + "loss/logits": 0.8309052169322968, + "step": 53720 + }, + { + "epoch": 0.5373, + "grad_norm": 15.625, + "grad_norm_var": 0.3016764322916667, + "learning_rate": 0.0003, + "loss": 11.0322, + "loss/aux_loss": 0.04806541334837675, + "loss/crossentropy": 2.686142373085022, + "loss/logits": 0.8141505211591721, + "step": 53730 + }, + { + "epoch": 0.5374, + "grad_norm": 15.6875, + "grad_norm_var": 0.4722493489583333, + "learning_rate": 0.0003, + "loss": 11.0185, + "loss/aux_loss": 0.04807978682219982, + "loss/crossentropy": 2.636608195304871, + "loss/logits": 0.8161023885011673, + "step": 53740 + }, + { + "epoch": 0.5375, + "grad_norm": 13.8125, + "grad_norm_var": 0.6322916666666667, + "learning_rate": 0.0003, + "loss": 10.986, + "loss/aux_loss": 0.04806647896766662, + "loss/crossentropy": 2.7851890683174134, + "loss/logits": 0.8317944526672363, + "step": 53750 + }, + { + "epoch": 0.5376, + "grad_norm": 15.0, + "grad_norm_var": 2.291520182291667, + "learning_rate": 0.0003, + "loss": 11.1272, + "loss/aux_loss": 0.04807963985949755, + "loss/crossentropy": 2.6976101815700533, + "loss/logits": 0.8801734536886215, + "step": 53760 + }, + { + "epoch": 0.5377, + "grad_norm": 15.125, + "grad_norm_var": 1.95234375, + "learning_rate": 0.0003, + "loss": 11.0244, + "loss/aux_loss": 0.048057077825069426, + "loss/crossentropy": 2.6501388132572172, + "loss/logits": 0.8309973537921905, + "step": 53770 + }, + { + "epoch": 0.5378, + "grad_norm": 15.0, + "grad_norm_var": 0.369384765625, + "learning_rate": 0.0003, + "loss": 10.8356, + "loss/aux_loss": 0.04807448796927929, + "loss/crossentropy": 2.7889013409614565, + "loss/logits": 0.8378504902124405, + "step": 53780 + }, + { + "epoch": 0.5379, + "grad_norm": 14.625, + "grad_norm_var": 0.5079264322916667, + "learning_rate": 0.0003, + "loss": 10.9582, + "loss/aux_loss": 0.04807044938206673, + "loss/crossentropy": 2.509946370124817, + "loss/logits": 0.8080231517553329, + "step": 53790 + }, + { + "epoch": 0.538, + "grad_norm": 15.3125, + "grad_norm_var": 1.011962890625, + "learning_rate": 0.0003, + "loss": 11.1271, + "loss/aux_loss": 0.04807122368365526, + "loss/crossentropy": 2.6480683028697967, + "loss/logits": 0.7914795011281968, + "step": 53800 + }, + { + "epoch": 0.5381, + "grad_norm": 14.8125, + "grad_norm_var": 0.6346354166666667, + "learning_rate": 0.0003, + "loss": 10.9294, + "loss/aux_loss": 0.048066381551325324, + "loss/crossentropy": 2.7862293422222137, + "loss/logits": 0.8505240023136139, + "step": 53810 + }, + { + "epoch": 0.5382, + "grad_norm": 14.9375, + "grad_norm_var": 0.29375, + "learning_rate": 0.0003, + "loss": 11.038, + "loss/aux_loss": 0.048070579580962655, + "loss/crossentropy": 2.6358683347702025, + "loss/logits": 0.8482916533946991, + "step": 53820 + }, + { + "epoch": 0.5383, + "grad_norm": 16.25, + "grad_norm_var": 20.885270182291666, + "learning_rate": 0.0003, + "loss": 11.0671, + "loss/aux_loss": 0.04806550685316324, + "loss/crossentropy": 2.6382621049880983, + "loss/logits": 0.8401525467634201, + "step": 53830 + }, + { + "epoch": 0.5384, + "grad_norm": 15.0, + "grad_norm_var": 0.3759765625, + "learning_rate": 0.0003, + "loss": 11.1593, + "loss/aux_loss": 0.04807865265756846, + "loss/crossentropy": 2.6878114998340608, + "loss/logits": 0.846782973408699, + "step": 53840 + }, + { + "epoch": 0.5385, + "grad_norm": 15.1875, + "grad_norm_var": 0.121875, + "learning_rate": 0.0003, + "loss": 11.0436, + "loss/aux_loss": 0.0480678740888834, + "loss/crossentropy": 2.6060830295085906, + "loss/logits": 0.8473536789417266, + "step": 53850 + }, + { + "epoch": 0.5386, + "grad_norm": 14.0625, + "grad_norm_var": 0.7895670572916667, + "learning_rate": 0.0003, + "loss": 11.094, + "loss/aux_loss": 0.04806717596948147, + "loss/crossentropy": 2.6601334273815156, + "loss/logits": 0.8278073340654373, + "step": 53860 + }, + { + "epoch": 0.5387, + "grad_norm": 13.75, + "grad_norm_var": 0.8055826822916666, + "learning_rate": 0.0003, + "loss": 10.9571, + "loss/aux_loss": 0.048082117736339566, + "loss/crossentropy": 2.7736589670181275, + "loss/logits": 0.7962304085493088, + "step": 53870 + }, + { + "epoch": 0.5388, + "grad_norm": 14.5, + "grad_norm_var": 0.513916015625, + "learning_rate": 0.0003, + "loss": 11.1215, + "loss/aux_loss": 0.04806166738271713, + "loss/crossentropy": 2.7752721309661865, + "loss/logits": 0.8360392391681671, + "step": 53880 + }, + { + "epoch": 0.5389, + "grad_norm": 14.1875, + "grad_norm_var": 0.7431640625, + "learning_rate": 0.0003, + "loss": 11.2056, + "loss/aux_loss": 0.04807320367544889, + "loss/crossentropy": 2.800821077823639, + "loss/logits": 0.8525734037160874, + "step": 53890 + }, + { + "epoch": 0.539, + "grad_norm": 13.9375, + "grad_norm_var": 0.7877604166666666, + "learning_rate": 0.0003, + "loss": 11.0453, + "loss/aux_loss": 0.048065982013940814, + "loss/crossentropy": 2.7617095947265624, + "loss/logits": 0.8081191062927247, + "step": 53900 + }, + { + "epoch": 0.5391, + "grad_norm": 14.5, + "grad_norm_var": 1.1628743489583333, + "learning_rate": 0.0003, + "loss": 10.8716, + "loss/aux_loss": 0.04807501696050167, + "loss/crossentropy": 2.5991472363471986, + "loss/logits": 0.8059735208749771, + "step": 53910 + }, + { + "epoch": 0.5392, + "grad_norm": 16.875, + "grad_norm_var": 1.0609212239583334, + "learning_rate": 0.0003, + "loss": 10.9493, + "loss/aux_loss": 0.048074368946254255, + "loss/crossentropy": 2.7167088091373444, + "loss/logits": 0.8214272201061249, + "step": 53920 + }, + { + "epoch": 0.5393, + "grad_norm": 15.75, + "grad_norm_var": 0.6758951822916667, + "learning_rate": 0.0003, + "loss": 11.0663, + "loss/aux_loss": 0.048074664548039436, + "loss/crossentropy": 2.6797154784202575, + "loss/logits": 0.8302604794502259, + "step": 53930 + }, + { + "epoch": 0.5394, + "grad_norm": 14.5625, + "grad_norm_var": 0.3733723958333333, + "learning_rate": 0.0003, + "loss": 11.1273, + "loss/aux_loss": 0.04806742053478956, + "loss/crossentropy": 2.7210973858833314, + "loss/logits": 0.8412497580051422, + "step": 53940 + }, + { + "epoch": 0.5395, + "grad_norm": 15.0, + "grad_norm_var": 0.466259765625, + "learning_rate": 0.0003, + "loss": 11.0095, + "loss/aux_loss": 0.048062196187675, + "loss/crossentropy": 2.655183678865433, + "loss/logits": 0.8122419893741608, + "step": 53950 + }, + { + "epoch": 0.5396, + "grad_norm": 15.3125, + "grad_norm_var": 0.3477701822916667, + "learning_rate": 0.0003, + "loss": 11.1166, + "loss/aux_loss": 0.04807857759296894, + "loss/crossentropy": 2.614110505580902, + "loss/logits": 0.8455834567546845, + "step": 53960 + }, + { + "epoch": 0.5397, + "grad_norm": 14.125, + "grad_norm_var": 0.5222493489583333, + "learning_rate": 0.0003, + "loss": 11.0279, + "loss/aux_loss": 0.04806742183864117, + "loss/crossentropy": 2.6582097709178925, + "loss/logits": 0.8321360021829605, + "step": 53970 + }, + { + "epoch": 0.5398, + "grad_norm": 15.0, + "grad_norm_var": 0.8941243489583334, + "learning_rate": 0.0003, + "loss": 11.0669, + "loss/aux_loss": 0.04807064067572355, + "loss/crossentropy": 2.8422482132911684, + "loss/logits": 0.8568818151950837, + "step": 53980 + }, + { + "epoch": 0.5399, + "grad_norm": 13.5625, + "grad_norm_var": 0.267041015625, + "learning_rate": 0.0003, + "loss": 10.8872, + "loss/aux_loss": 0.04806562513113022, + "loss/crossentropy": 2.64736921787262, + "loss/logits": 0.8495258182287216, + "step": 53990 + }, + { + "epoch": 0.54, + "grad_norm": 14.6875, + "grad_norm_var": 0.8567708333333334, + "learning_rate": 0.0003, + "loss": 11.0104, + "loss/aux_loss": 0.048078795336186884, + "loss/crossentropy": 2.7528712272644045, + "loss/logits": 0.830004358291626, + "step": 54000 + }, + { + "epoch": 0.5401, + "grad_norm": 14.25, + "grad_norm_var": 0.9296875, + "learning_rate": 0.0003, + "loss": 11.1183, + "loss/aux_loss": 0.04806767832487822, + "loss/crossentropy": 2.5062259435653687, + "loss/logits": 0.8438413232564926, + "step": 54010 + }, + { + "epoch": 0.5402, + "grad_norm": 14.9375, + "grad_norm_var": 0.5441243489583333, + "learning_rate": 0.0003, + "loss": 10.9066, + "loss/aux_loss": 0.048075572960078715, + "loss/crossentropy": 2.6077619075775145, + "loss/logits": 0.8247323483228683, + "step": 54020 + }, + { + "epoch": 0.5403, + "grad_norm": 14.1875, + "grad_norm_var": 0.43776041666666665, + "learning_rate": 0.0003, + "loss": 11.0432, + "loss/aux_loss": 0.04806858468800783, + "loss/crossentropy": 2.5667442798614504, + "loss/logits": 0.8107105433940888, + "step": 54030 + }, + { + "epoch": 0.5404, + "grad_norm": 16.375, + "grad_norm_var": 0.65859375, + "learning_rate": 0.0003, + "loss": 11.0633, + "loss/aux_loss": 0.048066397197544575, + "loss/crossentropy": 2.6910421431064604, + "loss/logits": 0.8274150729179383, + "step": 54040 + }, + { + "epoch": 0.5405, + "grad_norm": 20.25, + "grad_norm_var": 693.61171875, + "learning_rate": 0.0003, + "loss": 11.1854, + "loss/aux_loss": 0.04809403121471405, + "loss/crossentropy": 2.966396164894104, + "loss/logits": 0.8811068832874298, + "step": 54050 + }, + { + "epoch": 0.5406, + "grad_norm": 15.5625, + "grad_norm_var": 696.5848307291667, + "learning_rate": 0.0003, + "loss": 10.9561, + "loss/aux_loss": 0.048076951503753663, + "loss/crossentropy": 2.7103439450263975, + "loss/logits": 0.8318052858114242, + "step": 54060 + }, + { + "epoch": 0.5407, + "grad_norm": 13.9375, + "grad_norm_var": 0.6145670572916667, + "learning_rate": 0.0003, + "loss": 11.0968, + "loss/aux_loss": 0.048073592409491536, + "loss/crossentropy": 2.802405446767807, + "loss/logits": 0.807966560125351, + "step": 54070 + }, + { + "epoch": 0.5408, + "grad_norm": 14.625, + "grad_norm_var": 1.6135416666666667, + "learning_rate": 0.0003, + "loss": 10.9053, + "loss/aux_loss": 0.048079601302742955, + "loss/crossentropy": 2.689373391866684, + "loss/logits": 0.8208520948886872, + "step": 54080 + }, + { + "epoch": 0.5409, + "grad_norm": 14.0625, + "grad_norm_var": 1.5844889322916667, + "learning_rate": 0.0003, + "loss": 10.9493, + "loss/aux_loss": 0.04807072449475527, + "loss/crossentropy": 2.792709541320801, + "loss/logits": 0.8486111849546433, + "step": 54090 + }, + { + "epoch": 0.541, + "grad_norm": 15.6875, + "grad_norm_var": 0.526025390625, + "learning_rate": 0.0003, + "loss": 11.0094, + "loss/aux_loss": 0.0480718944221735, + "loss/crossentropy": 2.5616662383079527, + "loss/logits": 0.8192130953073502, + "step": 54100 + }, + { + "epoch": 0.5411, + "grad_norm": 15.25, + "grad_norm_var": 0.60546875, + "learning_rate": 0.0003, + "loss": 10.9774, + "loss/aux_loss": 0.04807149339467287, + "loss/crossentropy": 2.642424190044403, + "loss/logits": 0.8189570486545563, + "step": 54110 + }, + { + "epoch": 0.5412, + "grad_norm": 14.6875, + "grad_norm_var": 0.2639973958333333, + "learning_rate": 0.0003, + "loss": 10.9475, + "loss/aux_loss": 0.04808628931641579, + "loss/crossentropy": 2.6260932087898254, + "loss/logits": 0.8346313923597336, + "step": 54120 + }, + { + "epoch": 0.5413, + "grad_norm": 13.9375, + "grad_norm_var": 0.23170572916666668, + "learning_rate": 0.0003, + "loss": 11.0016, + "loss/aux_loss": 0.04805894047021866, + "loss/crossentropy": 2.718409872055054, + "loss/logits": 0.8037611931562424, + "step": 54130 + }, + { + "epoch": 0.5414, + "grad_norm": 16.0, + "grad_norm_var": 0.4911295572916667, + "learning_rate": 0.0003, + "loss": 11.0083, + "loss/aux_loss": 0.04807175807654858, + "loss/crossentropy": 2.665117746591568, + "loss/logits": 0.8265301376581192, + "step": 54140 + }, + { + "epoch": 0.5415, + "grad_norm": 16.125, + "grad_norm_var": 0.982275390625, + "learning_rate": 0.0003, + "loss": 10.9641, + "loss/aux_loss": 0.0480738976970315, + "loss/crossentropy": 2.575757938623428, + "loss/logits": 0.8114981263875961, + "step": 54150 + }, + { + "epoch": 0.5416, + "grad_norm": 14.5, + "grad_norm_var": 0.6212890625, + "learning_rate": 0.0003, + "loss": 11.0063, + "loss/aux_loss": 0.04806560389697552, + "loss/crossentropy": 2.744628429412842, + "loss/logits": 0.8167267292737961, + "step": 54160 + }, + { + "epoch": 0.5417, + "grad_norm": 14.375, + "grad_norm_var": 0.365869140625, + "learning_rate": 0.0003, + "loss": 10.8746, + "loss/aux_loss": 0.04807755947113037, + "loss/crossentropy": 2.729469120502472, + "loss/logits": 0.8389277517795563, + "step": 54170 + }, + { + "epoch": 0.5418, + "grad_norm": 15.6875, + "grad_norm_var": 0.6606608072916667, + "learning_rate": 0.0003, + "loss": 11.0025, + "loss/aux_loss": 0.04807156063616276, + "loss/crossentropy": 2.6684438705444338, + "loss/logits": 0.8205473870038986, + "step": 54180 + }, + { + "epoch": 0.5419, + "grad_norm": 15.875, + "grad_norm_var": 0.5477701822916666, + "learning_rate": 0.0003, + "loss": 10.9314, + "loss/aux_loss": 0.04807228222489357, + "loss/crossentropy": 2.6462816834449767, + "loss/logits": 0.8203870177268981, + "step": 54190 + }, + { + "epoch": 0.542, + "grad_norm": 13.75, + "grad_norm_var": 0.851025390625, + "learning_rate": 0.0003, + "loss": 11.1092, + "loss/aux_loss": 0.048070806078612804, + "loss/crossentropy": 2.653973418474197, + "loss/logits": 0.8173866599798203, + "step": 54200 + }, + { + "epoch": 0.5421, + "grad_norm": 15.25, + "grad_norm_var": 0.3895182291666667, + "learning_rate": 0.0003, + "loss": 11.0387, + "loss/aux_loss": 0.04807880613952875, + "loss/crossentropy": 2.8338425755500793, + "loss/logits": 0.8203609675168991, + "step": 54210 + }, + { + "epoch": 0.5422, + "grad_norm": 14.8125, + "grad_norm_var": 0.40167643229166666, + "learning_rate": 0.0003, + "loss": 11.0663, + "loss/aux_loss": 0.04807204119861126, + "loss/crossentropy": 2.8691022396087646, + "loss/logits": 0.8554022014141083, + "step": 54220 + }, + { + "epoch": 0.5423, + "grad_norm": 15.25, + "grad_norm_var": 41.256510416666664, + "learning_rate": 0.0003, + "loss": 10.8823, + "loss/aux_loss": 0.04807537421584129, + "loss/crossentropy": 2.7167785286903383, + "loss/logits": 0.8512430638074875, + "step": 54230 + }, + { + "epoch": 0.5424, + "grad_norm": 14.5625, + "grad_norm_var": 38.6275390625, + "learning_rate": 0.0003, + "loss": 10.9108, + "loss/aux_loss": 0.04806912895292044, + "loss/crossentropy": 2.7357052505016326, + "loss/logits": 0.8582530438899993, + "step": 54240 + }, + { + "epoch": 0.5425, + "grad_norm": 14.6875, + "grad_norm_var": 2.011979166666667, + "learning_rate": 0.0003, + "loss": 11.1132, + "loss/aux_loss": 0.048073516227304935, + "loss/crossentropy": 2.8648219108581543, + "loss/logits": 0.8151409834623337, + "step": 54250 + }, + { + "epoch": 0.5426, + "grad_norm": 15.0, + "grad_norm_var": 2.1770833333333335, + "learning_rate": 0.0003, + "loss": 11.1463, + "loss/aux_loss": 0.0480679165571928, + "loss/crossentropy": 2.6788148045539857, + "loss/logits": 0.8590417355298996, + "step": 54260 + }, + { + "epoch": 0.5427, + "grad_norm": 15.375, + "grad_norm_var": 0.44998372395833336, + "learning_rate": 0.0003, + "loss": 11.0944, + "loss/aux_loss": 0.04807999115437269, + "loss/crossentropy": 2.7842895090579987, + "loss/logits": 0.7997796133160591, + "step": 54270 + }, + { + "epoch": 0.5428, + "grad_norm": 15.9375, + "grad_norm_var": 1.0296223958333333, + "learning_rate": 0.0003, + "loss": 11.0713, + "loss/aux_loss": 0.04806948360055685, + "loss/crossentropy": 2.6741649389266966, + "loss/logits": 0.8174891114234925, + "step": 54280 + }, + { + "epoch": 0.5429, + "grad_norm": 14.5625, + "grad_norm_var": 1.2658854166666667, + "learning_rate": 0.0003, + "loss": 10.9228, + "loss/aux_loss": 0.04806262943893671, + "loss/crossentropy": 2.7555585384368895, + "loss/logits": 0.8146154910326004, + "step": 54290 + }, + { + "epoch": 0.543, + "grad_norm": 15.875, + "grad_norm_var": 1.404931640625, + "learning_rate": 0.0003, + "loss": 11.0579, + "loss/aux_loss": 0.048083382099866866, + "loss/crossentropy": 2.644556665420532, + "loss/logits": 0.8286139577627182, + "step": 54300 + }, + { + "epoch": 0.5431, + "grad_norm": 14.4375, + "grad_norm_var": 1.439306640625, + "learning_rate": 0.0003, + "loss": 11.0906, + "loss/aux_loss": 0.04806300513446331, + "loss/crossentropy": 2.77071772813797, + "loss/logits": 0.8199887424707413, + "step": 54310 + }, + { + "epoch": 0.5432, + "grad_norm": 14.625, + "grad_norm_var": 0.5684895833333333, + "learning_rate": 0.0003, + "loss": 10.952, + "loss/aux_loss": 0.04807317145168781, + "loss/crossentropy": 2.7126712799072266, + "loss/logits": 0.8230218112468719, + "step": 54320 + }, + { + "epoch": 0.5433, + "grad_norm": 15.3125, + "grad_norm_var": 0.5093587239583334, + "learning_rate": 0.0003, + "loss": 11.2114, + "loss/aux_loss": 0.048068922385573386, + "loss/crossentropy": 2.6962040960788727, + "loss/logits": 0.8404717385768891, + "step": 54330 + }, + { + "epoch": 0.5434, + "grad_norm": 13.625, + "grad_norm_var": 0.4540201822916667, + "learning_rate": 0.0003, + "loss": 11.0177, + "loss/aux_loss": 0.0480722289532423, + "loss/crossentropy": 2.549234163761139, + "loss/logits": 0.8165672957897187, + "step": 54340 + }, + { + "epoch": 0.5435, + "grad_norm": 13.6875, + "grad_norm_var": 1.0739420572916667, + "learning_rate": 0.0003, + "loss": 11.0096, + "loss/aux_loss": 0.04807580169290304, + "loss/crossentropy": 2.7629685401916504, + "loss/logits": 0.8371683716773987, + "step": 54350 + }, + { + "epoch": 0.5436, + "grad_norm": 15.0, + "grad_norm_var": 0.49073893229166665, + "learning_rate": 0.0003, + "loss": 11.0909, + "loss/aux_loss": 0.04806205108761787, + "loss/crossentropy": 2.7718708157539367, + "loss/logits": 0.8284541577100754, + "step": 54360 + }, + { + "epoch": 0.5437, + "grad_norm": 14.125, + "grad_norm_var": 0.41451822916666664, + "learning_rate": 0.0003, + "loss": 11.088, + "loss/aux_loss": 0.0480658633634448, + "loss/crossentropy": 2.6623750627040863, + "loss/logits": 0.7974577456712723, + "step": 54370 + }, + { + "epoch": 0.5438, + "grad_norm": 14.0, + "grad_norm_var": 0.5153483072916667, + "learning_rate": 0.0003, + "loss": 10.9292, + "loss/aux_loss": 0.048075420595705506, + "loss/crossentropy": 2.6294266045093537, + "loss/logits": 0.8390023171901703, + "step": 54380 + }, + { + "epoch": 0.5439, + "grad_norm": 15.4375, + "grad_norm_var": 0.738134765625, + "learning_rate": 0.0003, + "loss": 10.9715, + "loss/aux_loss": 0.04806606397032738, + "loss/crossentropy": 2.7456657886505127, + "loss/logits": 0.8316713005304337, + "step": 54390 + }, + { + "epoch": 0.544, + "grad_norm": 15.8125, + "grad_norm_var": 0.39191080729166666, + "learning_rate": 0.0003, + "loss": 11.0328, + "loss/aux_loss": 0.048073101229965685, + "loss/crossentropy": 2.708446371555328, + "loss/logits": 0.8340208530426025, + "step": 54400 + }, + { + "epoch": 0.5441, + "grad_norm": 17.375, + "grad_norm_var": 1.078369140625, + "learning_rate": 0.0003, + "loss": 10.9475, + "loss/aux_loss": 0.04807380214333534, + "loss/crossentropy": 2.5974309384822845, + "loss/logits": 0.7814567148685455, + "step": 54410 + }, + { + "epoch": 0.5442, + "grad_norm": 15.0, + "grad_norm_var": 1.2375, + "learning_rate": 0.0003, + "loss": 11.0752, + "loss/aux_loss": 0.04807327631860971, + "loss/crossentropy": 2.7958995699882507, + "loss/logits": 0.852491220831871, + "step": 54420 + }, + { + "epoch": 0.5443, + "grad_norm": 14.5625, + "grad_norm_var": 1.4841145833333333, + "learning_rate": 0.0003, + "loss": 11.0555, + "loss/aux_loss": 0.04806957729160786, + "loss/crossentropy": 2.706936830282211, + "loss/logits": 0.8366940230131149, + "step": 54430 + }, + { + "epoch": 0.5444, + "grad_norm": 13.5, + "grad_norm_var": 0.29108072916666666, + "learning_rate": 0.0003, + "loss": 11.0092, + "loss/aux_loss": 0.04806767478585243, + "loss/crossentropy": 2.5973580896854402, + "loss/logits": 0.8097669005393981, + "step": 54440 + }, + { + "epoch": 0.5445, + "grad_norm": 15.3125, + "grad_norm_var": 0.6947265625, + "learning_rate": 0.0003, + "loss": 10.9494, + "loss/aux_loss": 0.04808604661375284, + "loss/crossentropy": 2.7189027309417724, + "loss/logits": 0.8361554414033889, + "step": 54450 + }, + { + "epoch": 0.5446, + "grad_norm": 14.5, + "grad_norm_var": 2.285660807291667, + "learning_rate": 0.0003, + "loss": 10.9697, + "loss/aux_loss": 0.048063131235539916, + "loss/crossentropy": 2.6419017791748045, + "loss/logits": 0.8406848013401031, + "step": 54460 + }, + { + "epoch": 0.5447, + "grad_norm": 15.8125, + "grad_norm_var": 0.2712890625, + "learning_rate": 0.0003, + "loss": 11.0544, + "loss/aux_loss": 0.04808263406157494, + "loss/crossentropy": 2.768437546491623, + "loss/logits": 0.856353759765625, + "step": 54470 + }, + { + "epoch": 0.5448, + "grad_norm": 14.625, + "grad_norm_var": 0.688134765625, + "learning_rate": 0.0003, + "loss": 10.9724, + "loss/aux_loss": 0.04806215986609459, + "loss/crossentropy": 2.8292512774467466, + "loss/logits": 0.8433954983949661, + "step": 54480 + }, + { + "epoch": 0.5449, + "grad_norm": 15.625, + "grad_norm_var": 0.8994140625, + "learning_rate": 0.0003, + "loss": 10.8039, + "loss/aux_loss": 0.0480883689597249, + "loss/crossentropy": 2.797008693218231, + "loss/logits": 0.8281289517879487, + "step": 54490 + }, + { + "epoch": 0.545, + "grad_norm": 14.75, + "grad_norm_var": 0.6885416666666667, + "learning_rate": 0.0003, + "loss": 11.0977, + "loss/aux_loss": 0.048079678975045684, + "loss/crossentropy": 2.7917538404464723, + "loss/logits": 0.7895294100046157, + "step": 54500 + }, + { + "epoch": 0.5451, + "grad_norm": 14.0625, + "grad_norm_var": 0.8801432291666667, + "learning_rate": 0.0003, + "loss": 11.0203, + "loss/aux_loss": 0.04806761741638184, + "loss/crossentropy": 2.5670079469680784, + "loss/logits": 0.7952559888362885, + "step": 54510 + }, + { + "epoch": 0.5452, + "grad_norm": 15.4375, + "grad_norm_var": 0.3082682291666667, + "learning_rate": 0.0003, + "loss": 11.051, + "loss/aux_loss": 0.04807319939136505, + "loss/crossentropy": 2.767977863550186, + "loss/logits": 0.8227100253105164, + "step": 54520 + }, + { + "epoch": 0.5453, + "grad_norm": 15.1875, + "grad_norm_var": 0.7530598958333333, + "learning_rate": 0.0003, + "loss": 11.0836, + "loss/aux_loss": 0.04807401541620493, + "loss/crossentropy": 2.7236214160919188, + "loss/logits": 0.8225501179695129, + "step": 54530 + }, + { + "epoch": 0.5454, + "grad_norm": 13.75, + "grad_norm_var": 1.500634765625, + "learning_rate": 0.0003, + "loss": 10.9399, + "loss/aux_loss": 0.04807001277804375, + "loss/crossentropy": 2.706336867809296, + "loss/logits": 0.8362784326076508, + "step": 54540 + }, + { + "epoch": 0.5455, + "grad_norm": 15.3125, + "grad_norm_var": 1.7351399739583333, + "learning_rate": 0.0003, + "loss": 11.1466, + "loss/aux_loss": 0.04807394836097956, + "loss/crossentropy": 2.6270270586013793, + "loss/logits": 0.8055624902248383, + "step": 54550 + }, + { + "epoch": 0.5456, + "grad_norm": 14.5625, + "grad_norm_var": 1.8981608072916667, + "learning_rate": 0.0003, + "loss": 11.0043, + "loss/aux_loss": 0.04806969799101353, + "loss/crossentropy": 2.7887719571590424, + "loss/logits": 0.8097354710102082, + "step": 54560 + }, + { + "epoch": 0.5457, + "grad_norm": 16.25, + "grad_norm_var": 0.8872395833333333, + "learning_rate": 0.0003, + "loss": 10.9318, + "loss/aux_loss": 0.04807890877127648, + "loss/crossentropy": 2.632401758432388, + "loss/logits": 0.8157328695058823, + "step": 54570 + }, + { + "epoch": 0.5458, + "grad_norm": 16.25, + "grad_norm_var": 0.6329264322916667, + "learning_rate": 0.0003, + "loss": 11.043, + "loss/aux_loss": 0.04806562662124634, + "loss/crossentropy": 2.796691632270813, + "loss/logits": 0.8395285964012146, + "step": 54580 + }, + { + "epoch": 0.5459, + "grad_norm": 14.5625, + "grad_norm_var": 1.1449055989583334, + "learning_rate": 0.0003, + "loss": 10.998, + "loss/aux_loss": 0.04808074235916138, + "loss/crossentropy": 2.683507615327835, + "loss/logits": 0.8001698046922684, + "step": 54590 + }, + { + "epoch": 0.546, + "grad_norm": 15.875, + "grad_norm_var": 2.4734212239583333, + "learning_rate": 0.0003, + "loss": 11.0751, + "loss/aux_loss": 0.04807427860796452, + "loss/crossentropy": 2.66720929145813, + "loss/logits": 0.8314949810504914, + "step": 54600 + }, + { + "epoch": 0.5461, + "grad_norm": 14.9375, + "grad_norm_var": 1.858837890625, + "learning_rate": 0.0003, + "loss": 10.9659, + "loss/aux_loss": 0.0480766873806715, + "loss/crossentropy": 2.564626210927963, + "loss/logits": 0.7770012825727463, + "step": 54610 + }, + { + "epoch": 0.5462, + "grad_norm": 14.125, + "grad_norm_var": 0.8690104166666667, + "learning_rate": 0.0003, + "loss": 10.8087, + "loss/aux_loss": 0.04806546028703451, + "loss/crossentropy": 2.6509805858135223, + "loss/logits": 0.7985016733407975, + "step": 54620 + }, + { + "epoch": 0.5463, + "grad_norm": 15.0625, + "grad_norm_var": 1.0657389322916666, + "learning_rate": 0.0003, + "loss": 10.9941, + "loss/aux_loss": 0.048076178319752215, + "loss/crossentropy": 2.7503870487213136, + "loss/logits": 0.8250322550535202, + "step": 54630 + }, + { + "epoch": 0.5464, + "grad_norm": 14.3125, + "grad_norm_var": 0.5979166666666667, + "learning_rate": 0.0003, + "loss": 11.075, + "loss/aux_loss": 0.04807407483458519, + "loss/crossentropy": 2.688520979881287, + "loss/logits": 0.7950679957866669, + "step": 54640 + }, + { + "epoch": 0.5465, + "grad_norm": 20.25, + "grad_norm_var": 12.39765625, + "learning_rate": 0.0003, + "loss": 10.8744, + "loss/aux_loss": 0.04807685688138008, + "loss/crossentropy": 2.671421802043915, + "loss/logits": 0.8197872430086136, + "step": 54650 + }, + { + "epoch": 0.5466, + "grad_norm": 14.6875, + "grad_norm_var": 2.2609375, + "learning_rate": 0.0003, + "loss": 10.8834, + "loss/aux_loss": 0.048076017200946806, + "loss/crossentropy": 2.5632533609867094, + "loss/logits": 0.7861254096031189, + "step": 54660 + }, + { + "epoch": 0.5467, + "grad_norm": 15.1875, + "grad_norm_var": 0.87890625, + "learning_rate": 0.0003, + "loss": 10.9958, + "loss/aux_loss": 0.048075830191373826, + "loss/crossentropy": 2.669093906879425, + "loss/logits": 0.8262115895748139, + "step": 54670 + }, + { + "epoch": 0.5468, + "grad_norm": 16.5, + "grad_norm_var": 1.33984375, + "learning_rate": 0.0003, + "loss": 10.9333, + "loss/aux_loss": 0.04807015471160412, + "loss/crossentropy": 2.7327490568161013, + "loss/logits": 0.8197944283485412, + "step": 54680 + }, + { + "epoch": 0.5469, + "grad_norm": 15.6875, + "grad_norm_var": 1.074853515625, + "learning_rate": 0.0003, + "loss": 11.0295, + "loss/aux_loss": 0.04807836562395096, + "loss/crossentropy": 2.6087993323802947, + "loss/logits": 0.806918916106224, + "step": 54690 + }, + { + "epoch": 0.547, + "grad_norm": 15.125, + "grad_norm_var": 0.35930989583333334, + "learning_rate": 0.0003, + "loss": 11.0181, + "loss/aux_loss": 0.04807017408311367, + "loss/crossentropy": 2.839758336544037, + "loss/logits": 0.8093813061714172, + "step": 54700 + }, + { + "epoch": 0.5471, + "grad_norm": 15.6875, + "grad_norm_var": 0.43670247395833334, + "learning_rate": 0.0003, + "loss": 10.9142, + "loss/aux_loss": 0.04807600136846304, + "loss/crossentropy": 2.644270604848862, + "loss/logits": 0.7997170180082321, + "step": 54710 + }, + { + "epoch": 0.5472, + "grad_norm": 15.25, + "grad_norm_var": 0.309375, + "learning_rate": 0.0003, + "loss": 11.1508, + "loss/aux_loss": 0.04805862028151751, + "loss/crossentropy": 2.7486122012138368, + "loss/logits": 0.8468534052371979, + "step": 54720 + }, + { + "epoch": 0.5473, + "grad_norm": 13.8125, + "grad_norm_var": 0.5001139322916667, + "learning_rate": 0.0003, + "loss": 11.0571, + "loss/aux_loss": 0.04807792566716671, + "loss/crossentropy": 2.6369648575782776, + "loss/logits": 0.8063502162694931, + "step": 54730 + }, + { + "epoch": 0.5474, + "grad_norm": 15.25, + "grad_norm_var": 0.33984375, + "learning_rate": 0.0003, + "loss": 10.9084, + "loss/aux_loss": 0.048066607862710956, + "loss/crossentropy": 2.598222774267197, + "loss/logits": 0.8337789565324784, + "step": 54740 + }, + { + "epoch": 0.5475, + "grad_norm": 14.875, + "grad_norm_var": 146.435400390625, + "learning_rate": 0.0003, + "loss": 10.9427, + "loss/aux_loss": 0.04807037971913815, + "loss/crossentropy": 2.689501368999481, + "loss/logits": 0.8253029197454452, + "step": 54750 + }, + { + "epoch": 0.5476, + "grad_norm": 14.9375, + "grad_norm_var": 145.92447916666666, + "learning_rate": 0.0003, + "loss": 10.929, + "loss/aux_loss": 0.048080086894333365, + "loss/crossentropy": 2.7107209861278534, + "loss/logits": 0.8063658207654953, + "step": 54760 + }, + { + "epoch": 0.5477, + "grad_norm": 15.5, + "grad_norm_var": 0.3714680989583333, + "learning_rate": 0.0003, + "loss": 11.0567, + "loss/aux_loss": 0.048056581430137156, + "loss/crossentropy": 2.7536053538322447, + "loss/logits": 0.8437339574098587, + "step": 54770 + }, + { + "epoch": 0.5478, + "grad_norm": 14.125, + "grad_norm_var": 0.221728515625, + "learning_rate": 0.0003, + "loss": 11.0786, + "loss/aux_loss": 0.04807734116911888, + "loss/crossentropy": 2.624860906600952, + "loss/logits": 0.8355853497982025, + "step": 54780 + }, + { + "epoch": 0.5479, + "grad_norm": 14.75, + "grad_norm_var": 0.372119140625, + "learning_rate": 0.0003, + "loss": 11.0605, + "loss/aux_loss": 0.048068249225616456, + "loss/crossentropy": 2.6733208775520323, + "loss/logits": 0.8279371082782745, + "step": 54790 + }, + { + "epoch": 0.548, + "grad_norm": 14.1875, + "grad_norm_var": 0.5143229166666666, + "learning_rate": 0.0003, + "loss": 11.0193, + "loss/aux_loss": 0.04806514009833336, + "loss/crossentropy": 2.83870667219162, + "loss/logits": 0.8224406003952026, + "step": 54800 + }, + { + "epoch": 0.5481, + "grad_norm": 16.875, + "grad_norm_var": 0.6921223958333333, + "learning_rate": 0.0003, + "loss": 11.2115, + "loss/aux_loss": 0.048080642521381375, + "loss/crossentropy": 2.7003244876861574, + "loss/logits": 0.8368692755699157, + "step": 54810 + }, + { + "epoch": 0.5482, + "grad_norm": 14.5, + "grad_norm_var": 1.157275390625, + "learning_rate": 0.0003, + "loss": 11.0255, + "loss/aux_loss": 0.0480611115694046, + "loss/crossentropy": 2.7027824997901915, + "loss/logits": 0.8393090069293976, + "step": 54820 + }, + { + "epoch": 0.5483, + "grad_norm": 15.4375, + "grad_norm_var": 4.451285807291667, + "learning_rate": 0.0003, + "loss": 10.9206, + "loss/aux_loss": 0.04807600080966949, + "loss/crossentropy": 2.8590354561805724, + "loss/logits": 0.8446751832962036, + "step": 54830 + }, + { + "epoch": 0.5484, + "grad_norm": 15.0625, + "grad_norm_var": 3.78046875, + "learning_rate": 0.0003, + "loss": 11.0024, + "loss/aux_loss": 0.048070641607046126, + "loss/crossentropy": 2.7838706493377687, + "loss/logits": 0.8221488207578659, + "step": 54840 + }, + { + "epoch": 0.5485, + "grad_norm": 14.875, + "grad_norm_var": 23.503645833333334, + "learning_rate": 0.0003, + "loss": 10.9116, + "loss/aux_loss": 0.04807809256017208, + "loss/crossentropy": 2.7407828688621523, + "loss/logits": 0.8634491443634034, + "step": 54850 + }, + { + "epoch": 0.5486, + "grad_norm": 17.125, + "grad_norm_var": 0.9280598958333334, + "learning_rate": 0.0003, + "loss": 11.0951, + "loss/aux_loss": 0.04807487297803163, + "loss/crossentropy": 2.6179397821426393, + "loss/logits": 0.7829213112592697, + "step": 54860 + }, + { + "epoch": 0.5487, + "grad_norm": 16.375, + "grad_norm_var": 1.118994140625, + "learning_rate": 0.0003, + "loss": 11.1043, + "loss/aux_loss": 0.04806428123265505, + "loss/crossentropy": 2.738786405324936, + "loss/logits": 0.8392746210098266, + "step": 54870 + }, + { + "epoch": 0.5488, + "grad_norm": 16.625, + "grad_norm_var": 0.9288899739583333, + "learning_rate": 0.0003, + "loss": 11.0692, + "loss/aux_loss": 0.04807651992887259, + "loss/crossentropy": 2.739248037338257, + "loss/logits": 0.8410652667284012, + "step": 54880 + }, + { + "epoch": 0.5489, + "grad_norm": 15.125, + "grad_norm_var": 0.8481770833333333, + "learning_rate": 0.0003, + "loss": 11.0996, + "loss/aux_loss": 0.048066032119095325, + "loss/crossentropy": 2.841780698299408, + "loss/logits": 0.8673689156770706, + "step": 54890 + }, + { + "epoch": 0.549, + "grad_norm": 14.3125, + "grad_norm_var": 1.1091145833333333, + "learning_rate": 0.0003, + "loss": 10.9194, + "loss/aux_loss": 0.04806853048503399, + "loss/crossentropy": 2.7467273652553557, + "loss/logits": 0.8478365898132324, + "step": 54900 + }, + { + "epoch": 0.5491, + "grad_norm": 14.875, + "grad_norm_var": 0.23854166666666668, + "learning_rate": 0.0003, + "loss": 10.8423, + "loss/aux_loss": 0.04806788172572851, + "loss/crossentropy": 2.656290876865387, + "loss/logits": 0.8057217448949814, + "step": 54910 + }, + { + "epoch": 0.5492, + "grad_norm": 15.5625, + "grad_norm_var": 0.42421875, + "learning_rate": 0.0003, + "loss": 10.9517, + "loss/aux_loss": 0.04808411467820406, + "loss/crossentropy": 2.7929548025131226, + "loss/logits": 0.8252448886632919, + "step": 54920 + }, + { + "epoch": 0.5493, + "grad_norm": 15.5625, + "grad_norm_var": 0.4051920572916667, + "learning_rate": 0.0003, + "loss": 10.9819, + "loss/aux_loss": 0.0480615908280015, + "loss/crossentropy": 2.7801303029060365, + "loss/logits": 0.8024150758981705, + "step": 54930 + }, + { + "epoch": 0.5494, + "grad_norm": 14.125, + "grad_norm_var": 0.4627604166666667, + "learning_rate": 0.0003, + "loss": 10.9448, + "loss/aux_loss": 0.04807640910148621, + "loss/crossentropy": 2.7152156591415406, + "loss/logits": 0.8221557170152665, + "step": 54940 + }, + { + "epoch": 0.5495, + "grad_norm": 13.25, + "grad_norm_var": 0.5196451822916667, + "learning_rate": 0.0003, + "loss": 11.0243, + "loss/aux_loss": 0.04806815255433321, + "loss/crossentropy": 2.622380143404007, + "loss/logits": 0.8374263972043992, + "step": 54950 + }, + { + "epoch": 0.5496, + "grad_norm": 15.6875, + "grad_norm_var": 106.15402018229166, + "learning_rate": 0.0003, + "loss": 10.9103, + "loss/aux_loss": 0.04807578641921282, + "loss/crossentropy": 2.779756152629852, + "loss/logits": 0.8186855703592301, + "step": 54960 + }, + { + "epoch": 0.5497, + "grad_norm": 14.8125, + "grad_norm_var": 0.678125, + "learning_rate": 0.0003, + "loss": 11.0254, + "loss/aux_loss": 0.04808486998081207, + "loss/crossentropy": 2.5636366307735443, + "loss/logits": 0.8077445298433303, + "step": 54970 + }, + { + "epoch": 0.5498, + "grad_norm": 13.875, + "grad_norm_var": 0.73515625, + "learning_rate": 0.0003, + "loss": 11.012, + "loss/aux_loss": 0.04805983640253544, + "loss/crossentropy": 2.850145775079727, + "loss/logits": 0.8156041219830513, + "step": 54980 + }, + { + "epoch": 0.5499, + "grad_norm": 15.125, + "grad_norm_var": 68.479541015625, + "learning_rate": 0.0003, + "loss": 10.9674, + "loss/aux_loss": 0.0480882341042161, + "loss/crossentropy": 2.763186824321747, + "loss/logits": 0.8171787321567535, + "step": 54990 + }, + { + "epoch": 0.55, + "grad_norm": 14.8125, + "grad_norm_var": 1.0983723958333333, + "learning_rate": 0.0003, + "loss": 10.9881, + "loss/aux_loss": 0.048068669810891154, + "loss/crossentropy": 2.663837069272995, + "loss/logits": 0.8430362701416015, + "step": 55000 + }, + { + "epoch": 0.5501, + "grad_norm": 15.9375, + "grad_norm_var": 0.6478515625, + "learning_rate": 0.0003, + "loss": 11.0946, + "loss/aux_loss": 0.048068431206047535, + "loss/crossentropy": 2.65292683839798, + "loss/logits": 0.8245347827672959, + "step": 55010 + }, + { + "epoch": 0.5502, + "grad_norm": 15.1875, + "grad_norm_var": 0.368994140625, + "learning_rate": 0.0003, + "loss": 11.0463, + "loss/aux_loss": 0.04806177597492933, + "loss/crossentropy": 2.7886301994323732, + "loss/logits": 0.8723696410655976, + "step": 55020 + }, + { + "epoch": 0.5503, + "grad_norm": 14.4375, + "grad_norm_var": 1.080322265625, + "learning_rate": 0.0003, + "loss": 11.0383, + "loss/aux_loss": 0.04808465298265219, + "loss/crossentropy": 2.846861410140991, + "loss/logits": 0.8170645713806153, + "step": 55030 + }, + { + "epoch": 0.5504, + "grad_norm": 17.875, + "grad_norm_var": 1.7322265625, + "learning_rate": 0.0003, + "loss": 11.0391, + "loss/aux_loss": 0.04806484617292881, + "loss/crossentropy": 2.611069065332413, + "loss/logits": 0.7951992452144623, + "step": 55040 + }, + { + "epoch": 0.5505, + "grad_norm": 15.3125, + "grad_norm_var": 1.0645182291666666, + "learning_rate": 0.0003, + "loss": 11.0523, + "loss/aux_loss": 0.048074701242148875, + "loss/crossentropy": 2.811970281600952, + "loss/logits": 0.794430273771286, + "step": 55050 + }, + { + "epoch": 0.5506, + "grad_norm": 14.75, + "grad_norm_var": 0.6796875, + "learning_rate": 0.0003, + "loss": 10.9043, + "loss/aux_loss": 0.048068948276340964, + "loss/crossentropy": 2.664133369922638, + "loss/logits": 0.814795833826065, + "step": 55060 + }, + { + "epoch": 0.5507, + "grad_norm": 15.3125, + "grad_norm_var": 0.6744791666666666, + "learning_rate": 0.0003, + "loss": 11.0856, + "loss/aux_loss": 0.04806470796465874, + "loss/crossentropy": 2.8181748032569884, + "loss/logits": 0.8605926305055618, + "step": 55070 + }, + { + "epoch": 0.5508, + "grad_norm": 15.3125, + "grad_norm_var": 0.50078125, + "learning_rate": 0.0003, + "loss": 11.0929, + "loss/aux_loss": 0.048071750067174436, + "loss/crossentropy": 2.6172981381416323, + "loss/logits": 0.8222677648067475, + "step": 55080 + }, + { + "epoch": 0.5509, + "grad_norm": 18.125, + "grad_norm_var": 1.0407389322916667, + "learning_rate": 0.0003, + "loss": 10.8124, + "loss/aux_loss": 0.04807625990360975, + "loss/crossentropy": 2.448382931947708, + "loss/logits": 0.7904783099889755, + "step": 55090 + }, + { + "epoch": 0.551, + "grad_norm": 14.75, + "grad_norm_var": 3.9869140625, + "learning_rate": 0.0003, + "loss": 11.0645, + "loss/aux_loss": 0.04806983452290296, + "loss/crossentropy": 2.7576751828193666, + "loss/logits": 0.8664533495903015, + "step": 55100 + }, + { + "epoch": 0.5511, + "grad_norm": 15.875, + "grad_norm_var": 0.45818684895833334, + "learning_rate": 0.0003, + "loss": 10.8939, + "loss/aux_loss": 0.04806837178766728, + "loss/crossentropy": 2.7311266005039214, + "loss/logits": 0.8202107667922973, + "step": 55110 + }, + { + "epoch": 0.5512, + "grad_norm": 14.75, + "grad_norm_var": 0.746728515625, + "learning_rate": 0.0003, + "loss": 10.8522, + "loss/aux_loss": 0.04809461031109095, + "loss/crossentropy": 2.481432467699051, + "loss/logits": 0.7854719698429108, + "step": 55120 + }, + { + "epoch": 0.5513, + "grad_norm": 15.8125, + "grad_norm_var": 0.5721354166666667, + "learning_rate": 0.0003, + "loss": 10.9235, + "loss/aux_loss": 0.048076366260647777, + "loss/crossentropy": 2.6985863506793977, + "loss/logits": 0.7947354167699814, + "step": 55130 + }, + { + "epoch": 0.5514, + "grad_norm": 16.375, + "grad_norm_var": 0.8880208333333334, + "learning_rate": 0.0003, + "loss": 11.1427, + "loss/aux_loss": 0.048077909648418425, + "loss/crossentropy": 2.8347915410995483, + "loss/logits": 0.8606969892978669, + "step": 55140 + }, + { + "epoch": 0.5515, + "grad_norm": 14.8125, + "grad_norm_var": 0.5734375, + "learning_rate": 0.0003, + "loss": 11.0213, + "loss/aux_loss": 0.04807186853140592, + "loss/crossentropy": 2.677201247215271, + "loss/logits": 0.8178416341543198, + "step": 55150 + }, + { + "epoch": 0.5516, + "grad_norm": 14.6875, + "grad_norm_var": 0.47578125, + "learning_rate": 0.0003, + "loss": 11.1246, + "loss/aux_loss": 0.04808474499732256, + "loss/crossentropy": 2.7420763611793517, + "loss/logits": 0.8414195388555527, + "step": 55160 + }, + { + "epoch": 0.5517, + "grad_norm": 15.3125, + "grad_norm_var": 0.49933268229166666, + "learning_rate": 0.0003, + "loss": 11.0815, + "loss/aux_loss": 0.0480824813246727, + "loss/crossentropy": 2.7030155539512633, + "loss/logits": 0.8236714661121368, + "step": 55170 + }, + { + "epoch": 0.5518, + "grad_norm": 15.4375, + "grad_norm_var": 0.470166015625, + "learning_rate": 0.0003, + "loss": 11.1841, + "loss/aux_loss": 0.048062578216195105, + "loss/crossentropy": 2.6597807705402374, + "loss/logits": 0.8233522891998291, + "step": 55180 + }, + { + "epoch": 0.5519, + "grad_norm": 15.875, + "grad_norm_var": 1.1374837239583333, + "learning_rate": 0.0003, + "loss": 10.9552, + "loss/aux_loss": 0.048083331808447836, + "loss/crossentropy": 2.794076269865036, + "loss/logits": 0.8059151649475098, + "step": 55190 + }, + { + "epoch": 0.552, + "grad_norm": 14.6875, + "grad_norm_var": 0.7822265625, + "learning_rate": 0.0003, + "loss": 11.2436, + "loss/aux_loss": 0.048071344010531904, + "loss/crossentropy": 2.935925805568695, + "loss/logits": 0.8763310700654984, + "step": 55200 + }, + { + "epoch": 0.5521, + "grad_norm": 14.4375, + "grad_norm_var": 0.8796875, + "learning_rate": 0.0003, + "loss": 10.8792, + "loss/aux_loss": 0.04806312434375286, + "loss/crossentropy": 2.5801034331321717, + "loss/logits": 0.7766230911016464, + "step": 55210 + }, + { + "epoch": 0.5522, + "grad_norm": 16.0, + "grad_norm_var": 1.910009765625, + "learning_rate": 0.0003, + "loss": 11.0385, + "loss/aux_loss": 0.048060713522136214, + "loss/crossentropy": 2.7574446558952332, + "loss/logits": 0.844352638721466, + "step": 55220 + }, + { + "epoch": 0.5523, + "grad_norm": 14.1875, + "grad_norm_var": 1.689697265625, + "learning_rate": 0.0003, + "loss": 10.8802, + "loss/aux_loss": 0.048071770928800106, + "loss/crossentropy": 2.5709005653858186, + "loss/logits": 0.7922522544860839, + "step": 55230 + }, + { + "epoch": 0.5524, + "grad_norm": 15.125, + "grad_norm_var": 0.2791666666666667, + "learning_rate": 0.0003, + "loss": 11.0235, + "loss/aux_loss": 0.04807364530861378, + "loss/crossentropy": 2.7921212911605835, + "loss/logits": 0.8340934455394745, + "step": 55240 + }, + { + "epoch": 0.5525, + "grad_norm": 14.875, + "grad_norm_var": 0.3009765625, + "learning_rate": 0.0003, + "loss": 11.0513, + "loss/aux_loss": 0.04807345513254404, + "loss/crossentropy": 2.711283278465271, + "loss/logits": 0.8268725454807282, + "step": 55250 + }, + { + "epoch": 0.5526, + "grad_norm": 16.375, + "grad_norm_var": 0.49947916666666664, + "learning_rate": 0.0003, + "loss": 10.8667, + "loss/aux_loss": 0.048064783401787284, + "loss/crossentropy": 2.5558693051338195, + "loss/logits": 0.8117028713226319, + "step": 55260 + }, + { + "epoch": 0.5527, + "grad_norm": 15.625, + "grad_norm_var": 0.506884765625, + "learning_rate": 0.0003, + "loss": 11.0831, + "loss/aux_loss": 0.0480767959728837, + "loss/crossentropy": 2.6161147236824034, + "loss/logits": 0.8263348460197448, + "step": 55270 + }, + { + "epoch": 0.5528, + "grad_norm": 15.9375, + "grad_norm_var": 0.44453125, + "learning_rate": 0.0003, + "loss": 11.0553, + "loss/aux_loss": 0.04807081706821918, + "loss/crossentropy": 2.8348384737968444, + "loss/logits": 0.8327891290187835, + "step": 55280 + }, + { + "epoch": 0.5529, + "grad_norm": 13.75, + "grad_norm_var": 0.7016764322916667, + "learning_rate": 0.0003, + "loss": 11.0743, + "loss/aux_loss": 0.04807010293006897, + "loss/crossentropy": 2.8104523420333862, + "loss/logits": 0.8793832540512085, + "step": 55290 + }, + { + "epoch": 0.553, + "grad_norm": 15.125, + "grad_norm_var": 0.8228515625, + "learning_rate": 0.0003, + "loss": 10.9962, + "loss/aux_loss": 0.04807137455791235, + "loss/crossentropy": 2.6533069372177125, + "loss/logits": 0.831883293390274, + "step": 55300 + }, + { + "epoch": 0.5531, + "grad_norm": 15.125, + "grad_norm_var": 0.70390625, + "learning_rate": 0.0003, + "loss": 11.0324, + "loss/aux_loss": 0.04808204546570778, + "loss/crossentropy": 2.8105109453201296, + "loss/logits": 0.8209470868110657, + "step": 55310 + }, + { + "epoch": 0.5532, + "grad_norm": 14.0, + "grad_norm_var": 0.2744140625, + "learning_rate": 0.0003, + "loss": 11.1437, + "loss/aux_loss": 0.04806657768785953, + "loss/crossentropy": 2.852742946147919, + "loss/logits": 0.8403396517038345, + "step": 55320 + }, + { + "epoch": 0.5533, + "grad_norm": 14.4375, + "grad_norm_var": 0.42473958333333334, + "learning_rate": 0.0003, + "loss": 10.9109, + "loss/aux_loss": 0.04807582776993513, + "loss/crossentropy": 2.6064063906669617, + "loss/logits": 0.8204812169075012, + "step": 55330 + }, + { + "epoch": 0.5534, + "grad_norm": 17.25, + "grad_norm_var": 0.7844889322916667, + "learning_rate": 0.0003, + "loss": 10.8695, + "loss/aux_loss": 0.04807104654610157, + "loss/crossentropy": 2.8387674689292908, + "loss/logits": 0.8554284036159515, + "step": 55340 + }, + { + "epoch": 0.5535, + "grad_norm": 19.25, + "grad_norm_var": 1.8325358072916667, + "learning_rate": 0.0003, + "loss": 10.8966, + "loss/aux_loss": 0.048077484220266344, + "loss/crossentropy": 2.6154538214206697, + "loss/logits": 0.7829844743013382, + "step": 55350 + }, + { + "epoch": 0.5536, + "grad_norm": 39.5, + "grad_norm_var": 38.35558268229167, + "learning_rate": 0.0003, + "loss": 10.9473, + "loss/aux_loss": 0.04806430507451296, + "loss/crossentropy": 2.670952570438385, + "loss/logits": 0.8368105083703995, + "step": 55360 + }, + { + "epoch": 0.5537, + "grad_norm": 14.6875, + "grad_norm_var": 37.1072265625, + "learning_rate": 0.0003, + "loss": 11.1894, + "loss/aux_loss": 0.04807990249246359, + "loss/crossentropy": 2.566202479600906, + "loss/logits": 0.8429438978433609, + "step": 55370 + }, + { + "epoch": 0.5538, + "grad_norm": 15.0, + "grad_norm_var": 0.7909993489583333, + "learning_rate": 0.0003, + "loss": 11.0169, + "loss/aux_loss": 0.04807171169668436, + "loss/crossentropy": 2.6199849128723143, + "loss/logits": 0.7806309968233108, + "step": 55380 + }, + { + "epoch": 0.5539, + "grad_norm": 14.6875, + "grad_norm_var": 0.7916666666666666, + "learning_rate": 0.0003, + "loss": 11.1617, + "loss/aux_loss": 0.04807775840163231, + "loss/crossentropy": 2.8030936300754545, + "loss/logits": 0.8265924125909805, + "step": 55390 + }, + { + "epoch": 0.554, + "grad_norm": 14.875, + "grad_norm_var": 1.0254557291666666, + "learning_rate": 0.0003, + "loss": 10.9235, + "loss/aux_loss": 0.04806862715631723, + "loss/crossentropy": 2.6641399443149565, + "loss/logits": 0.8228778213262558, + "step": 55400 + }, + { + "epoch": 0.5541, + "grad_norm": 14.5, + "grad_norm_var": 0.5369140625, + "learning_rate": 0.0003, + "loss": 11.1513, + "loss/aux_loss": 0.04806708451360464, + "loss/crossentropy": 2.73874751329422, + "loss/logits": 0.8570866554975509, + "step": 55410 + }, + { + "epoch": 0.5542, + "grad_norm": 14.5, + "grad_norm_var": 0.27545572916666666, + "learning_rate": 0.0003, + "loss": 11.0742, + "loss/aux_loss": 0.048080652765929696, + "loss/crossentropy": 2.6296676993370056, + "loss/logits": 0.8333312928676605, + "step": 55420 + }, + { + "epoch": 0.5543, + "grad_norm": 15.0, + "grad_norm_var": 4.2884765625, + "learning_rate": 0.0003, + "loss": 10.7601, + "loss/aux_loss": 0.04808066971600056, + "loss/crossentropy": 2.6608549892902373, + "loss/logits": 0.7777025699615479, + "step": 55430 + }, + { + "epoch": 0.5544, + "grad_norm": 15.1875, + "grad_norm_var": 4.875764973958334, + "learning_rate": 0.0003, + "loss": 10.8756, + "loss/aux_loss": 0.04807064030319452, + "loss/crossentropy": 2.6703452289104463, + "loss/logits": 0.8353601545095444, + "step": 55440 + }, + { + "epoch": 0.5545, + "grad_norm": 13.5625, + "grad_norm_var": 1.1557291666666667, + "learning_rate": 0.0003, + "loss": 10.8567, + "loss/aux_loss": 0.04806992132216692, + "loss/crossentropy": 2.8200165271759032, + "loss/logits": 0.8091388493776321, + "step": 55450 + }, + { + "epoch": 0.5546, + "grad_norm": 15.0, + "grad_norm_var": 26.2869140625, + "learning_rate": 0.0003, + "loss": 10.9664, + "loss/aux_loss": 0.04806872811168432, + "loss/crossentropy": 2.758486533164978, + "loss/logits": 0.814564099907875, + "step": 55460 + }, + { + "epoch": 0.5547, + "grad_norm": 17.25, + "grad_norm_var": 24.257666015625, + "learning_rate": 0.0003, + "loss": 10.9383, + "loss/aux_loss": 0.048077772557735446, + "loss/crossentropy": 2.850853431224823, + "loss/logits": 0.8375491023063659, + "step": 55470 + }, + { + "epoch": 0.5548, + "grad_norm": 15.4375, + "grad_norm_var": 0.7166666666666667, + "learning_rate": 0.0003, + "loss": 10.9209, + "loss/aux_loss": 0.04806488305330277, + "loss/crossentropy": 2.7074069380760193, + "loss/logits": 0.809591680765152, + "step": 55480 + }, + { + "epoch": 0.5549, + "grad_norm": 18.25, + "grad_norm_var": 115.28430989583333, + "learning_rate": 0.0003, + "loss": 11.1249, + "loss/aux_loss": 0.048068816773593426, + "loss/crossentropy": 2.7336994290351866, + "loss/logits": 0.8458652794361115, + "step": 55490 + }, + { + "epoch": 0.555, + "grad_norm": 15.3125, + "grad_norm_var": 115.85260416666667, + "learning_rate": 0.0003, + "loss": 11.0102, + "loss/aux_loss": 0.04807518254965544, + "loss/crossentropy": 2.773914611339569, + "loss/logits": 0.8233480423688888, + "step": 55500 + }, + { + "epoch": 0.5551, + "grad_norm": 15.125, + "grad_norm_var": 2.074934895833333, + "learning_rate": 0.0003, + "loss": 11.0417, + "loss/aux_loss": 0.048073450662195684, + "loss/crossentropy": 2.622314327955246, + "loss/logits": 0.8316299766302109, + "step": 55510 + }, + { + "epoch": 0.5552, + "grad_norm": 15.8125, + "grad_norm_var": 2.1219889322916665, + "learning_rate": 0.0003, + "loss": 10.9537, + "loss/aux_loss": 0.04807238392531872, + "loss/crossentropy": 2.688526248931885, + "loss/logits": 0.8633313030004501, + "step": 55520 + }, + { + "epoch": 0.5553, + "grad_norm": 14.5, + "grad_norm_var": 0.8098795572916667, + "learning_rate": 0.0003, + "loss": 11.1069, + "loss/aux_loss": 0.048073047399520875, + "loss/crossentropy": 2.9057937622070313, + "loss/logits": 0.8318546801805496, + "step": 55530 + }, + { + "epoch": 0.5554, + "grad_norm": 15.3125, + "grad_norm_var": 0.7106608072916667, + "learning_rate": 0.0003, + "loss": 10.8562, + "loss/aux_loss": 0.04806810189038515, + "loss/crossentropy": 2.677476871013641, + "loss/logits": 0.786837711930275, + "step": 55540 + }, + { + "epoch": 0.5555, + "grad_norm": 14.6875, + "grad_norm_var": 0.3433430989583333, + "learning_rate": 0.0003, + "loss": 11.1194, + "loss/aux_loss": 0.048065618611872196, + "loss/crossentropy": 2.7434459567070006, + "loss/logits": 0.813059389591217, + "step": 55550 + }, + { + "epoch": 0.5556, + "grad_norm": 14.5625, + "grad_norm_var": 0.3681640625, + "learning_rate": 0.0003, + "loss": 10.8911, + "loss/aux_loss": 0.04808599632233381, + "loss/crossentropy": 2.6911366164684294, + "loss/logits": 0.8269895523786545, + "step": 55560 + }, + { + "epoch": 0.5557, + "grad_norm": 14.25, + "grad_norm_var": 0.372509765625, + "learning_rate": 0.0003, + "loss": 10.9757, + "loss/aux_loss": 0.04807158019393683, + "loss/crossentropy": 2.719972950220108, + "loss/logits": 0.8304482787847519, + "step": 55570 + }, + { + "epoch": 0.5558, + "grad_norm": 14.8125, + "grad_norm_var": 1.05, + "learning_rate": 0.0003, + "loss": 11.0324, + "loss/aux_loss": 0.048060659877955915, + "loss/crossentropy": 2.758992946147919, + "loss/logits": 0.8127716392278671, + "step": 55580 + }, + { + "epoch": 0.5559, + "grad_norm": 15.6875, + "grad_norm_var": 0.48020833333333335, + "learning_rate": 0.0003, + "loss": 11.1187, + "loss/aux_loss": 0.04808644969016314, + "loss/crossentropy": 2.6615478575229643, + "loss/logits": 0.8186038672924042, + "step": 55590 + }, + { + "epoch": 0.556, + "grad_norm": 15.0625, + "grad_norm_var": 0.7535807291666666, + "learning_rate": 0.0003, + "loss": 10.984, + "loss/aux_loss": 0.04806722085922956, + "loss/crossentropy": 2.8301248073577883, + "loss/logits": 0.8224711626768112, + "step": 55600 + }, + { + "epoch": 0.5561, + "grad_norm": 15.6875, + "grad_norm_var": 0.9426432291666667, + "learning_rate": 0.0003, + "loss": 11.0619, + "loss/aux_loss": 0.048068858496844766, + "loss/crossentropy": 2.652260237932205, + "loss/logits": 0.8152152061462402, + "step": 55610 + }, + { + "epoch": 0.5562, + "grad_norm": 15.9375, + "grad_norm_var": 0.6843098958333333, + "learning_rate": 0.0003, + "loss": 11.0558, + "loss/aux_loss": 0.048064957931637765, + "loss/crossentropy": 2.8026095151901247, + "loss/logits": 0.8398134261369705, + "step": 55620 + }, + { + "epoch": 0.5563, + "grad_norm": 14.0, + "grad_norm_var": 0.9531087239583333, + "learning_rate": 0.0003, + "loss": 10.9296, + "loss/aux_loss": 0.048068616352975366, + "loss/crossentropy": 2.6979903995990755, + "loss/logits": 0.8080804139375687, + "step": 55630 + }, + { + "epoch": 0.5564, + "grad_norm": 14.875, + "grad_norm_var": 0.63671875, + "learning_rate": 0.0003, + "loss": 10.895, + "loss/aux_loss": 0.04807308129966259, + "loss/crossentropy": 2.7710575222969056, + "loss/logits": 0.807817280292511, + "step": 55640 + }, + { + "epoch": 0.5565, + "grad_norm": 15.1875, + "grad_norm_var": 0.8942057291666666, + "learning_rate": 0.0003, + "loss": 10.9262, + "loss/aux_loss": 0.04807392563670874, + "loss/crossentropy": 2.736243361234665, + "loss/logits": 0.8000924259424209, + "step": 55650 + }, + { + "epoch": 0.5566, + "grad_norm": 14.875, + "grad_norm_var": 0.2556640625, + "learning_rate": 0.0003, + "loss": 10.9558, + "loss/aux_loss": 0.04807540960609913, + "loss/crossentropy": 2.8422346234321596, + "loss/logits": 0.830447968840599, + "step": 55660 + }, + { + "epoch": 0.5567, + "grad_norm": 15.3125, + "grad_norm_var": 5286.068473307291, + "learning_rate": 0.0003, + "loss": 11.0703, + "loss/aux_loss": 0.04807211048901081, + "loss/crossentropy": 2.746444511413574, + "loss/logits": 0.8070830225944519, + "step": 55670 + }, + { + "epoch": 0.5568, + "grad_norm": 13.75, + "grad_norm_var": 5261.000244140625, + "learning_rate": 0.0003, + "loss": 10.9093, + "loss/aux_loss": 0.048075992986559866, + "loss/crossentropy": 2.802550220489502, + "loss/logits": 0.8187012434005737, + "step": 55680 + }, + { + "epoch": 0.5569, + "grad_norm": 16.125, + "grad_norm_var": 2.572249348958333, + "learning_rate": 0.0003, + "loss": 11.0327, + "loss/aux_loss": 0.048064975813031194, + "loss/crossentropy": 2.5375086605548858, + "loss/logits": 0.8208780288696289, + "step": 55690 + }, + { + "epoch": 0.557, + "grad_norm": 14.5, + "grad_norm_var": 0.4688639322916667, + "learning_rate": 0.0003, + "loss": 11.0094, + "loss/aux_loss": 0.04807396829128265, + "loss/crossentropy": 2.7012075066566466, + "loss/logits": 0.8320501059293747, + "step": 55700 + }, + { + "epoch": 0.5571, + "grad_norm": 14.25, + "grad_norm_var": 0.24140625, + "learning_rate": 0.0003, + "loss": 11.1335, + "loss/aux_loss": 0.048072258941829205, + "loss/crossentropy": 2.685439348220825, + "loss/logits": 0.8327443897724152, + "step": 55710 + }, + { + "epoch": 0.5572, + "grad_norm": 14.9375, + "grad_norm_var": 0.24166666666666667, + "learning_rate": 0.0003, + "loss": 11.0821, + "loss/aux_loss": 0.0480724660679698, + "loss/crossentropy": 2.8291414260864256, + "loss/logits": 0.8656550794839859, + "step": 55720 + }, + { + "epoch": 0.5573, + "grad_norm": 15.0625, + "grad_norm_var": 15.460791015625, + "learning_rate": 0.0003, + "loss": 10.9168, + "loss/aux_loss": 0.04806995950639248, + "loss/crossentropy": 2.7315680921077727, + "loss/logits": 0.7951398670673371, + "step": 55730 + }, + { + "epoch": 0.5574, + "grad_norm": 14.5625, + "grad_norm_var": 0.6952473958333333, + "learning_rate": 0.0003, + "loss": 11.2082, + "loss/aux_loss": 0.048077480867505074, + "loss/crossentropy": 2.734374761581421, + "loss/logits": 0.8259120464324952, + "step": 55740 + }, + { + "epoch": 0.5575, + "grad_norm": 16.0, + "grad_norm_var": 1.183837890625, + "learning_rate": 0.0003, + "loss": 11.066, + "loss/aux_loss": 0.048074356466531756, + "loss/crossentropy": 2.626655274629593, + "loss/logits": 0.8067521005868912, + "step": 55750 + }, + { + "epoch": 0.5576, + "grad_norm": 15.3125, + "grad_norm_var": 0.7752604166666667, + "learning_rate": 0.0003, + "loss": 11.0163, + "loss/aux_loss": 0.048075728304684165, + "loss/crossentropy": 2.6281754672527313, + "loss/logits": 0.8328516259789467, + "step": 55760 + }, + { + "epoch": 0.5577, + "grad_norm": 15.0, + "grad_norm_var": 1.0020182291666666, + "learning_rate": 0.0003, + "loss": 10.9977, + "loss/aux_loss": 0.04805116355419159, + "loss/crossentropy": 2.502528029680252, + "loss/logits": 0.7761318385601044, + "step": 55770 + }, + { + "epoch": 0.5578, + "grad_norm": 13.875, + "grad_norm_var": 0.8640462239583333, + "learning_rate": 0.0003, + "loss": 11.0557, + "loss/aux_loss": 0.04808572828769684, + "loss/crossentropy": 2.792075717449188, + "loss/logits": 0.8123959988355637, + "step": 55780 + }, + { + "epoch": 0.5579, + "grad_norm": 15.4375, + "grad_norm_var": 0.5848307291666667, + "learning_rate": 0.0003, + "loss": 11.0302, + "loss/aux_loss": 0.04807162135839462, + "loss/crossentropy": 2.764680355787277, + "loss/logits": 0.8557851523160934, + "step": 55790 + }, + { + "epoch": 0.558, + "grad_norm": 14.1875, + "grad_norm_var": 0.627197265625, + "learning_rate": 0.0003, + "loss": 11.0062, + "loss/aux_loss": 0.04807290825992823, + "loss/crossentropy": 2.6959391951560976, + "loss/logits": 0.8017183929681778, + "step": 55800 + }, + { + "epoch": 0.5581, + "grad_norm": 14.1875, + "grad_norm_var": 0.4681640625, + "learning_rate": 0.0003, + "loss": 10.9007, + "loss/aux_loss": 0.04806526694446802, + "loss/crossentropy": 2.712996482849121, + "loss/logits": 0.8240525692701339, + "step": 55810 + }, + { + "epoch": 0.5582, + "grad_norm": 21.375, + "grad_norm_var": 3.1531087239583333, + "learning_rate": 0.0003, + "loss": 11.0358, + "loss/aux_loss": 0.048070162907242774, + "loss/crossentropy": 2.7307373881340027, + "loss/logits": 0.82339708507061, + "step": 55820 + }, + { + "epoch": 0.5583, + "grad_norm": 13.75, + "grad_norm_var": 2.9661458333333335, + "learning_rate": 0.0003, + "loss": 10.8756, + "loss/aux_loss": 0.04807821772992611, + "loss/crossentropy": 2.8782392740249634, + "loss/logits": 0.8153641313314438, + "step": 55830 + }, + { + "epoch": 0.5584, + "grad_norm": 14.8125, + "grad_norm_var": 0.42823893229166665, + "learning_rate": 0.0003, + "loss": 11.0125, + "loss/aux_loss": 0.04807160831987858, + "loss/crossentropy": 2.8019445538520813, + "loss/logits": 0.8658664226531982, + "step": 55840 + }, + { + "epoch": 0.5585, + "grad_norm": 15.1875, + "grad_norm_var": 0.6179524739583333, + "learning_rate": 0.0003, + "loss": 11.0465, + "loss/aux_loss": 0.0480682285502553, + "loss/crossentropy": 2.633160024881363, + "loss/logits": 0.8404253333806991, + "step": 55850 + }, + { + "epoch": 0.5586, + "grad_norm": 14.25, + "grad_norm_var": 1.3625, + "learning_rate": 0.0003, + "loss": 11.0788, + "loss/aux_loss": 0.048069980926811695, + "loss/crossentropy": 2.977233016490936, + "loss/logits": 0.8548012495040893, + "step": 55860 + }, + { + "epoch": 0.5587, + "grad_norm": 17.5, + "grad_norm_var": 1.658837890625, + "learning_rate": 0.0003, + "loss": 11.0906, + "loss/aux_loss": 0.0480623546987772, + "loss/crossentropy": 2.668596589565277, + "loss/logits": 0.8072352677583694, + "step": 55870 + }, + { + "epoch": 0.5588, + "grad_norm": 15.1875, + "grad_norm_var": 1.0363932291666667, + "learning_rate": 0.0003, + "loss": 10.9759, + "loss/aux_loss": 0.04808857198804617, + "loss/crossentropy": 2.6548173129558563, + "loss/logits": 0.832793864607811, + "step": 55880 + }, + { + "epoch": 0.5589, + "grad_norm": 15.1875, + "grad_norm_var": 0.35305989583333336, + "learning_rate": 0.0003, + "loss": 10.9757, + "loss/aux_loss": 0.04806632045656443, + "loss/crossentropy": 2.6122241616249084, + "loss/logits": 0.8252353370189667, + "step": 55890 + }, + { + "epoch": 0.559, + "grad_norm": 14.3125, + "grad_norm_var": 0.826025390625, + "learning_rate": 0.0003, + "loss": 10.9732, + "loss/aux_loss": 0.0480646962299943, + "loss/crossentropy": 2.8674940884113314, + "loss/logits": 0.8213084667921067, + "step": 55900 + }, + { + "epoch": 0.5591, + "grad_norm": 14.25, + "grad_norm_var": 0.79921875, + "learning_rate": 0.0003, + "loss": 11.0364, + "loss/aux_loss": 0.04807482287287712, + "loss/crossentropy": 2.709700071811676, + "loss/logits": 0.8265916168689728, + "step": 55910 + }, + { + "epoch": 0.5592, + "grad_norm": 15.0, + "grad_norm_var": 1.2120930989583334, + "learning_rate": 0.0003, + "loss": 11.0549, + "loss/aux_loss": 0.048078625462949276, + "loss/crossentropy": 2.725412595272064, + "loss/logits": 0.8201213121414185, + "step": 55920 + }, + { + "epoch": 0.5593, + "grad_norm": 15.4375, + "grad_norm_var": 1.197900390625, + "learning_rate": 0.0003, + "loss": 11.0721, + "loss/aux_loss": 0.04807287231087685, + "loss/crossentropy": 2.781103193759918, + "loss/logits": 0.8102922707796096, + "step": 55930 + }, + { + "epoch": 0.5594, + "grad_norm": 14.25, + "grad_norm_var": 0.6386555989583333, + "learning_rate": 0.0003, + "loss": 11.234, + "loss/aux_loss": 0.04807069655507803, + "loss/crossentropy": 2.869676959514618, + "loss/logits": 0.8445936232805252, + "step": 55940 + }, + { + "epoch": 0.5595, + "grad_norm": 14.3125, + "grad_norm_var": 0.4103515625, + "learning_rate": 0.0003, + "loss": 10.9058, + "loss/aux_loss": 0.048074489645659926, + "loss/crossentropy": 2.652887338399887, + "loss/logits": 0.8485498696565628, + "step": 55950 + }, + { + "epoch": 0.5596, + "grad_norm": 15.0625, + "grad_norm_var": 0.6891764322916667, + "learning_rate": 0.0003, + "loss": 10.8934, + "loss/aux_loss": 0.04807630702853203, + "loss/crossentropy": 2.62935094833374, + "loss/logits": 0.8135641008615494, + "step": 55960 + }, + { + "epoch": 0.5597, + "grad_norm": 15.6875, + "grad_norm_var": 0.805322265625, + "learning_rate": 0.0003, + "loss": 11.1182, + "loss/aux_loss": 0.04807244669646025, + "loss/crossentropy": 2.4817294061183928, + "loss/logits": 0.804246386885643, + "step": 55970 + }, + { + "epoch": 0.5598, + "grad_norm": 15.125, + "grad_norm_var": 0.7141764322916667, + "learning_rate": 0.0003, + "loss": 10.9826, + "loss/aux_loss": 0.048072634264826775, + "loss/crossentropy": 2.8035527586936952, + "loss/logits": 0.8370449364185333, + "step": 55980 + }, + { + "epoch": 0.5599, + "grad_norm": 15.6875, + "grad_norm_var": 0.5895670572916667, + "learning_rate": 0.0003, + "loss": 10.9758, + "loss/aux_loss": 0.048080057837069035, + "loss/crossentropy": 2.8196861863136293, + "loss/logits": 0.8604692459106446, + "step": 55990 + }, + { + "epoch": 0.56, + "grad_norm": 14.125, + "grad_norm_var": 0.3753743489583333, + "learning_rate": 0.0003, + "loss": 11.119, + "loss/aux_loss": 0.04806259609758854, + "loss/crossentropy": 2.770169770717621, + "loss/logits": 0.8338570713996887, + "step": 56000 + }, + { + "epoch": 0.5601, + "grad_norm": 16.375, + "grad_norm_var": 0.40078125, + "learning_rate": 0.0003, + "loss": 11.1225, + "loss/aux_loss": 0.04806851968169212, + "loss/crossentropy": 2.778761512041092, + "loss/logits": 0.8658297926187515, + "step": 56010 + }, + { + "epoch": 0.5602, + "grad_norm": 15.5625, + "grad_norm_var": 0.825, + "learning_rate": 0.0003, + "loss": 11.145, + "loss/aux_loss": 0.04807616528123617, + "loss/crossentropy": 2.830496996641159, + "loss/logits": 0.8245423913002015, + "step": 56020 + }, + { + "epoch": 0.5603, + "grad_norm": 15.0625, + "grad_norm_var": 0.638916015625, + "learning_rate": 0.0003, + "loss": 10.9945, + "loss/aux_loss": 0.04807240832597017, + "loss/crossentropy": 2.755543279647827, + "loss/logits": 0.8172949731349946, + "step": 56030 + }, + { + "epoch": 0.5604, + "grad_norm": 15.4375, + "grad_norm_var": 0.6825358072916666, + "learning_rate": 0.0003, + "loss": 10.9785, + "loss/aux_loss": 0.04805968664586544, + "loss/crossentropy": 2.717133402824402, + "loss/logits": 0.8595670849084854, + "step": 56040 + }, + { + "epoch": 0.5605, + "grad_norm": 14.25, + "grad_norm_var": 0.8062337239583334, + "learning_rate": 0.0003, + "loss": 10.9878, + "loss/aux_loss": 0.0480790127068758, + "loss/crossentropy": 2.7759326457977296, + "loss/logits": 0.8354659885168075, + "step": 56050 + }, + { + "epoch": 0.5606, + "grad_norm": 14.5625, + "grad_norm_var": 0.37381184895833336, + "learning_rate": 0.0003, + "loss": 10.8017, + "loss/aux_loss": 0.048070153221488, + "loss/crossentropy": 2.7013581454753877, + "loss/logits": 0.8085485100746155, + "step": 56060 + }, + { + "epoch": 0.5607, + "grad_norm": 13.9375, + "grad_norm_var": 0.7577473958333333, + "learning_rate": 0.0003, + "loss": 10.8808, + "loss/aux_loss": 0.04807856027036905, + "loss/crossentropy": 2.527774375677109, + "loss/logits": 0.8128434181213379, + "step": 56070 + }, + { + "epoch": 0.5608, + "grad_norm": 14.0, + "grad_norm_var": 0.3275390625, + "learning_rate": 0.0003, + "loss": 11.0026, + "loss/aux_loss": 0.04807322099804878, + "loss/crossentropy": 2.6217800080776215, + "loss/logits": 0.8302909851074218, + "step": 56080 + }, + { + "epoch": 0.5609, + "grad_norm": 15.0, + "grad_norm_var": 0.44803059895833336, + "learning_rate": 0.0003, + "loss": 10.8046, + "loss/aux_loss": 0.04807568434625864, + "loss/crossentropy": 2.5421776592731478, + "loss/logits": 0.7773946285247803, + "step": 56090 + }, + { + "epoch": 0.561, + "grad_norm": 15.0, + "grad_norm_var": 0.2843098958333333, + "learning_rate": 0.0003, + "loss": 11.0388, + "loss/aux_loss": 0.04807381071150303, + "loss/crossentropy": 2.7090347170829774, + "loss/logits": 0.8462309181690216, + "step": 56100 + }, + { + "epoch": 0.5611, + "grad_norm": 15.625, + "grad_norm_var": 0.26764322916666666, + "learning_rate": 0.0003, + "loss": 11.1147, + "loss/aux_loss": 0.04806612860411406, + "loss/crossentropy": 2.7417237401008605, + "loss/logits": 0.8153161972761154, + "step": 56110 + }, + { + "epoch": 0.5612, + "grad_norm": 15.9375, + "grad_norm_var": 0.493603515625, + "learning_rate": 0.0003, + "loss": 11.0104, + "loss/aux_loss": 0.048076927475631234, + "loss/crossentropy": 2.685434067249298, + "loss/logits": 0.8215235829353332, + "step": 56120 + }, + { + "epoch": 0.5613, + "grad_norm": 15.0625, + "grad_norm_var": 0.543994140625, + "learning_rate": 0.0003, + "loss": 10.9183, + "loss/aux_loss": 0.048070359975099564, + "loss/crossentropy": 2.7782336354255674, + "loss/logits": 0.8645920783281327, + "step": 56130 + }, + { + "epoch": 0.5614, + "grad_norm": 15.9375, + "grad_norm_var": 0.56875, + "learning_rate": 0.0003, + "loss": 10.9654, + "loss/aux_loss": 0.04806331600993872, + "loss/crossentropy": 2.7166395127773284, + "loss/logits": 0.835795333981514, + "step": 56140 + }, + { + "epoch": 0.5615, + "grad_norm": 14.625, + "grad_norm_var": 3.842431640625, + "learning_rate": 0.0003, + "loss": 11.0505, + "loss/aux_loss": 0.04807646721601486, + "loss/crossentropy": 2.7557631850242617, + "loss/logits": 0.8347720831632615, + "step": 56150 + }, + { + "epoch": 0.5616, + "grad_norm": 14.75, + "grad_norm_var": 0.269384765625, + "learning_rate": 0.0003, + "loss": 11.1227, + "loss/aux_loss": 0.04807073958218098, + "loss/crossentropy": 2.834517753124237, + "loss/logits": 0.8356727987527848, + "step": 56160 + }, + { + "epoch": 0.5617, + "grad_norm": 14.9375, + "grad_norm_var": 0.3465983072916667, + "learning_rate": 0.0003, + "loss": 10.8892, + "loss/aux_loss": 0.048071736469864845, + "loss/crossentropy": 2.6536025047302245, + "loss/logits": 0.8201006531715394, + "step": 56170 + }, + { + "epoch": 0.5618, + "grad_norm": 17.0, + "grad_norm_var": 0.6214680989583333, + "learning_rate": 0.0003, + "loss": 11.0144, + "loss/aux_loss": 0.048070326820015906, + "loss/crossentropy": 2.6506611943244933, + "loss/logits": 0.7980666756629944, + "step": 56180 + }, + { + "epoch": 0.5619, + "grad_norm": 21.5, + "grad_norm_var": 3.3739583333333334, + "learning_rate": 0.0003, + "loss": 11.0292, + "loss/aux_loss": 0.048073834739625454, + "loss/crossentropy": 2.757190352678299, + "loss/logits": 0.8351425707340241, + "step": 56190 + }, + { + "epoch": 0.562, + "grad_norm": 14.25, + "grad_norm_var": 3.118212890625, + "learning_rate": 0.0003, + "loss": 11.0093, + "loss/aux_loss": 0.048075878620147706, + "loss/crossentropy": 2.798985254764557, + "loss/logits": 0.8397725850343705, + "step": 56200 + }, + { + "epoch": 0.5621, + "grad_norm": 14.125, + "grad_norm_var": 1.3070149739583334, + "learning_rate": 0.0003, + "loss": 10.9908, + "loss/aux_loss": 0.048066343553364275, + "loss/crossentropy": 2.794421637058258, + "loss/logits": 0.8280640333890915, + "step": 56210 + }, + { + "epoch": 0.5622, + "grad_norm": 13.875, + "grad_norm_var": 0.6166015625, + "learning_rate": 0.0003, + "loss": 10.968, + "loss/aux_loss": 0.04806768260896206, + "loss/crossentropy": 2.646075713634491, + "loss/logits": 0.8375303894281387, + "step": 56220 + }, + { + "epoch": 0.5623, + "grad_norm": 15.125, + "grad_norm_var": 0.5046223958333333, + "learning_rate": 0.0003, + "loss": 11.0031, + "loss/aux_loss": 0.04807407818734646, + "loss/crossentropy": 2.8416967034339904, + "loss/logits": 0.8331740826368332, + "step": 56230 + }, + { + "epoch": 0.5624, + "grad_norm": 15.875, + "grad_norm_var": 0.5088541666666667, + "learning_rate": 0.0003, + "loss": 11.121, + "loss/aux_loss": 0.04808174110949039, + "loss/crossentropy": 2.7374016523361204, + "loss/logits": 0.8004465430974961, + "step": 56240 + }, + { + "epoch": 0.5625, + "grad_norm": 14.625, + "grad_norm_var": 0.8033854166666666, + "learning_rate": 0.0003, + "loss": 11.0857, + "loss/aux_loss": 0.048068351671099664, + "loss/crossentropy": 2.691207242012024, + "loss/logits": 0.8244406789541244, + "step": 56250 + }, + { + "epoch": 0.5626, + "grad_norm": 15.25, + "grad_norm_var": 0.5994140625, + "learning_rate": 0.0003, + "loss": 11.033, + "loss/aux_loss": 0.048069524578750134, + "loss/crossentropy": 2.655339479446411, + "loss/logits": 0.8004418700933457, + "step": 56260 + }, + { + "epoch": 0.5627, + "grad_norm": 16.375, + "grad_norm_var": 1.406103515625, + "learning_rate": 0.0003, + "loss": 10.9853, + "loss/aux_loss": 0.04807061068713665, + "loss/crossentropy": 2.562644922733307, + "loss/logits": 0.7774939149618149, + "step": 56270 + }, + { + "epoch": 0.5628, + "grad_norm": 14.5, + "grad_norm_var": 110.99542643229167, + "learning_rate": 0.0003, + "loss": 10.9302, + "loss/aux_loss": 0.04808583036065102, + "loss/crossentropy": 2.7587235629558564, + "loss/logits": 0.8342153191566467, + "step": 56280 + }, + { + "epoch": 0.5629, + "grad_norm": 14.6875, + "grad_norm_var": 0.5614420572916666, + "learning_rate": 0.0003, + "loss": 10.8492, + "loss/aux_loss": 0.048062351532280445, + "loss/crossentropy": 2.640738385915756, + "loss/logits": 0.7902368202805519, + "step": 56290 + }, + { + "epoch": 0.563, + "grad_norm": 13.9375, + "grad_norm_var": 0.862353515625, + "learning_rate": 0.0003, + "loss": 10.9259, + "loss/aux_loss": 0.04808336030691862, + "loss/crossentropy": 2.662439024448395, + "loss/logits": 0.8200179070234299, + "step": 56300 + }, + { + "epoch": 0.5631, + "grad_norm": 13.9375, + "grad_norm_var": 1.118212890625, + "learning_rate": 0.0003, + "loss": 11.1451, + "loss/aux_loss": 0.048071098141372205, + "loss/crossentropy": 2.7258577704429627, + "loss/logits": 0.8454255849123001, + "step": 56310 + }, + { + "epoch": 0.5632, + "grad_norm": 14.25, + "grad_norm_var": 0.7738118489583333, + "learning_rate": 0.0003, + "loss": 10.8717, + "loss/aux_loss": 0.048069667629897596, + "loss/crossentropy": 2.721261328458786, + "loss/logits": 0.8305182576179504, + "step": 56320 + }, + { + "epoch": 0.5633, + "grad_norm": 14.3125, + "grad_norm_var": 0.325, + "learning_rate": 0.0003, + "loss": 10.9718, + "loss/aux_loss": 0.04807541910558939, + "loss/crossentropy": 2.828034371137619, + "loss/logits": 0.8110349535942077, + "step": 56330 + }, + { + "epoch": 0.5634, + "grad_norm": 13.75, + "grad_norm_var": 0.1484375, + "learning_rate": 0.0003, + "loss": 10.8651, + "loss/aux_loss": 0.04806641507893801, + "loss/crossentropy": 2.6425141513347628, + "loss/logits": 0.7947331488132476, + "step": 56340 + }, + { + "epoch": 0.5635, + "grad_norm": 13.75, + "grad_norm_var": 13.531103515625, + "learning_rate": 0.0003, + "loss": 10.824, + "loss/aux_loss": 0.048073524795472625, + "loss/crossentropy": 2.704647868871689, + "loss/logits": 0.7970446825027466, + "step": 56350 + }, + { + "epoch": 0.5636, + "grad_norm": 14.4375, + "grad_norm_var": 0.30618489583333336, + "learning_rate": 0.0003, + "loss": 10.8263, + "loss/aux_loss": 0.04807068221271038, + "loss/crossentropy": 2.7439758598804476, + "loss/logits": 0.8404293477535247, + "step": 56360 + }, + { + "epoch": 0.5637, + "grad_norm": 14.125, + "grad_norm_var": 0.9722493489583334, + "learning_rate": 0.0003, + "loss": 11.0797, + "loss/aux_loss": 0.048076880536973475, + "loss/crossentropy": 2.848537635803223, + "loss/logits": 0.8467506438493728, + "step": 56370 + }, + { + "epoch": 0.5638, + "grad_norm": 15.75, + "grad_norm_var": 0.7387858072916667, + "learning_rate": 0.0003, + "loss": 11.0897, + "loss/aux_loss": 0.04807664547115564, + "loss/crossentropy": 2.8234737038612367, + "loss/logits": 0.8553803592920304, + "step": 56380 + }, + { + "epoch": 0.5639, + "grad_norm": 14.5, + "grad_norm_var": 0.567431640625, + "learning_rate": 0.0003, + "loss": 11.0941, + "loss/aux_loss": 0.048065589554607865, + "loss/crossentropy": 2.7501831650733948, + "loss/logits": 0.8351290255784989, + "step": 56390 + }, + { + "epoch": 0.564, + "grad_norm": 18.125, + "grad_norm_var": 1.2236979166666666, + "learning_rate": 0.0003, + "loss": 11.1896, + "loss/aux_loss": 0.04807354472577572, + "loss/crossentropy": 2.709191882610321, + "loss/logits": 0.848085030913353, + "step": 56400 + }, + { + "epoch": 0.5641, + "grad_norm": 15.875, + "grad_norm_var": 0.8799479166666667, + "learning_rate": 0.0003, + "loss": 10.8941, + "loss/aux_loss": 0.04807153381407261, + "loss/crossentropy": 2.7052394211292268, + "loss/logits": 0.8169119179248809, + "step": 56410 + }, + { + "epoch": 0.5642, + "grad_norm": 17.0, + "grad_norm_var": 2.4324055989583333, + "learning_rate": 0.0003, + "loss": 10.9911, + "loss/aux_loss": 0.04807809721678495, + "loss/crossentropy": 2.7568470358848574, + "loss/logits": 0.8337729841470718, + "step": 56420 + }, + { + "epoch": 0.5643, + "grad_norm": 14.1875, + "grad_norm_var": 3.5363118489583334, + "learning_rate": 0.0003, + "loss": 10.8475, + "loss/aux_loss": 0.04806798957288265, + "loss/crossentropy": 2.55110359787941, + "loss/logits": 0.7923869863152504, + "step": 56430 + }, + { + "epoch": 0.5644, + "grad_norm": 16.625, + "grad_norm_var": 3.0268229166666667, + "learning_rate": 0.0003, + "loss": 11.0289, + "loss/aux_loss": 0.048070752806961534, + "loss/crossentropy": 2.7474220752716065, + "loss/logits": 0.8456249058246612, + "step": 56440 + }, + { + "epoch": 0.5645, + "grad_norm": 15.75, + "grad_norm_var": 2.623893229166667, + "learning_rate": 0.0003, + "loss": 11.0403, + "loss/aux_loss": 0.04807250145822763, + "loss/crossentropy": 2.650127410888672, + "loss/logits": 0.8189398646354675, + "step": 56450 + }, + { + "epoch": 0.5646, + "grad_norm": 14.8125, + "grad_norm_var": 0.6281087239583333, + "learning_rate": 0.0003, + "loss": 10.9889, + "loss/aux_loss": 0.048068339750170705, + "loss/crossentropy": 2.6610859453678133, + "loss/logits": 0.7993488103151322, + "step": 56460 + }, + { + "epoch": 0.5647, + "grad_norm": 14.625, + "grad_norm_var": 0.537353515625, + "learning_rate": 0.0003, + "loss": 11.0154, + "loss/aux_loss": 0.04807689357548952, + "loss/crossentropy": 2.7062296152114866, + "loss/logits": 0.8192767605185509, + "step": 56470 + }, + { + "epoch": 0.5648, + "grad_norm": 15.5, + "grad_norm_var": 0.7515462239583334, + "learning_rate": 0.0003, + "loss": 11.0572, + "loss/aux_loss": 0.048062050342559816, + "loss/crossentropy": 2.8266763508319857, + "loss/logits": 0.8351973295211792, + "step": 56480 + }, + { + "epoch": 0.5649, + "grad_norm": 14.125, + "grad_norm_var": 0.8270182291666667, + "learning_rate": 0.0003, + "loss": 11.0404, + "loss/aux_loss": 0.04807958882302046, + "loss/crossentropy": 2.7233566522598265, + "loss/logits": 0.8333428800106049, + "step": 56490 + }, + { + "epoch": 0.565, + "grad_norm": 14.1875, + "grad_norm_var": 0.41456705729166665, + "learning_rate": 0.0003, + "loss": 11.015, + "loss/aux_loss": 0.04806462060660124, + "loss/crossentropy": 2.6568702876567842, + "loss/logits": 0.840363684296608, + "step": 56500 + }, + { + "epoch": 0.5651, + "grad_norm": 15.1875, + "grad_norm_var": 1.0598307291666667, + "learning_rate": 0.0003, + "loss": 11.1722, + "loss/aux_loss": 0.048068604059517385, + "loss/crossentropy": 2.863558900356293, + "loss/logits": 0.8404663354158401, + "step": 56510 + }, + { + "epoch": 0.5652, + "grad_norm": 14.0625, + "grad_norm_var": 1.366650390625, + "learning_rate": 0.0003, + "loss": 10.9801, + "loss/aux_loss": 0.048066033609211446, + "loss/crossentropy": 2.7827521324157716, + "loss/logits": 0.833331236243248, + "step": 56520 + }, + { + "epoch": 0.5653, + "grad_norm": 16.25, + "grad_norm_var": 0.7650390625, + "learning_rate": 0.0003, + "loss": 11.0661, + "loss/aux_loss": 0.04807133264839649, + "loss/crossentropy": 2.691127985715866, + "loss/logits": 0.8125263452529907, + "step": 56530 + }, + { + "epoch": 0.5654, + "grad_norm": 14.3125, + "grad_norm_var": 0.5718587239583334, + "learning_rate": 0.0003, + "loss": 11.1224, + "loss/aux_loss": 0.04806945119053126, + "loss/crossentropy": 2.7853653192520142, + "loss/logits": 0.8257082641124726, + "step": 56540 + }, + { + "epoch": 0.5655, + "grad_norm": 15.375, + "grad_norm_var": 0.29270833333333335, + "learning_rate": 0.0003, + "loss": 10.965, + "loss/aux_loss": 0.048084007762372497, + "loss/crossentropy": 2.676611590385437, + "loss/logits": 0.8134146988391876, + "step": 56550 + }, + { + "epoch": 0.5656, + "grad_norm": 14.3125, + "grad_norm_var": 2.330322265625, + "learning_rate": 0.0003, + "loss": 10.794, + "loss/aux_loss": 0.048067791387438774, + "loss/crossentropy": 2.7615798473358155, + "loss/logits": 0.8393901348114013, + "step": 56560 + }, + { + "epoch": 0.5657, + "grad_norm": 14.3125, + "grad_norm_var": 1.1707682291666666, + "learning_rate": 0.0003, + "loss": 10.9696, + "loss/aux_loss": 0.0480693681165576, + "loss/crossentropy": 2.7393906354904174, + "loss/logits": 0.8108802825212479, + "step": 56570 + }, + { + "epoch": 0.5658, + "grad_norm": 15.875, + "grad_norm_var": 1.6731770833333333, + "learning_rate": 0.0003, + "loss": 10.9976, + "loss/aux_loss": 0.04807717688381672, + "loss/crossentropy": 2.7219885349273683, + "loss/logits": 0.8048550575971604, + "step": 56580 + }, + { + "epoch": 0.5659, + "grad_norm": 14.875, + "grad_norm_var": 1.264306640625, + "learning_rate": 0.0003, + "loss": 11.093, + "loss/aux_loss": 0.04807864520698786, + "loss/crossentropy": 2.777973675727844, + "loss/logits": 0.8606278628110886, + "step": 56590 + }, + { + "epoch": 0.566, + "grad_norm": 14.125, + "grad_norm_var": 1.4953125, + "learning_rate": 0.0003, + "loss": 10.868, + "loss/aux_loss": 0.04806201551109553, + "loss/crossentropy": 2.5385043144226076, + "loss/logits": 0.7936165243387222, + "step": 56600 + }, + { + "epoch": 0.5661, + "grad_norm": 14.1875, + "grad_norm_var": 0.692041015625, + "learning_rate": 0.0003, + "loss": 10.9311, + "loss/aux_loss": 0.04808430373668671, + "loss/crossentropy": 2.6752517938613893, + "loss/logits": 0.794241589307785, + "step": 56610 + }, + { + "epoch": 0.5662, + "grad_norm": 14.375, + "grad_norm_var": 1.6005208333333334, + "learning_rate": 0.0003, + "loss": 10.7976, + "loss/aux_loss": 0.04808196313679218, + "loss/crossentropy": 2.622132194042206, + "loss/logits": 0.7990513414144516, + "step": 56620 + }, + { + "epoch": 0.5663, + "grad_norm": 15.25, + "grad_norm_var": 1.3752604166666667, + "learning_rate": 0.0003, + "loss": 10.9685, + "loss/aux_loss": 0.048063672706484796, + "loss/crossentropy": 2.659779739379883, + "loss/logits": 0.8321389853954315, + "step": 56630 + }, + { + "epoch": 0.5664, + "grad_norm": 16.0, + "grad_norm_var": 0.3322265625, + "learning_rate": 0.0003, + "loss": 11.0019, + "loss/aux_loss": 0.04806230738759041, + "loss/crossentropy": 2.7045338630676268, + "loss/logits": 0.8264268547296524, + "step": 56640 + }, + { + "epoch": 0.5665, + "grad_norm": 14.4375, + "grad_norm_var": 0.6581868489583333, + "learning_rate": 0.0003, + "loss": 11.0197, + "loss/aux_loss": 0.04807612672448158, + "loss/crossentropy": 2.6564504504203796, + "loss/logits": 0.7988866597414017, + "step": 56650 + }, + { + "epoch": 0.5666, + "grad_norm": 14.625, + "grad_norm_var": 0.8681640625, + "learning_rate": 0.0003, + "loss": 10.7875, + "loss/aux_loss": 0.04807793591171503, + "loss/crossentropy": 2.573316812515259, + "loss/logits": 0.7921032071113586, + "step": 56660 + }, + { + "epoch": 0.5667, + "grad_norm": 15.5625, + "grad_norm_var": 0.5376139322916667, + "learning_rate": 0.0003, + "loss": 10.9263, + "loss/aux_loss": 0.048057892732322215, + "loss/crossentropy": 2.730033391714096, + "loss/logits": 0.8079028069972992, + "step": 56670 + }, + { + "epoch": 0.5668, + "grad_norm": 15.0, + "grad_norm_var": 0.26536458333333335, + "learning_rate": 0.0003, + "loss": 11.0131, + "loss/aux_loss": 0.04807726927101612, + "loss/crossentropy": 2.7096143126487733, + "loss/logits": 0.8181146889925003, + "step": 56680 + }, + { + "epoch": 0.5669, + "grad_norm": 15.0625, + "grad_norm_var": 0.4228515625, + "learning_rate": 0.0003, + "loss": 11.098, + "loss/aux_loss": 0.04807647932320833, + "loss/crossentropy": 2.642156887054443, + "loss/logits": 0.8269279479980469, + "step": 56690 + }, + { + "epoch": 0.567, + "grad_norm": 13.9375, + "grad_norm_var": 0.33123372395833334, + "learning_rate": 0.0003, + "loss": 10.9675, + "loss/aux_loss": 0.04807464182376862, + "loss/crossentropy": 2.8210769176483153, + "loss/logits": 0.83407823741436, + "step": 56700 + }, + { + "epoch": 0.5671, + "grad_norm": 15.4375, + "grad_norm_var": 0.3556640625, + "learning_rate": 0.0003, + "loss": 11.1748, + "loss/aux_loss": 0.04806642550975084, + "loss/crossentropy": 2.749396449327469, + "loss/logits": 0.8253662884235382, + "step": 56710 + }, + { + "epoch": 0.5672, + "grad_norm": 14.75, + "grad_norm_var": 0.515087890625, + "learning_rate": 0.0003, + "loss": 10.9303, + "loss/aux_loss": 0.04807874038815498, + "loss/crossentropy": 2.8592591881752014, + "loss/logits": 0.8499416679143905, + "step": 56720 + }, + { + "epoch": 0.5673, + "grad_norm": 15.5, + "grad_norm_var": 0.5817545572916667, + "learning_rate": 0.0003, + "loss": 11.0301, + "loss/aux_loss": 0.048067976161837576, + "loss/crossentropy": 2.7235675573349, + "loss/logits": 0.8350800782442093, + "step": 56730 + }, + { + "epoch": 0.5674, + "grad_norm": 15.6875, + "grad_norm_var": 0.468212890625, + "learning_rate": 0.0003, + "loss": 10.9828, + "loss/aux_loss": 0.04807393439114094, + "loss/crossentropy": 2.7318145632743835, + "loss/logits": 0.8563040405511856, + "step": 56740 + }, + { + "epoch": 0.5675, + "grad_norm": 14.1875, + "grad_norm_var": 2.0502604166666667, + "learning_rate": 0.0003, + "loss": 10.915, + "loss/aux_loss": 0.04806574210524559, + "loss/crossentropy": 2.855338990688324, + "loss/logits": 0.851107832789421, + "step": 56750 + }, + { + "epoch": 0.5676, + "grad_norm": 16.625, + "grad_norm_var": 2.186181640625, + "learning_rate": 0.0003, + "loss": 11.0244, + "loss/aux_loss": 0.04807633981108665, + "loss/crossentropy": 2.7718964219093323, + "loss/logits": 0.8375044643878937, + "step": 56760 + }, + { + "epoch": 0.5677, + "grad_norm": 16.0, + "grad_norm_var": 2.228645833333333, + "learning_rate": 0.0003, + "loss": 11.0663, + "loss/aux_loss": 0.048062844574451445, + "loss/crossentropy": 2.8246702313423158, + "loss/logits": 0.8107487201690674, + "step": 56770 + }, + { + "epoch": 0.5678, + "grad_norm": 14.75, + "grad_norm_var": 0.7020670572916666, + "learning_rate": 0.0003, + "loss": 11.0445, + "loss/aux_loss": 0.0480653140693903, + "loss/crossentropy": 2.789826810359955, + "loss/logits": 0.8652868360280991, + "step": 56780 + }, + { + "epoch": 0.5679, + "grad_norm": 14.625, + "grad_norm_var": 0.5175618489583333, + "learning_rate": 0.0003, + "loss": 10.8587, + "loss/aux_loss": 0.048073895275592804, + "loss/crossentropy": 2.729355055093765, + "loss/logits": 0.7821523636579514, + "step": 56790 + }, + { + "epoch": 0.568, + "grad_norm": 14.8125, + "grad_norm_var": 0.551416015625, + "learning_rate": 0.0003, + "loss": 11.0657, + "loss/aux_loss": 0.04806792289018631, + "loss/crossentropy": 2.722504496574402, + "loss/logits": 0.8235153377056121, + "step": 56800 + }, + { + "epoch": 0.5681, + "grad_norm": 15.0, + "grad_norm_var": 0.220166015625, + "learning_rate": 0.0003, + "loss": 11.0569, + "loss/aux_loss": 0.04807557370513678, + "loss/crossentropy": 2.571262764930725, + "loss/logits": 0.8196977347135543, + "step": 56810 + }, + { + "epoch": 0.5682, + "grad_norm": 14.8125, + "grad_norm_var": 0.18292643229166666, + "learning_rate": 0.0003, + "loss": 10.9235, + "loss/aux_loss": 0.04806826990097761, + "loss/crossentropy": 2.7095551788806915, + "loss/logits": 0.8231742322444916, + "step": 56820 + }, + { + "epoch": 0.5683, + "grad_norm": 15.3125, + "grad_norm_var": 0.6325358072916667, + "learning_rate": 0.0003, + "loss": 11.0522, + "loss/aux_loss": 0.04808192327618599, + "loss/crossentropy": 2.724127823114395, + "loss/logits": 0.7891089856624603, + "step": 56830 + }, + { + "epoch": 0.5684, + "grad_norm": 15.375, + "grad_norm_var": 0.35859375, + "learning_rate": 0.0003, + "loss": 11.1465, + "loss/aux_loss": 0.04806146658957004, + "loss/crossentropy": 2.758617115020752, + "loss/logits": 0.8087145060300827, + "step": 56840 + }, + { + "epoch": 0.5685, + "grad_norm": 13.5625, + "grad_norm_var": 0.4038899739583333, + "learning_rate": 0.0003, + "loss": 10.8962, + "loss/aux_loss": 0.048076235502958295, + "loss/crossentropy": 2.6616262257099152, + "loss/logits": 0.8008765608072281, + "step": 56850 + }, + { + "epoch": 0.5686, + "grad_norm": 14.625, + "grad_norm_var": 0.36822916666666666, + "learning_rate": 0.0003, + "loss": 10.8288, + "loss/aux_loss": 0.04807582087814808, + "loss/crossentropy": 2.8255065202713014, + "loss/logits": 0.8107618898153305, + "step": 56860 + }, + { + "epoch": 0.5687, + "grad_norm": 14.8125, + "grad_norm_var": 0.33396809895833335, + "learning_rate": 0.0003, + "loss": 10.976, + "loss/aux_loss": 0.04806512389332056, + "loss/crossentropy": 2.8445683240890505, + "loss/logits": 0.8330892562866211, + "step": 56870 + }, + { + "epoch": 0.5688, + "grad_norm": 15.1875, + "grad_norm_var": 0.38795572916666665, + "learning_rate": 0.0003, + "loss": 11.077, + "loss/aux_loss": 0.04806285053491592, + "loss/crossentropy": 2.6184718787670134, + "loss/logits": 0.8130939185619355, + "step": 56880 + }, + { + "epoch": 0.5689, + "grad_norm": 14.8125, + "grad_norm_var": 0.265478515625, + "learning_rate": 0.0003, + "loss": 11.112, + "loss/aux_loss": 0.04806987438350916, + "loss/crossentropy": 2.666793406009674, + "loss/logits": 0.8218275606632233, + "step": 56890 + }, + { + "epoch": 0.569, + "grad_norm": 14.9375, + "grad_norm_var": 0.2703125, + "learning_rate": 0.0003, + "loss": 10.9284, + "loss/aux_loss": 0.0480709794908762, + "loss/crossentropy": 2.6845811307430267, + "loss/logits": 0.8159762293100357, + "step": 56900 + }, + { + "epoch": 0.5691, + "grad_norm": 14.4375, + "grad_norm_var": 0.38748372395833336, + "learning_rate": 0.0003, + "loss": 11.0256, + "loss/aux_loss": 0.04806566461920738, + "loss/crossentropy": 2.7073962688446045, + "loss/logits": 0.815559196472168, + "step": 56910 + }, + { + "epoch": 0.5692, + "grad_norm": 16.125, + "grad_norm_var": 0.6726399739583333, + "learning_rate": 0.0003, + "loss": 10.9962, + "loss/aux_loss": 0.04807677231729031, + "loss/crossentropy": 2.6567338943481444, + "loss/logits": 0.7852911531925202, + "step": 56920 + }, + { + "epoch": 0.5693, + "grad_norm": 13.25, + "grad_norm_var": 0.807275390625, + "learning_rate": 0.0003, + "loss": 10.8256, + "loss/aux_loss": 0.04806611649692059, + "loss/crossentropy": 2.900643491744995, + "loss/logits": 0.8667346566915513, + "step": 56930 + }, + { + "epoch": 0.5694, + "grad_norm": 14.25, + "grad_norm_var": 0.8233723958333333, + "learning_rate": 0.0003, + "loss": 10.9697, + "loss/aux_loss": 0.048079632222652435, + "loss/crossentropy": 2.892075502872467, + "loss/logits": 0.8571143001317978, + "step": 56940 + }, + { + "epoch": 0.5695, + "grad_norm": 14.625, + "grad_norm_var": 0.5994140625, + "learning_rate": 0.0003, + "loss": 10.9265, + "loss/aux_loss": 0.048065055161714554, + "loss/crossentropy": 2.6348765909671785, + "loss/logits": 0.8346069097518921, + "step": 56950 + }, + { + "epoch": 0.5696, + "grad_norm": 14.125, + "grad_norm_var": 0.25983072916666666, + "learning_rate": 0.0003, + "loss": 10.9799, + "loss/aux_loss": 0.04807092547416687, + "loss/crossentropy": 2.802864468097687, + "loss/logits": 0.8114332973957061, + "step": 56960 + }, + { + "epoch": 0.5697, + "grad_norm": 15.75, + "grad_norm_var": 0.6822265625, + "learning_rate": 0.0003, + "loss": 10.8788, + "loss/aux_loss": 0.04807887505739927, + "loss/crossentropy": 2.7381537735462187, + "loss/logits": 0.87372607588768, + "step": 56970 + }, + { + "epoch": 0.5698, + "grad_norm": 13.9375, + "grad_norm_var": 0.7661295572916667, + "learning_rate": 0.0003, + "loss": 10.9834, + "loss/aux_loss": 0.04806207437068224, + "loss/crossentropy": 2.7998313903808594, + "loss/logits": 0.8182542502880097, + "step": 56980 + }, + { + "epoch": 0.5699, + "grad_norm": 16.125, + "grad_norm_var": 0.699462890625, + "learning_rate": 0.0003, + "loss": 11.0665, + "loss/aux_loss": 0.048081908747553824, + "loss/crossentropy": 2.635771578550339, + "loss/logits": 0.7996685534715653, + "step": 56990 + }, + { + "epoch": 0.57, + "grad_norm": 13.9375, + "grad_norm_var": 0.766650390625, + "learning_rate": 0.0003, + "loss": 10.9328, + "loss/aux_loss": 0.04807494562119245, + "loss/crossentropy": 2.6950223565101625, + "loss/logits": 0.8105741649866104, + "step": 57000 + }, + { + "epoch": 0.5701, + "grad_norm": 15.75, + "grad_norm_var": 0.7155598958333333, + "learning_rate": 0.0003, + "loss": 11.0767, + "loss/aux_loss": 0.0480563260614872, + "loss/crossentropy": 2.711246186494827, + "loss/logits": 0.8292560011148453, + "step": 57010 + }, + { + "epoch": 0.5702, + "grad_norm": 15.5625, + "grad_norm_var": 0.5749348958333333, + "learning_rate": 0.0003, + "loss": 10.8529, + "loss/aux_loss": 0.048080886527895925, + "loss/crossentropy": 2.688188964128494, + "loss/logits": 0.8574258774518967, + "step": 57020 + }, + { + "epoch": 0.5703, + "grad_norm": 14.0625, + "grad_norm_var": 0.40545247395833334, + "learning_rate": 0.0003, + "loss": 10.9149, + "loss/aux_loss": 0.04806966222822666, + "loss/crossentropy": 2.668558394908905, + "loss/logits": 0.813389179110527, + "step": 57030 + }, + { + "epoch": 0.5704, + "grad_norm": 14.875, + "grad_norm_var": 0.4353515625, + "learning_rate": 0.0003, + "loss": 11.0002, + "loss/aux_loss": 0.04807461742311716, + "loss/crossentropy": 2.6643282949924467, + "loss/logits": 0.7867618024349212, + "step": 57040 + }, + { + "epoch": 0.5705, + "grad_norm": 15.9375, + "grad_norm_var": 0.31886393229166665, + "learning_rate": 0.0003, + "loss": 10.9509, + "loss/aux_loss": 0.04807612039148808, + "loss/crossentropy": 2.7182164669036863, + "loss/logits": 0.778824046254158, + "step": 57050 + }, + { + "epoch": 0.5706, + "grad_norm": 15.875, + "grad_norm_var": 0.5471354166666667, + "learning_rate": 0.0003, + "loss": 10.9733, + "loss/aux_loss": 0.04806810449808836, + "loss/crossentropy": 2.8615632176399233, + "loss/logits": 0.8570520609617234, + "step": 57060 + }, + { + "epoch": 0.5707, + "grad_norm": 14.625, + "grad_norm_var": 0.9356770833333333, + "learning_rate": 0.0003, + "loss": 10.8977, + "loss/aux_loss": 0.04807794988155365, + "loss/crossentropy": 2.7868527293205263, + "loss/logits": 0.8298997163772583, + "step": 57070 + }, + { + "epoch": 0.5708, + "grad_norm": 15.1875, + "grad_norm_var": 0.87734375, + "learning_rate": 0.0003, + "loss": 11.081, + "loss/aux_loss": 0.0480685269460082, + "loss/crossentropy": 2.646866476535797, + "loss/logits": 0.8291843563318253, + "step": 57080 + }, + { + "epoch": 0.5709, + "grad_norm": 14.6875, + "grad_norm_var": 0.625634765625, + "learning_rate": 0.0003, + "loss": 10.9265, + "loss/aux_loss": 0.0480671152472496, + "loss/crossentropy": 2.781124436855316, + "loss/logits": 0.8275115400552749, + "step": 57090 + }, + { + "epoch": 0.571, + "grad_norm": 15.1875, + "grad_norm_var": 1.0968098958333334, + "learning_rate": 0.0003, + "loss": 11.0086, + "loss/aux_loss": 0.04807842988520861, + "loss/crossentropy": 2.722087186574936, + "loss/logits": 0.8378143638372422, + "step": 57100 + }, + { + "epoch": 0.5711, + "grad_norm": 13.9375, + "grad_norm_var": 1.2960774739583334, + "learning_rate": 0.0003, + "loss": 10.8447, + "loss/aux_loss": 0.04806256033480168, + "loss/crossentropy": 2.69550861120224, + "loss/logits": 0.8196415692567826, + "step": 57110 + }, + { + "epoch": 0.5712, + "grad_norm": 15.125, + "grad_norm_var": 0.9403483072916666, + "learning_rate": 0.0003, + "loss": 10.9062, + "loss/aux_loss": 0.0480593366548419, + "loss/crossentropy": 2.659187990427017, + "loss/logits": 0.7925233572721482, + "step": 57120 + }, + { + "epoch": 0.5713, + "grad_norm": 15.3125, + "grad_norm_var": 0.5572265625, + "learning_rate": 0.0003, + "loss": 10.9287, + "loss/aux_loss": 0.04807809740304947, + "loss/crossentropy": 2.5628524363040923, + "loss/logits": 0.778043681383133, + "step": 57130 + }, + { + "epoch": 0.5714, + "grad_norm": 17.875, + "grad_norm_var": 0.793212890625, + "learning_rate": 0.0003, + "loss": 10.9902, + "loss/aux_loss": 0.04807734601199627, + "loss/crossentropy": 2.6852267503738405, + "loss/logits": 0.821107491850853, + "step": 57140 + }, + { + "epoch": 0.5715, + "grad_norm": 23.625, + "grad_norm_var": 5.084879557291667, + "learning_rate": 0.0003, + "loss": 10.9945, + "loss/aux_loss": 0.04806005675345659, + "loss/crossentropy": 2.670514500141144, + "loss/logits": 0.8100097209215165, + "step": 57150 + }, + { + "epoch": 0.5716, + "grad_norm": 14.4375, + "grad_norm_var": 4.920817057291667, + "learning_rate": 0.0003, + "loss": 11.2069, + "loss/aux_loss": 0.04807403068989515, + "loss/crossentropy": 2.765077519416809, + "loss/logits": 0.8135815739631653, + "step": 57160 + }, + { + "epoch": 0.5717, + "grad_norm": 15.0625, + "grad_norm_var": 2.030322265625, + "learning_rate": 0.0003, + "loss": 10.9361, + "loss/aux_loss": 0.04807031713426113, + "loss/crossentropy": 2.7000171720981596, + "loss/logits": 0.7982536077499389, + "step": 57170 + }, + { + "epoch": 0.5718, + "grad_norm": 14.8125, + "grad_norm_var": 8.3837890625, + "learning_rate": 0.0003, + "loss": 10.9202, + "loss/aux_loss": 0.04807793851941824, + "loss/crossentropy": 2.6715080082416534, + "loss/logits": 0.800389638543129, + "step": 57180 + }, + { + "epoch": 0.5719, + "grad_norm": 15.0625, + "grad_norm_var": 7.883968098958333, + "learning_rate": 0.0003, + "loss": 11.1442, + "loss/aux_loss": 0.04807151965796948, + "loss/crossentropy": 2.6992808401584627, + "loss/logits": 0.8161318123340606, + "step": 57190 + }, + { + "epoch": 0.572, + "grad_norm": 14.9375, + "grad_norm_var": 0.543603515625, + "learning_rate": 0.0003, + "loss": 11.0557, + "loss/aux_loss": 0.04805929586291313, + "loss/crossentropy": 2.7212381601333617, + "loss/logits": 0.8454442709684372, + "step": 57200 + }, + { + "epoch": 0.5721, + "grad_norm": 15.375, + "grad_norm_var": 0.40358072916666665, + "learning_rate": 0.0003, + "loss": 11.0428, + "loss/aux_loss": 0.04806900396943092, + "loss/crossentropy": 2.807091176509857, + "loss/logits": 0.84793541431427, + "step": 57210 + }, + { + "epoch": 0.5722, + "grad_norm": 15.0625, + "grad_norm_var": 0.598681640625, + "learning_rate": 0.0003, + "loss": 10.9167, + "loss/aux_loss": 0.04806927982717753, + "loss/crossentropy": 2.9018397092819215, + "loss/logits": 0.8026500940322876, + "step": 57220 + }, + { + "epoch": 0.5723, + "grad_norm": 15.8125, + "grad_norm_var": 0.5054524739583334, + "learning_rate": 0.0003, + "loss": 10.9284, + "loss/aux_loss": 0.04806585274636745, + "loss/crossentropy": 2.7474361181259157, + "loss/logits": 0.7843928277492523, + "step": 57230 + }, + { + "epoch": 0.5724, + "grad_norm": 14.0625, + "grad_norm_var": 0.6785807291666667, + "learning_rate": 0.0003, + "loss": 11.0105, + "loss/aux_loss": 0.04807022046297789, + "loss/crossentropy": 2.684603381156921, + "loss/logits": 0.8285898119211197, + "step": 57240 + }, + { + "epoch": 0.5725, + "grad_norm": 15.0625, + "grad_norm_var": 0.7791015625, + "learning_rate": 0.0003, + "loss": 10.8737, + "loss/aux_loss": 0.04808129519224167, + "loss/crossentropy": 2.704842007160187, + "loss/logits": 0.8189653396606446, + "step": 57250 + }, + { + "epoch": 0.5726, + "grad_norm": 15.625, + "grad_norm_var": 0.44264322916666665, + "learning_rate": 0.0003, + "loss": 11.11, + "loss/aux_loss": 0.048068471066653726, + "loss/crossentropy": 2.755719757080078, + "loss/logits": 0.8350825071334839, + "step": 57260 + }, + { + "epoch": 0.5727, + "grad_norm": 16.125, + "grad_norm_var": 0.5872395833333334, + "learning_rate": 0.0003, + "loss": 11.0595, + "loss/aux_loss": 0.04805658888071775, + "loss/crossentropy": 2.747038698196411, + "loss/logits": 0.8029045939445496, + "step": 57270 + }, + { + "epoch": 0.5728, + "grad_norm": 15.625, + "grad_norm_var": 0.9292805989583334, + "learning_rate": 0.0003, + "loss": 11.097, + "loss/aux_loss": 0.04807424712926149, + "loss/crossentropy": 2.8873910784721373, + "loss/logits": 0.8244173586368561, + "step": 57280 + }, + { + "epoch": 0.5729, + "grad_norm": 13.6875, + "grad_norm_var": 1.3277180989583333, + "learning_rate": 0.0003, + "loss": 10.763, + "loss/aux_loss": 0.04808657988905907, + "loss/crossentropy": 2.5384365618228912, + "loss/logits": 0.8188546657562256, + "step": 57290 + }, + { + "epoch": 0.573, + "grad_norm": 14.3125, + "grad_norm_var": 0.25310872395833334, + "learning_rate": 0.0003, + "loss": 10.909, + "loss/aux_loss": 0.04805942717939615, + "loss/crossentropy": 2.7145915269851684, + "loss/logits": 0.7983001649379731, + "step": 57300 + }, + { + "epoch": 0.5731, + "grad_norm": 16.5, + "grad_norm_var": 0.5681640625, + "learning_rate": 0.0003, + "loss": 10.949, + "loss/aux_loss": 0.04808266796171665, + "loss/crossentropy": 2.680216872692108, + "loss/logits": 0.8321155905723572, + "step": 57310 + }, + { + "epoch": 0.5732, + "grad_norm": 14.75, + "grad_norm_var": 0.4596354166666667, + "learning_rate": 0.0003, + "loss": 11.0807, + "loss/aux_loss": 0.0480640958994627, + "loss/crossentropy": 2.772093391418457, + "loss/logits": 0.8285915166139602, + "step": 57320 + }, + { + "epoch": 0.5733, + "grad_norm": 15.375, + "grad_norm_var": 0.4398274739583333, + "learning_rate": 0.0003, + "loss": 11.0067, + "loss/aux_loss": 0.048067055828869346, + "loss/crossentropy": 2.7100286722183227, + "loss/logits": 0.8241377651691437, + "step": 57330 + }, + { + "epoch": 0.5734, + "grad_norm": 14.75, + "grad_norm_var": 1.8723307291666667, + "learning_rate": 0.0003, + "loss": 10.9345, + "loss/aux_loss": 0.04808278437703848, + "loss/crossentropy": 2.6721641540527346, + "loss/logits": 0.8055184870958328, + "step": 57340 + }, + { + "epoch": 0.5735, + "grad_norm": 18.625, + "grad_norm_var": 2.445817057291667, + "learning_rate": 0.0003, + "loss": 10.886, + "loss/aux_loss": 0.04806376602500677, + "loss/crossentropy": 2.6517118215560913, + "loss/logits": 0.8045364022254944, + "step": 57350 + }, + { + "epoch": 0.5736, + "grad_norm": 13.875, + "grad_norm_var": 1.1486979166666667, + "learning_rate": 0.0003, + "loss": 10.9331, + "loss/aux_loss": 0.048079241439700125, + "loss/crossentropy": 2.7825541257858277, + "loss/logits": 0.8218899816274643, + "step": 57360 + }, + { + "epoch": 0.5737, + "grad_norm": 15.6875, + "grad_norm_var": 85.743212890625, + "learning_rate": 0.0003, + "loss": 10.9565, + "loss/aux_loss": 0.0480652479454875, + "loss/crossentropy": 2.707579892873764, + "loss/logits": 0.82884761095047, + "step": 57370 + }, + { + "epoch": 0.5738, + "grad_norm": 14.75, + "grad_norm_var": 91.01847330729167, + "learning_rate": 0.0003, + "loss": 11.0795, + "loss/aux_loss": 0.048074961826205256, + "loss/crossentropy": 2.7413926482200623, + "loss/logits": 0.7939124822616577, + "step": 57380 + }, + { + "epoch": 0.5739, + "grad_norm": 14.75, + "grad_norm_var": 2.615738932291667, + "learning_rate": 0.0003, + "loss": 10.9845, + "loss/aux_loss": 0.04806408416479826, + "loss/crossentropy": 2.707012790441513, + "loss/logits": 0.830555847287178, + "step": 57390 + }, + { + "epoch": 0.574, + "grad_norm": 13.875, + "grad_norm_var": 0.516259765625, + "learning_rate": 0.0003, + "loss": 11.0791, + "loss/aux_loss": 0.048079000785946846, + "loss/crossentropy": 2.5664061307907104, + "loss/logits": 0.7763120234012604, + "step": 57400 + }, + { + "epoch": 0.5741, + "grad_norm": 14.6875, + "grad_norm_var": 0.56484375, + "learning_rate": 0.0003, + "loss": 11.137, + "loss/aux_loss": 0.04807056300342083, + "loss/crossentropy": 2.758346974849701, + "loss/logits": 0.8242935687303543, + "step": 57410 + }, + { + "epoch": 0.5742, + "grad_norm": 14.6875, + "grad_norm_var": 0.2916666666666667, + "learning_rate": 0.0003, + "loss": 10.9833, + "loss/aux_loss": 0.04807440787553787, + "loss/crossentropy": 2.6768109679222105, + "loss/logits": 0.811660248041153, + "step": 57420 + }, + { + "epoch": 0.5743, + "grad_norm": 15.3125, + "grad_norm_var": 0.4571451822916667, + "learning_rate": 0.0003, + "loss": 11.1252, + "loss/aux_loss": 0.0480594988912344, + "loss/crossentropy": 2.7542240738868715, + "loss/logits": 0.8283898085355759, + "step": 57430 + }, + { + "epoch": 0.5744, + "grad_norm": 15.3125, + "grad_norm_var": 0.39576822916666665, + "learning_rate": 0.0003, + "loss": 10.9843, + "loss/aux_loss": 0.04808159470558167, + "loss/crossentropy": 2.7614540815353394, + "loss/logits": 0.8680014103651047, + "step": 57440 + }, + { + "epoch": 0.5745, + "grad_norm": 14.6875, + "grad_norm_var": 1.1231770833333334, + "learning_rate": 0.0003, + "loss": 10.8306, + "loss/aux_loss": 0.04807989429682493, + "loss/crossentropy": 2.7385359168052674, + "loss/logits": 0.8156585484743119, + "step": 57450 + }, + { + "epoch": 0.5746, + "grad_norm": 15.1875, + "grad_norm_var": 0.28326822916666666, + "learning_rate": 0.0003, + "loss": 10.9561, + "loss/aux_loss": 0.04805928226560354, + "loss/crossentropy": 2.669804847240448, + "loss/logits": 0.8256619513034821, + "step": 57460 + }, + { + "epoch": 0.5747, + "grad_norm": 14.9375, + "grad_norm_var": 0.14889322916666667, + "learning_rate": 0.0003, + "loss": 10.9695, + "loss/aux_loss": 0.04807625375688076, + "loss/crossentropy": 2.615860992670059, + "loss/logits": 0.8644401401281356, + "step": 57470 + }, + { + "epoch": 0.5748, + "grad_norm": 50.25, + "grad_norm_var": 77.43899739583334, + "learning_rate": 0.0003, + "loss": 11.2276, + "loss/aux_loss": 0.0480669941753149, + "loss/crossentropy": 2.826398515701294, + "loss/logits": 0.8551149964332581, + "step": 57480 + }, + { + "epoch": 0.5749, + "grad_norm": 14.8125, + "grad_norm_var": 76.96354166666667, + "learning_rate": 0.0003, + "loss": 11.0055, + "loss/aux_loss": 0.04806830994784832, + "loss/crossentropy": 2.7592093706130982, + "loss/logits": 0.8087594985961915, + "step": 57490 + }, + { + "epoch": 0.575, + "grad_norm": 14.8125, + "grad_norm_var": 0.399072265625, + "learning_rate": 0.0003, + "loss": 11.0602, + "loss/aux_loss": 0.04806661587208509, + "loss/crossentropy": 2.6397600889205934, + "loss/logits": 0.7925887256860733, + "step": 57500 + }, + { + "epoch": 0.5751, + "grad_norm": 15.25, + "grad_norm_var": 0.46990559895833334, + "learning_rate": 0.0003, + "loss": 11.14, + "loss/aux_loss": 0.04806430134922266, + "loss/crossentropy": 2.7477360010147094, + "loss/logits": 0.8559922903776169, + "step": 57510 + }, + { + "epoch": 0.5752, + "grad_norm": 14.375, + "grad_norm_var": 1.5181640625, + "learning_rate": 0.0003, + "loss": 11.0349, + "loss/aux_loss": 0.04808305911719799, + "loss/crossentropy": 2.5889110445976256, + "loss/logits": 0.8387425035238266, + "step": 57520 + }, + { + "epoch": 0.5753, + "grad_norm": 15.875, + "grad_norm_var": 1.505322265625, + "learning_rate": 0.0003, + "loss": 10.9867, + "loss/aux_loss": 0.048064771480858325, + "loss/crossentropy": 2.609954422712326, + "loss/logits": 0.8136496782302857, + "step": 57530 + }, + { + "epoch": 0.5754, + "grad_norm": 15.8125, + "grad_norm_var": 0.9311848958333333, + "learning_rate": 0.0003, + "loss": 11.0806, + "loss/aux_loss": 0.04807123206555843, + "loss/crossentropy": 2.7172460675239565, + "loss/logits": 0.8387424349784851, + "step": 57540 + }, + { + "epoch": 0.5755, + "grad_norm": 14.1875, + "grad_norm_var": 0.79453125, + "learning_rate": 0.0003, + "loss": 11.023, + "loss/aux_loss": 0.04807838406413793, + "loss/crossentropy": 2.702912151813507, + "loss/logits": 0.8372643262147903, + "step": 57550 + }, + { + "epoch": 0.5756, + "grad_norm": 15.8125, + "grad_norm_var": 0.3731770833333333, + "learning_rate": 0.0003, + "loss": 10.9375, + "loss/aux_loss": 0.04806549474596977, + "loss/crossentropy": 2.909746289253235, + "loss/logits": 0.8345662504434586, + "step": 57560 + }, + { + "epoch": 0.5757, + "grad_norm": 15.125, + "grad_norm_var": 0.7208333333333333, + "learning_rate": 0.0003, + "loss": 10.9115, + "loss/aux_loss": 0.04808139894157648, + "loss/crossentropy": 2.558688461780548, + "loss/logits": 0.8080873370170594, + "step": 57570 + }, + { + "epoch": 0.5758, + "grad_norm": 15.4375, + "grad_norm_var": 0.42355143229166664, + "learning_rate": 0.0003, + "loss": 10.9426, + "loss/aux_loss": 0.04807103350758553, + "loss/crossentropy": 2.7293295919895173, + "loss/logits": 0.8171383291482925, + "step": 57580 + }, + { + "epoch": 0.5759, + "grad_norm": 16.0, + "grad_norm_var": 1.1202473958333334, + "learning_rate": 0.0003, + "loss": 11.0332, + "loss/aux_loss": 0.04807331319898367, + "loss/crossentropy": 2.7175457954406737, + "loss/logits": 0.8126596748828888, + "step": 57590 + }, + { + "epoch": 0.576, + "grad_norm": 16.0, + "grad_norm_var": 1.250634765625, + "learning_rate": 0.0003, + "loss": 11.0771, + "loss/aux_loss": 0.04807139728218317, + "loss/crossentropy": 2.728803825378418, + "loss/logits": 0.8120762914419174, + "step": 57600 + }, + { + "epoch": 0.5761, + "grad_norm": 14.1875, + "grad_norm_var": 0.6979166666666666, + "learning_rate": 0.0003, + "loss": 10.973, + "loss/aux_loss": 0.04807168822735548, + "loss/crossentropy": 2.6952412009239195, + "loss/logits": 0.8491257846355438, + "step": 57610 + }, + { + "epoch": 0.5762, + "grad_norm": 15.125, + "grad_norm_var": 0.345556640625, + "learning_rate": 0.0003, + "loss": 10.8365, + "loss/aux_loss": 0.048068418726325036, + "loss/crossentropy": 2.651057040691376, + "loss/logits": 0.7869156956672668, + "step": 57620 + }, + { + "epoch": 0.5763, + "grad_norm": 22.0, + "grad_norm_var": 3.711962890625, + "learning_rate": 0.0003, + "loss": 11.1586, + "loss/aux_loss": 0.04807181134819984, + "loss/crossentropy": 2.6818241477012634, + "loss/logits": 0.8666929543018341, + "step": 57630 + }, + { + "epoch": 0.5764, + "grad_norm": 15.1875, + "grad_norm_var": 3.421875, + "learning_rate": 0.0003, + "loss": 11.0666, + "loss/aux_loss": 0.0480666371062398, + "loss/crossentropy": 2.6690493881702424, + "loss/logits": 0.8045336902141571, + "step": 57640 + }, + { + "epoch": 0.5765, + "grad_norm": 15.5, + "grad_norm_var": 0.502978515625, + "learning_rate": 0.0003, + "loss": 11.1792, + "loss/aux_loss": 0.04807822220027447, + "loss/crossentropy": 2.7619189381599427, + "loss/logits": 0.8294977605342865, + "step": 57650 + }, + { + "epoch": 0.5766, + "grad_norm": 13.875, + "grad_norm_var": 0.6133951822916667, + "learning_rate": 0.0003, + "loss": 10.9134, + "loss/aux_loss": 0.04808087293058634, + "loss/crossentropy": 2.7703096151351927, + "loss/logits": 0.7943071156740189, + "step": 57660 + }, + { + "epoch": 0.5767, + "grad_norm": 15.8125, + "grad_norm_var": 0.8546223958333333, + "learning_rate": 0.0003, + "loss": 10.9094, + "loss/aux_loss": 0.048060201853513715, + "loss/crossentropy": 2.537974363565445, + "loss/logits": 0.7927771121263504, + "step": 57670 + }, + { + "epoch": 0.5768, + "grad_norm": 15.875, + "grad_norm_var": 0.7843098958333333, + "learning_rate": 0.0003, + "loss": 10.9885, + "loss/aux_loss": 0.04805659111589193, + "loss/crossentropy": 2.7599482774734496, + "loss/logits": 0.8290839821100235, + "step": 57680 + }, + { + "epoch": 0.5769, + "grad_norm": 15.5, + "grad_norm_var": 92.83125, + "learning_rate": 0.0003, + "loss": 11.1651, + "loss/aux_loss": 0.04809475895017386, + "loss/crossentropy": 2.85399044752121, + "loss/logits": 0.8747380167245865, + "step": 57690 + }, + { + "epoch": 0.577, + "grad_norm": 16.375, + "grad_norm_var": 42.48951822916667, + "learning_rate": 0.0003, + "loss": 11.2927, + "loss/aux_loss": 0.048062573187053204, + "loss/crossentropy": 2.8536964416503907, + "loss/logits": 0.8355174720287323, + "step": 57700 + }, + { + "epoch": 0.5771, + "grad_norm": 15.4375, + "grad_norm_var": 0.37890625, + "learning_rate": 0.0003, + "loss": 11.0133, + "loss/aux_loss": 0.04806618671864271, + "loss/crossentropy": 2.658688408136368, + "loss/logits": 0.8183623373508453, + "step": 57710 + }, + { + "epoch": 0.5772, + "grad_norm": 15.0, + "grad_norm_var": 0.47076822916666666, + "learning_rate": 0.0003, + "loss": 10.9248, + "loss/aux_loss": 0.04806567393243313, + "loss/crossentropy": 2.682706815004349, + "loss/logits": 0.8114593774080276, + "step": 57720 + }, + { + "epoch": 0.5773, + "grad_norm": 14.9375, + "grad_norm_var": 0.7301432291666666, + "learning_rate": 0.0003, + "loss": 10.7614, + "loss/aux_loss": 0.04807316064834595, + "loss/crossentropy": 2.6830021500587464, + "loss/logits": 0.8217417180538178, + "step": 57730 + }, + { + "epoch": 0.5774, + "grad_norm": 14.375, + "grad_norm_var": 0.46848958333333335, + "learning_rate": 0.0003, + "loss": 10.8117, + "loss/aux_loss": 0.048069142177700995, + "loss/crossentropy": 2.680797153711319, + "loss/logits": 0.7941523939371109, + "step": 57740 + }, + { + "epoch": 0.5775, + "grad_norm": 14.8125, + "grad_norm_var": 0.4583333333333333, + "learning_rate": 0.0003, + "loss": 10.8973, + "loss/aux_loss": 0.048070931993424895, + "loss/crossentropy": 2.7513445258140563, + "loss/logits": 0.8205919414758682, + "step": 57750 + }, + { + "epoch": 0.5776, + "grad_norm": 14.5, + "grad_norm_var": 0.47537434895833336, + "learning_rate": 0.0003, + "loss": 11.0942, + "loss/aux_loss": 0.04808807913213968, + "loss/crossentropy": 2.708397227525711, + "loss/logits": 0.7808063089847564, + "step": 57760 + }, + { + "epoch": 0.5777, + "grad_norm": 15.3125, + "grad_norm_var": 0.385791015625, + "learning_rate": 0.0003, + "loss": 10.9057, + "loss/aux_loss": 0.04805845711380243, + "loss/crossentropy": 2.7761632323265077, + "loss/logits": 0.8206240832805634, + "step": 57770 + }, + { + "epoch": 0.5778, + "grad_norm": 15.3125, + "grad_norm_var": 0.806494140625, + "learning_rate": 0.0003, + "loss": 10.9635, + "loss/aux_loss": 0.048069931007921694, + "loss/crossentropy": 2.7127468466758726, + "loss/logits": 0.8321121394634247, + "step": 57780 + }, + { + "epoch": 0.5779, + "grad_norm": 15.3125, + "grad_norm_var": 0.6773274739583334, + "learning_rate": 0.0003, + "loss": 10.8796, + "loss/aux_loss": 0.048075957037508485, + "loss/crossentropy": 2.5557093918323517, + "loss/logits": 0.7811422199010849, + "step": 57790 + }, + { + "epoch": 0.578, + "grad_norm": 15.4375, + "grad_norm_var": 0.46087239583333334, + "learning_rate": 0.0003, + "loss": 11.0337, + "loss/aux_loss": 0.048079724051058294, + "loss/crossentropy": 2.7828991770744325, + "loss/logits": 0.8361575275659561, + "step": 57800 + }, + { + "epoch": 0.5781, + "grad_norm": 15.375, + "grad_norm_var": 0.5171223958333333, + "learning_rate": 0.0003, + "loss": 10.7984, + "loss/aux_loss": 0.04807369504123926, + "loss/crossentropy": 2.767413020133972, + "loss/logits": 0.7979224413633347, + "step": 57810 + }, + { + "epoch": 0.5782, + "grad_norm": 15.5625, + "grad_norm_var": 12.863997395833334, + "learning_rate": 0.0003, + "loss": 11.0843, + "loss/aux_loss": 0.04807633981108665, + "loss/crossentropy": 2.841790997982025, + "loss/logits": 0.8295446068048478, + "step": 57820 + }, + { + "epoch": 0.5783, + "grad_norm": 14.4375, + "grad_norm_var": 11.975374348958333, + "learning_rate": 0.0003, + "loss": 11.0327, + "loss/aux_loss": 0.04806151837110519, + "loss/crossentropy": 2.8679856061935425, + "loss/logits": 0.863958340883255, + "step": 57830 + }, + { + "epoch": 0.5784, + "grad_norm": 14.1875, + "grad_norm_var": 0.6630208333333333, + "learning_rate": 0.0003, + "loss": 10.7655, + "loss/aux_loss": 0.04806194268167019, + "loss/crossentropy": 2.6133798182010652, + "loss/logits": 0.8144498199224472, + "step": 57840 + }, + { + "epoch": 0.5785, + "grad_norm": 14.625, + "grad_norm_var": 0.43020833333333336, + "learning_rate": 0.0003, + "loss": 10.9158, + "loss/aux_loss": 0.04808299690485, + "loss/crossentropy": 2.5752854347229004, + "loss/logits": 0.7725576773285866, + "step": 57850 + }, + { + "epoch": 0.5786, + "grad_norm": 14.4375, + "grad_norm_var": 0.30078125, + "learning_rate": 0.0003, + "loss": 11.1358, + "loss/aux_loss": 0.04805977363139391, + "loss/crossentropy": 2.6507094621658327, + "loss/logits": 0.8261604458093643, + "step": 57860 + }, + { + "epoch": 0.5787, + "grad_norm": 14.6875, + "grad_norm_var": 0.7885416666666667, + "learning_rate": 0.0003, + "loss": 10.9286, + "loss/aux_loss": 0.04806450437754393, + "loss/crossentropy": 2.7155093371868135, + "loss/logits": 0.8360859841108322, + "step": 57870 + }, + { + "epoch": 0.5788, + "grad_norm": 14.75, + "grad_norm_var": 2.076546223958333, + "learning_rate": 0.0003, + "loss": 11.0901, + "loss/aux_loss": 0.048075138591229916, + "loss/crossentropy": 2.7140918552875517, + "loss/logits": 0.8228471457958222, + "step": 57880 + }, + { + "epoch": 0.5789, + "grad_norm": 14.75, + "grad_norm_var": 2.1946451822916666, + "learning_rate": 0.0003, + "loss": 11.0579, + "loss/aux_loss": 0.048060869611799714, + "loss/crossentropy": 2.7325907826423643, + "loss/logits": 0.8381363540887833, + "step": 57890 + }, + { + "epoch": 0.579, + "grad_norm": 14.75, + "grad_norm_var": 0.9061848958333333, + "learning_rate": 0.0003, + "loss": 10.9924, + "loss/aux_loss": 0.048078179731965064, + "loss/crossentropy": 2.738635867834091, + "loss/logits": 0.8476099342107772, + "step": 57900 + }, + { + "epoch": 0.5791, + "grad_norm": 15.0, + "grad_norm_var": 1.1124837239583334, + "learning_rate": 0.0003, + "loss": 10.9735, + "loss/aux_loss": 0.048069071024656296, + "loss/crossentropy": 2.8026002764701845, + "loss/logits": 0.8490731894969941, + "step": 57910 + }, + { + "epoch": 0.5792, + "grad_norm": 14.3125, + "grad_norm_var": 0.7389973958333333, + "learning_rate": 0.0003, + "loss": 11.034, + "loss/aux_loss": 0.04806493632495403, + "loss/crossentropy": 2.6233414888381956, + "loss/logits": 0.8417493313550949, + "step": 57920 + }, + { + "epoch": 0.5793, + "grad_norm": 15.9375, + "grad_norm_var": 0.7728515625, + "learning_rate": 0.0003, + "loss": 10.9387, + "loss/aux_loss": 0.04806803483515978, + "loss/crossentropy": 2.729952883720398, + "loss/logits": 0.8270312875509263, + "step": 57930 + }, + { + "epoch": 0.5794, + "grad_norm": 17.875, + "grad_norm_var": 2.8348795572916665, + "learning_rate": 0.0003, + "loss": 11.1579, + "loss/aux_loss": 0.04808615278452635, + "loss/crossentropy": 2.733921545743942, + "loss/logits": 0.8263924434781075, + "step": 57940 + }, + { + "epoch": 0.5795, + "grad_norm": 14.5, + "grad_norm_var": 0.916650390625, + "learning_rate": 0.0003, + "loss": 11.0477, + "loss/aux_loss": 0.04806936271488667, + "loss/crossentropy": 2.6277839660644533, + "loss/logits": 0.8033677011728286, + "step": 57950 + }, + { + "epoch": 0.5796, + "grad_norm": 15.125, + "grad_norm_var": 0.39837239583333334, + "learning_rate": 0.0003, + "loss": 10.9133, + "loss/aux_loss": 0.048060805164277555, + "loss/crossentropy": 2.694873237609863, + "loss/logits": 0.8217334061861038, + "step": 57960 + }, + { + "epoch": 0.5797, + "grad_norm": 14.125, + "grad_norm_var": 0.5139973958333334, + "learning_rate": 0.0003, + "loss": 10.9741, + "loss/aux_loss": 0.04808518867939711, + "loss/crossentropy": 2.6820975124835966, + "loss/logits": 0.8232692778110504, + "step": 57970 + }, + { + "epoch": 0.5798, + "grad_norm": 14.0, + "grad_norm_var": 2.2020833333333334, + "learning_rate": 0.0003, + "loss": 11.1821, + "loss/aux_loss": 0.04804998859763145, + "loss/crossentropy": 2.7280581176280974, + "loss/logits": 0.817136037349701, + "step": 57980 + }, + { + "epoch": 0.5799, + "grad_norm": 16.625, + "grad_norm_var": 3.3268229166666665, + "learning_rate": 0.0003, + "loss": 11.0193, + "loss/aux_loss": 0.048069480992853644, + "loss/crossentropy": 2.6904157042503356, + "loss/logits": 0.8245778560638428, + "step": 57990 + }, + { + "epoch": 0.58, + "grad_norm": 15.875, + "grad_norm_var": 0.616259765625, + "learning_rate": 0.0003, + "loss": 11.0416, + "loss/aux_loss": 0.048078110441565514, + "loss/crossentropy": 2.7497189164161684, + "loss/logits": 0.8141031920909881, + "step": 58000 + }, + { + "epoch": 0.5801, + "grad_norm": 14.6875, + "grad_norm_var": 0.6343587239583334, + "learning_rate": 0.0003, + "loss": 10.8494, + "loss/aux_loss": 0.048049984686076644, + "loss/crossentropy": 2.655291825532913, + "loss/logits": 0.7856258243322373, + "step": 58010 + }, + { + "epoch": 0.5802, + "grad_norm": 14.75, + "grad_norm_var": 2.2038899739583333, + "learning_rate": 0.0003, + "loss": 11.0884, + "loss/aux_loss": 0.048076943308115, + "loss/crossentropy": 2.606997859477997, + "loss/logits": 0.8418209999799728, + "step": 58020 + }, + { + "epoch": 0.5803, + "grad_norm": 14.875, + "grad_norm_var": 2.009228515625, + "learning_rate": 0.0003, + "loss": 11.0585, + "loss/aux_loss": 0.04807098638266325, + "loss/crossentropy": 2.7810503602027894, + "loss/logits": 0.8308149874210358, + "step": 58030 + }, + { + "epoch": 0.5804, + "grad_norm": 14.75, + "grad_norm_var": 1.2375, + "learning_rate": 0.0003, + "loss": 11.0071, + "loss/aux_loss": 0.048070183396339415, + "loss/crossentropy": 2.6167166888713838, + "loss/logits": 0.8119976550340653, + "step": 58040 + }, + { + "epoch": 0.5805, + "grad_norm": 13.4375, + "grad_norm_var": 12.139957682291667, + "learning_rate": 0.0003, + "loss": 10.9504, + "loss/aux_loss": 0.04809025507420302, + "loss/crossentropy": 2.7787895798683167, + "loss/logits": 0.8444702595472335, + "step": 58050 + }, + { + "epoch": 0.5806, + "grad_norm": 15.625, + "grad_norm_var": 0.7994140625, + "learning_rate": 0.0003, + "loss": 11.1034, + "loss/aux_loss": 0.04807179775089025, + "loss/crossentropy": 2.671674072742462, + "loss/logits": 0.815391731262207, + "step": 58060 + }, + { + "epoch": 0.5807, + "grad_norm": 15.5, + "grad_norm_var": 0.4554524739583333, + "learning_rate": 0.0003, + "loss": 11.0592, + "loss/aux_loss": 0.048074791021645066, + "loss/crossentropy": 2.6951618790626526, + "loss/logits": 0.856848555803299, + "step": 58070 + }, + { + "epoch": 0.5808, + "grad_norm": 14.25, + "grad_norm_var": 1.3780598958333334, + "learning_rate": 0.0003, + "loss": 10.9668, + "loss/aux_loss": 0.048078453540802, + "loss/crossentropy": 2.5765260636806486, + "loss/logits": 0.8218820422887803, + "step": 58080 + }, + { + "epoch": 0.5809, + "grad_norm": 15.9375, + "grad_norm_var": 0.4432291666666667, + "learning_rate": 0.0003, + "loss": 10.936, + "loss/aux_loss": 0.04806906320154667, + "loss/crossentropy": 2.7392422437667845, + "loss/logits": 0.7855724722146988, + "step": 58090 + }, + { + "epoch": 0.581, + "grad_norm": 14.6875, + "grad_norm_var": 0.762744140625, + "learning_rate": 0.0003, + "loss": 10.7023, + "loss/aux_loss": 0.04807261452078819, + "loss/crossentropy": 2.508842921257019, + "loss/logits": 0.7834379196166992, + "step": 58100 + }, + { + "epoch": 0.5811, + "grad_norm": 16.125, + "grad_norm_var": 0.4356770833333333, + "learning_rate": 0.0003, + "loss": 11.0333, + "loss/aux_loss": 0.0480639960616827, + "loss/crossentropy": 2.576325136423111, + "loss/logits": 0.7919742912054062, + "step": 58110 + }, + { + "epoch": 0.5812, + "grad_norm": 16.0, + "grad_norm_var": 1.0244140625, + "learning_rate": 0.0003, + "loss": 11.0291, + "loss/aux_loss": 0.04808232747018337, + "loss/crossentropy": 2.7864030063152314, + "loss/logits": 0.8283389776945114, + "step": 58120 + }, + { + "epoch": 0.5813, + "grad_norm": 17.75, + "grad_norm_var": 1.7234375, + "learning_rate": 0.0003, + "loss": 11.0488, + "loss/aux_loss": 0.04805902913212776, + "loss/crossentropy": 2.7792890667915344, + "loss/logits": 0.8295496284961701, + "step": 58130 + }, + { + "epoch": 0.5814, + "grad_norm": 14.8125, + "grad_norm_var": 1.323681640625, + "learning_rate": 0.0003, + "loss": 11.0617, + "loss/aux_loss": 0.04808140993118286, + "loss/crossentropy": 2.7483465135097505, + "loss/logits": 0.8322425484657288, + "step": 58140 + }, + { + "epoch": 0.5815, + "grad_norm": 14.8125, + "grad_norm_var": 0.75703125, + "learning_rate": 0.0003, + "loss": 11.0982, + "loss/aux_loss": 0.048070278204977515, + "loss/crossentropy": 2.9067394614219664, + "loss/logits": 0.8337443679571152, + "step": 58150 + }, + { + "epoch": 0.5816, + "grad_norm": 14.75, + "grad_norm_var": 0.5151041666666667, + "learning_rate": 0.0003, + "loss": 10.9214, + "loss/aux_loss": 0.0480665884912014, + "loss/crossentropy": 2.706578928232193, + "loss/logits": 0.8104908049106598, + "step": 58160 + }, + { + "epoch": 0.5817, + "grad_norm": 16.375, + "grad_norm_var": 1.6416015625, + "learning_rate": 0.0003, + "loss": 11.0962, + "loss/aux_loss": 0.048067183792591096, + "loss/crossentropy": 2.7630359292030335, + "loss/logits": 0.8151145994663238, + "step": 58170 + }, + { + "epoch": 0.5818, + "grad_norm": 13.625, + "grad_norm_var": 2.781233723958333, + "learning_rate": 0.0003, + "loss": 10.9718, + "loss/aux_loss": 0.048072848655283454, + "loss/crossentropy": 2.6488034069538116, + "loss/logits": 0.8012841731309891, + "step": 58180 + }, + { + "epoch": 0.5819, + "grad_norm": 14.5625, + "grad_norm_var": 1.9555826822916667, + "learning_rate": 0.0003, + "loss": 10.9685, + "loss/aux_loss": 0.0480765612795949, + "loss/crossentropy": 2.7052852630615236, + "loss/logits": 0.837667453289032, + "step": 58190 + }, + { + "epoch": 0.582, + "grad_norm": 14.5625, + "grad_norm_var": 0.8645182291666667, + "learning_rate": 0.0003, + "loss": 11.076, + "loss/aux_loss": 0.048066786117851736, + "loss/crossentropy": 2.683205193281174, + "loss/logits": 0.8363191336393356, + "step": 58200 + }, + { + "epoch": 0.5821, + "grad_norm": 15.0, + "grad_norm_var": 0.5692545572916666, + "learning_rate": 0.0003, + "loss": 10.8792, + "loss/aux_loss": 0.04808394853025675, + "loss/crossentropy": 2.706932079792023, + "loss/logits": 0.8160331755876541, + "step": 58210 + }, + { + "epoch": 0.5822, + "grad_norm": 15.5625, + "grad_norm_var": 0.7785807291666667, + "learning_rate": 0.0003, + "loss": 11.039, + "loss/aux_loss": 0.04806770384311676, + "loss/crossentropy": 2.781590723991394, + "loss/logits": 0.8191991955041885, + "step": 58220 + }, + { + "epoch": 0.5823, + "grad_norm": 15.5, + "grad_norm_var": 0.8033854166666666, + "learning_rate": 0.0003, + "loss": 10.9915, + "loss/aux_loss": 0.048082100600004195, + "loss/crossentropy": 2.827639192342758, + "loss/logits": 0.8229591697454453, + "step": 58230 + }, + { + "epoch": 0.5824, + "grad_norm": 13.3125, + "grad_norm_var": 4.969384765625, + "learning_rate": 0.0003, + "loss": 10.998, + "loss/aux_loss": 0.0480662764981389, + "loss/crossentropy": 2.6668840289115905, + "loss/logits": 0.8254508256912232, + "step": 58240 + }, + { + "epoch": 0.5825, + "grad_norm": 14.3125, + "grad_norm_var": 1.2660807291666667, + "learning_rate": 0.0003, + "loss": 10.9292, + "loss/aux_loss": 0.04806844256818295, + "loss/crossentropy": 2.786569392681122, + "loss/logits": 0.831071189045906, + "step": 58250 + }, + { + "epoch": 0.5826, + "grad_norm": 14.375, + "grad_norm_var": 1.1525390625, + "learning_rate": 0.0003, + "loss": 11.0841, + "loss/aux_loss": 0.048065231554210185, + "loss/crossentropy": 2.6059127330780028, + "loss/logits": 0.8094421774148941, + "step": 58260 + }, + { + "epoch": 0.5827, + "grad_norm": 14.3125, + "grad_norm_var": 0.690087890625, + "learning_rate": 0.0003, + "loss": 10.9705, + "loss/aux_loss": 0.048076927289366723, + "loss/crossentropy": 2.7309110164642334, + "loss/logits": 0.8249422818422317, + "step": 58270 + }, + { + "epoch": 0.5828, + "grad_norm": 16.5, + "grad_norm_var": 2.154150390625, + "learning_rate": 0.0003, + "loss": 11.0905, + "loss/aux_loss": 0.04807541277259588, + "loss/crossentropy": 2.6696152329444884, + "loss/logits": 0.8134458005428314, + "step": 58280 + }, + { + "epoch": 0.5829, + "grad_norm": 13.9375, + "grad_norm_var": 2.66875, + "learning_rate": 0.0003, + "loss": 10.864, + "loss/aux_loss": 0.048075484298169616, + "loss/crossentropy": 2.60534029006958, + "loss/logits": 0.8042290031909942, + "step": 58290 + }, + { + "epoch": 0.583, + "grad_norm": 14.6875, + "grad_norm_var": 1.1087890625, + "learning_rate": 0.0003, + "loss": 11.0752, + "loss/aux_loss": 0.04807412791997194, + "loss/crossentropy": 2.669729250669479, + "loss/logits": 0.8465682655572891, + "step": 58300 + }, + { + "epoch": 0.5831, + "grad_norm": 14.875, + "grad_norm_var": 0.42337239583333336, + "learning_rate": 0.0003, + "loss": 10.924, + "loss/aux_loss": 0.04807706717401743, + "loss/crossentropy": 2.641595256328583, + "loss/logits": 0.8296791315078735, + "step": 58310 + }, + { + "epoch": 0.5832, + "grad_norm": 15.25, + "grad_norm_var": 0.48828125, + "learning_rate": 0.0003, + "loss": 11.086, + "loss/aux_loss": 0.048067683912813665, + "loss/crossentropy": 2.602385413646698, + "loss/logits": 0.8127113878726959, + "step": 58320 + }, + { + "epoch": 0.5833, + "grad_norm": 15.25, + "grad_norm_var": 0.3636555989583333, + "learning_rate": 0.0003, + "loss": 11.0658, + "loss/aux_loss": 0.04806897640228271, + "loss/crossentropy": 2.675559568405151, + "loss/logits": 0.8271180987358093, + "step": 58330 + }, + { + "epoch": 0.5834, + "grad_norm": 15.0, + "grad_norm_var": 0.08854166666666667, + "learning_rate": 0.0003, + "loss": 11.045, + "loss/aux_loss": 0.04805998243391514, + "loss/crossentropy": 2.7416910886764527, + "loss/logits": 0.8429529070854187, + "step": 58340 + }, + { + "epoch": 0.5835, + "grad_norm": 14.75, + "grad_norm_var": 0.07810872395833333, + "learning_rate": 0.0003, + "loss": 10.9301, + "loss/aux_loss": 0.048085011541843414, + "loss/crossentropy": 2.5801856577396394, + "loss/logits": 0.7824599385261536, + "step": 58350 + }, + { + "epoch": 0.5836, + "grad_norm": 13.6875, + "grad_norm_var": 0.5702473958333333, + "learning_rate": 0.0003, + "loss": 10.9614, + "loss/aux_loss": 0.04806315153837204, + "loss/crossentropy": 2.7932356715202333, + "loss/logits": 0.8210479527711868, + "step": 58360 + }, + { + "epoch": 0.5837, + "grad_norm": 14.8125, + "grad_norm_var": 0.73828125, + "learning_rate": 0.0003, + "loss": 10.8114, + "loss/aux_loss": 0.04807087611407042, + "loss/crossentropy": 2.8822829246521, + "loss/logits": 0.8095810860395432, + "step": 58370 + }, + { + "epoch": 0.5838, + "grad_norm": 15.8125, + "grad_norm_var": 0.8700358072916666, + "learning_rate": 0.0003, + "loss": 10.9375, + "loss/aux_loss": 0.04807416722178459, + "loss/crossentropy": 2.850358772277832, + "loss/logits": 0.849553844332695, + "step": 58380 + }, + { + "epoch": 0.5839, + "grad_norm": 15.9375, + "grad_norm_var": 0.8473795572916667, + "learning_rate": 0.0003, + "loss": 10.9638, + "loss/aux_loss": 0.04806542359292507, + "loss/crossentropy": 2.8250136971473694, + "loss/logits": 0.8444699108600616, + "step": 58390 + }, + { + "epoch": 0.584, + "grad_norm": 16.25, + "grad_norm_var": 3.767041015625, + "learning_rate": 0.0003, + "loss": 10.9255, + "loss/aux_loss": 0.0480630787089467, + "loss/crossentropy": 2.770486330986023, + "loss/logits": 0.8150721251964569, + "step": 58400 + }, + { + "epoch": 0.5841, + "grad_norm": 13.9375, + "grad_norm_var": 3.655143229166667, + "learning_rate": 0.0003, + "loss": 10.9237, + "loss/aux_loss": 0.048080130480229855, + "loss/crossentropy": 2.77328075170517, + "loss/logits": 0.8458864361047744, + "step": 58410 + }, + { + "epoch": 0.5842, + "grad_norm": 15.5, + "grad_norm_var": 0.5301432291666667, + "learning_rate": 0.0003, + "loss": 11.0029, + "loss/aux_loss": 0.04805634468793869, + "loss/crossentropy": 2.744527643918991, + "loss/logits": 0.8247867822647095, + "step": 58420 + }, + { + "epoch": 0.5843, + "grad_norm": 14.0625, + "grad_norm_var": 0.5108723958333333, + "learning_rate": 0.0003, + "loss": 10.9075, + "loss/aux_loss": 0.04806449562311173, + "loss/crossentropy": 2.7784756422042847, + "loss/logits": 0.8371286004781723, + "step": 58430 + }, + { + "epoch": 0.5844, + "grad_norm": 13.8125, + "grad_norm_var": 1.1895833333333334, + "learning_rate": 0.0003, + "loss": 10.8034, + "loss/aux_loss": 0.048080151155591014, + "loss/crossentropy": 2.7118197083473206, + "loss/logits": 0.8134770125150681, + "step": 58440 + }, + { + "epoch": 0.5845, + "grad_norm": 14.375, + "grad_norm_var": 1.2567057291666666, + "learning_rate": 0.0003, + "loss": 10.9606, + "loss/aux_loss": 0.04806223269551992, + "loss/crossentropy": 2.8520585894584656, + "loss/logits": 0.8356281250715256, + "step": 58450 + }, + { + "epoch": 0.5846, + "grad_norm": 16.5, + "grad_norm_var": 0.9921712239583333, + "learning_rate": 0.0003, + "loss": 10.991, + "loss/aux_loss": 0.04806670732796192, + "loss/crossentropy": 2.7324011504650114, + "loss/logits": 0.8266925632953643, + "step": 58460 + }, + { + "epoch": 0.5847, + "grad_norm": 15.4375, + "grad_norm_var": 0.8575520833333333, + "learning_rate": 0.0003, + "loss": 10.9843, + "loss/aux_loss": 0.04807538501918316, + "loss/crossentropy": 2.647887235879898, + "loss/logits": 0.820940124988556, + "step": 58470 + }, + { + "epoch": 0.5848, + "grad_norm": 13.3125, + "grad_norm_var": 1.0228515625, + "learning_rate": 0.0003, + "loss": 10.9996, + "loss/aux_loss": 0.04807217437773943, + "loss/crossentropy": 2.680449867248535, + "loss/logits": 0.8432391703128814, + "step": 58480 + }, + { + "epoch": 0.5849, + "grad_norm": 16.25, + "grad_norm_var": 1.27421875, + "learning_rate": 0.0003, + "loss": 11.031, + "loss/aux_loss": 0.04805920589715242, + "loss/crossentropy": 2.7809171319007873, + "loss/logits": 0.8145634055137634, + "step": 58490 + }, + { + "epoch": 0.585, + "grad_norm": 15.5625, + "grad_norm_var": 0.3885416666666667, + "learning_rate": 0.0003, + "loss": 10.9707, + "loss/aux_loss": 0.048078577220439914, + "loss/crossentropy": 2.741816544532776, + "loss/logits": 0.8448689103126525, + "step": 58500 + }, + { + "epoch": 0.5851, + "grad_norm": 15.1875, + "grad_norm_var": 0.46295572916666666, + "learning_rate": 0.0003, + "loss": 10.9568, + "loss/aux_loss": 0.048060483485460284, + "loss/crossentropy": 2.691603738069534, + "loss/logits": 0.8457825213670731, + "step": 58510 + }, + { + "epoch": 0.5852, + "grad_norm": 14.8125, + "grad_norm_var": 1.128759765625, + "learning_rate": 0.0003, + "loss": 10.9338, + "loss/aux_loss": 0.04807424061000347, + "loss/crossentropy": 2.6941749453544617, + "loss/logits": 0.8350166887044906, + "step": 58520 + }, + { + "epoch": 0.5853, + "grad_norm": 16.625, + "grad_norm_var": 0.451806640625, + "learning_rate": 0.0003, + "loss": 11.0623, + "loss/aux_loss": 0.04807107653468847, + "loss/crossentropy": 2.8114802479743957, + "loss/logits": 0.8427129536867142, + "step": 58530 + }, + { + "epoch": 0.5854, + "grad_norm": 14.0625, + "grad_norm_var": 0.9030598958333333, + "learning_rate": 0.0003, + "loss": 11.0062, + "loss/aux_loss": 0.04807313997298479, + "loss/crossentropy": 2.673390966653824, + "loss/logits": 0.8236012995243073, + "step": 58540 + }, + { + "epoch": 0.5855, + "grad_norm": 14.6875, + "grad_norm_var": 1.0176432291666666, + "learning_rate": 0.0003, + "loss": 10.9709, + "loss/aux_loss": 0.048067517951130866, + "loss/crossentropy": 2.7362454771995544, + "loss/logits": 0.8251888632774353, + "step": 58550 + }, + { + "epoch": 0.5856, + "grad_norm": 16.625, + "grad_norm_var": 1.235791015625, + "learning_rate": 0.0003, + "loss": 10.9647, + "loss/aux_loss": 0.04807721339166164, + "loss/crossentropy": 2.751077103614807, + "loss/logits": 0.8136387556791306, + "step": 58560 + }, + { + "epoch": 0.5857, + "grad_norm": 15.875, + "grad_norm_var": 0.832666015625, + "learning_rate": 0.0003, + "loss": 11.0542, + "loss/aux_loss": 0.048059957846999166, + "loss/crossentropy": 2.695993906259537, + "loss/logits": 0.8578321129083634, + "step": 58570 + }, + { + "epoch": 0.5858, + "grad_norm": 13.6875, + "grad_norm_var": 0.3734375, + "learning_rate": 0.0003, + "loss": 10.9334, + "loss/aux_loss": 0.0480723824352026, + "loss/crossentropy": 2.636319124698639, + "loss/logits": 0.8126176208257675, + "step": 58580 + }, + { + "epoch": 0.5859, + "grad_norm": 15.6875, + "grad_norm_var": 0.6244140625, + "learning_rate": 0.0003, + "loss": 10.9793, + "loss/aux_loss": 0.04807424917817116, + "loss/crossentropy": 2.633167880773544, + "loss/logits": 0.8149242758750915, + "step": 58590 + }, + { + "epoch": 0.586, + "grad_norm": 13.4375, + "grad_norm_var": 0.7452473958333333, + "learning_rate": 0.0003, + "loss": 10.7418, + "loss/aux_loss": 0.04807692859321833, + "loss/crossentropy": 2.4629740476608277, + "loss/logits": 0.7668499648571014, + "step": 58600 + }, + { + "epoch": 0.5861, + "grad_norm": 15.4375, + "grad_norm_var": 0.6527180989583333, + "learning_rate": 0.0003, + "loss": 10.9786, + "loss/aux_loss": 0.04806172419339418, + "loss/crossentropy": 2.8479265451431273, + "loss/logits": 0.8407178670167923, + "step": 58610 + }, + { + "epoch": 0.5862, + "grad_norm": 15.125, + "grad_norm_var": 0.5923014322916667, + "learning_rate": 0.0003, + "loss": 10.8859, + "loss/aux_loss": 0.04807059057056904, + "loss/crossentropy": 2.6659990191459655, + "loss/logits": 0.814807391166687, + "step": 58620 + }, + { + "epoch": 0.5863, + "grad_norm": 15.5625, + "grad_norm_var": 0.2994791666666667, + "learning_rate": 0.0003, + "loss": 10.9763, + "loss/aux_loss": 0.0480600368231535, + "loss/crossentropy": 2.7559759974479676, + "loss/logits": 0.8363692253828049, + "step": 58630 + }, + { + "epoch": 0.5864, + "grad_norm": 14.8125, + "grad_norm_var": 0.354541015625, + "learning_rate": 0.0003, + "loss": 10.9895, + "loss/aux_loss": 0.048074362054467204, + "loss/crossentropy": 2.8443562030792235, + "loss/logits": 0.8306858450174331, + "step": 58640 + }, + { + "epoch": 0.5865, + "grad_norm": 15.0, + "grad_norm_var": 0.24777018229166667, + "learning_rate": 0.0003, + "loss": 10.9248, + "loss/aux_loss": 0.04806084036827087, + "loss/crossentropy": 2.7475152254104613, + "loss/logits": 0.7995836168527604, + "step": 58650 + }, + { + "epoch": 0.5866, + "grad_norm": 14.4375, + "grad_norm_var": 0.5483723958333333, + "learning_rate": 0.0003, + "loss": 10.921, + "loss/aux_loss": 0.04807322192937136, + "loss/crossentropy": 2.705971562862396, + "loss/logits": 0.8364428788423538, + "step": 58660 + }, + { + "epoch": 0.5867, + "grad_norm": 15.0625, + "grad_norm_var": 0.5433430989583333, + "learning_rate": 0.0003, + "loss": 10.9895, + "loss/aux_loss": 0.04806621428579092, + "loss/crossentropy": 2.723879784345627, + "loss/logits": 0.8121023416519165, + "step": 58670 + }, + { + "epoch": 0.5868, + "grad_norm": 15.5, + "grad_norm_var": 0.8768229166666667, + "learning_rate": 0.0003, + "loss": 10.9435, + "loss/aux_loss": 0.048067286051809786, + "loss/crossentropy": 2.8236024498939516, + "loss/logits": 0.8061568111181259, + "step": 58680 + }, + { + "epoch": 0.5869, + "grad_norm": 14.9375, + "grad_norm_var": 0.5377604166666666, + "learning_rate": 0.0003, + "loss": 11.0705, + "loss/aux_loss": 0.04807589165866375, + "loss/crossentropy": 2.6578650951385496, + "loss/logits": 0.822320407629013, + "step": 58690 + }, + { + "epoch": 0.587, + "grad_norm": 15.25, + "grad_norm_var": 3.528059895833333, + "learning_rate": 0.0003, + "loss": 10.9717, + "loss/aux_loss": 0.04807302374392748, + "loss/crossentropy": 2.7690295398235323, + "loss/logits": 0.8380024790763855, + "step": 58700 + }, + { + "epoch": 0.5871, + "grad_norm": 14.375, + "grad_norm_var": 1.2687337239583334, + "learning_rate": 0.0003, + "loss": 10.9628, + "loss/aux_loss": 0.04807158224284649, + "loss/crossentropy": 2.642226552963257, + "loss/logits": 0.8027304679155349, + "step": 58710 + }, + { + "epoch": 0.5872, + "grad_norm": 14.1875, + "grad_norm_var": 0.4669270833333333, + "learning_rate": 0.0003, + "loss": 10.9551, + "loss/aux_loss": 0.048065906204283235, + "loss/crossentropy": 2.667159843444824, + "loss/logits": 0.837336790561676, + "step": 58720 + }, + { + "epoch": 0.5873, + "grad_norm": 15.25, + "grad_norm_var": 0.6577473958333333, + "learning_rate": 0.0003, + "loss": 10.9683, + "loss/aux_loss": 0.048073652759194375, + "loss/crossentropy": 2.667749172449112, + "loss/logits": 0.8197858512401581, + "step": 58730 + }, + { + "epoch": 0.5874, + "grad_norm": 14.9375, + "grad_norm_var": 6.6400390625, + "learning_rate": 0.0003, + "loss": 11.0102, + "loss/aux_loss": 0.04808447286486626, + "loss/crossentropy": 2.632328379154205, + "loss/logits": 0.8462556928396225, + "step": 58740 + }, + { + "epoch": 0.5875, + "grad_norm": 16.625, + "grad_norm_var": 0.4176432291666667, + "learning_rate": 0.0003, + "loss": 10.9477, + "loss/aux_loss": 0.048075118102133274, + "loss/crossentropy": 2.7823431193828583, + "loss/logits": 0.8039717346429824, + "step": 58750 + }, + { + "epoch": 0.5876, + "grad_norm": 13.75, + "grad_norm_var": 0.5447265625, + "learning_rate": 0.0003, + "loss": 10.968, + "loss/aux_loss": 0.04806168247014284, + "loss/crossentropy": 2.774631363153458, + "loss/logits": 0.82295723259449, + "step": 58760 + }, + { + "epoch": 0.5877, + "grad_norm": 14.0625, + "grad_norm_var": 4.605582682291667, + "learning_rate": 0.0003, + "loss": 10.8307, + "loss/aux_loss": 0.04806800838559866, + "loss/crossentropy": 2.820412439107895, + "loss/logits": 0.8312744557857513, + "step": 58770 + }, + { + "epoch": 0.5878, + "grad_norm": 14.125, + "grad_norm_var": 0.3238118489583333, + "learning_rate": 0.0003, + "loss": 10.9226, + "loss/aux_loss": 0.04807846397161484, + "loss/crossentropy": 2.5340620458126066, + "loss/logits": 0.8130956321954728, + "step": 58780 + }, + { + "epoch": 0.5879, + "grad_norm": 14.625, + "grad_norm_var": 0.2986979166666667, + "learning_rate": 0.0003, + "loss": 10.9796, + "loss/aux_loss": 0.048070663772523406, + "loss/crossentropy": 2.617660069465637, + "loss/logits": 0.7849185347557068, + "step": 58790 + }, + { + "epoch": 0.588, + "grad_norm": 15.4375, + "grad_norm_var": 0.9128743489583333, + "learning_rate": 0.0003, + "loss": 11.0414, + "loss/aux_loss": 0.048075301200151445, + "loss/crossentropy": 2.7353998363018035, + "loss/logits": 0.8277339696884155, + "step": 58800 + }, + { + "epoch": 0.5881, + "grad_norm": 15.6875, + "grad_norm_var": 0.8166015625, + "learning_rate": 0.0003, + "loss": 10.9593, + "loss/aux_loss": 0.04806892182677984, + "loss/crossentropy": 2.720021104812622, + "loss/logits": 0.8124103635549546, + "step": 58810 + }, + { + "epoch": 0.5882, + "grad_norm": 16.25, + "grad_norm_var": 0.705322265625, + "learning_rate": 0.0003, + "loss": 10.8535, + "loss/aux_loss": 0.048067253082990646, + "loss/crossentropy": 2.6595967948436736, + "loss/logits": 0.7839356884360313, + "step": 58820 + }, + { + "epoch": 0.5883, + "grad_norm": 15.0625, + "grad_norm_var": 0.604931640625, + "learning_rate": 0.0003, + "loss": 10.8844, + "loss/aux_loss": 0.048070118948817256, + "loss/crossentropy": 2.7247247993946075, + "loss/logits": 0.7789757996797562, + "step": 58830 + }, + { + "epoch": 0.5884, + "grad_norm": 15.5625, + "grad_norm_var": 1.5989420572916666, + "learning_rate": 0.0003, + "loss": 10.8944, + "loss/aux_loss": 0.04808255434036255, + "loss/crossentropy": 2.4786873877048494, + "loss/logits": 0.7954070687294006, + "step": 58840 + }, + { + "epoch": 0.5885, + "grad_norm": 14.0625, + "grad_norm_var": 1.150634765625, + "learning_rate": 0.0003, + "loss": 10.7977, + "loss/aux_loss": 0.04806720409542322, + "loss/crossentropy": 2.6936080753803253, + "loss/logits": 0.8249445348978043, + "step": 58850 + }, + { + "epoch": 0.5886, + "grad_norm": 14.1875, + "grad_norm_var": 0.6049479166666667, + "learning_rate": 0.0003, + "loss": 10.92, + "loss/aux_loss": 0.04806901291012764, + "loss/crossentropy": 2.748962438106537, + "loss/logits": 0.840557438135147, + "step": 58860 + }, + { + "epoch": 0.5887, + "grad_norm": 16.0, + "grad_norm_var": 0.7219889322916667, + "learning_rate": 0.0003, + "loss": 10.963, + "loss/aux_loss": 0.04807457271963358, + "loss/crossentropy": 2.7299853801727294, + "loss/logits": 0.8568669199943543, + "step": 58870 + }, + { + "epoch": 0.5888, + "grad_norm": 14.4375, + "grad_norm_var": 1.1009765625, + "learning_rate": 0.0003, + "loss": 10.9833, + "loss/aux_loss": 0.0480708921328187, + "loss/crossentropy": 2.6630140364170076, + "loss/logits": 0.8391565322875977, + "step": 58880 + }, + { + "epoch": 0.5889, + "grad_norm": 15.6875, + "grad_norm_var": 0.5079264322916667, + "learning_rate": 0.0003, + "loss": 10.7613, + "loss/aux_loss": 0.0480732224881649, + "loss/crossentropy": 2.6498291552066804, + "loss/logits": 0.8057867288589478, + "step": 58890 + }, + { + "epoch": 0.589, + "grad_norm": 14.5625, + "grad_norm_var": 1.2921712239583334, + "learning_rate": 0.0003, + "loss": 10.9393, + "loss/aux_loss": 0.04806382786482573, + "loss/crossentropy": 2.706892067193985, + "loss/logits": 0.8239340364933014, + "step": 58900 + }, + { + "epoch": 0.5891, + "grad_norm": 15.25, + "grad_norm_var": 0.4327473958333333, + "learning_rate": 0.0003, + "loss": 11.2523, + "loss/aux_loss": 0.04806530307978392, + "loss/crossentropy": 2.5392131090164183, + "loss/logits": 0.802097937464714, + "step": 58910 + }, + { + "epoch": 0.5892, + "grad_norm": 16.0, + "grad_norm_var": 103.838525390625, + "learning_rate": 0.0003, + "loss": 11.1054, + "loss/aux_loss": 0.048078567162156104, + "loss/crossentropy": 2.7394894659519196, + "loss/logits": 0.8429781794548035, + "step": 58920 + }, + { + "epoch": 0.5893, + "grad_norm": 16.875, + "grad_norm_var": 101.1806640625, + "learning_rate": 0.0003, + "loss": 11.0567, + "loss/aux_loss": 0.04806540366262198, + "loss/crossentropy": 2.717589294910431, + "loss/logits": 0.8201171487569809, + "step": 58930 + }, + { + "epoch": 0.5894, + "grad_norm": 16.0, + "grad_norm_var": 1.0885416666666667, + "learning_rate": 0.0003, + "loss": 10.9065, + "loss/aux_loss": 0.04806985668838024, + "loss/crossentropy": 2.721989232301712, + "loss/logits": 0.8106504052877426, + "step": 58940 + }, + { + "epoch": 0.5895, + "grad_norm": 14.375, + "grad_norm_var": 0.3611979166666667, + "learning_rate": 0.0003, + "loss": 11.0415, + "loss/aux_loss": 0.04806645512580872, + "loss/crossentropy": 2.7016734063625334, + "loss/logits": 0.8152611821889877, + "step": 58950 + }, + { + "epoch": 0.5896, + "grad_norm": 15.8125, + "grad_norm_var": 0.5468098958333333, + "learning_rate": 0.0003, + "loss": 10.9485, + "loss/aux_loss": 0.04806891251355409, + "loss/crossentropy": 2.62559455037117, + "loss/logits": 0.8271364778280258, + "step": 58960 + }, + { + "epoch": 0.5897, + "grad_norm": 15.3125, + "grad_norm_var": 0.3322265625, + "learning_rate": 0.0003, + "loss": 11.052, + "loss/aux_loss": 0.04806600380688906, + "loss/crossentropy": 2.737446331977844, + "loss/logits": 0.8194819182157517, + "step": 58970 + }, + { + "epoch": 0.5898, + "grad_norm": 16.375, + "grad_norm_var": 0.439697265625, + "learning_rate": 0.0003, + "loss": 11.0957, + "loss/aux_loss": 0.04807234760373831, + "loss/crossentropy": 2.760717141628265, + "loss/logits": 0.8376249551773072, + "step": 58980 + }, + { + "epoch": 0.5899, + "grad_norm": 14.875, + "grad_norm_var": 1.3390462239583334, + "learning_rate": 0.0003, + "loss": 11.0902, + "loss/aux_loss": 0.04807633645832539, + "loss/crossentropy": 2.7784714460372926, + "loss/logits": 0.8507242858409881, + "step": 58990 + }, + { + "epoch": 0.59, + "grad_norm": 15.8125, + "grad_norm_var": 1.2333170572916667, + "learning_rate": 0.0003, + "loss": 10.9738, + "loss/aux_loss": 0.048075743950903416, + "loss/crossentropy": 2.634059315919876, + "loss/logits": 0.8007471144199372, + "step": 59000 + }, + { + "epoch": 0.5901, + "grad_norm": 15.0625, + "grad_norm_var": 0.501416015625, + "learning_rate": 0.0003, + "loss": 10.8689, + "loss/aux_loss": 0.048068515583872796, + "loss/crossentropy": 2.646367919445038, + "loss/logits": 0.7871147692203522, + "step": 59010 + }, + { + "epoch": 0.5902, + "grad_norm": 14.3125, + "grad_norm_var": 0.6884765625, + "learning_rate": 0.0003, + "loss": 11.0153, + "loss/aux_loss": 0.04807483684271574, + "loss/crossentropy": 2.8454954862594604, + "loss/logits": 0.8256110936403275, + "step": 59020 + }, + { + "epoch": 0.5903, + "grad_norm": 13.8125, + "grad_norm_var": 0.595947265625, + "learning_rate": 0.0003, + "loss": 10.7911, + "loss/aux_loss": 0.04806120917201042, + "loss/crossentropy": 2.7482463240623476, + "loss/logits": 0.8132666110992431, + "step": 59030 + }, + { + "epoch": 0.5904, + "grad_norm": 15.1875, + "grad_norm_var": 0.685400390625, + "learning_rate": 0.0003, + "loss": 10.9159, + "loss/aux_loss": 0.04806686472147703, + "loss/crossentropy": 2.742703366279602, + "loss/logits": 0.8217089116573334, + "step": 59040 + }, + { + "epoch": 0.5905, + "grad_norm": 14.25, + "grad_norm_var": 0.917431640625, + "learning_rate": 0.0003, + "loss": 10.9354, + "loss/aux_loss": 0.048080751299858095, + "loss/crossentropy": 2.7598276495933534, + "loss/logits": 0.8276279777288437, + "step": 59050 + }, + { + "epoch": 0.5906, + "grad_norm": 16.0, + "grad_norm_var": 0.8372233072916667, + "learning_rate": 0.0003, + "loss": 11.1481, + "loss/aux_loss": 0.048068036511540416, + "loss/crossentropy": 2.5435283482074738, + "loss/logits": 0.8292164400219917, + "step": 59060 + }, + { + "epoch": 0.5907, + "grad_norm": 15.5, + "grad_norm_var": 0.3714680989583333, + "learning_rate": 0.0003, + "loss": 11.13, + "loss/aux_loss": 0.048066299967467786, + "loss/crossentropy": 2.531124544143677, + "loss/logits": 0.8264132618904114, + "step": 59070 + }, + { + "epoch": 0.5908, + "grad_norm": 16.125, + "grad_norm_var": 0.4212890625, + "learning_rate": 0.0003, + "loss": 11.0672, + "loss/aux_loss": 0.0480706337839365, + "loss/crossentropy": 2.7756115198135376, + "loss/logits": 0.8168632984161377, + "step": 59080 + }, + { + "epoch": 0.5909, + "grad_norm": 15.3125, + "grad_norm_var": 1.2400390625, + "learning_rate": 0.0003, + "loss": 10.966, + "loss/aux_loss": 0.048072228021919725, + "loss/crossentropy": 2.5596219480037687, + "loss/logits": 0.8085451662540436, + "step": 59090 + }, + { + "epoch": 0.591, + "grad_norm": 14.1875, + "grad_norm_var": 0.41795247395833335, + "learning_rate": 0.0003, + "loss": 11.0411, + "loss/aux_loss": 0.04807391669601202, + "loss/crossentropy": 2.7747272551059723, + "loss/logits": 0.8068033158779144, + "step": 59100 + }, + { + "epoch": 0.5911, + "grad_norm": 16.625, + "grad_norm_var": 0.5026041666666666, + "learning_rate": 0.0003, + "loss": 11.0439, + "loss/aux_loss": 0.048057069256901744, + "loss/crossentropy": 2.8529131174087525, + "loss/logits": 0.8054678052663803, + "step": 59110 + }, + { + "epoch": 0.5912, + "grad_norm": 15.9375, + "grad_norm_var": 0.3973795572916667, + "learning_rate": 0.0003, + "loss": 10.8576, + "loss/aux_loss": 0.04808417148888111, + "loss/crossentropy": 2.5729918599128725, + "loss/logits": 0.7877096027135849, + "step": 59120 + }, + { + "epoch": 0.5913, + "grad_norm": 15.125, + "grad_norm_var": 0.5738118489583334, + "learning_rate": 0.0003, + "loss": 11.054, + "loss/aux_loss": 0.048090359196066856, + "loss/crossentropy": 2.69580699801445, + "loss/logits": 0.8433273226022721, + "step": 59130 + }, + { + "epoch": 0.5914, + "grad_norm": 15.625, + "grad_norm_var": 0.3301920572916667, + "learning_rate": 0.0003, + "loss": 10.8815, + "loss/aux_loss": 0.04807056300342083, + "loss/crossentropy": 2.615113401412964, + "loss/logits": 0.8006851255893708, + "step": 59140 + }, + { + "epoch": 0.5915, + "grad_norm": 15.0, + "grad_norm_var": 0.3577473958333333, + "learning_rate": 0.0003, + "loss": 10.8658, + "loss/aux_loss": 0.04805748388171196, + "loss/crossentropy": 2.775008863210678, + "loss/logits": 0.803268751502037, + "step": 59150 + }, + { + "epoch": 0.5916, + "grad_norm": 15.75, + "grad_norm_var": 0.7499837239583333, + "learning_rate": 0.0003, + "loss": 10.9147, + "loss/aux_loss": 0.04807665143162012, + "loss/crossentropy": 2.7489787578582763, + "loss/logits": 0.8320010215044021, + "step": 59160 + }, + { + "epoch": 0.5917, + "grad_norm": 14.1875, + "grad_norm_var": 1.4718587239583334, + "learning_rate": 0.0003, + "loss": 11.1376, + "loss/aux_loss": 0.048068560846149924, + "loss/crossentropy": 2.8063846230506897, + "loss/logits": 0.8416935801506042, + "step": 59170 + }, + { + "epoch": 0.5918, + "grad_norm": 14.6875, + "grad_norm_var": 1.0505045572916667, + "learning_rate": 0.0003, + "loss": 11.1433, + "loss/aux_loss": 0.04806650020182133, + "loss/crossentropy": 2.7465264439582824, + "loss/logits": 0.8265712201595307, + "step": 59180 + }, + { + "epoch": 0.5919, + "grad_norm": 14.625, + "grad_norm_var": 0.20859375, + "learning_rate": 0.0003, + "loss": 11.0092, + "loss/aux_loss": 0.04806997440755367, + "loss/crossentropy": 2.7317902624607084, + "loss/logits": 0.829322350025177, + "step": 59190 + }, + { + "epoch": 0.592, + "grad_norm": 15.75, + "grad_norm_var": 0.5913899739583334, + "learning_rate": 0.0003, + "loss": 11.1233, + "loss/aux_loss": 0.048072732985019684, + "loss/crossentropy": 2.736423373222351, + "loss/logits": 0.8223551511764526, + "step": 59200 + }, + { + "epoch": 0.5921, + "grad_norm": 13.9375, + "grad_norm_var": 0.7077473958333333, + "learning_rate": 0.0003, + "loss": 10.9564, + "loss/aux_loss": 0.04806751888245344, + "loss/crossentropy": 2.719071865081787, + "loss/logits": 0.8144007086753845, + "step": 59210 + }, + { + "epoch": 0.5922, + "grad_norm": 14.5, + "grad_norm_var": 0.599072265625, + "learning_rate": 0.0003, + "loss": 11.0542, + "loss/aux_loss": 0.04806646332144737, + "loss/crossentropy": 2.6371989250183105, + "loss/logits": 0.8028866291046143, + "step": 59220 + }, + { + "epoch": 0.5923, + "grad_norm": 14.625, + "grad_norm_var": 0.6218587239583333, + "learning_rate": 0.0003, + "loss": 11.05, + "loss/aux_loss": 0.048065362870693205, + "loss/crossentropy": 2.739983332157135, + "loss/logits": 0.8592475086450577, + "step": 59230 + }, + { + "epoch": 0.5924, + "grad_norm": 14.3125, + "grad_norm_var": 5.140348307291666, + "learning_rate": 0.0003, + "loss": 11.0626, + "loss/aux_loss": 0.04808344319462776, + "loss/crossentropy": 2.6779512405395507, + "loss/logits": 0.8342467457056045, + "step": 59240 + }, + { + "epoch": 0.5925, + "grad_norm": 15.625, + "grad_norm_var": 0.465087890625, + "learning_rate": 0.0003, + "loss": 10.6864, + "loss/aux_loss": 0.04806926678866148, + "loss/crossentropy": 2.6925257742404938, + "loss/logits": 0.7975740045309067, + "step": 59250 + }, + { + "epoch": 0.5926, + "grad_norm": 14.9375, + "grad_norm_var": 0.4261555989583333, + "learning_rate": 0.0003, + "loss": 10.9838, + "loss/aux_loss": 0.04807133283466101, + "loss/crossentropy": 2.6862433731555937, + "loss/logits": 0.830639323592186, + "step": 59260 + }, + { + "epoch": 0.5927, + "grad_norm": 15.0625, + "grad_norm_var": 0.607666015625, + "learning_rate": 0.0003, + "loss": 10.9802, + "loss/aux_loss": 0.048060860484838486, + "loss/crossentropy": 2.6332711696624758, + "loss/logits": 0.8140772134065628, + "step": 59270 + }, + { + "epoch": 0.5928, + "grad_norm": 15.5, + "grad_norm_var": 0.642431640625, + "learning_rate": 0.0003, + "loss": 10.9915, + "loss/aux_loss": 0.048063439317047596, + "loss/crossentropy": 2.647914093732834, + "loss/logits": 0.8220590710639953, + "step": 59280 + }, + { + "epoch": 0.5929, + "grad_norm": 15.375, + "grad_norm_var": 2.2627604166666666, + "learning_rate": 0.0003, + "loss": 11.0782, + "loss/aux_loss": 0.048079627007246016, + "loss/crossentropy": 2.739602434635162, + "loss/logits": 0.846561822295189, + "step": 59290 + }, + { + "epoch": 0.593, + "grad_norm": 16.25, + "grad_norm_var": 2.2085774739583335, + "learning_rate": 0.0003, + "loss": 10.979, + "loss/aux_loss": 0.04806938376277685, + "loss/crossentropy": 2.6843549072742463, + "loss/logits": 0.7924599975347519, + "step": 59300 + }, + { + "epoch": 0.5931, + "grad_norm": 33.25, + "grad_norm_var": 21.649462890625, + "learning_rate": 0.0003, + "loss": 10.8709, + "loss/aux_loss": 0.04807360116392374, + "loss/crossentropy": 2.6454947888851166, + "loss/logits": 0.7973768830299377, + "step": 59310 + }, + { + "epoch": 0.5932, + "grad_norm": 14.8125, + "grad_norm_var": 21.169775390625, + "learning_rate": 0.0003, + "loss": 10.9825, + "loss/aux_loss": 0.048061652667820455, + "loss/crossentropy": 2.8345079243183138, + "loss/logits": 0.8396099478006362, + "step": 59320 + }, + { + "epoch": 0.5933, + "grad_norm": 13.875, + "grad_norm_var": 0.815625, + "learning_rate": 0.0003, + "loss": 10.9579, + "loss/aux_loss": 0.04807075336575508, + "loss/crossentropy": 2.7278249740600584, + "loss/logits": 0.8078439980745316, + "step": 59330 + }, + { + "epoch": 0.5934, + "grad_norm": 15.0, + "grad_norm_var": 0.859375, + "learning_rate": 0.0003, + "loss": 10.9258, + "loss/aux_loss": 0.04808927923440933, + "loss/crossentropy": 2.647115594148636, + "loss/logits": 0.8381874442100525, + "step": 59340 + }, + { + "epoch": 0.5935, + "grad_norm": 15.5625, + "grad_norm_var": 1.0649576822916667, + "learning_rate": 0.0003, + "loss": 10.9276, + "loss/aux_loss": 0.04806491620838642, + "loss/crossentropy": 2.760968017578125, + "loss/logits": 0.813837793469429, + "step": 59350 + }, + { + "epoch": 0.5936, + "grad_norm": 14.125, + "grad_norm_var": 0.940087890625, + "learning_rate": 0.0003, + "loss": 10.8929, + "loss/aux_loss": 0.048059547506272796, + "loss/crossentropy": 2.728524845838547, + "loss/logits": 0.816395303606987, + "step": 59360 + }, + { + "epoch": 0.5937, + "grad_norm": 15.75, + "grad_norm_var": 0.6927083333333334, + "learning_rate": 0.0003, + "loss": 11.1686, + "loss/aux_loss": 0.04809366017580032, + "loss/crossentropy": 2.576079845428467, + "loss/logits": 0.8472390443086624, + "step": 59370 + }, + { + "epoch": 0.5938, + "grad_norm": 14.25, + "grad_norm_var": 0.6554524739583333, + "learning_rate": 0.0003, + "loss": 10.8931, + "loss/aux_loss": 0.048058745451271534, + "loss/crossentropy": 2.5764957904815673, + "loss/logits": 0.7882023543119431, + "step": 59380 + }, + { + "epoch": 0.5939, + "grad_norm": 14.25, + "grad_norm_var": 0.5186848958333333, + "learning_rate": 0.0003, + "loss": 10.8687, + "loss/aux_loss": 0.04806121941655874, + "loss/crossentropy": 2.4873786866664886, + "loss/logits": 0.8160700887441635, + "step": 59390 + }, + { + "epoch": 0.594, + "grad_norm": 14.625, + "grad_norm_var": 0.5051432291666667, + "learning_rate": 0.0003, + "loss": 10.8354, + "loss/aux_loss": 0.04808267336338758, + "loss/crossentropy": 2.715548413991928, + "loss/logits": 0.8030152201652527, + "step": 59400 + }, + { + "epoch": 0.5941, + "grad_norm": 15.0625, + "grad_norm_var": 0.8432291666666667, + "learning_rate": 0.0003, + "loss": 10.7494, + "loss/aux_loss": 0.04806729760020971, + "loss/crossentropy": 2.5783600986003874, + "loss/logits": 0.775664460659027, + "step": 59410 + }, + { + "epoch": 0.5942, + "grad_norm": 15.375, + "grad_norm_var": 0.728125, + "learning_rate": 0.0003, + "loss": 10.9819, + "loss/aux_loss": 0.04807338900864124, + "loss/crossentropy": 2.7407156348228456, + "loss/logits": 0.8106876760721207, + "step": 59420 + }, + { + "epoch": 0.5943, + "grad_norm": 16.75, + "grad_norm_var": 0.7608723958333333, + "learning_rate": 0.0003, + "loss": 11.1629, + "loss/aux_loss": 0.048071779869496824, + "loss/crossentropy": 2.7402497112751005, + "loss/logits": 0.8036000728607178, + "step": 59430 + }, + { + "epoch": 0.5944, + "grad_norm": 15.125, + "grad_norm_var": 1.547509765625, + "learning_rate": 0.0003, + "loss": 10.9178, + "loss/aux_loss": 0.04806051217019558, + "loss/crossentropy": 2.725063371658325, + "loss/logits": 0.8170418709516525, + "step": 59440 + }, + { + "epoch": 0.5945, + "grad_norm": 16.125, + "grad_norm_var": 0.7936848958333333, + "learning_rate": 0.0003, + "loss": 11.0313, + "loss/aux_loss": 0.04807902853935957, + "loss/crossentropy": 2.6812859654426573, + "loss/logits": 0.8411592811346054, + "step": 59450 + }, + { + "epoch": 0.5946, + "grad_norm": 15.375, + "grad_norm_var": 0.5328125, + "learning_rate": 0.0003, + "loss": 10.8673, + "loss/aux_loss": 0.04806883670389652, + "loss/crossentropy": 2.526155251264572, + "loss/logits": 0.7790047436952591, + "step": 59460 + }, + { + "epoch": 0.5947, + "grad_norm": 14.5625, + "grad_norm_var": 0.6650390625, + "learning_rate": 0.0003, + "loss": 11.0563, + "loss/aux_loss": 0.04805390052497387, + "loss/crossentropy": 2.714101165533066, + "loss/logits": 0.8301648050546646, + "step": 59470 + }, + { + "epoch": 0.5948, + "grad_norm": 15.5625, + "grad_norm_var": 0.21555989583333332, + "learning_rate": 0.0003, + "loss": 11.0729, + "loss/aux_loss": 0.04807809926569462, + "loss/crossentropy": 2.582643520832062, + "loss/logits": 0.8132951408624649, + "step": 59480 + }, + { + "epoch": 0.5949, + "grad_norm": 15.625, + "grad_norm_var": 0.6773274739583334, + "learning_rate": 0.0003, + "loss": 11.2003, + "loss/aux_loss": 0.04807939510792494, + "loss/crossentropy": 2.8899078488349916, + "loss/logits": 0.8984211206436157, + "step": 59490 + }, + { + "epoch": 0.595, + "grad_norm": 14.5625, + "grad_norm_var": 0.4994140625, + "learning_rate": 0.0003, + "loss": 10.8003, + "loss/aux_loss": 0.048056123591959474, + "loss/crossentropy": 2.657677114009857, + "loss/logits": 0.8166221141815185, + "step": 59500 + }, + { + "epoch": 0.5951, + "grad_norm": 15.0, + "grad_norm_var": 1.293212890625, + "learning_rate": 0.0003, + "loss": 10.9127, + "loss/aux_loss": 0.04807478673756123, + "loss/crossentropy": 2.618022048473358, + "loss/logits": 0.829986622929573, + "step": 59510 + }, + { + "epoch": 0.5952, + "grad_norm": 14.5, + "grad_norm_var": 0.6301920572916667, + "learning_rate": 0.0003, + "loss": 10.9695, + "loss/aux_loss": 0.0480778394266963, + "loss/crossentropy": 2.6219813764095306, + "loss/logits": 0.8168375045061111, + "step": 59520 + }, + { + "epoch": 0.5953, + "grad_norm": 14.6875, + "grad_norm_var": 14.586393229166667, + "learning_rate": 0.0003, + "loss": 11.0956, + "loss/aux_loss": 0.0480570949614048, + "loss/crossentropy": 2.6230372488498688, + "loss/logits": 0.7977444887161255, + "step": 59530 + }, + { + "epoch": 0.5954, + "grad_norm": 13.875, + "grad_norm_var": 0.9593098958333334, + "learning_rate": 0.0003, + "loss": 10.9874, + "loss/aux_loss": 0.04807024523615837, + "loss/crossentropy": 2.7767493963241576, + "loss/logits": 0.8549789160490036, + "step": 59540 + }, + { + "epoch": 0.5955, + "grad_norm": 14.75, + "grad_norm_var": 0.9890625, + "learning_rate": 0.0003, + "loss": 10.9328, + "loss/aux_loss": 0.048075405322015284, + "loss/crossentropy": 2.8718122959136965, + "loss/logits": 0.8272106260061264, + "step": 59550 + }, + { + "epoch": 0.5956, + "grad_norm": 15.5625, + "grad_norm_var": 0.9208333333333333, + "learning_rate": 0.0003, + "loss": 10.7519, + "loss/aux_loss": 0.04805839378386736, + "loss/crossentropy": 2.621040326356888, + "loss/logits": 0.8079787522554398, + "step": 59560 + }, + { + "epoch": 0.5957, + "grad_norm": 14.5625, + "grad_norm_var": 0.563916015625, + "learning_rate": 0.0003, + "loss": 10.9413, + "loss/aux_loss": 0.048068532906472686, + "loss/crossentropy": 2.7320153057575225, + "loss/logits": 0.8291085928678512, + "step": 59570 + }, + { + "epoch": 0.5958, + "grad_norm": 15.3125, + "grad_norm_var": 0.435400390625, + "learning_rate": 0.0003, + "loss": 11.059, + "loss/aux_loss": 0.048074721731245516, + "loss/crossentropy": 2.596713310480118, + "loss/logits": 0.8409205973148346, + "step": 59580 + }, + { + "epoch": 0.5959, + "grad_norm": 14.4375, + "grad_norm_var": 0.33513997395833334, + "learning_rate": 0.0003, + "loss": 11.0557, + "loss/aux_loss": 0.04807321783155203, + "loss/crossentropy": 2.7819936752319334, + "loss/logits": 0.835452938079834, + "step": 59590 + }, + { + "epoch": 0.596, + "grad_norm": 15.4375, + "grad_norm_var": 2.527604166666667, + "learning_rate": 0.0003, + "loss": 10.9209, + "loss/aux_loss": 0.04806660022586584, + "loss/crossentropy": 2.7568194687366487, + "loss/logits": 0.8260948032140731, + "step": 59600 + }, + { + "epoch": 0.5961, + "grad_norm": 14.625, + "grad_norm_var": 0.7150390625, + "learning_rate": 0.0003, + "loss": 11.0103, + "loss/aux_loss": 0.04807510152459145, + "loss/crossentropy": 2.743026089668274, + "loss/logits": 0.8484435856342316, + "step": 59610 + }, + { + "epoch": 0.5962, + "grad_norm": 15.5, + "grad_norm_var": 0.7692057291666666, + "learning_rate": 0.0003, + "loss": 10.9734, + "loss/aux_loss": 0.04806904960423708, + "loss/crossentropy": 2.553094118833542, + "loss/logits": 0.7921933591365814, + "step": 59620 + }, + { + "epoch": 0.5963, + "grad_norm": 13.8125, + "grad_norm_var": 0.4197265625, + "learning_rate": 0.0003, + "loss": 10.8507, + "loss/aux_loss": 0.048070012219250204, + "loss/crossentropy": 2.620549178123474, + "loss/logits": 0.8065012693405151, + "step": 59630 + }, + { + "epoch": 0.5964, + "grad_norm": 14.8125, + "grad_norm_var": 0.51875, + "learning_rate": 0.0003, + "loss": 11.0665, + "loss/aux_loss": 0.048063849285244944, + "loss/crossentropy": 2.7524186074733734, + "loss/logits": 0.8217565357685089, + "step": 59640 + }, + { + "epoch": 0.5965, + "grad_norm": 16.625, + "grad_norm_var": 0.3515625, + "learning_rate": 0.0003, + "loss": 10.9902, + "loss/aux_loss": 0.04806663002818823, + "loss/crossentropy": 2.8480568647384645, + "loss/logits": 0.8349178761243821, + "step": 59650 + }, + { + "epoch": 0.5966, + "grad_norm": 15.5625, + "grad_norm_var": 0.368603515625, + "learning_rate": 0.0003, + "loss": 11.0885, + "loss/aux_loss": 0.04807380642741919, + "loss/crossentropy": 2.7424690067768096, + "loss/logits": 0.8472974270582199, + "step": 59660 + }, + { + "epoch": 0.5967, + "grad_norm": 15.375, + "grad_norm_var": 0.24869791666666666, + "learning_rate": 0.0003, + "loss": 11.0736, + "loss/aux_loss": 0.04807511363178492, + "loss/crossentropy": 2.7169342398643495, + "loss/logits": 0.8267938494682312, + "step": 59670 + }, + { + "epoch": 0.5968, + "grad_norm": 15.5, + "grad_norm_var": 0.6700520833333333, + "learning_rate": 0.0003, + "loss": 11.1724, + "loss/aux_loss": 0.048063908331096174, + "loss/crossentropy": 2.7714505553245545, + "loss/logits": 0.8535281270742416, + "step": 59680 + }, + { + "epoch": 0.5969, + "grad_norm": 14.25, + "grad_norm_var": 0.5930826822916667, + "learning_rate": 0.0003, + "loss": 10.8832, + "loss/aux_loss": 0.048075766302645206, + "loss/crossentropy": 2.607689690589905, + "loss/logits": 0.8187483072280883, + "step": 59690 + }, + { + "epoch": 0.597, + "grad_norm": 16.375, + "grad_norm_var": 0.46243489583333336, + "learning_rate": 0.0003, + "loss": 11.0849, + "loss/aux_loss": 0.048065418377518654, + "loss/crossentropy": 2.779401385784149, + "loss/logits": 0.8177706062793731, + "step": 59700 + }, + { + "epoch": 0.5971, + "grad_norm": 16.0, + "grad_norm_var": 0.4337890625, + "learning_rate": 0.0003, + "loss": 11.0411, + "loss/aux_loss": 0.04806800279766321, + "loss/crossentropy": 2.6941158711910247, + "loss/logits": 0.8276149153709411, + "step": 59710 + }, + { + "epoch": 0.5972, + "grad_norm": 14.75, + "grad_norm_var": 0.475, + "learning_rate": 0.0003, + "loss": 11.0057, + "loss/aux_loss": 0.04807844683527947, + "loss/crossentropy": 2.7109430134296417, + "loss/logits": 0.8097800493240357, + "step": 59720 + }, + { + "epoch": 0.5973, + "grad_norm": 14.6875, + "grad_norm_var": 0.7171223958333334, + "learning_rate": 0.0003, + "loss": 10.9658, + "loss/aux_loss": 0.0480660380795598, + "loss/crossentropy": 2.6846647441387175, + "loss/logits": 0.8089656233787537, + "step": 59730 + }, + { + "epoch": 0.5974, + "grad_norm": 15.6875, + "grad_norm_var": 0.8059895833333334, + "learning_rate": 0.0003, + "loss": 10.9255, + "loss/aux_loss": 0.04807507041841745, + "loss/crossentropy": 2.693540346622467, + "loss/logits": 0.835108283162117, + "step": 59740 + }, + { + "epoch": 0.5975, + "grad_norm": 16.5, + "grad_norm_var": 0.428125, + "learning_rate": 0.0003, + "loss": 10.8601, + "loss/aux_loss": 0.048072214052081105, + "loss/crossentropy": 2.6454875826835633, + "loss/logits": 0.8485147625207901, + "step": 59750 + }, + { + "epoch": 0.5976, + "grad_norm": 15.1875, + "grad_norm_var": 0.9978515625, + "learning_rate": 0.0003, + "loss": 11.0293, + "loss/aux_loss": 0.04806381613016129, + "loss/crossentropy": 2.784002923965454, + "loss/logits": 0.823991322517395, + "step": 59760 + }, + { + "epoch": 0.5977, + "grad_norm": 15.0625, + "grad_norm_var": 0.25857747395833336, + "learning_rate": 0.0003, + "loss": 10.8259, + "loss/aux_loss": 0.04807265438139439, + "loss/crossentropy": 2.801326608657837, + "loss/logits": 0.8200320184230805, + "step": 59770 + }, + { + "epoch": 0.5978, + "grad_norm": 16.125, + "grad_norm_var": 0.47146809895833336, + "learning_rate": 0.0003, + "loss": 10.9737, + "loss/aux_loss": 0.04808081742376089, + "loss/crossentropy": 2.856343114376068, + "loss/logits": 0.8455061435699462, + "step": 59780 + }, + { + "epoch": 0.5979, + "grad_norm": 18.125, + "grad_norm_var": 1.2893229166666667, + "learning_rate": 0.0003, + "loss": 10.998, + "loss/aux_loss": 0.048059838637709615, + "loss/crossentropy": 2.553582340478897, + "loss/logits": 0.7986224472522736, + "step": 59790 + }, + { + "epoch": 0.598, + "grad_norm": 14.875, + "grad_norm_var": 0.927978515625, + "learning_rate": 0.0003, + "loss": 11.0234, + "loss/aux_loss": 0.048071127571165564, + "loss/crossentropy": 2.8242238759994507, + "loss/logits": 0.8418795853853226, + "step": 59800 + }, + { + "epoch": 0.5981, + "grad_norm": 13.6875, + "grad_norm_var": 0.2869140625, + "learning_rate": 0.0003, + "loss": 11.054, + "loss/aux_loss": 0.04806619361042976, + "loss/crossentropy": 2.7367840886116026, + "loss/logits": 0.8270297706127167, + "step": 59810 + }, + { + "epoch": 0.5982, + "grad_norm": 16.0, + "grad_norm_var": 0.9035807291666667, + "learning_rate": 0.0003, + "loss": 10.8816, + "loss/aux_loss": 0.048074010014534, + "loss/crossentropy": 2.5995913684368133, + "loss/logits": 0.843637329339981, + "step": 59820 + }, + { + "epoch": 0.5983, + "grad_norm": 16.125, + "grad_norm_var": 0.842431640625, + "learning_rate": 0.0003, + "loss": 10.9971, + "loss/aux_loss": 0.04806699063628912, + "loss/crossentropy": 2.66332545876503, + "loss/logits": 0.7995132386684418, + "step": 59830 + }, + { + "epoch": 0.5984, + "grad_norm": 14.0, + "grad_norm_var": 1.2426432291666667, + "learning_rate": 0.0003, + "loss": 10.9088, + "loss/aux_loss": 0.048076143860816954, + "loss/crossentropy": 2.6590377569198607, + "loss/logits": 0.8228894799947739, + "step": 59840 + }, + { + "epoch": 0.5985, + "grad_norm": 14.3125, + "grad_norm_var": 0.6864420572916666, + "learning_rate": 0.0003, + "loss": 11.0308, + "loss/aux_loss": 0.04806055724620819, + "loss/crossentropy": 2.734611225128174, + "loss/logits": 0.8257469087839127, + "step": 59850 + }, + { + "epoch": 0.5986, + "grad_norm": 13.6875, + "grad_norm_var": 0.5270670572916667, + "learning_rate": 0.0003, + "loss": 10.9778, + "loss/aux_loss": 0.048079499416053294, + "loss/crossentropy": 2.8724292278289796, + "loss/logits": 0.8287100523710251, + "step": 59860 + }, + { + "epoch": 0.5987, + "grad_norm": 14.9375, + "grad_norm_var": 2.584619140625, + "learning_rate": 0.0003, + "loss": 10.8757, + "loss/aux_loss": 0.04806960113346577, + "loss/crossentropy": 2.723463845252991, + "loss/logits": 0.834690722823143, + "step": 59870 + }, + { + "epoch": 0.5988, + "grad_norm": 16.625, + "grad_norm_var": 1.8973307291666666, + "learning_rate": 0.0003, + "loss": 10.9234, + "loss/aux_loss": 0.04806802999228239, + "loss/crossentropy": 2.785686802864075, + "loss/logits": 0.8314568728208542, + "step": 59880 + }, + { + "epoch": 0.5989, + "grad_norm": 16.375, + "grad_norm_var": 0.5518229166666667, + "learning_rate": 0.0003, + "loss": 10.9717, + "loss/aux_loss": 0.04806967880576849, + "loss/crossentropy": 2.7029653549194337, + "loss/logits": 0.8201411485671997, + "step": 59890 + }, + { + "epoch": 0.599, + "grad_norm": 16.125, + "grad_norm_var": 0.5994140625, + "learning_rate": 0.0003, + "loss": 11.0052, + "loss/aux_loss": 0.048081204667687415, + "loss/crossentropy": 2.783577024936676, + "loss/logits": 0.8226085513830185, + "step": 59900 + }, + { + "epoch": 0.5991, + "grad_norm": 15.25, + "grad_norm_var": 0.7863118489583333, + "learning_rate": 0.0003, + "loss": 11.0765, + "loss/aux_loss": 0.0480691323056817, + "loss/crossentropy": 2.6739238142967223, + "loss/logits": 0.8000189930200576, + "step": 59910 + }, + { + "epoch": 0.5992, + "grad_norm": 14.625, + "grad_norm_var": 0.5018229166666667, + "learning_rate": 0.0003, + "loss": 10.948, + "loss/aux_loss": 0.04806508179754019, + "loss/crossentropy": 2.732639729976654, + "loss/logits": 0.8182629913091659, + "step": 59920 + }, + { + "epoch": 0.5993, + "grad_norm": 14.625, + "grad_norm_var": 0.42233072916666664, + "learning_rate": 0.0003, + "loss": 10.9416, + "loss/aux_loss": 0.04808191284537315, + "loss/crossentropy": 2.773883467912674, + "loss/logits": 0.8407058566808701, + "step": 59930 + }, + { + "epoch": 0.5994, + "grad_norm": 15.625, + "grad_norm_var": 0.33904622395833334, + "learning_rate": 0.0003, + "loss": 10.982, + "loss/aux_loss": 0.04807113688439131, + "loss/crossentropy": 2.7120142698287966, + "loss/logits": 0.8165002167224884, + "step": 59940 + }, + { + "epoch": 0.5995, + "grad_norm": 15.75, + "grad_norm_var": 0.39993489583333336, + "learning_rate": 0.0003, + "loss": 10.9997, + "loss/aux_loss": 0.04807593729346991, + "loss/crossentropy": 2.646089047193527, + "loss/logits": 0.8073794126510621, + "step": 59950 + }, + { + "epoch": 0.5996, + "grad_norm": 15.3125, + "grad_norm_var": 0.32928059895833334, + "learning_rate": 0.0003, + "loss": 11.0826, + "loss/aux_loss": 0.048062044009566304, + "loss/crossentropy": 2.8690964460372923, + "loss/logits": 0.8507906019687652, + "step": 59960 + }, + { + "epoch": 0.5997, + "grad_norm": 15.25, + "grad_norm_var": 0.365869140625, + "learning_rate": 0.0003, + "loss": 10.9892, + "loss/aux_loss": 0.048076673224568364, + "loss/crossentropy": 2.638966166973114, + "loss/logits": 0.8349994659423828, + "step": 59970 + }, + { + "epoch": 0.5998, + "grad_norm": 14.3125, + "grad_norm_var": 0.5264973958333333, + "learning_rate": 0.0003, + "loss": 10.9127, + "loss/aux_loss": 0.0480753380805254, + "loss/crossentropy": 2.7292301952838898, + "loss/logits": 0.8326119810342789, + "step": 59980 + }, + { + "epoch": 0.5999, + "grad_norm": 15.0, + "grad_norm_var": 0.39479166666666665, + "learning_rate": 0.0003, + "loss": 11.0337, + "loss/aux_loss": 0.04805723261088133, + "loss/crossentropy": 2.7440689623355867, + "loss/logits": 0.8415668040513993, + "step": 59990 + }, + { + "epoch": 0.6, + "grad_norm": 14.5625, + "grad_norm_var": 0.41847330729166665, + "learning_rate": 0.0003, + "loss": 11.1506, + "loss/aux_loss": 0.04806195814162493, + "loss/crossentropy": 2.7931241512298586, + "loss/logits": 0.8480271577835083, + "step": 60000 + }, + { + "epoch": 0.6001, + "grad_norm": 14.6875, + "grad_norm_var": 3.018684895833333, + "learning_rate": 0.0003, + "loss": 10.8942, + "loss/aux_loss": 0.04808544497936964, + "loss/crossentropy": 2.8031197428703307, + "loss/logits": 0.8436632961034775, + "step": 60010 + }, + { + "epoch": 0.6002, + "grad_norm": 14.375, + "grad_norm_var": 0.6575358072916667, + "learning_rate": 0.0003, + "loss": 11.009, + "loss/aux_loss": 0.048070221580564974, + "loss/crossentropy": 2.835988187789917, + "loss/logits": 0.8101363390684128, + "step": 60020 + }, + { + "epoch": 0.6003, + "grad_norm": 17.625, + "grad_norm_var": 0.9681640625, + "learning_rate": 0.0003, + "loss": 11.0825, + "loss/aux_loss": 0.04806274306029081, + "loss/crossentropy": 2.634697949886322, + "loss/logits": 0.8300037115812302, + "step": 60030 + }, + { + "epoch": 0.6004, + "grad_norm": 15.4375, + "grad_norm_var": 1.369775390625, + "learning_rate": 0.0003, + "loss": 10.8961, + "loss/aux_loss": 0.04807508382946253, + "loss/crossentropy": 2.7980096697807313, + "loss/logits": 0.8517778217792511, + "step": 60040 + }, + { + "epoch": 0.6005, + "grad_norm": 15.6875, + "grad_norm_var": 1.1749348958333334, + "learning_rate": 0.0003, + "loss": 10.8726, + "loss/aux_loss": 0.048068697564303875, + "loss/crossentropy": 2.5798544883728027, + "loss/logits": 0.7742179721593857, + "step": 60050 + }, + { + "epoch": 0.6006, + "grad_norm": 16.625, + "grad_norm_var": 1.3494791666666666, + "learning_rate": 0.0003, + "loss": 11.0209, + "loss/aux_loss": 0.0480728967115283, + "loss/crossentropy": 2.7089627504348757, + "loss/logits": 0.8144480526447296, + "step": 60060 + }, + { + "epoch": 0.6007, + "grad_norm": 15.25, + "grad_norm_var": 1.4589680989583333, + "learning_rate": 0.0003, + "loss": 10.7924, + "loss/aux_loss": 0.04806405883282423, + "loss/crossentropy": 2.63613708615303, + "loss/logits": 0.82077776491642, + "step": 60070 + }, + { + "epoch": 0.6008, + "grad_norm": 14.625, + "grad_norm_var": 0.13214518229166666, + "learning_rate": 0.0003, + "loss": 10.8491, + "loss/aux_loss": 0.04806582164019346, + "loss/crossentropy": 2.709311383962631, + "loss/logits": 0.8013584047555924, + "step": 60080 + }, + { + "epoch": 0.6009, + "grad_norm": 14.75, + "grad_norm_var": 0.383837890625, + "learning_rate": 0.0003, + "loss": 11.0591, + "loss/aux_loss": 0.04806982241570949, + "loss/crossentropy": 2.7307440638542175, + "loss/logits": 0.8089520663022995, + "step": 60090 + }, + { + "epoch": 0.601, + "grad_norm": 14.6875, + "grad_norm_var": 0.47552083333333334, + "learning_rate": 0.0003, + "loss": 11.0038, + "loss/aux_loss": 0.04807723425328732, + "loss/crossentropy": 2.713645851612091, + "loss/logits": 0.8089924275875091, + "step": 60100 + }, + { + "epoch": 0.6011, + "grad_norm": 15.3125, + "grad_norm_var": 1.0103515625, + "learning_rate": 0.0003, + "loss": 10.8574, + "loss/aux_loss": 0.0480644965544343, + "loss/crossentropy": 2.7398121774196627, + "loss/logits": 0.8082275360822677, + "step": 60110 + }, + { + "epoch": 0.6012, + "grad_norm": 15.375, + "grad_norm_var": 0.6893229166666667, + "learning_rate": 0.0003, + "loss": 10.923, + "loss/aux_loss": 0.04806880187243223, + "loss/crossentropy": 2.7813449084758757, + "loss/logits": 0.8685790807008743, + "step": 60120 + }, + { + "epoch": 0.6013, + "grad_norm": 14.25, + "grad_norm_var": 0.377587890625, + "learning_rate": 0.0003, + "loss": 11.167, + "loss/aux_loss": 0.04805507734417915, + "loss/crossentropy": 2.8507793068885805, + "loss/logits": 0.8408539682626724, + "step": 60130 + }, + { + "epoch": 0.6014, + "grad_norm": 16.25, + "grad_norm_var": 0.5222493489583333, + "learning_rate": 0.0003, + "loss": 10.9877, + "loss/aux_loss": 0.048066430166363715, + "loss/crossentropy": 2.662059265375137, + "loss/logits": 0.8354754239320755, + "step": 60140 + }, + { + "epoch": 0.6015, + "grad_norm": 14.875, + "grad_norm_var": 1.5430826822916666, + "learning_rate": 0.0003, + "loss": 11.0714, + "loss/aux_loss": 0.04806972537189722, + "loss/crossentropy": 2.728497040271759, + "loss/logits": 0.8564503043889999, + "step": 60150 + }, + { + "epoch": 0.6016, + "grad_norm": 15.0625, + "grad_norm_var": 1.3777180989583333, + "learning_rate": 0.0003, + "loss": 10.8885, + "loss/aux_loss": 0.0480747090652585, + "loss/crossentropy": 2.6242256700992583, + "loss/logits": 0.8221401393413543, + "step": 60160 + }, + { + "epoch": 0.6017, + "grad_norm": 14.3125, + "grad_norm_var": 0.440869140625, + "learning_rate": 0.0003, + "loss": 10.9851, + "loss/aux_loss": 0.04808126259595156, + "loss/crossentropy": 2.571116214990616, + "loss/logits": 0.8445754140615463, + "step": 60170 + }, + { + "epoch": 0.6018, + "grad_norm": 15.125, + "grad_norm_var": 0.6421875, + "learning_rate": 0.0003, + "loss": 11.1588, + "loss/aux_loss": 0.0480703953653574, + "loss/crossentropy": 2.6313143491744997, + "loss/logits": 0.8429572701454162, + "step": 60180 + }, + { + "epoch": 0.6019, + "grad_norm": 15.0625, + "grad_norm_var": 0.4825358072916667, + "learning_rate": 0.0003, + "loss": 10.9319, + "loss/aux_loss": 0.048066435009241106, + "loss/crossentropy": 2.470182943344116, + "loss/logits": 0.7674608916044235, + "step": 60190 + }, + { + "epoch": 0.602, + "grad_norm": 14.5625, + "grad_norm_var": 0.5676920572916667, + "learning_rate": 0.0003, + "loss": 10.865, + "loss/aux_loss": 0.048062361776828766, + "loss/crossentropy": 2.652854871749878, + "loss/logits": 0.8476852804422379, + "step": 60200 + }, + { + "epoch": 0.6021, + "grad_norm": 15.75, + "grad_norm_var": 0.644775390625, + "learning_rate": 0.0003, + "loss": 10.9103, + "loss/aux_loss": 0.04806744400411844, + "loss/crossentropy": 2.886264109611511, + "loss/logits": 0.8491258502006531, + "step": 60210 + }, + { + "epoch": 0.6022, + "grad_norm": 16.25, + "grad_norm_var": 15.070947265625, + "learning_rate": 0.0003, + "loss": 10.9318, + "loss/aux_loss": 0.048081173188984395, + "loss/crossentropy": 2.7197977185249327, + "loss/logits": 0.8118861824274063, + "step": 60220 + }, + { + "epoch": 0.6023, + "grad_norm": 16.875, + "grad_norm_var": 15.35703125, + "learning_rate": 0.0003, + "loss": 10.8425, + "loss/aux_loss": 0.04807243477553129, + "loss/crossentropy": 2.675804728269577, + "loss/logits": 0.7933743417263031, + "step": 60230 + }, + { + "epoch": 0.6024, + "grad_norm": 15.6875, + "grad_norm_var": 0.8639973958333333, + "learning_rate": 0.0003, + "loss": 10.9663, + "loss/aux_loss": 0.048066473379731176, + "loss/crossentropy": 2.5603831708431244, + "loss/logits": 0.8213741898536682, + "step": 60240 + }, + { + "epoch": 0.6025, + "grad_norm": 14.75, + "grad_norm_var": 0.6863932291666667, + "learning_rate": 0.0003, + "loss": 11.0783, + "loss/aux_loss": 0.04807442165911198, + "loss/crossentropy": 2.759167742729187, + "loss/logits": 0.8260885119438172, + "step": 60250 + }, + { + "epoch": 0.6026, + "grad_norm": 15.625, + "grad_norm_var": 0.44724934895833335, + "learning_rate": 0.0003, + "loss": 10.9391, + "loss/aux_loss": 0.048061834275722505, + "loss/crossentropy": 2.830324959754944, + "loss/logits": 0.8341899156570435, + "step": 60260 + }, + { + "epoch": 0.6027, + "grad_norm": 14.125, + "grad_norm_var": 0.5426432291666666, + "learning_rate": 0.0003, + "loss": 10.9735, + "loss/aux_loss": 0.04807800371199846, + "loss/crossentropy": 2.710289627313614, + "loss/logits": 0.8235593348741531, + "step": 60270 + }, + { + "epoch": 0.6028, + "grad_norm": 12.9375, + "grad_norm_var": 0.8927083333333333, + "learning_rate": 0.0003, + "loss": 10.8861, + "loss/aux_loss": 0.04806134235113859, + "loss/crossentropy": 2.702167409658432, + "loss/logits": 0.8340632915496826, + "step": 60280 + }, + { + "epoch": 0.6029, + "grad_norm": 15.875, + "grad_norm_var": 1.0098307291666666, + "learning_rate": 0.0003, + "loss": 10.9902, + "loss/aux_loss": 0.04806969091296196, + "loss/crossentropy": 2.7321683406829833, + "loss/logits": 0.834402334690094, + "step": 60290 + }, + { + "epoch": 0.603, + "grad_norm": 15.625, + "grad_norm_var": 0.6902180989583333, + "learning_rate": 0.0003, + "loss": 10.9249, + "loss/aux_loss": 0.048069990053772924, + "loss/crossentropy": 2.5171724021434785, + "loss/logits": 0.7993605226278305, + "step": 60300 + }, + { + "epoch": 0.6031, + "grad_norm": 14.0, + "grad_norm_var": 0.43605143229166665, + "learning_rate": 0.0003, + "loss": 10.9595, + "loss/aux_loss": 0.04806102756410837, + "loss/crossentropy": 2.707091200351715, + "loss/logits": 0.8481060534715652, + "step": 60310 + }, + { + "epoch": 0.6032, + "grad_norm": 14.5625, + "grad_norm_var": 0.2884765625, + "learning_rate": 0.0003, + "loss": 10.9526, + "loss/aux_loss": 0.04808544833213091, + "loss/crossentropy": 2.7442154586315155, + "loss/logits": 0.8329938769340515, + "step": 60320 + }, + { + "epoch": 0.6033, + "grad_norm": 17.125, + "grad_norm_var": 0.6363118489583334, + "learning_rate": 0.0003, + "loss": 10.8661, + "loss/aux_loss": 0.04806406293064356, + "loss/crossentropy": 2.511446052789688, + "loss/logits": 0.8104943811893464, + "step": 60330 + }, + { + "epoch": 0.6034, + "grad_norm": 15.375, + "grad_norm_var": 0.9984375, + "learning_rate": 0.0003, + "loss": 10.7618, + "loss/aux_loss": 0.04807962328195572, + "loss/crossentropy": 2.5813129425048826, + "loss/logits": 0.8011042684316635, + "step": 60340 + }, + { + "epoch": 0.6035, + "grad_norm": 16.0, + "grad_norm_var": 0.7387858072916667, + "learning_rate": 0.0003, + "loss": 11.0578, + "loss/aux_loss": 0.04808678813278675, + "loss/crossentropy": 2.71219407916069, + "loss/logits": 0.8353963553905487, + "step": 60350 + }, + { + "epoch": 0.6036, + "grad_norm": 14.4375, + "grad_norm_var": 0.5567708333333333, + "learning_rate": 0.0003, + "loss": 10.9285, + "loss/aux_loss": 0.048061441816389563, + "loss/crossentropy": 2.757435607910156, + "loss/logits": 0.8533197224140168, + "step": 60360 + }, + { + "epoch": 0.6037, + "grad_norm": 15.0, + "grad_norm_var": 0.30514322916666664, + "learning_rate": 0.0003, + "loss": 10.9645, + "loss/aux_loss": 0.04807481989264488, + "loss/crossentropy": 2.685833466053009, + "loss/logits": 0.8134330004453659, + "step": 60370 + }, + { + "epoch": 0.6038, + "grad_norm": 15.25, + "grad_norm_var": 0.3848795572916667, + "learning_rate": 0.0003, + "loss": 11.053, + "loss/aux_loss": 0.04806959424167871, + "loss/crossentropy": 2.602872520685196, + "loss/logits": 0.8216569721698761, + "step": 60380 + }, + { + "epoch": 0.6039, + "grad_norm": 14.125, + "grad_norm_var": 0.6390462239583333, + "learning_rate": 0.0003, + "loss": 10.9354, + "loss/aux_loss": 0.04806573148816824, + "loss/crossentropy": 2.7883040606975555, + "loss/logits": 0.8433178305625916, + "step": 60390 + }, + { + "epoch": 0.604, + "grad_norm": 15.625, + "grad_norm_var": 0.424853515625, + "learning_rate": 0.0003, + "loss": 10.9932, + "loss/aux_loss": 0.0480732886120677, + "loss/crossentropy": 2.6341083645820618, + "loss/logits": 0.8201987504959106, + "step": 60400 + }, + { + "epoch": 0.6041, + "grad_norm": 15.4375, + "grad_norm_var": 4.114176432291667, + "learning_rate": 0.0003, + "loss": 10.8744, + "loss/aux_loss": 0.048076138645410535, + "loss/crossentropy": 2.8022005796432494, + "loss/logits": 0.8076352566480637, + "step": 60410 + }, + { + "epoch": 0.6042, + "grad_norm": 16.0, + "grad_norm_var": 5.117708333333334, + "learning_rate": 0.0003, + "loss": 10.7988, + "loss/aux_loss": 0.048061057738959787, + "loss/crossentropy": 2.6338140249252318, + "loss/logits": 0.7957394987344741, + "step": 60420 + }, + { + "epoch": 0.6043, + "grad_norm": 14.8125, + "grad_norm_var": 4.590869140625, + "learning_rate": 0.0003, + "loss": 11.0153, + "loss/aux_loss": 0.04807674996554852, + "loss/crossentropy": 2.7275028109550474, + "loss/logits": 0.8166062444448471, + "step": 60430 + }, + { + "epoch": 0.6044, + "grad_norm": 14.875, + "grad_norm_var": 0.3009765625, + "learning_rate": 0.0003, + "loss": 10.9633, + "loss/aux_loss": 0.04805723633617163, + "loss/crossentropy": 2.7013749897480013, + "loss/logits": 0.8536836624145507, + "step": 60440 + }, + { + "epoch": 0.6045, + "grad_norm": 14.3125, + "grad_norm_var": 0.5106770833333333, + "learning_rate": 0.0003, + "loss": 10.8347, + "loss/aux_loss": 0.04807833768427372, + "loss/crossentropy": 2.5935844779014587, + "loss/logits": 0.8088052183389663, + "step": 60450 + }, + { + "epoch": 0.6046, + "grad_norm": 15.0, + "grad_norm_var": 0.478125, + "learning_rate": 0.0003, + "loss": 10.9532, + "loss/aux_loss": 0.048071070946753024, + "loss/crossentropy": 2.699798661470413, + "loss/logits": 0.824173653125763, + "step": 60460 + }, + { + "epoch": 0.6047, + "grad_norm": 16.875, + "grad_norm_var": 0.451416015625, + "learning_rate": 0.0003, + "loss": 11.0419, + "loss/aux_loss": 0.048078490793704985, + "loss/crossentropy": 2.6727247834205627, + "loss/logits": 0.8337906152009964, + "step": 60470 + }, + { + "epoch": 0.6048, + "grad_norm": 14.8125, + "grad_norm_var": 0.66875, + "learning_rate": 0.0003, + "loss": 10.9652, + "loss/aux_loss": 0.048067685589194296, + "loss/crossentropy": 2.5421866893768312, + "loss/logits": 0.8169512122869491, + "step": 60480 + }, + { + "epoch": 0.6049, + "grad_norm": 14.3125, + "grad_norm_var": 0.5202473958333333, + "learning_rate": 0.0003, + "loss": 10.943, + "loss/aux_loss": 0.04807629156857729, + "loss/crossentropy": 2.7076221227645876, + "loss/logits": 0.8126240253448487, + "step": 60490 + }, + { + "epoch": 0.605, + "grad_norm": 16.375, + "grad_norm_var": 0.4942708333333333, + "learning_rate": 0.0003, + "loss": 10.9824, + "loss/aux_loss": 0.04806436561048031, + "loss/crossentropy": 2.716249758005142, + "loss/logits": 0.8121423751115799, + "step": 60500 + }, + { + "epoch": 0.6051, + "grad_norm": 14.6875, + "grad_norm_var": 0.33203125, + "learning_rate": 0.0003, + "loss": 10.8244, + "loss/aux_loss": 0.048078736290335655, + "loss/crossentropy": 2.5074705123901366, + "loss/logits": 0.7894505262374878, + "step": 60510 + }, + { + "epoch": 0.6052, + "grad_norm": 14.75, + "grad_norm_var": 0.6769368489583333, + "learning_rate": 0.0003, + "loss": 10.8273, + "loss/aux_loss": 0.04807204809039831, + "loss/crossentropy": 2.6767341911792757, + "loss/logits": 0.8195008933544159, + "step": 60520 + }, + { + "epoch": 0.6053, + "grad_norm": 14.0625, + "grad_norm_var": 0.9587076822916667, + "learning_rate": 0.0003, + "loss": 10.85, + "loss/aux_loss": 0.04806831441819668, + "loss/crossentropy": 2.8315181374549865, + "loss/logits": 0.8047915935516358, + "step": 60530 + }, + { + "epoch": 0.6054, + "grad_norm": 14.4375, + "grad_norm_var": 0.4041015625, + "learning_rate": 0.0003, + "loss": 10.8398, + "loss/aux_loss": 0.048069612868130204, + "loss/crossentropy": 2.657101058959961, + "loss/logits": 0.8052042782306671, + "step": 60540 + }, + { + "epoch": 0.6055, + "grad_norm": 14.625, + "grad_norm_var": 0.5738932291666666, + "learning_rate": 0.0003, + "loss": 10.8523, + "loss/aux_loss": 0.04806829355657101, + "loss/crossentropy": 2.584134030342102, + "loss/logits": 0.7640033394098282, + "step": 60550 + }, + { + "epoch": 0.6056, + "grad_norm": 15.25, + "grad_norm_var": 0.375244140625, + "learning_rate": 0.0003, + "loss": 10.9682, + "loss/aux_loss": 0.04807130675762892, + "loss/crossentropy": 2.680067926645279, + "loss/logits": 0.8243839502334595, + "step": 60560 + }, + { + "epoch": 0.6057, + "grad_norm": 15.0625, + "grad_norm_var": 0.43697916666666664, + "learning_rate": 0.0003, + "loss": 10.912, + "loss/aux_loss": 0.0480777146294713, + "loss/crossentropy": 2.6226901173591615, + "loss/logits": 0.7878055989742279, + "step": 60570 + }, + { + "epoch": 0.6058, + "grad_norm": 15.0625, + "grad_norm_var": 0.3062337239583333, + "learning_rate": 0.0003, + "loss": 10.9453, + "loss/aux_loss": 0.048065618798136714, + "loss/crossentropy": 2.6935440480709074, + "loss/logits": 0.8214478433132172, + "step": 60580 + }, + { + "epoch": 0.6059, + "grad_norm": 15.6875, + "grad_norm_var": 1.1348795572916666, + "learning_rate": 0.0003, + "loss": 10.9734, + "loss/aux_loss": 0.0480723200365901, + "loss/crossentropy": 2.7571221947669984, + "loss/logits": 0.8032363146543503, + "step": 60590 + }, + { + "epoch": 0.606, + "grad_norm": 14.5625, + "grad_norm_var": 1.2327473958333333, + "learning_rate": 0.0003, + "loss": 10.88, + "loss/aux_loss": 0.04806821886450052, + "loss/crossentropy": 2.661402940750122, + "loss/logits": 0.7886179000139236, + "step": 60600 + }, + { + "epoch": 0.6061, + "grad_norm": 15.0, + "grad_norm_var": 0.5450520833333333, + "learning_rate": 0.0003, + "loss": 10.8687, + "loss/aux_loss": 0.048080092296004295, + "loss/crossentropy": 2.4920336484909056, + "loss/logits": 0.7764117568731308, + "step": 60610 + }, + { + "epoch": 0.6062, + "grad_norm": 15.3125, + "grad_norm_var": 0.428125, + "learning_rate": 0.0003, + "loss": 11.0299, + "loss/aux_loss": 0.04806080795824528, + "loss/crossentropy": 2.8082796573638915, + "loss/logits": 0.8537631243467331, + "step": 60620 + }, + { + "epoch": 0.6063, + "grad_norm": 16.25, + "grad_norm_var": 0.29921875, + "learning_rate": 0.0003, + "loss": 11.0316, + "loss/aux_loss": 0.04806738365441561, + "loss/crossentropy": 2.7339873909950256, + "loss/logits": 0.7961423873901368, + "step": 60630 + }, + { + "epoch": 0.6064, + "grad_norm": 15.0625, + "grad_norm_var": 0.390625, + "learning_rate": 0.0003, + "loss": 10.8029, + "loss/aux_loss": 0.04806647207587957, + "loss/crossentropy": 2.7505694150924684, + "loss/logits": 0.8106872260570526, + "step": 60640 + }, + { + "epoch": 0.6065, + "grad_norm": 18.5, + "grad_norm_var": 0.990478515625, + "learning_rate": 0.0003, + "loss": 10.9839, + "loss/aux_loss": 0.0480714239180088, + "loss/crossentropy": 2.696449559926987, + "loss/logits": 0.8294881820678711, + "step": 60650 + }, + { + "epoch": 0.6066, + "grad_norm": 16.625, + "grad_norm_var": 1.271728515625, + "learning_rate": 0.0003, + "loss": 10.7504, + "loss/aux_loss": 0.04806580077856779, + "loss/crossentropy": 2.5379061937332152, + "loss/logits": 0.7808073431253433, + "step": 60660 + }, + { + "epoch": 0.6067, + "grad_norm": 15.125, + "grad_norm_var": 0.8075358072916666, + "learning_rate": 0.0003, + "loss": 11.0137, + "loss/aux_loss": 0.04806665126234293, + "loss/crossentropy": 2.7637165009975435, + "loss/logits": 0.8278068244457245, + "step": 60670 + }, + { + "epoch": 0.6068, + "grad_norm": 14.75, + "grad_norm_var": 0.43917643229166664, + "learning_rate": 0.0003, + "loss": 10.8688, + "loss/aux_loss": 0.04807304907590151, + "loss/crossentropy": 2.606863057613373, + "loss/logits": 0.8165640115737915, + "step": 60680 + }, + { + "epoch": 0.6069, + "grad_norm": 15.0625, + "grad_norm_var": 0.3447265625, + "learning_rate": 0.0003, + "loss": 10.9712, + "loss/aux_loss": 0.04807915091514588, + "loss/crossentropy": 2.7368947744369505, + "loss/logits": 0.8139879643917084, + "step": 60690 + }, + { + "epoch": 0.607, + "grad_norm": 13.4375, + "grad_norm_var": 0.539697265625, + "learning_rate": 0.0003, + "loss": 10.8863, + "loss/aux_loss": 0.04805984944105148, + "loss/crossentropy": 2.561430436372757, + "loss/logits": 0.7969212979078293, + "step": 60700 + }, + { + "epoch": 0.6071, + "grad_norm": 16.125, + "grad_norm_var": 0.8063639322916667, + "learning_rate": 0.0003, + "loss": 11.1143, + "loss/aux_loss": 0.048070601746439935, + "loss/crossentropy": 2.724953460693359, + "loss/logits": 0.8761008381843567, + "step": 60710 + }, + { + "epoch": 0.6072, + "grad_norm": 15.8125, + "grad_norm_var": 0.5405598958333333, + "learning_rate": 0.0003, + "loss": 11.067, + "loss/aux_loss": 0.04806268252432346, + "loss/crossentropy": 2.722470408678055, + "loss/logits": 0.8286719590425491, + "step": 60720 + }, + { + "epoch": 0.6073, + "grad_norm": 16.75, + "grad_norm_var": 0.48292643229166665, + "learning_rate": 0.0003, + "loss": 10.9106, + "loss/aux_loss": 0.048081399872899055, + "loss/crossentropy": 2.7040891528129576, + "loss/logits": 0.8151125907897949, + "step": 60730 + }, + { + "epoch": 0.6074, + "grad_norm": 14.9375, + "grad_norm_var": 0.6304524739583334, + "learning_rate": 0.0003, + "loss": 11.0008, + "loss/aux_loss": 0.04806943740695715, + "loss/crossentropy": 2.750800085067749, + "loss/logits": 0.811496239900589, + "step": 60740 + }, + { + "epoch": 0.6075, + "grad_norm": 14.875, + "grad_norm_var": 0.5072265625, + "learning_rate": 0.0003, + "loss": 10.9407, + "loss/aux_loss": 0.048066843301057816, + "loss/crossentropy": 2.5790812611579894, + "loss/logits": 0.8062135219573975, + "step": 60750 + }, + { + "epoch": 0.6076, + "grad_norm": 15.125, + "grad_norm_var": 0.4410807291666667, + "learning_rate": 0.0003, + "loss": 10.938, + "loss/aux_loss": 0.04806975163519382, + "loss/crossentropy": 2.669712710380554, + "loss/logits": 0.8050375521183014, + "step": 60760 + }, + { + "epoch": 0.6077, + "grad_norm": 16.5, + "grad_norm_var": 0.5805826822916667, + "learning_rate": 0.0003, + "loss": 10.9754, + "loss/aux_loss": 0.04807074461132288, + "loss/crossentropy": 2.7768321573734283, + "loss/logits": 0.8277645260095596, + "step": 60770 + }, + { + "epoch": 0.6078, + "grad_norm": 14.375, + "grad_norm_var": 0.8393229166666667, + "learning_rate": 0.0003, + "loss": 10.9727, + "loss/aux_loss": 0.04806243553757668, + "loss/crossentropy": 2.7435842633247374, + "loss/logits": 0.8338838994503022, + "step": 60780 + }, + { + "epoch": 0.6079, + "grad_norm": 14.9375, + "grad_norm_var": 0.7704264322916666, + "learning_rate": 0.0003, + "loss": 10.8381, + "loss/aux_loss": 0.048067687265574935, + "loss/crossentropy": 2.695614975690842, + "loss/logits": 0.7965478479862214, + "step": 60790 + }, + { + "epoch": 0.608, + "grad_norm": 16.375, + "grad_norm_var": 1.0828125, + "learning_rate": 0.0003, + "loss": 10.9719, + "loss/aux_loss": 0.04806322492659092, + "loss/crossentropy": 2.747288691997528, + "loss/logits": 0.8433381974697113, + "step": 60800 + }, + { + "epoch": 0.6081, + "grad_norm": 14.9375, + "grad_norm_var": 0.7783854166666667, + "learning_rate": 0.0003, + "loss": 10.9004, + "loss/aux_loss": 0.04807988330721855, + "loss/crossentropy": 2.7076495826244353, + "loss/logits": 0.83790722489357, + "step": 60810 + }, + { + "epoch": 0.6082, + "grad_norm": 15.9375, + "grad_norm_var": 0.40305989583333335, + "learning_rate": 0.0003, + "loss": 11.0397, + "loss/aux_loss": 0.0480671776458621, + "loss/crossentropy": 2.6762712955474854, + "loss/logits": 0.8186611771583557, + "step": 60820 + }, + { + "epoch": 0.6083, + "grad_norm": 15.5625, + "grad_norm_var": 0.33307291666666666, + "learning_rate": 0.0003, + "loss": 11.0564, + "loss/aux_loss": 0.04807759691029787, + "loss/crossentropy": 2.673157799243927, + "loss/logits": 0.8050933957099915, + "step": 60830 + }, + { + "epoch": 0.6084, + "grad_norm": 15.125, + "grad_norm_var": 0.8190104166666666, + "learning_rate": 0.0003, + "loss": 10.8996, + "loss/aux_loss": 0.048066492564976214, + "loss/crossentropy": 2.7351067125797273, + "loss/logits": 0.8209027826786042, + "step": 60840 + }, + { + "epoch": 0.6085, + "grad_norm": 15.3125, + "grad_norm_var": 0.1978515625, + "learning_rate": 0.0003, + "loss": 11.0685, + "loss/aux_loss": 0.04806740805506706, + "loss/crossentropy": 2.7408780336380003, + "loss/logits": 0.8286194503307343, + "step": 60850 + }, + { + "epoch": 0.6086, + "grad_norm": 14.25, + "grad_norm_var": 0.15636393229166667, + "learning_rate": 0.0003, + "loss": 10.9519, + "loss/aux_loss": 0.048057135008275506, + "loss/crossentropy": 2.708062160015106, + "loss/logits": 0.8348789572715759, + "step": 60860 + }, + { + "epoch": 0.6087, + "grad_norm": 15.0625, + "grad_norm_var": 0.245556640625, + "learning_rate": 0.0003, + "loss": 10.9066, + "loss/aux_loss": 0.04808010403066874, + "loss/crossentropy": 2.7931439101696016, + "loss/logits": 0.823428162932396, + "step": 60870 + }, + { + "epoch": 0.6088, + "grad_norm": 14.0625, + "grad_norm_var": 0.3282389322916667, + "learning_rate": 0.0003, + "loss": 11.0119, + "loss/aux_loss": 0.04807362128049135, + "loss/crossentropy": 2.803303599357605, + "loss/logits": 0.8527332812547683, + "step": 60880 + }, + { + "epoch": 0.6089, + "grad_norm": 14.625, + "grad_norm_var": 0.40208333333333335, + "learning_rate": 0.0003, + "loss": 10.9771, + "loss/aux_loss": 0.04806482549756765, + "loss/crossentropy": 2.5410806000232697, + "loss/logits": 0.8482417315244675, + "step": 60890 + }, + { + "epoch": 0.609, + "grad_norm": 15.125, + "grad_norm_var": 0.276025390625, + "learning_rate": 0.0003, + "loss": 11.0426, + "loss/aux_loss": 0.04806209746748209, + "loss/crossentropy": 2.8440731525421143, + "loss/logits": 0.8651130110025406, + "step": 60900 + }, + { + "epoch": 0.6091, + "grad_norm": 15.0625, + "grad_norm_var": 0.162744140625, + "learning_rate": 0.0003, + "loss": 10.8486, + "loss/aux_loss": 0.048080362193286416, + "loss/crossentropy": 2.608972841501236, + "loss/logits": 0.8105991780757904, + "step": 60910 + }, + { + "epoch": 0.6092, + "grad_norm": 15.6875, + "grad_norm_var": 0.20807291666666666, + "learning_rate": 0.0003, + "loss": 10.9675, + "loss/aux_loss": 0.04806051570922136, + "loss/crossentropy": 2.6756951212882996, + "loss/logits": 0.7980956196784973, + "step": 60920 + }, + { + "epoch": 0.6093, + "grad_norm": 16.625, + "grad_norm_var": 0.2994140625, + "learning_rate": 0.0003, + "loss": 11.0392, + "loss/aux_loss": 0.048062234185636044, + "loss/crossentropy": 2.738618332147598, + "loss/logits": 0.8416084438562393, + "step": 60930 + }, + { + "epoch": 0.6094, + "grad_norm": 16.0, + "grad_norm_var": 0.4228515625, + "learning_rate": 0.0003, + "loss": 11.0543, + "loss/aux_loss": 0.04806976187974214, + "loss/crossentropy": 2.698890858888626, + "loss/logits": 0.8149118602275849, + "step": 60940 + }, + { + "epoch": 0.6095, + "grad_norm": 14.3125, + "grad_norm_var": 0.46144205729166665, + "learning_rate": 0.0003, + "loss": 10.9536, + "loss/aux_loss": 0.0480699822306633, + "loss/crossentropy": 2.839012861251831, + "loss/logits": 0.8258768379688263, + "step": 60950 + }, + { + "epoch": 0.6096, + "grad_norm": 15.0, + "grad_norm_var": 0.5419108072916666, + "learning_rate": 0.0003, + "loss": 10.9647, + "loss/aux_loss": 0.048070922307670114, + "loss/crossentropy": 2.738613134622574, + "loss/logits": 0.8129228353500366, + "step": 60960 + }, + { + "epoch": 0.6097, + "grad_norm": 17.625, + "grad_norm_var": 0.9274576822916667, + "learning_rate": 0.0003, + "loss": 10.9578, + "loss/aux_loss": 0.04805801305919886, + "loss/crossentropy": 2.5094609320163728, + "loss/logits": 0.8016722679138184, + "step": 60970 + }, + { + "epoch": 0.6098, + "grad_norm": 15.8125, + "grad_norm_var": 0.7968098958333333, + "learning_rate": 0.0003, + "loss": 10.9871, + "loss/aux_loss": 0.048079372942447664, + "loss/crossentropy": 2.664980614185333, + "loss/logits": 0.8202985137701034, + "step": 60980 + }, + { + "epoch": 0.6099, + "grad_norm": 15.875, + "grad_norm_var": 0.5782389322916667, + "learning_rate": 0.0003, + "loss": 11.0115, + "loss/aux_loss": 0.04807392340153456, + "loss/crossentropy": 2.7239008784294128, + "loss/logits": 0.831571900844574, + "step": 60990 + }, + { + "epoch": 0.61, + "grad_norm": 13.5, + "grad_norm_var": 0.58828125, + "learning_rate": 0.0003, + "loss": 10.8612, + "loss/aux_loss": 0.04806995764374733, + "loss/crossentropy": 2.6622074127197264, + "loss/logits": 0.7961272418498992, + "step": 61000 + }, + { + "epoch": 0.6101, + "grad_norm": 16.0, + "grad_norm_var": 0.6364420572916667, + "learning_rate": 0.0003, + "loss": 10.8923, + "loss/aux_loss": 0.04806650709360838, + "loss/crossentropy": 2.7559936583042144, + "loss/logits": 0.8345979481935502, + "step": 61010 + }, + { + "epoch": 0.6102, + "grad_norm": 15.5625, + "grad_norm_var": 2.0155598958333334, + "learning_rate": 0.0003, + "loss": 10.8705, + "loss/aux_loss": 0.04807249642908573, + "loss/crossentropy": 2.591252303123474, + "loss/logits": 0.7885099232196808, + "step": 61020 + }, + { + "epoch": 0.6103, + "grad_norm": 14.9375, + "grad_norm_var": 2.109114583333333, + "learning_rate": 0.0003, + "loss": 10.9392, + "loss/aux_loss": 0.04807036854326725, + "loss/crossentropy": 2.685466194152832, + "loss/logits": 0.8492469847202301, + "step": 61030 + }, + { + "epoch": 0.6104, + "grad_norm": 14.4375, + "grad_norm_var": 0.6700520833333333, + "learning_rate": 0.0003, + "loss": 10.9431, + "loss/aux_loss": 0.04806849993765354, + "loss/crossentropy": 2.6850741684436796, + "loss/logits": 0.8018898099660874, + "step": 61040 + }, + { + "epoch": 0.6105, + "grad_norm": 14.5, + "grad_norm_var": 0.3611979166666667, + "learning_rate": 0.0003, + "loss": 10.8835, + "loss/aux_loss": 0.04807262290269136, + "loss/crossentropy": 2.704542863368988, + "loss/logits": 0.834966391324997, + "step": 61050 + }, + { + "epoch": 0.6106, + "grad_norm": 15.5, + "grad_norm_var": 0.350244140625, + "learning_rate": 0.0003, + "loss": 10.9628, + "loss/aux_loss": 0.04807685576379299, + "loss/crossentropy": 2.6365856409072874, + "loss/logits": 0.8094233006238938, + "step": 61060 + }, + { + "epoch": 0.6107, + "grad_norm": 14.8125, + "grad_norm_var": 0.3317057291666667, + "learning_rate": 0.0003, + "loss": 11.0145, + "loss/aux_loss": 0.048073244467377665, + "loss/crossentropy": 2.844811725616455, + "loss/logits": 0.836044305562973, + "step": 61070 + }, + { + "epoch": 0.6108, + "grad_norm": 16.5, + "grad_norm_var": 1.5063639322916667, + "learning_rate": 0.0003, + "loss": 10.8371, + "loss/aux_loss": 0.04806085731834173, + "loss/crossentropy": 2.7075256764888764, + "loss/logits": 0.8152056097984314, + "step": 61080 + }, + { + "epoch": 0.6109, + "grad_norm": 17.0, + "grad_norm_var": 0.8766764322916667, + "learning_rate": 0.0003, + "loss": 10.8974, + "loss/aux_loss": 0.04807128459215164, + "loss/crossentropy": 2.675181972980499, + "loss/logits": 0.8163933247327805, + "step": 61090 + }, + { + "epoch": 0.611, + "grad_norm": 15.625, + "grad_norm_var": 0.8407389322916666, + "learning_rate": 0.0003, + "loss": 10.8567, + "loss/aux_loss": 0.04806245286017656, + "loss/crossentropy": 2.7227718472480773, + "loss/logits": 0.8572416335344315, + "step": 61100 + }, + { + "epoch": 0.6111, + "grad_norm": 17.0, + "grad_norm_var": 0.7202473958333333, + "learning_rate": 0.0003, + "loss": 10.8851, + "loss/aux_loss": 0.048066180758178235, + "loss/crossentropy": 2.744245910644531, + "loss/logits": 0.8281572759151459, + "step": 61110 + }, + { + "epoch": 0.6112, + "grad_norm": 13.875, + "grad_norm_var": 0.7058430989583333, + "learning_rate": 0.0003, + "loss": 10.974, + "loss/aux_loss": 0.04806941282004118, + "loss/crossentropy": 2.788532388210297, + "loss/logits": 0.8086911767721177, + "step": 61120 + }, + { + "epoch": 0.6113, + "grad_norm": 13.9375, + "grad_norm_var": 2.058854166666667, + "learning_rate": 0.0003, + "loss": 10.9633, + "loss/aux_loss": 0.048071177862584594, + "loss/crossentropy": 2.5584035396575926, + "loss/logits": 0.7963971257209778, + "step": 61130 + }, + { + "epoch": 0.6114, + "grad_norm": 15.5, + "grad_norm_var": 2.2919270833333334, + "learning_rate": 0.0003, + "loss": 11.0252, + "loss/aux_loss": 0.048073113150894645, + "loss/crossentropy": 2.816734492778778, + "loss/logits": 0.8178564816713333, + "step": 61140 + }, + { + "epoch": 0.6115, + "grad_norm": 14.9375, + "grad_norm_var": 1.1398274739583334, + "learning_rate": 0.0003, + "loss": 11.1053, + "loss/aux_loss": 0.04807866048067808, + "loss/crossentropy": 2.7244319319725037, + "loss/logits": 0.8512472093105317, + "step": 61150 + }, + { + "epoch": 0.6116, + "grad_norm": 14.5625, + "grad_norm_var": 0.8706868489583334, + "learning_rate": 0.0003, + "loss": 11.0246, + "loss/aux_loss": 0.04805517755448818, + "loss/crossentropy": 2.801200783252716, + "loss/logits": 0.8401286274194717, + "step": 61160 + }, + { + "epoch": 0.6117, + "grad_norm": 13.9375, + "grad_norm_var": 0.4552083333333333, + "learning_rate": 0.0003, + "loss": 11.0931, + "loss/aux_loss": 0.04807036258280277, + "loss/crossentropy": 2.772087001800537, + "loss/logits": 0.8375712424516678, + "step": 61170 + }, + { + "epoch": 0.6118, + "grad_norm": 15.875, + "grad_norm_var": 0.84375, + "learning_rate": 0.0003, + "loss": 10.9609, + "loss/aux_loss": 0.04807903449982405, + "loss/crossentropy": 2.622360199689865, + "loss/logits": 0.8055921524763108, + "step": 61180 + }, + { + "epoch": 0.6119, + "grad_norm": 15.4375, + "grad_norm_var": 0.5645670572916667, + "learning_rate": 0.0003, + "loss": 11.0062, + "loss/aux_loss": 0.04806223157793284, + "loss/crossentropy": 2.681737995147705, + "loss/logits": 0.813096073269844, + "step": 61190 + }, + { + "epoch": 0.612, + "grad_norm": 15.9375, + "grad_norm_var": 0.445166015625, + "learning_rate": 0.0003, + "loss": 10.9057, + "loss/aux_loss": 0.048072535917162894, + "loss/crossentropy": 2.6666407227516173, + "loss/logits": 0.8438924968242645, + "step": 61200 + }, + { + "epoch": 0.6121, + "grad_norm": 16.375, + "grad_norm_var": 0.6181640625, + "learning_rate": 0.0003, + "loss": 10.8865, + "loss/aux_loss": 0.04807114116847515, + "loss/crossentropy": 2.698245918750763, + "loss/logits": 0.8058286488056183, + "step": 61210 + }, + { + "epoch": 0.6122, + "grad_norm": 16.125, + "grad_norm_var": 0.8072265625, + "learning_rate": 0.0003, + "loss": 10.9027, + "loss/aux_loss": 0.048066765256226066, + "loss/crossentropy": 2.682483744621277, + "loss/logits": 0.7972517877817153, + "step": 61220 + }, + { + "epoch": 0.6123, + "grad_norm": 14.5, + "grad_norm_var": 1.0152180989583333, + "learning_rate": 0.0003, + "loss": 10.9902, + "loss/aux_loss": 0.04806827660650015, + "loss/crossentropy": 2.8098564445972443, + "loss/logits": 0.8148068457841873, + "step": 61230 + }, + { + "epoch": 0.6124, + "grad_norm": 15.125, + "grad_norm_var": 0.787744140625, + "learning_rate": 0.0003, + "loss": 10.925, + "loss/aux_loss": 0.04808166231960058, + "loss/crossentropy": 2.5316755414009093, + "loss/logits": 0.8108256548643112, + "step": 61240 + }, + { + "epoch": 0.6125, + "grad_norm": 13.5, + "grad_norm_var": 0.662353515625, + "learning_rate": 0.0003, + "loss": 10.8142, + "loss/aux_loss": 0.048060395009815696, + "loss/crossentropy": 2.6689065754413606, + "loss/logits": 0.7798559069633484, + "step": 61250 + }, + { + "epoch": 0.6126, + "grad_norm": 15.0625, + "grad_norm_var": 0.5609212239583333, + "learning_rate": 0.0003, + "loss": 10.9542, + "loss/aux_loss": 0.04807155355811119, + "loss/crossentropy": 2.671508860588074, + "loss/logits": 0.8162847578525543, + "step": 61260 + }, + { + "epoch": 0.6127, + "grad_norm": 15.125, + "grad_norm_var": 0.26848958333333334, + "learning_rate": 0.0003, + "loss": 10.9077, + "loss/aux_loss": 0.04807188101112843, + "loss/crossentropy": 2.8237990200519563, + "loss/logits": 0.8153227150440217, + "step": 61270 + }, + { + "epoch": 0.6128, + "grad_norm": 14.5, + "grad_norm_var": 22.748372395833332, + "learning_rate": 0.0003, + "loss": 11.0002, + "loss/aux_loss": 0.048073857091367245, + "loss/crossentropy": 2.720522928237915, + "loss/logits": 0.8327284932136536, + "step": 61280 + }, + { + "epoch": 0.6129, + "grad_norm": 15.6875, + "grad_norm_var": 20.959358723958335, + "learning_rate": 0.0003, + "loss": 10.8401, + "loss/aux_loss": 0.0480800049379468, + "loss/crossentropy": 2.8034905910491945, + "loss/logits": 0.7961880445480347, + "step": 61290 + }, + { + "epoch": 0.613, + "grad_norm": 16.125, + "grad_norm_var": 0.31378580729166666, + "learning_rate": 0.0003, + "loss": 10.9138, + "loss/aux_loss": 0.04807343352586031, + "loss/crossentropy": 2.59561088681221, + "loss/logits": 0.8048137962818146, + "step": 61300 + }, + { + "epoch": 0.6131, + "grad_norm": 14.875, + "grad_norm_var": 0.700634765625, + "learning_rate": 0.0003, + "loss": 10.9678, + "loss/aux_loss": 0.04806978832930327, + "loss/crossentropy": 2.6666161894798277, + "loss/logits": 0.8165017098188401, + "step": 61310 + }, + { + "epoch": 0.6132, + "grad_norm": 14.5, + "grad_norm_var": 0.784228515625, + "learning_rate": 0.0003, + "loss": 10.8918, + "loss/aux_loss": 0.0480765713378787, + "loss/crossentropy": 2.6589391052722933, + "loss/logits": 0.8012593746185303, + "step": 61320 + }, + { + "epoch": 0.6133, + "grad_norm": 15.625, + "grad_norm_var": 1.0489583333333334, + "learning_rate": 0.0003, + "loss": 10.9852, + "loss/aux_loss": 0.04807199165225029, + "loss/crossentropy": 2.744808477163315, + "loss/logits": 0.8064806133508682, + "step": 61330 + }, + { + "epoch": 0.6134, + "grad_norm": 18.0, + "grad_norm_var": 1.1325520833333333, + "learning_rate": 0.0003, + "loss": 10.9129, + "loss/aux_loss": 0.04806118700653315, + "loss/crossentropy": 2.7831350564956665, + "loss/logits": 0.8111140578985214, + "step": 61340 + }, + { + "epoch": 0.6135, + "grad_norm": 15.1875, + "grad_norm_var": 1.3180826822916667, + "learning_rate": 0.0003, + "loss": 10.9594, + "loss/aux_loss": 0.04806741625070572, + "loss/crossentropy": 2.66780064702034, + "loss/logits": 0.8168701589107513, + "step": 61350 + }, + { + "epoch": 0.6136, + "grad_norm": 14.0625, + "grad_norm_var": 0.9478515625, + "learning_rate": 0.0003, + "loss": 11.0428, + "loss/aux_loss": 0.04806680958718061, + "loss/crossentropy": 2.756870436668396, + "loss/logits": 0.8362319558858872, + "step": 61360 + }, + { + "epoch": 0.6137, + "grad_norm": 14.625, + "grad_norm_var": 0.949853515625, + "learning_rate": 0.0003, + "loss": 11.0459, + "loss/aux_loss": 0.04806111045181751, + "loss/crossentropy": 2.7263152480125425, + "loss/logits": 0.8313703805208206, + "step": 61370 + }, + { + "epoch": 0.6138, + "grad_norm": 14.5625, + "grad_norm_var": 0.459375, + "learning_rate": 0.0003, + "loss": 11.0347, + "loss/aux_loss": 0.04806870762258768, + "loss/crossentropy": 2.6985792994499205, + "loss/logits": 0.8102845966815948, + "step": 61380 + }, + { + "epoch": 0.6139, + "grad_norm": 15.1875, + "grad_norm_var": 0.8634765625, + "learning_rate": 0.0003, + "loss": 11.1736, + "loss/aux_loss": 0.048068526200950146, + "loss/crossentropy": 2.7508439660072326, + "loss/logits": 0.8183762282133102, + "step": 61390 + }, + { + "epoch": 0.614, + "grad_norm": 15.125, + "grad_norm_var": 0.3478515625, + "learning_rate": 0.0003, + "loss": 10.8252, + "loss/aux_loss": 0.04806302357465029, + "loss/crossentropy": 2.672939831018448, + "loss/logits": 0.8024695843458176, + "step": 61400 + }, + { + "epoch": 0.6141, + "grad_norm": 14.3125, + "grad_norm_var": 0.3848958333333333, + "learning_rate": 0.0003, + "loss": 11.0337, + "loss/aux_loss": 0.048076591454446316, + "loss/crossentropy": 2.83030418753624, + "loss/logits": 0.8267215609550476, + "step": 61410 + }, + { + "epoch": 0.6142, + "grad_norm": 14.4375, + "grad_norm_var": 0.6458333333333334, + "learning_rate": 0.0003, + "loss": 10.9174, + "loss/aux_loss": 0.04806251674890518, + "loss/crossentropy": 2.7849998474121094, + "loss/logits": 0.8149536848068237, + "step": 61420 + }, + { + "epoch": 0.6143, + "grad_norm": 15.1875, + "grad_norm_var": 126.11222330729167, + "learning_rate": 0.0003, + "loss": 11.0026, + "loss/aux_loss": 0.048078781180083754, + "loss/crossentropy": 2.8044037342071535, + "loss/logits": 0.8208670258522034, + "step": 61430 + }, + { + "epoch": 0.6144, + "grad_norm": 14.1875, + "grad_norm_var": 1.9354166666666666, + "learning_rate": 0.0003, + "loss": 10.9765, + "loss/aux_loss": 0.0480761431157589, + "loss/crossentropy": 2.7967730283737184, + "loss/logits": 0.8021587640047073, + "step": 61440 + }, + { + "epoch": 0.6145, + "grad_norm": 15.625, + "grad_norm_var": 0.901416015625, + "learning_rate": 0.0003, + "loss": 11.0474, + "loss/aux_loss": 0.048061800003051755, + "loss/crossentropy": 2.6379260659217834, + "loss/logits": 0.8099302232265473, + "step": 61450 + }, + { + "epoch": 0.6146, + "grad_norm": 15.0, + "grad_norm_var": 0.5874348958333333, + "learning_rate": 0.0003, + "loss": 11.0572, + "loss/aux_loss": 0.048064269311726096, + "loss/crossentropy": 2.754233205318451, + "loss/logits": 0.8168322265148162, + "step": 61460 + }, + { + "epoch": 0.6147, + "grad_norm": 15.0, + "grad_norm_var": 0.5436848958333333, + "learning_rate": 0.0003, + "loss": 10.9424, + "loss/aux_loss": 0.04808512944728136, + "loss/crossentropy": 2.7634010910987854, + "loss/logits": 0.8455385863780975, + "step": 61470 + }, + { + "epoch": 0.6148, + "grad_norm": 15.75, + "grad_norm_var": 1.7707682291666667, + "learning_rate": 0.0003, + "loss": 11.1005, + "loss/aux_loss": 0.048066995665431024, + "loss/crossentropy": 2.7025927007198334, + "loss/logits": 0.8244033396244049, + "step": 61480 + }, + { + "epoch": 0.6149, + "grad_norm": 15.8125, + "grad_norm_var": 0.7311848958333333, + "learning_rate": 0.0003, + "loss": 10.7938, + "loss/aux_loss": 0.048072151467204095, + "loss/crossentropy": 2.481117475032806, + "loss/logits": 0.8055284798145295, + "step": 61490 + }, + { + "epoch": 0.615, + "grad_norm": 16.125, + "grad_norm_var": 0.6077962239583333, + "learning_rate": 0.0003, + "loss": 10.9174, + "loss/aux_loss": 0.04806630816310644, + "loss/crossentropy": 2.6769744515419007, + "loss/logits": 0.8111788332462311, + "step": 61500 + }, + { + "epoch": 0.6151, + "grad_norm": 15.25, + "grad_norm_var": 0.5186848958333333, + "learning_rate": 0.0003, + "loss": 10.9967, + "loss/aux_loss": 0.04806443694978953, + "loss/crossentropy": 2.683100473880768, + "loss/logits": 0.8171298623085022, + "step": 61510 + }, + { + "epoch": 0.6152, + "grad_norm": 15.4375, + "grad_norm_var": 0.744384765625, + "learning_rate": 0.0003, + "loss": 10.8982, + "loss/aux_loss": 0.04807937704026699, + "loss/crossentropy": 2.7236180365085603, + "loss/logits": 0.821711191534996, + "step": 61520 + }, + { + "epoch": 0.6153, + "grad_norm": 17.25, + "grad_norm_var": 1.3526041666666666, + "learning_rate": 0.0003, + "loss": 11.0416, + "loss/aux_loss": 0.0480710020288825, + "loss/crossentropy": 2.7528501987457275, + "loss/logits": 0.8094364821910858, + "step": 61530 + }, + { + "epoch": 0.6154, + "grad_norm": 16.25, + "grad_norm_var": 1.1885416666666666, + "learning_rate": 0.0003, + "loss": 11.0722, + "loss/aux_loss": 0.04807053804397583, + "loss/crossentropy": 2.767429292201996, + "loss/logits": 0.8756510764360428, + "step": 61540 + }, + { + "epoch": 0.6155, + "grad_norm": 14.5625, + "grad_norm_var": 0.620556640625, + "learning_rate": 0.0003, + "loss": 10.8218, + "loss/aux_loss": 0.04807050470262766, + "loss/crossentropy": 2.697913628816605, + "loss/logits": 0.7820253252983094, + "step": 61550 + }, + { + "epoch": 0.6156, + "grad_norm": 14.5625, + "grad_norm_var": 0.38084309895833335, + "learning_rate": 0.0003, + "loss": 10.9104, + "loss/aux_loss": 0.0480689549818635, + "loss/crossentropy": 2.6347146034240723, + "loss/logits": 0.7996633857488632, + "step": 61560 + }, + { + "epoch": 0.6157, + "grad_norm": 14.375, + "grad_norm_var": 0.5408854166666667, + "learning_rate": 0.0003, + "loss": 10.8944, + "loss/aux_loss": 0.04806984327733517, + "loss/crossentropy": 2.7262151658535005, + "loss/logits": 0.8081013143062592, + "step": 61570 + }, + { + "epoch": 0.6158, + "grad_norm": 16.125, + "grad_norm_var": 0.8464680989583333, + "learning_rate": 0.0003, + "loss": 10.8768, + "loss/aux_loss": 0.04807692188769579, + "loss/crossentropy": 2.697046458721161, + "loss/logits": 0.7776322573423385, + "step": 61580 + }, + { + "epoch": 0.6159, + "grad_norm": 14.0625, + "grad_norm_var": 0.794775390625, + "learning_rate": 0.0003, + "loss": 10.9726, + "loss/aux_loss": 0.04806830957531929, + "loss/crossentropy": 2.7502528548240663, + "loss/logits": 0.8258104085922241, + "step": 61590 + }, + { + "epoch": 0.616, + "grad_norm": 15.5625, + "grad_norm_var": 0.4962890625, + "learning_rate": 0.0003, + "loss": 10.886, + "loss/aux_loss": 0.04806863311678171, + "loss/crossentropy": 2.702814507484436, + "loss/logits": 0.8230858445167542, + "step": 61600 + }, + { + "epoch": 0.6161, + "grad_norm": 15.3125, + "grad_norm_var": 0.359228515625, + "learning_rate": 0.0003, + "loss": 10.9529, + "loss/aux_loss": 0.048067998327314856, + "loss/crossentropy": 2.7368035674095155, + "loss/logits": 0.8276433378458024, + "step": 61610 + }, + { + "epoch": 0.6162, + "grad_norm": 15.125, + "grad_norm_var": 1.8343098958333333, + "learning_rate": 0.0003, + "loss": 10.9874, + "loss/aux_loss": 0.048063565976917744, + "loss/crossentropy": 2.6927200853824615, + "loss/logits": 0.8120525509119034, + "step": 61620 + }, + { + "epoch": 0.6163, + "grad_norm": 15.4375, + "grad_norm_var": 1.581884765625, + "learning_rate": 0.0003, + "loss": 10.9596, + "loss/aux_loss": 0.04806598611176014, + "loss/crossentropy": 2.6300831198692323, + "loss/logits": 0.7872573018074036, + "step": 61630 + }, + { + "epoch": 0.6164, + "grad_norm": 14.5, + "grad_norm_var": 0.5002604166666667, + "learning_rate": 0.0003, + "loss": 10.8337, + "loss/aux_loss": 0.04807138796895742, + "loss/crossentropy": 2.746791756153107, + "loss/logits": 0.796841761469841, + "step": 61640 + }, + { + "epoch": 0.6165, + "grad_norm": 15.125, + "grad_norm_var": 0.915087890625, + "learning_rate": 0.0003, + "loss": 10.81, + "loss/aux_loss": 0.04806817434728146, + "loss/crossentropy": 2.5922399282455446, + "loss/logits": 0.8125677675008773, + "step": 61650 + }, + { + "epoch": 0.6166, + "grad_norm": 15.0625, + "grad_norm_var": 0.43045247395833336, + "learning_rate": 0.0003, + "loss": 10.8958, + "loss/aux_loss": 0.048080663196742535, + "loss/crossentropy": 2.6909705996513367, + "loss/logits": 0.8360078364610672, + "step": 61660 + }, + { + "epoch": 0.6167, + "grad_norm": 15.3125, + "grad_norm_var": 3.886962890625, + "learning_rate": 0.0003, + "loss": 10.7584, + "loss/aux_loss": 0.04806402511894703, + "loss/crossentropy": 2.5941467702388765, + "loss/logits": 0.8068960756063461, + "step": 61670 + }, + { + "epoch": 0.6168, + "grad_norm": 14.1875, + "grad_norm_var": 0.5759765625, + "learning_rate": 0.0003, + "loss": 10.9424, + "loss/aux_loss": 0.048076951317489146, + "loss/crossentropy": 2.7264155983924865, + "loss/logits": 0.8287631750106812, + "step": 61680 + }, + { + "epoch": 0.6169, + "grad_norm": 14.25, + "grad_norm_var": 0.5332682291666667, + "learning_rate": 0.0003, + "loss": 10.8441, + "loss/aux_loss": 0.04805428683757782, + "loss/crossentropy": 2.745866870880127, + "loss/logits": 0.7957580178976059, + "step": 61690 + }, + { + "epoch": 0.617, + "grad_norm": 14.75, + "grad_norm_var": 0.39140625, + "learning_rate": 0.0003, + "loss": 10.9403, + "loss/aux_loss": 0.04806794375181198, + "loss/crossentropy": 2.6823421716690063, + "loss/logits": 0.816996818780899, + "step": 61700 + }, + { + "epoch": 0.6171, + "grad_norm": 14.625, + "grad_norm_var": 0.6180826822916666, + "learning_rate": 0.0003, + "loss": 10.8513, + "loss/aux_loss": 0.048078464530408385, + "loss/crossentropy": 2.8029967546463013, + "loss/logits": 0.8454837918281555, + "step": 61710 + }, + { + "epoch": 0.6172, + "grad_norm": 15.4375, + "grad_norm_var": 0.3431640625, + "learning_rate": 0.0003, + "loss": 10.8751, + "loss/aux_loss": 0.04807190615683794, + "loss/crossentropy": 2.6858488082885743, + "loss/logits": 0.8107406437397003, + "step": 61720 + }, + { + "epoch": 0.6173, + "grad_norm": 13.375, + "grad_norm_var": 0.8051432291666667, + "learning_rate": 0.0003, + "loss": 10.9275, + "loss/aux_loss": 0.048067220486700535, + "loss/crossentropy": 2.6807423889636994, + "loss/logits": 0.7955632448196411, + "step": 61730 + }, + { + "epoch": 0.6174, + "grad_norm": 14.4375, + "grad_norm_var": 1.01875, + "learning_rate": 0.0003, + "loss": 10.8206, + "loss/aux_loss": 0.04806277137249708, + "loss/crossentropy": 2.6485226929187773, + "loss/logits": 0.7749884635210037, + "step": 61740 + }, + { + "epoch": 0.6175, + "grad_norm": 14.75, + "grad_norm_var": 0.29973958333333334, + "learning_rate": 0.0003, + "loss": 11.0639, + "loss/aux_loss": 0.04807302244007587, + "loss/crossentropy": 2.501930046081543, + "loss/logits": 0.8085714936256408, + "step": 61750 + }, + { + "epoch": 0.6176, + "grad_norm": 17.0, + "grad_norm_var": 29.426416015625, + "learning_rate": 0.0003, + "loss": 10.9392, + "loss/aux_loss": 0.048070022463798524, + "loss/crossentropy": 2.56240091919899, + "loss/logits": 0.8246626138687134, + "step": 61760 + }, + { + "epoch": 0.6177, + "grad_norm": 14.25, + "grad_norm_var": 29.176155598958335, + "learning_rate": 0.0003, + "loss": 10.8047, + "loss/aux_loss": 0.048060801811516285, + "loss/crossentropy": 2.7177587747573853, + "loss/logits": 0.8236128687858582, + "step": 61770 + }, + { + "epoch": 0.6178, + "grad_norm": 15.0, + "grad_norm_var": 0.9817057291666667, + "learning_rate": 0.0003, + "loss": 11.0162, + "loss/aux_loss": 0.048079765401780605, + "loss/crossentropy": 2.6734387814998626, + "loss/logits": 0.8322317689657212, + "step": 61780 + }, + { + "epoch": 0.6179, + "grad_norm": 14.1875, + "grad_norm_var": 0.7858723958333333, + "learning_rate": 0.0003, + "loss": 11.0301, + "loss/aux_loss": 0.04804958906024694, + "loss/crossentropy": 2.667484325170517, + "loss/logits": 0.7962237685918808, + "step": 61790 + }, + { + "epoch": 0.618, + "grad_norm": 17.375, + "grad_norm_var": 1.3082682291666667, + "learning_rate": 0.0003, + "loss": 10.8701, + "loss/aux_loss": 0.048083477467298505, + "loss/crossentropy": 2.6046105325222015, + "loss/logits": 0.8223717421293258, + "step": 61800 + }, + { + "epoch": 0.6181, + "grad_norm": 15.0, + "grad_norm_var": 1.2212890625, + "learning_rate": 0.0003, + "loss": 10.7544, + "loss/aux_loss": 0.04806771744042635, + "loss/crossentropy": 2.703940987586975, + "loss/logits": 0.8195635229349136, + "step": 61810 + }, + { + "epoch": 0.6182, + "grad_norm": 15.875, + "grad_norm_var": 0.5813639322916667, + "learning_rate": 0.0003, + "loss": 10.8849, + "loss/aux_loss": 0.048071309179067614, + "loss/crossentropy": 2.7586312294006348, + "loss/logits": 0.8400523275136947, + "step": 61820 + }, + { + "epoch": 0.6183, + "grad_norm": 14.3125, + "grad_norm_var": 1.1192057291666666, + "learning_rate": 0.0003, + "loss": 10.9316, + "loss/aux_loss": 0.04807364828884601, + "loss/crossentropy": 2.8496673822402956, + "loss/logits": 0.8168164789676666, + "step": 61830 + }, + { + "epoch": 0.6184, + "grad_norm": 16.375, + "grad_norm_var": 4.017822265625, + "learning_rate": 0.0003, + "loss": 10.9409, + "loss/aux_loss": 0.048056223429739475, + "loss/crossentropy": 2.573849785327911, + "loss/logits": 0.816324171423912, + "step": 61840 + }, + { + "epoch": 0.6185, + "grad_norm": 14.4375, + "grad_norm_var": 3.4400390625, + "learning_rate": 0.0003, + "loss": 10.9165, + "loss/aux_loss": 0.04806674625724554, + "loss/crossentropy": 2.757099586725235, + "loss/logits": 0.8031830161809921, + "step": 61850 + }, + { + "epoch": 0.6186, + "grad_norm": 15.25, + "grad_norm_var": 0.35618489583333335, + "learning_rate": 0.0003, + "loss": 11.0356, + "loss/aux_loss": 0.04808044787496328, + "loss/crossentropy": 2.7227927923202513, + "loss/logits": 0.817890202999115, + "step": 61860 + }, + { + "epoch": 0.6187, + "grad_norm": 14.125, + "grad_norm_var": 0.23326822916666667, + "learning_rate": 0.0003, + "loss": 10.7683, + "loss/aux_loss": 0.048065101355314256, + "loss/crossentropy": 2.5788680493831633, + "loss/logits": 0.7917226999998093, + "step": 61870 + }, + { + "epoch": 0.6188, + "grad_norm": 15.8125, + "grad_norm_var": 0.42578125, + "learning_rate": 0.0003, + "loss": 10.8498, + "loss/aux_loss": 0.04807348400354385, + "loss/crossentropy": 2.8568573355674745, + "loss/logits": 0.831426665186882, + "step": 61880 + }, + { + "epoch": 0.6189, + "grad_norm": 14.375, + "grad_norm_var": 0.7421223958333333, + "learning_rate": 0.0003, + "loss": 10.9383, + "loss/aux_loss": 0.048065127618610856, + "loss/crossentropy": 2.7187650322914125, + "loss/logits": 0.7684122264385224, + "step": 61890 + }, + { + "epoch": 0.619, + "grad_norm": 14.625, + "grad_norm_var": 0.6462890625, + "learning_rate": 0.0003, + "loss": 10.8742, + "loss/aux_loss": 0.048072330094873904, + "loss/crossentropy": 2.7928129851818086, + "loss/logits": 0.840417456626892, + "step": 61900 + }, + { + "epoch": 0.6191, + "grad_norm": 17.375, + "grad_norm_var": 0.6249348958333333, + "learning_rate": 0.0003, + "loss": 10.95, + "loss/aux_loss": 0.048061074875295165, + "loss/crossentropy": 2.827212655544281, + "loss/logits": 0.851186552643776, + "step": 61910 + }, + { + "epoch": 0.6192, + "grad_norm": 15.3125, + "grad_norm_var": 0.8921223958333333, + "learning_rate": 0.0003, + "loss": 10.9378, + "loss/aux_loss": 0.04807654786854983, + "loss/crossentropy": 2.660733711719513, + "loss/logits": 0.8380063980817795, + "step": 61920 + }, + { + "epoch": 0.6193, + "grad_norm": 14.8125, + "grad_norm_var": 0.570556640625, + "learning_rate": 0.0003, + "loss": 10.9107, + "loss/aux_loss": 0.04806284811347723, + "loss/crossentropy": 2.6565398812294005, + "loss/logits": 0.8210236459970475, + "step": 61930 + }, + { + "epoch": 0.6194, + "grad_norm": 15.125, + "grad_norm_var": 0.33787434895833335, + "learning_rate": 0.0003, + "loss": 10.842, + "loss/aux_loss": 0.04807104039937258, + "loss/crossentropy": 2.5869544565677645, + "loss/logits": 0.8126054167747497, + "step": 61940 + }, + { + "epoch": 0.6195, + "grad_norm": 14.0625, + "grad_norm_var": 0.4328125, + "learning_rate": 0.0003, + "loss": 10.8834, + "loss/aux_loss": 0.048072011955082414, + "loss/crossentropy": 2.839719223976135, + "loss/logits": 0.823541471362114, + "step": 61950 + }, + { + "epoch": 0.6196, + "grad_norm": 14.5625, + "grad_norm_var": 0.3223795572916667, + "learning_rate": 0.0003, + "loss": 10.8743, + "loss/aux_loss": 0.04806395042687654, + "loss/crossentropy": 2.798982226848602, + "loss/logits": 0.7807911396026611, + "step": 61960 + }, + { + "epoch": 0.6197, + "grad_norm": 14.5, + "grad_norm_var": 1.5895182291666667, + "learning_rate": 0.0003, + "loss": 10.7799, + "loss/aux_loss": 0.04806208536028862, + "loss/crossentropy": 2.6087071001529694, + "loss/logits": 0.8195267915725708, + "step": 61970 + }, + { + "epoch": 0.6198, + "grad_norm": 15.125, + "grad_norm_var": 0.6953125, + "learning_rate": 0.0003, + "loss": 11.043, + "loss/aux_loss": 0.048067536950111386, + "loss/crossentropy": 2.7437108635902403, + "loss/logits": 0.8448336660861969, + "step": 61980 + }, + { + "epoch": 0.6199, + "grad_norm": 17.625, + "grad_norm_var": 0.860791015625, + "learning_rate": 0.0003, + "loss": 10.9815, + "loss/aux_loss": 0.04807477705180645, + "loss/crossentropy": 2.70632341504097, + "loss/logits": 0.8352989315986633, + "step": 61990 + }, + { + "epoch": 0.62, + "grad_norm": 14.6875, + "grad_norm_var": 1.3466145833333334, + "learning_rate": 0.0003, + "loss": 10.9892, + "loss/aux_loss": 0.04806019198149443, + "loss/crossentropy": 2.664289927482605, + "loss/logits": 0.828738734126091, + "step": 62000 + }, + { + "epoch": 0.6201, + "grad_norm": 15.1875, + "grad_norm_var": 0.3690104166666667, + "learning_rate": 0.0003, + "loss": 10.9197, + "loss/aux_loss": 0.04807502832263708, + "loss/crossentropy": 2.728136438131332, + "loss/logits": 0.8290602266788483, + "step": 62010 + }, + { + "epoch": 0.6202, + "grad_norm": 14.5, + "grad_norm_var": 0.5070149739583333, + "learning_rate": 0.0003, + "loss": 10.8922, + "loss/aux_loss": 0.048071499727666375, + "loss/crossentropy": 2.74332879781723, + "loss/logits": 0.8003027319908143, + "step": 62020 + }, + { + "epoch": 0.6203, + "grad_norm": 16.875, + "grad_norm_var": 0.5157389322916667, + "learning_rate": 0.0003, + "loss": 11.0849, + "loss/aux_loss": 0.048065542615950106, + "loss/crossentropy": 2.7377222657203673, + "loss/logits": 0.8492092847824096, + "step": 62030 + }, + { + "epoch": 0.6204, + "grad_norm": 14.8125, + "grad_norm_var": 1.1666015625, + "learning_rate": 0.0003, + "loss": 10.8011, + "loss/aux_loss": 0.04808056894689798, + "loss/crossentropy": 2.632717180252075, + "loss/logits": 0.7819905787706375, + "step": 62040 + }, + { + "epoch": 0.6205, + "grad_norm": 14.9375, + "grad_norm_var": 0.558447265625, + "learning_rate": 0.0003, + "loss": 10.883, + "loss/aux_loss": 0.04806170351803303, + "loss/crossentropy": 2.7062652587890623, + "loss/logits": 0.7980828583240509, + "step": 62050 + }, + { + "epoch": 0.6206, + "grad_norm": 14.6875, + "grad_norm_var": 1.1997233072916667, + "learning_rate": 0.0003, + "loss": 10.8167, + "loss/aux_loss": 0.04807770270854235, + "loss/crossentropy": 2.474736750125885, + "loss/logits": 0.7879143923521041, + "step": 62060 + }, + { + "epoch": 0.6207, + "grad_norm": 14.3125, + "grad_norm_var": 0.5335774739583333, + "learning_rate": 0.0003, + "loss": 11.0472, + "loss/aux_loss": 0.04806693401187658, + "loss/crossentropy": 2.7242776453495026, + "loss/logits": 0.8014493867754936, + "step": 62070 + }, + { + "epoch": 0.6208, + "grad_norm": 15.9375, + "grad_norm_var": 0.8457682291666667, + "learning_rate": 0.0003, + "loss": 10.9555, + "loss/aux_loss": 0.048064196668565276, + "loss/crossentropy": 2.8615013003349303, + "loss/logits": 0.8332249820232391, + "step": 62080 + }, + { + "epoch": 0.6209, + "grad_norm": 18.0, + "grad_norm_var": 1.15078125, + "learning_rate": 0.0003, + "loss": 10.8982, + "loss/aux_loss": 0.04807161465287209, + "loss/crossentropy": 2.7590698480606077, + "loss/logits": 0.828876069188118, + "step": 62090 + }, + { + "epoch": 0.621, + "grad_norm": 16.125, + "grad_norm_var": 0.6268229166666667, + "learning_rate": 0.0003, + "loss": 10.952, + "loss/aux_loss": 0.048064458556473254, + "loss/crossentropy": 2.8559590697288515, + "loss/logits": 0.8306620687246322, + "step": 62100 + }, + { + "epoch": 0.6211, + "grad_norm": 14.375, + "grad_norm_var": 0.65, + "learning_rate": 0.0003, + "loss": 10.8323, + "loss/aux_loss": 0.04808349907398224, + "loss/crossentropy": 2.6381156027317045, + "loss/logits": 0.829769441485405, + "step": 62110 + }, + { + "epoch": 0.6212, + "grad_norm": 14.5625, + "grad_norm_var": 0.40208333333333335, + "learning_rate": 0.0003, + "loss": 10.9283, + "loss/aux_loss": 0.048080073297023775, + "loss/crossentropy": 2.7890799164772035, + "loss/logits": 0.8134836733341217, + "step": 62120 + }, + { + "epoch": 0.6213, + "grad_norm": 14.9375, + "grad_norm_var": 35.38430989583333, + "learning_rate": 0.0003, + "loss": 10.7432, + "loss/aux_loss": 0.048065336607396605, + "loss/crossentropy": 2.44475519657135, + "loss/logits": 0.7698864176869392, + "step": 62130 + }, + { + "epoch": 0.6214, + "grad_norm": 14.5, + "grad_norm_var": 0.5469889322916667, + "learning_rate": 0.0003, + "loss": 10.9544, + "loss/aux_loss": 0.04806721787899733, + "loss/crossentropy": 2.5549269795417784, + "loss/logits": 0.8181776434183121, + "step": 62140 + }, + { + "epoch": 0.6215, + "grad_norm": 15.125, + "grad_norm_var": 0.447900390625, + "learning_rate": 0.0003, + "loss": 10.8531, + "loss/aux_loss": 0.048075252957642076, + "loss/crossentropy": 2.8422864854335783, + "loss/logits": 0.8431739717721939, + "step": 62150 + }, + { + "epoch": 0.6216, + "grad_norm": 16.875, + "grad_norm_var": 0.6466145833333333, + "learning_rate": 0.0003, + "loss": 10.9505, + "loss/aux_loss": 0.048076402954757215, + "loss/crossentropy": 2.8633032202720643, + "loss/logits": 0.8444990605115891, + "step": 62160 + }, + { + "epoch": 0.6217, + "grad_norm": 14.9375, + "grad_norm_var": 1.3199055989583333, + "learning_rate": 0.0003, + "loss": 10.8851, + "loss/aux_loss": 0.04805946424603462, + "loss/crossentropy": 2.5738767266273497, + "loss/logits": 0.7897825837135315, + "step": 62170 + }, + { + "epoch": 0.6218, + "grad_norm": 14.4375, + "grad_norm_var": 1.0264973958333334, + "learning_rate": 0.0003, + "loss": 10.9777, + "loss/aux_loss": 0.04807272329926491, + "loss/crossentropy": 2.559682661294937, + "loss/logits": 0.8130290180444717, + "step": 62180 + }, + { + "epoch": 0.6219, + "grad_norm": 15.75, + "grad_norm_var": 1.2301432291666667, + "learning_rate": 0.0003, + "loss": 10.8184, + "loss/aux_loss": 0.04808760862797499, + "loss/crossentropy": 2.65628005862236, + "loss/logits": 0.7793559074401856, + "step": 62190 + }, + { + "epoch": 0.622, + "grad_norm": 27.75, + "grad_norm_var": 10.504280598958333, + "learning_rate": 0.0003, + "loss": 10.9073, + "loss/aux_loss": 0.04806331116706133, + "loss/crossentropy": 2.751385676860809, + "loss/logits": 0.8148221343755722, + "step": 62200 + }, + { + "epoch": 0.6221, + "grad_norm": 14.8125, + "grad_norm_var": 10.069205729166667, + "learning_rate": 0.0003, + "loss": 10.8365, + "loss/aux_loss": 0.048077536001801494, + "loss/crossentropy": 2.76932435631752, + "loss/logits": 0.7825021982192993, + "step": 62210 + }, + { + "epoch": 0.6222, + "grad_norm": 14.5625, + "grad_norm_var": 0.34661458333333334, + "learning_rate": 0.0003, + "loss": 10.9128, + "loss/aux_loss": 0.048069404624402526, + "loss/crossentropy": 2.6793047428131103, + "loss/logits": 0.7982782870531082, + "step": 62220 + }, + { + "epoch": 0.6223, + "grad_norm": 15.5625, + "grad_norm_var": 51.823893229166664, + "learning_rate": 0.0003, + "loss": 11.0149, + "loss/aux_loss": 0.04806563127785921, + "loss/crossentropy": 2.742257535457611, + "loss/logits": 0.8206332385540008, + "step": 62230 + }, + { + "epoch": 0.6224, + "grad_norm": 15.25, + "grad_norm_var": 49.48118489583333, + "learning_rate": 0.0003, + "loss": 10.9483, + "loss/aux_loss": 0.04807669762521982, + "loss/crossentropy": 2.7733189463615417, + "loss/logits": 0.8633380651473999, + "step": 62240 + }, + { + "epoch": 0.6225, + "grad_norm": 15.625, + "grad_norm_var": 1.4867024739583334, + "learning_rate": 0.0003, + "loss": 10.8326, + "loss/aux_loss": 0.04806031119078398, + "loss/crossentropy": 2.7665975272655485, + "loss/logits": 0.825995746254921, + "step": 62250 + }, + { + "epoch": 0.6226, + "grad_norm": 17.375, + "grad_norm_var": 31.684619140625, + "learning_rate": 0.0003, + "loss": 10.9359, + "loss/aux_loss": 0.048060914315283296, + "loss/crossentropy": 2.7299613773822786, + "loss/logits": 0.7919081568717956, + "step": 62260 + }, + { + "epoch": 0.6227, + "grad_norm": 16.0, + "grad_norm_var": 32.25402018229167, + "learning_rate": 0.0003, + "loss": 10.9558, + "loss/aux_loss": 0.048073223978281024, + "loss/crossentropy": 2.854112696647644, + "loss/logits": 0.8261337369680405, + "step": 62270 + }, + { + "epoch": 0.6228, + "grad_norm": 16.25, + "grad_norm_var": 0.37604166666666666, + "learning_rate": 0.0003, + "loss": 10.9295, + "loss/aux_loss": 0.04806328769773245, + "loss/crossentropy": 2.6099496364593504, + "loss/logits": 0.7905628532171249, + "step": 62280 + }, + { + "epoch": 0.6229, + "grad_norm": 15.0, + "grad_norm_var": 0.8378743489583333, + "learning_rate": 0.0003, + "loss": 11.1089, + "loss/aux_loss": 0.048074782267212866, + "loss/crossentropy": 2.856866729259491, + "loss/logits": 0.8273001462221146, + "step": 62290 + }, + { + "epoch": 0.623, + "grad_norm": 15.375, + "grad_norm_var": 0.5945149739583333, + "learning_rate": 0.0003, + "loss": 11.0759, + "loss/aux_loss": 0.048062778823077676, + "loss/crossentropy": 2.8540167093276976, + "loss/logits": 0.8591305077075958, + "step": 62300 + }, + { + "epoch": 0.6231, + "grad_norm": 14.6875, + "grad_norm_var": 1.1102701822916667, + "learning_rate": 0.0003, + "loss": 11.0353, + "loss/aux_loss": 0.048072639107704165, + "loss/crossentropy": 2.670693778991699, + "loss/logits": 0.8144174665212631, + "step": 62310 + }, + { + "epoch": 0.6232, + "grad_norm": 14.5625, + "grad_norm_var": 0.4649576822916667, + "learning_rate": 0.0003, + "loss": 10.8635, + "loss/aux_loss": 0.04807703364640474, + "loss/crossentropy": 2.6906314373016356, + "loss/logits": 0.7975012451410294, + "step": 62320 + }, + { + "epoch": 0.6233, + "grad_norm": 14.1875, + "grad_norm_var": 0.6809733072916667, + "learning_rate": 0.0003, + "loss": 10.9574, + "loss/aux_loss": 0.04804598540067673, + "loss/crossentropy": 2.7562792539596557, + "loss/logits": 0.8315281063318253, + "step": 62330 + }, + { + "epoch": 0.6234, + "grad_norm": 18.5, + "grad_norm_var": 1.3674479166666667, + "learning_rate": 0.0003, + "loss": 11.0711, + "loss/aux_loss": 0.04807227849960327, + "loss/crossentropy": 2.699070680141449, + "loss/logits": 0.8405324429273605, + "step": 62340 + }, + { + "epoch": 0.6235, + "grad_norm": 15.375, + "grad_norm_var": 48.110660807291666, + "learning_rate": 0.0003, + "loss": 10.9377, + "loss/aux_loss": 0.04808774162083864, + "loss/crossentropy": 2.8322594940662382, + "loss/logits": 0.8086932510137558, + "step": 62350 + }, + { + "epoch": 0.6236, + "grad_norm": 15.75, + "grad_norm_var": 1.492041015625, + "learning_rate": 0.0003, + "loss": 10.8616, + "loss/aux_loss": 0.04806898422539234, + "loss/crossentropy": 2.5754688024520873, + "loss/logits": 0.7867444813251495, + "step": 62360 + }, + { + "epoch": 0.6237, + "grad_norm": 15.3125, + "grad_norm_var": 0.5119140625, + "learning_rate": 0.0003, + "loss": 10.9684, + "loss/aux_loss": 0.04807077944278717, + "loss/crossentropy": 2.6778059184551237, + "loss/logits": 0.8169686466455459, + "step": 62370 + }, + { + "epoch": 0.6238, + "grad_norm": 16.125, + "grad_norm_var": 0.6291015625, + "learning_rate": 0.0003, + "loss": 10.9766, + "loss/aux_loss": 0.04807309564203024, + "loss/crossentropy": 2.6437867105007173, + "loss/logits": 0.823999360203743, + "step": 62380 + }, + { + "epoch": 0.6239, + "grad_norm": 16.25, + "grad_norm_var": 0.7306640625, + "learning_rate": 0.0003, + "loss": 10.8911, + "loss/aux_loss": 0.04806749243289232, + "loss/crossentropy": 2.6562050104141237, + "loss/logits": 0.8306028187274933, + "step": 62390 + }, + { + "epoch": 0.624, + "grad_norm": 15.5, + "grad_norm_var": 0.3, + "learning_rate": 0.0003, + "loss": 11.061, + "loss/aux_loss": 0.04807000830769539, + "loss/crossentropy": 2.786527621746063, + "loss/logits": 0.8158238917589188, + "step": 62400 + }, + { + "epoch": 0.6241, + "grad_norm": 16.0, + "grad_norm_var": 0.3340983072916667, + "learning_rate": 0.0003, + "loss": 11.0293, + "loss/aux_loss": 0.04807388223707676, + "loss/crossentropy": 2.7131783425807954, + "loss/logits": 0.8209474682807922, + "step": 62410 + }, + { + "epoch": 0.6242, + "grad_norm": 14.5, + "grad_norm_var": 0.22109375, + "learning_rate": 0.0003, + "loss": 10.9286, + "loss/aux_loss": 0.04806325174868107, + "loss/crossentropy": 2.6579012751579283, + "loss/logits": 0.8379012405872345, + "step": 62420 + }, + { + "epoch": 0.6243, + "grad_norm": 15.25, + "grad_norm_var": 0.3780598958333333, + "learning_rate": 0.0003, + "loss": 10.8599, + "loss/aux_loss": 0.048072734661400315, + "loss/crossentropy": 2.668544816970825, + "loss/logits": 0.8266326695680618, + "step": 62430 + }, + { + "epoch": 0.6244, + "grad_norm": 14.5625, + "grad_norm_var": 0.6106608072916667, + "learning_rate": 0.0003, + "loss": 10.8735, + "loss/aux_loss": 0.048060843162238596, + "loss/crossentropy": 2.692414402961731, + "loss/logits": 0.8246606469154358, + "step": 62440 + }, + { + "epoch": 0.6245, + "grad_norm": 14.9375, + "grad_norm_var": 1.0809733072916667, + "learning_rate": 0.0003, + "loss": 10.9023, + "loss/aux_loss": 0.04808099400252104, + "loss/crossentropy": 2.865987467765808, + "loss/logits": 0.8370956897735595, + "step": 62450 + }, + { + "epoch": 0.6246, + "grad_norm": 14.625, + "grad_norm_var": 0.835791015625, + "learning_rate": 0.0003, + "loss": 10.8651, + "loss/aux_loss": 0.04806222338229418, + "loss/crossentropy": 2.5183817207813264, + "loss/logits": 0.7817242562770843, + "step": 62460 + }, + { + "epoch": 0.6247, + "grad_norm": 14.25, + "grad_norm_var": 2.8445149739583333, + "learning_rate": 0.0003, + "loss": 10.8275, + "loss/aux_loss": 0.04807907696813345, + "loss/crossentropy": 2.5393788814544678, + "loss/logits": 0.7698053836822509, + "step": 62470 + }, + { + "epoch": 0.6248, + "grad_norm": 14.75, + "grad_norm_var": 0.21964518229166666, + "learning_rate": 0.0003, + "loss": 10.9445, + "loss/aux_loss": 0.04807716142386198, + "loss/crossentropy": 2.879362916946411, + "loss/logits": 0.8249101668596268, + "step": 62480 + }, + { + "epoch": 0.6249, + "grad_norm": 15.5, + "grad_norm_var": 0.3416666666666667, + "learning_rate": 0.0003, + "loss": 10.8909, + "loss/aux_loss": 0.04806219730526209, + "loss/crossentropy": 2.6007526874542237, + "loss/logits": 0.8132065325975418, + "step": 62490 + }, + { + "epoch": 0.625, + "grad_norm": 13.9375, + "grad_norm_var": 0.5270670572916667, + "learning_rate": 0.0003, + "loss": 11.0119, + "loss/aux_loss": 0.04807214587926865, + "loss/crossentropy": 2.636463737487793, + "loss/logits": 0.7892209351062774, + "step": 62500 + }, + { + "epoch": 0.6251, + "grad_norm": 14.5, + "grad_norm_var": 1.1276041666666667, + "learning_rate": 0.0003, + "loss": 10.9606, + "loss/aux_loss": 0.04807151295244694, + "loss/crossentropy": 2.6543030560016634, + "loss/logits": 0.8350883662700653, + "step": 62510 + }, + { + "epoch": 0.6252, + "grad_norm": 14.875, + "grad_norm_var": 0.31354166666666666, + "learning_rate": 0.0003, + "loss": 10.8797, + "loss/aux_loss": 0.04806313067674637, + "loss/crossentropy": 2.7374988555908204, + "loss/logits": 0.8234562575817108, + "step": 62520 + }, + { + "epoch": 0.6253, + "grad_norm": 14.125, + "grad_norm_var": 0.9880208333333333, + "learning_rate": 0.0003, + "loss": 11.2108, + "loss/aux_loss": 0.048078093118965624, + "loss/crossentropy": 2.745024061203003, + "loss/logits": 0.8560461461544037, + "step": 62530 + }, + { + "epoch": 0.6254, + "grad_norm": 15.0, + "grad_norm_var": 0.7145182291666666, + "learning_rate": 0.0003, + "loss": 10.987, + "loss/aux_loss": 0.04806946255266666, + "loss/crossentropy": 2.651322227716446, + "loss/logits": 0.8073702841997147, + "step": 62540 + }, + { + "epoch": 0.6255, + "grad_norm": 15.75, + "grad_norm_var": 0.2684733072916667, + "learning_rate": 0.0003, + "loss": 10.8866, + "loss/aux_loss": 0.048070280253887175, + "loss/crossentropy": 2.6663502156734467, + "loss/logits": 0.823998111486435, + "step": 62550 + }, + { + "epoch": 0.6256, + "grad_norm": 72.5, + "grad_norm_var": 205.42701822916666, + "learning_rate": 0.0003, + "loss": 10.9477, + "loss/aux_loss": 0.04807772878557444, + "loss/crossentropy": 2.639860916137695, + "loss/logits": 0.8050632417201996, + "step": 62560 + }, + { + "epoch": 0.6257, + "grad_norm": 18.0, + "grad_norm_var": 201.77667643229168, + "learning_rate": 0.0003, + "loss": 10.9815, + "loss/aux_loss": 0.048079793155193326, + "loss/crossentropy": 2.8103298008441926, + "loss/logits": 0.8344414174556732, + "step": 62570 + }, + { + "epoch": 0.6258, + "grad_norm": 15.8125, + "grad_norm_var": 1.1109212239583333, + "learning_rate": 0.0003, + "loss": 10.9126, + "loss/aux_loss": 0.04806471895426512, + "loss/crossentropy": 2.6820645689964295, + "loss/logits": 0.8160485446453094, + "step": 62580 + }, + { + "epoch": 0.6259, + "grad_norm": 17.0, + "grad_norm_var": 0.9077473958333333, + "learning_rate": 0.0003, + "loss": 10.8826, + "loss/aux_loss": 0.048061324283480644, + "loss/crossentropy": 2.5685730636119843, + "loss/logits": 0.7997966796159744, + "step": 62590 + }, + { + "epoch": 0.626, + "grad_norm": 15.25, + "grad_norm_var": 1.097900390625, + "learning_rate": 0.0003, + "loss": 10.8269, + "loss/aux_loss": 0.04806754421442747, + "loss/crossentropy": 2.563853049278259, + "loss/logits": 0.7696677416563034, + "step": 62600 + }, + { + "epoch": 0.6261, + "grad_norm": 15.75, + "grad_norm_var": 1.073681640625, + "learning_rate": 0.0003, + "loss": 10.7331, + "loss/aux_loss": 0.04807633589953184, + "loss/crossentropy": 2.7461453557014464, + "loss/logits": 0.8335110425949097, + "step": 62610 + }, + { + "epoch": 0.6262, + "grad_norm": 15.5, + "grad_norm_var": 0.9044108072916667, + "learning_rate": 0.0003, + "loss": 10.9591, + "loss/aux_loss": 0.04808416999876499, + "loss/crossentropy": 2.652498370409012, + "loss/logits": 0.7771803379058838, + "step": 62620 + }, + { + "epoch": 0.6263, + "grad_norm": 15.125, + "grad_norm_var": 0.30597330729166666, + "learning_rate": 0.0003, + "loss": 10.9905, + "loss/aux_loss": 0.048068183846771716, + "loss/crossentropy": 2.7204720437526704, + "loss/logits": 0.8556759804487228, + "step": 62630 + }, + { + "epoch": 0.6264, + "grad_norm": 14.625, + "grad_norm_var": 0.813134765625, + "learning_rate": 0.0003, + "loss": 10.7167, + "loss/aux_loss": 0.048065423220396045, + "loss/crossentropy": 2.565179693698883, + "loss/logits": 0.8034500062465668, + "step": 62640 + }, + { + "epoch": 0.6265, + "grad_norm": 15.0625, + "grad_norm_var": 13.729166666666666, + "learning_rate": 0.0003, + "loss": 10.8313, + "loss/aux_loss": 0.048078210465610026, + "loss/crossentropy": 2.7543790459632875, + "loss/logits": 0.8006289631128312, + "step": 62650 + }, + { + "epoch": 0.6266, + "grad_norm": 16.0, + "grad_norm_var": 0.9395833333333333, + "learning_rate": 0.0003, + "loss": 10.9459, + "loss/aux_loss": 0.04806745704263449, + "loss/crossentropy": 2.822178053855896, + "loss/logits": 0.8426857680082321, + "step": 62660 + }, + { + "epoch": 0.6267, + "grad_norm": 15.0, + "grad_norm_var": 0.7831868489583333, + "learning_rate": 0.0003, + "loss": 10.9596, + "loss/aux_loss": 0.04807326514273882, + "loss/crossentropy": 2.694602167606354, + "loss/logits": 0.8324411004781723, + "step": 62670 + }, + { + "epoch": 0.6268, + "grad_norm": 15.5625, + "grad_norm_var": 86.090869140625, + "learning_rate": 0.0003, + "loss": 11.0313, + "loss/aux_loss": 0.04808346442878246, + "loss/crossentropy": 2.6940039336681365, + "loss/logits": 0.8582751452922821, + "step": 62680 + }, + { + "epoch": 0.6269, + "grad_norm": 17.0, + "grad_norm_var": 84.3541015625, + "learning_rate": 0.0003, + "loss": 11.0566, + "loss/aux_loss": 0.04807037822902203, + "loss/crossentropy": 2.7277477622032165, + "loss/logits": 0.8286092817783356, + "step": 62690 + }, + { + "epoch": 0.627, + "grad_norm": 15.0, + "grad_norm_var": 1.0344889322916666, + "learning_rate": 0.0003, + "loss": 10.897, + "loss/aux_loss": 0.048066021874547005, + "loss/crossentropy": 2.6495929658412933, + "loss/logits": 0.7907640814781189, + "step": 62700 + }, + { + "epoch": 0.6271, + "grad_norm": 14.8125, + "grad_norm_var": 1.0577962239583334, + "learning_rate": 0.0003, + "loss": 11.0379, + "loss/aux_loss": 0.04806892462074756, + "loss/crossentropy": 2.6996763944625854, + "loss/logits": 0.8464843809604645, + "step": 62710 + }, + { + "epoch": 0.6272, + "grad_norm": 14.4375, + "grad_norm_var": 0.22701822916666667, + "learning_rate": 0.0003, + "loss": 10.8966, + "loss/aux_loss": 0.04808309208601713, + "loss/crossentropy": 2.7269538223743437, + "loss/logits": 0.797277769446373, + "step": 62720 + }, + { + "epoch": 0.6273, + "grad_norm": 16.25, + "grad_norm_var": 0.9309895833333334, + "learning_rate": 0.0003, + "loss": 10.9687, + "loss/aux_loss": 0.04806727655231953, + "loss/crossentropy": 2.64298877120018, + "loss/logits": 0.7983599692583084, + "step": 62730 + }, + { + "epoch": 0.6274, + "grad_norm": 15.5625, + "grad_norm_var": 0.6864420572916666, + "learning_rate": 0.0003, + "loss": 11.0072, + "loss/aux_loss": 0.04806558284908533, + "loss/crossentropy": 2.675836908817291, + "loss/logits": 0.8062641978263855, + "step": 62740 + }, + { + "epoch": 0.6275, + "grad_norm": 16.25, + "grad_norm_var": 0.7261555989583334, + "learning_rate": 0.0003, + "loss": 10.8478, + "loss/aux_loss": 0.04808370973914862, + "loss/crossentropy": 2.6298579216003417, + "loss/logits": 0.8091606229543686, + "step": 62750 + }, + { + "epoch": 0.6276, + "grad_norm": 15.6875, + "grad_norm_var": 0.6263020833333334, + "learning_rate": 0.0003, + "loss": 10.9488, + "loss/aux_loss": 0.048074055649340156, + "loss/crossentropy": 2.684184890985489, + "loss/logits": 0.8185036033391953, + "step": 62760 + }, + { + "epoch": 0.6277, + "grad_norm": 15.0, + "grad_norm_var": 0.7436848958333333, + "learning_rate": 0.0003, + "loss": 10.9509, + "loss/aux_loss": 0.04805560186505318, + "loss/crossentropy": 2.7323277831077575, + "loss/logits": 0.8214883893728256, + "step": 62770 + }, + { + "epoch": 0.6278, + "grad_norm": 16.375, + "grad_norm_var": 0.6624837239583333, + "learning_rate": 0.0003, + "loss": 10.9952, + "loss/aux_loss": 0.04808086436241865, + "loss/crossentropy": 2.7757518172264097, + "loss/logits": 0.8345901370048523, + "step": 62780 + }, + { + "epoch": 0.6279, + "grad_norm": 15.8125, + "grad_norm_var": 0.7156087239583333, + "learning_rate": 0.0003, + "loss": 10.8995, + "loss/aux_loss": 0.04808343015611172, + "loss/crossentropy": 2.809204262495041, + "loss/logits": 0.8202072083950043, + "step": 62790 + }, + { + "epoch": 0.628, + "grad_norm": 14.9375, + "grad_norm_var": 0.37701822916666666, + "learning_rate": 0.0003, + "loss": 10.8546, + "loss/aux_loss": 0.048064966686069965, + "loss/crossentropy": 2.7755215167999268, + "loss/logits": 0.8226945012807846, + "step": 62800 + }, + { + "epoch": 0.6281, + "grad_norm": 14.5625, + "grad_norm_var": 0.3634765625, + "learning_rate": 0.0003, + "loss": 10.8531, + "loss/aux_loss": 0.04806779157370329, + "loss/crossentropy": 2.684882569313049, + "loss/logits": 0.82384153008461, + "step": 62810 + }, + { + "epoch": 0.6282, + "grad_norm": 14.25, + "grad_norm_var": 0.4567057291666667, + "learning_rate": 0.0003, + "loss": 10.8806, + "loss/aux_loss": 0.048072170466184616, + "loss/crossentropy": 2.750021505355835, + "loss/logits": 0.7991617172956467, + "step": 62820 + }, + { + "epoch": 0.6283, + "grad_norm": 14.8125, + "grad_norm_var": 0.2503743489583333, + "learning_rate": 0.0003, + "loss": 10.9362, + "loss/aux_loss": 0.04807143583893776, + "loss/crossentropy": 2.749982488155365, + "loss/logits": 0.8353980958461762, + "step": 62830 + }, + { + "epoch": 0.6284, + "grad_norm": 14.25, + "grad_norm_var": 0.517041015625, + "learning_rate": 0.0003, + "loss": 10.9349, + "loss/aux_loss": 0.048057589866220954, + "loss/crossentropy": 2.761917233467102, + "loss/logits": 0.8192497193813324, + "step": 62840 + }, + { + "epoch": 0.6285, + "grad_norm": 14.9375, + "grad_norm_var": 0.5915201822916667, + "learning_rate": 0.0003, + "loss": 10.7967, + "loss/aux_loss": 0.04808051139116287, + "loss/crossentropy": 2.4791474997997285, + "loss/logits": 0.8087964832782746, + "step": 62850 + }, + { + "epoch": 0.6286, + "grad_norm": 15.6875, + "grad_norm_var": 0.5188639322916667, + "learning_rate": 0.0003, + "loss": 11.022, + "loss/aux_loss": 0.048080192692577836, + "loss/crossentropy": 2.8895226955413817, + "loss/logits": 0.8381287634372712, + "step": 62860 + }, + { + "epoch": 0.6287, + "grad_norm": 15.5, + "grad_norm_var": 0.5929524739583333, + "learning_rate": 0.0003, + "loss": 10.908, + "loss/aux_loss": 0.04806222971528769, + "loss/crossentropy": 2.747699362039566, + "loss/logits": 0.8261835396289825, + "step": 62870 + }, + { + "epoch": 0.6288, + "grad_norm": 15.375, + "grad_norm_var": 1.2239420572916666, + "learning_rate": 0.0003, + "loss": 10.9217, + "loss/aux_loss": 0.048062012530863285, + "loss/crossentropy": 2.6717309832572935, + "loss/logits": 0.8120827436447143, + "step": 62880 + }, + { + "epoch": 0.6289, + "grad_norm": 13.9375, + "grad_norm_var": 1.063525390625, + "learning_rate": 0.0003, + "loss": 10.7873, + "loss/aux_loss": 0.048078616708517076, + "loss/crossentropy": 2.6327943921089174, + "loss/logits": 0.7977903634309769, + "step": 62890 + }, + { + "epoch": 0.629, + "grad_norm": 16.0, + "grad_norm_var": 0.4988932291666667, + "learning_rate": 0.0003, + "loss": 10.948, + "loss/aux_loss": 0.04807599578052759, + "loss/crossentropy": 2.727197366952896, + "loss/logits": 0.8394771188497543, + "step": 62900 + }, + { + "epoch": 0.6291, + "grad_norm": 15.375, + "grad_norm_var": 0.3973307291666667, + "learning_rate": 0.0003, + "loss": 10.902, + "loss/aux_loss": 0.04805990718305111, + "loss/crossentropy": 2.791462790966034, + "loss/logits": 0.8735287189483643, + "step": 62910 + }, + { + "epoch": 0.6292, + "grad_norm": 14.6875, + "grad_norm_var": 1.3952473958333333, + "learning_rate": 0.0003, + "loss": 10.9858, + "loss/aux_loss": 0.04808229636400938, + "loss/crossentropy": 2.740624117851257, + "loss/logits": 0.7988775402307511, + "step": 62920 + }, + { + "epoch": 0.6293, + "grad_norm": 14.9375, + "grad_norm_var": 0.968212890625, + "learning_rate": 0.0003, + "loss": 10.8832, + "loss/aux_loss": 0.0480513833463192, + "loss/crossentropy": 2.785011887550354, + "loss/logits": 0.8363262772560119, + "step": 62930 + }, + { + "epoch": 0.6294, + "grad_norm": 16.25, + "grad_norm_var": 0.5317708333333333, + "learning_rate": 0.0003, + "loss": 10.8444, + "loss/aux_loss": 0.04807438999414444, + "loss/crossentropy": 2.6182597100734712, + "loss/logits": 0.7917582601308822, + "step": 62940 + }, + { + "epoch": 0.6295, + "grad_norm": 14.375, + "grad_norm_var": 0.4007649739583333, + "learning_rate": 0.0003, + "loss": 10.8933, + "loss/aux_loss": 0.048065260984003544, + "loss/crossentropy": 2.6386733055114746, + "loss/logits": 0.8080016434192657, + "step": 62950 + }, + { + "epoch": 0.6296, + "grad_norm": 14.5625, + "grad_norm_var": 4.309358723958334, + "learning_rate": 0.0003, + "loss": 10.9506, + "loss/aux_loss": 0.04806258585304022, + "loss/crossentropy": 2.6573799908161164, + "loss/logits": 0.797971498966217, + "step": 62960 + }, + { + "epoch": 0.6297, + "grad_norm": 14.1875, + "grad_norm_var": 0.6682291666666667, + "learning_rate": 0.0003, + "loss": 10.8314, + "loss/aux_loss": 0.04807965587824583, + "loss/crossentropy": 2.614883852005005, + "loss/logits": 0.7928066223859787, + "step": 62970 + }, + { + "epoch": 0.6298, + "grad_norm": 15.1875, + "grad_norm_var": 0.452587890625, + "learning_rate": 0.0003, + "loss": 11.0064, + "loss/aux_loss": 0.048063984513282774, + "loss/crossentropy": 2.7016715586185454, + "loss/logits": 0.8424327522516251, + "step": 62980 + }, + { + "epoch": 0.6299, + "grad_norm": 15.0, + "grad_norm_var": 0.3759765625, + "learning_rate": 0.0003, + "loss": 10.818, + "loss/aux_loss": 0.04806662555783987, + "loss/crossentropy": 2.7241014719009398, + "loss/logits": 0.8228190451860428, + "step": 62990 + }, + { + "epoch": 0.63, + "grad_norm": 15.625, + "grad_norm_var": 0.3109212239583333, + "learning_rate": 0.0003, + "loss": 10.8628, + "loss/aux_loss": 0.048064174503087996, + "loss/crossentropy": 2.558414030075073, + "loss/logits": 0.8088842839002609, + "step": 63000 + }, + { + "epoch": 0.6301, + "grad_norm": 15.125, + "grad_norm_var": 0.5624348958333333, + "learning_rate": 0.0003, + "loss": 11.0454, + "loss/aux_loss": 0.04806617610156536, + "loss/crossentropy": 2.8958580434322356, + "loss/logits": 0.8480107396841049, + "step": 63010 + }, + { + "epoch": 0.6302, + "grad_norm": 15.375, + "grad_norm_var": 0.42185872395833335, + "learning_rate": 0.0003, + "loss": 10.8501, + "loss/aux_loss": 0.04805904608219862, + "loss/crossentropy": 2.912857186794281, + "loss/logits": 0.8123132467269898, + "step": 63020 + }, + { + "epoch": 0.6303, + "grad_norm": 15.1875, + "grad_norm_var": 0.6196451822916667, + "learning_rate": 0.0003, + "loss": 10.6561, + "loss/aux_loss": 0.04808768462389708, + "loss/crossentropy": 2.677269399166107, + "loss/logits": 0.8051055639982223, + "step": 63030 + }, + { + "epoch": 0.6304, + "grad_norm": 15.0625, + "grad_norm_var": 0.6065104166666667, + "learning_rate": 0.0003, + "loss": 10.784, + "loss/aux_loss": 0.04805712196975946, + "loss/crossentropy": 2.9472272872924803, + "loss/logits": 0.8176089495420455, + "step": 63040 + }, + { + "epoch": 0.6305, + "grad_norm": 15.25, + "grad_norm_var": 0.51953125, + "learning_rate": 0.0003, + "loss": 10.8028, + "loss/aux_loss": 0.04806398153305054, + "loss/crossentropy": 2.6117322742938995, + "loss/logits": 0.8162994027137757, + "step": 63050 + }, + { + "epoch": 0.6306, + "grad_norm": 15.0625, + "grad_norm_var": 1.9244140625, + "learning_rate": 0.0003, + "loss": 10.9476, + "loss/aux_loss": 0.04808130543678999, + "loss/crossentropy": 2.674235236644745, + "loss/logits": 0.8312394112348557, + "step": 63060 + }, + { + "epoch": 0.6307, + "grad_norm": 15.5, + "grad_norm_var": 0.9276041666666667, + "learning_rate": 0.0003, + "loss": 11.0446, + "loss/aux_loss": 0.048066967912018296, + "loss/crossentropy": 2.751528322696686, + "loss/logits": 0.8499272048473359, + "step": 63070 + }, + { + "epoch": 0.6308, + "grad_norm": 14.4375, + "grad_norm_var": 1.1157389322916667, + "learning_rate": 0.0003, + "loss": 10.9221, + "loss/aux_loss": 0.04807424377650023, + "loss/crossentropy": 2.756152319908142, + "loss/logits": 0.8447676509618759, + "step": 63080 + }, + { + "epoch": 0.6309, + "grad_norm": 15.3125, + "grad_norm_var": 0.7453125, + "learning_rate": 0.0003, + "loss": 10.8395, + "loss/aux_loss": 0.048067429848015306, + "loss/crossentropy": 2.895149755477905, + "loss/logits": 0.8409383088350296, + "step": 63090 + }, + { + "epoch": 0.631, + "grad_norm": 14.0, + "grad_norm_var": 0.9541666666666667, + "learning_rate": 0.0003, + "loss": 11.026, + "loss/aux_loss": 0.04806844405829906, + "loss/crossentropy": 2.747306799888611, + "loss/logits": 0.8143503844738007, + "step": 63100 + }, + { + "epoch": 0.6311, + "grad_norm": 15.125, + "grad_norm_var": 4.254671223958334, + "learning_rate": 0.0003, + "loss": 10.9367, + "loss/aux_loss": 0.04806724544614553, + "loss/crossentropy": 2.741036427021027, + "loss/logits": 0.8391730457544326, + "step": 63110 + }, + { + "epoch": 0.6312, + "grad_norm": 14.1875, + "grad_norm_var": 1.7449055989583333, + "learning_rate": 0.0003, + "loss": 11.0246, + "loss/aux_loss": 0.04807869717478752, + "loss/crossentropy": 2.748949956893921, + "loss/logits": 0.8150217235088348, + "step": 63120 + }, + { + "epoch": 0.6313, + "grad_norm": 15.0, + "grad_norm_var": 1.1575520833333333, + "learning_rate": 0.0003, + "loss": 10.8627, + "loss/aux_loss": 0.048072864301502705, + "loss/crossentropy": 2.5791411340236663, + "loss/logits": 0.7991462841629982, + "step": 63130 + }, + { + "epoch": 0.6314, + "grad_norm": 15.25, + "grad_norm_var": 1.3233723958333334, + "learning_rate": 0.0003, + "loss": 10.8953, + "loss/aux_loss": 0.04805458467453718, + "loss/crossentropy": 2.609523779153824, + "loss/logits": 0.8489516407251358, + "step": 63140 + }, + { + "epoch": 0.6315, + "grad_norm": 14.125, + "grad_norm_var": 0.6304524739583334, + "learning_rate": 0.0003, + "loss": 10.934, + "loss/aux_loss": 0.0480818985030055, + "loss/crossentropy": 2.618219095468521, + "loss/logits": 0.8005220651626587, + "step": 63150 + }, + { + "epoch": 0.6316, + "grad_norm": 17.125, + "grad_norm_var": 0.9078125, + "learning_rate": 0.0003, + "loss": 10.9303, + "loss/aux_loss": 0.04806983359158039, + "loss/crossentropy": 2.686432045698166, + "loss/logits": 0.8284903228282928, + "step": 63160 + }, + { + "epoch": 0.6317, + "grad_norm": 14.3125, + "grad_norm_var": 1.4434895833333334, + "learning_rate": 0.0003, + "loss": 10.9313, + "loss/aux_loss": 0.04806499164551496, + "loss/crossentropy": 2.7233918964862824, + "loss/logits": 0.8174569487571717, + "step": 63170 + }, + { + "epoch": 0.6318, + "grad_norm": 16.0, + "grad_norm_var": 169.39816080729167, + "learning_rate": 0.0003, + "loss": 10.8758, + "loss/aux_loss": 0.048072904162108895, + "loss/crossentropy": 2.759899604320526, + "loss/logits": 0.8145837306976318, + "step": 63180 + }, + { + "epoch": 0.6319, + "grad_norm": 15.75, + "grad_norm_var": 164.56328125, + "learning_rate": 0.0003, + "loss": 10.949, + "loss/aux_loss": 0.048071921803057194, + "loss/crossentropy": 2.555657994747162, + "loss/logits": 0.793342587351799, + "step": 63190 + }, + { + "epoch": 0.632, + "grad_norm": 14.375, + "grad_norm_var": 0.3575358072916667, + "learning_rate": 0.0003, + "loss": 10.9903, + "loss/aux_loss": 0.04807258564978838, + "loss/crossentropy": 2.7257075905799866, + "loss/logits": 0.8129594385623932, + "step": 63200 + }, + { + "epoch": 0.6321, + "grad_norm": 14.0, + "grad_norm_var": 0.46295572916666666, + "learning_rate": 0.0003, + "loss": 10.8189, + "loss/aux_loss": 0.048059084080159666, + "loss/crossentropy": 2.704482650756836, + "loss/logits": 0.8401682913303375, + "step": 63210 + }, + { + "epoch": 0.6322, + "grad_norm": 14.75, + "grad_norm_var": 0.42337239583333336, + "learning_rate": 0.0003, + "loss": 10.9708, + "loss/aux_loss": 0.04807595741003752, + "loss/crossentropy": 2.6612473666667937, + "loss/logits": 0.8036254912614822, + "step": 63220 + }, + { + "epoch": 0.6323, + "grad_norm": 15.1875, + "grad_norm_var": 0.4369140625, + "learning_rate": 0.0003, + "loss": 10.8829, + "loss/aux_loss": 0.04806538727134466, + "loss/crossentropy": 2.730154258012772, + "loss/logits": 0.8400603294372558, + "step": 63230 + }, + { + "epoch": 0.6324, + "grad_norm": 15.3125, + "grad_norm_var": 3.090559895833333, + "learning_rate": 0.0003, + "loss": 11.0641, + "loss/aux_loss": 0.048070489801466464, + "loss/crossentropy": 2.8468264818191527, + "loss/logits": 0.838485524058342, + "step": 63240 + }, + { + "epoch": 0.6325, + "grad_norm": 14.6875, + "grad_norm_var": 3.206770833333333, + "learning_rate": 0.0003, + "loss": 11.0373, + "loss/aux_loss": 0.04807015266269445, + "loss/crossentropy": 2.78941011428833, + "loss/logits": 0.8361764669418335, + "step": 63250 + }, + { + "epoch": 0.6326, + "grad_norm": 14.75, + "grad_norm_var": 29.7541015625, + "learning_rate": 0.0003, + "loss": 11.0234, + "loss/aux_loss": 0.04807714801281691, + "loss/crossentropy": 2.579346811771393, + "loss/logits": 0.8216876536607742, + "step": 63260 + }, + { + "epoch": 0.6327, + "grad_norm": 16.25, + "grad_norm_var": 73.72159830729167, + "learning_rate": 0.0003, + "loss": 10.9517, + "loss/aux_loss": 0.04807807840406895, + "loss/crossentropy": 2.7519372761249543, + "loss/logits": 0.8636613190174103, + "step": 63270 + }, + { + "epoch": 0.6328, + "grad_norm": 17.0, + "grad_norm_var": 0.8097493489583333, + "learning_rate": 0.0003, + "loss": 10.835, + "loss/aux_loss": 0.04807962123304606, + "loss/crossentropy": 2.561204981803894, + "loss/logits": 0.7664595246315002, + "step": 63280 + }, + { + "epoch": 0.6329, + "grad_norm": 14.3125, + "grad_norm_var": 0.8628743489583334, + "learning_rate": 0.0003, + "loss": 10.8773, + "loss/aux_loss": 0.04806769024580717, + "loss/crossentropy": 2.592850297689438, + "loss/logits": 0.8082565724849701, + "step": 63290 + }, + { + "epoch": 0.633, + "grad_norm": 15.6875, + "grad_norm_var": 0.5722493489583333, + "learning_rate": 0.0003, + "loss": 11.072, + "loss/aux_loss": 0.048081412352621554, + "loss/crossentropy": 2.8436991333961488, + "loss/logits": 0.8220110654830932, + "step": 63300 + }, + { + "epoch": 0.6331, + "grad_norm": 15.5, + "grad_norm_var": 0.4627604166666667, + "learning_rate": 0.0003, + "loss": 10.9644, + "loss/aux_loss": 0.04806417748332024, + "loss/crossentropy": 2.746978682279587, + "loss/logits": 0.8097629576921463, + "step": 63310 + }, + { + "epoch": 0.6332, + "grad_norm": 14.125, + "grad_norm_var": 1.061572265625, + "learning_rate": 0.0003, + "loss": 10.7537, + "loss/aux_loss": 0.04806242845952511, + "loss/crossentropy": 2.6207932353019716, + "loss/logits": 0.8020304828882218, + "step": 63320 + }, + { + "epoch": 0.6333, + "grad_norm": 14.375, + "grad_norm_var": 0.3042805989583333, + "learning_rate": 0.0003, + "loss": 10.9099, + "loss/aux_loss": 0.04806431401520968, + "loss/crossentropy": 2.7724860310554504, + "loss/logits": 0.812814936041832, + "step": 63330 + }, + { + "epoch": 0.6334, + "grad_norm": 14.8125, + "grad_norm_var": 3.206103515625, + "learning_rate": 0.0003, + "loss": 11.0152, + "loss/aux_loss": 0.04808428026735782, + "loss/crossentropy": 2.6957937836647035, + "loss/logits": 0.8201660066843033, + "step": 63340 + }, + { + "epoch": 0.6335, + "grad_norm": 18.625, + "grad_norm_var": 4.637744140625, + "learning_rate": 0.0003, + "loss": 11.0486, + "loss/aux_loss": 0.04807388950139284, + "loss/crossentropy": 2.706111788749695, + "loss/logits": 0.8277266383171081, + "step": 63350 + }, + { + "epoch": 0.6336, + "grad_norm": 13.8125, + "grad_norm_var": 1.4661458333333333, + "learning_rate": 0.0003, + "loss": 10.6886, + "loss/aux_loss": 0.04806654676795006, + "loss/crossentropy": 2.594613701105118, + "loss/logits": 0.787641778588295, + "step": 63360 + }, + { + "epoch": 0.6337, + "grad_norm": 16.875, + "grad_norm_var": 0.5614420572916666, + "learning_rate": 0.0003, + "loss": 10.9344, + "loss/aux_loss": 0.04806236121803522, + "loss/crossentropy": 2.695681321620941, + "loss/logits": 0.8070782214403153, + "step": 63370 + }, + { + "epoch": 0.6338, + "grad_norm": 16.125, + "grad_norm_var": 0.5291666666666667, + "learning_rate": 0.0003, + "loss": 10.9202, + "loss/aux_loss": 0.04807351212948561, + "loss/crossentropy": 2.5633945643901823, + "loss/logits": 0.770171768963337, + "step": 63380 + }, + { + "epoch": 0.6339, + "grad_norm": 15.0, + "grad_norm_var": 0.3859375, + "learning_rate": 0.0003, + "loss": 10.9848, + "loss/aux_loss": 0.04807458482682705, + "loss/crossentropy": 2.643518990278244, + "loss/logits": 0.8119804114103317, + "step": 63390 + }, + { + "epoch": 0.634, + "grad_norm": 15.625, + "grad_norm_var": 0.79609375, + "learning_rate": 0.0003, + "loss": 10.9034, + "loss/aux_loss": 0.04807896073907614, + "loss/crossentropy": 2.580906796455383, + "loss/logits": 0.7909560561180115, + "step": 63400 + }, + { + "epoch": 0.6341, + "grad_norm": 15.375, + "grad_norm_var": 0.6630045572916666, + "learning_rate": 0.0003, + "loss": 10.8749, + "loss/aux_loss": 0.04806428924202919, + "loss/crossentropy": 2.6164624214172365, + "loss/logits": 0.8118317008018494, + "step": 63410 + }, + { + "epoch": 0.6342, + "grad_norm": 16.375, + "grad_norm_var": 9.6837890625, + "learning_rate": 0.0003, + "loss": 10.9738, + "loss/aux_loss": 0.0480818934738636, + "loss/crossentropy": 2.6591660141944886, + "loss/logits": 0.8064444810152054, + "step": 63420 + }, + { + "epoch": 0.6343, + "grad_norm": 16.5, + "grad_norm_var": 10.77578125, + "learning_rate": 0.0003, + "loss": 11.0038, + "loss/aux_loss": 0.04806845411658287, + "loss/crossentropy": 2.8173150777816773, + "loss/logits": 0.839667072892189, + "step": 63430 + }, + { + "epoch": 0.6344, + "grad_norm": 15.125, + "grad_norm_var": 0.5028483072916666, + "learning_rate": 0.0003, + "loss": 10.8932, + "loss/aux_loss": 0.04805213697254658, + "loss/crossentropy": 2.636157047748566, + "loss/logits": 0.792957991361618, + "step": 63440 + }, + { + "epoch": 0.6345, + "grad_norm": 16.5, + "grad_norm_var": 0.7291015625, + "learning_rate": 0.0003, + "loss": 10.8117, + "loss/aux_loss": 0.048077466525137426, + "loss/crossentropy": 2.6534729659557343, + "loss/logits": 0.8073912143707276, + "step": 63450 + }, + { + "epoch": 0.6346, + "grad_norm": 14.6875, + "grad_norm_var": 0.768994140625, + "learning_rate": 0.0003, + "loss": 10.7352, + "loss/aux_loss": 0.04807067047804594, + "loss/crossentropy": 2.716173303127289, + "loss/logits": 0.8196294963359833, + "step": 63460 + }, + { + "epoch": 0.6347, + "grad_norm": 14.875, + "grad_norm_var": 0.490869140625, + "learning_rate": 0.0003, + "loss": 10.8034, + "loss/aux_loss": 0.04808057863265276, + "loss/crossentropy": 2.6360740780830385, + "loss/logits": 0.7901035279035569, + "step": 63470 + }, + { + "epoch": 0.6348, + "grad_norm": 15.5, + "grad_norm_var": 0.7166015625, + "learning_rate": 0.0003, + "loss": 10.8123, + "loss/aux_loss": 0.048066640831530096, + "loss/crossentropy": 2.6251393437385557, + "loss/logits": 0.7680756062269211, + "step": 63480 + }, + { + "epoch": 0.6349, + "grad_norm": 16.5, + "grad_norm_var": 2.012744140625, + "learning_rate": 0.0003, + "loss": 10.8699, + "loss/aux_loss": 0.048078449070453645, + "loss/crossentropy": 2.72235426902771, + "loss/logits": 0.8374341070652008, + "step": 63490 + }, + { + "epoch": 0.635, + "grad_norm": 14.9375, + "grad_norm_var": 2.2, + "learning_rate": 0.0003, + "loss": 10.886, + "loss/aux_loss": 0.04805882424116135, + "loss/crossentropy": 2.775067353248596, + "loss/logits": 0.8465858489274979, + "step": 63500 + }, + { + "epoch": 0.6351, + "grad_norm": 15.25, + "grad_norm_var": 0.1453125, + "learning_rate": 0.0003, + "loss": 10.8996, + "loss/aux_loss": 0.048069519177079204, + "loss/crossentropy": 2.8206980526447296, + "loss/logits": 0.8248533338308335, + "step": 63510 + }, + { + "epoch": 0.6352, + "grad_norm": 15.125, + "grad_norm_var": 0.60625, + "learning_rate": 0.0003, + "loss": 10.8778, + "loss/aux_loss": 0.04807919226586819, + "loss/crossentropy": 2.8253234326839447, + "loss/logits": 0.8029891848564148, + "step": 63520 + }, + { + "epoch": 0.6353, + "grad_norm": 15.75, + "grad_norm_var": 0.830712890625, + "learning_rate": 0.0003, + "loss": 10.8775, + "loss/aux_loss": 0.048057425394654275, + "loss/crossentropy": 2.6366010308265686, + "loss/logits": 0.8082585781812668, + "step": 63530 + }, + { + "epoch": 0.6354, + "grad_norm": 15.9375, + "grad_norm_var": 1.255322265625, + "learning_rate": 0.0003, + "loss": 10.7782, + "loss/aux_loss": 0.048088168166577815, + "loss/crossentropy": 2.5838097631931305, + "loss/logits": 0.7870519459247589, + "step": 63540 + }, + { + "epoch": 0.6355, + "grad_norm": 14.875, + "grad_norm_var": 1.31015625, + "learning_rate": 0.0003, + "loss": 10.8468, + "loss/aux_loss": 0.04806020092219114, + "loss/crossentropy": 2.711851143836975, + "loss/logits": 0.8394827723503113, + "step": 63550 + }, + { + "epoch": 0.6356, + "grad_norm": 16.375, + "grad_norm_var": 0.5389973958333333, + "learning_rate": 0.0003, + "loss": 10.8928, + "loss/aux_loss": 0.048066692799329756, + "loss/crossentropy": 2.7870961904525755, + "loss/logits": 0.8038838863372803, + "step": 63560 + }, + { + "epoch": 0.6357, + "grad_norm": 15.1875, + "grad_norm_var": 0.41015625, + "learning_rate": 0.0003, + "loss": 10.9666, + "loss/aux_loss": 0.04806511420756578, + "loss/crossentropy": 2.641837865114212, + "loss/logits": 0.8012272834777832, + "step": 63570 + }, + { + "epoch": 0.6358, + "grad_norm": 14.0625, + "grad_norm_var": 0.35625, + "learning_rate": 0.0003, + "loss": 10.8966, + "loss/aux_loss": 0.0480894086882472, + "loss/crossentropy": 2.6827530384063722, + "loss/logits": 0.7748128771781921, + "step": 63580 + }, + { + "epoch": 0.6359, + "grad_norm": 14.75, + "grad_norm_var": 0.496337890625, + "learning_rate": 0.0003, + "loss": 10.8975, + "loss/aux_loss": 0.048070944286882876, + "loss/crossentropy": 2.441706246137619, + "loss/logits": 0.8006876438856125, + "step": 63590 + }, + { + "epoch": 0.636, + "grad_norm": 15.9375, + "grad_norm_var": 0.2744140625, + "learning_rate": 0.0003, + "loss": 10.9769, + "loss/aux_loss": 0.04805447738617659, + "loss/crossentropy": 2.801032680273056, + "loss/logits": 0.8238119214773179, + "step": 63600 + }, + { + "epoch": 0.6361, + "grad_norm": 15.4375, + "grad_norm_var": 0.48359375, + "learning_rate": 0.0003, + "loss": 10.8361, + "loss/aux_loss": 0.04808128047734499, + "loss/crossentropy": 2.6498410642147063, + "loss/logits": 0.7981126606464386, + "step": 63610 + }, + { + "epoch": 0.6362, + "grad_norm": 14.375, + "grad_norm_var": 0.28046875, + "learning_rate": 0.0003, + "loss": 10.9804, + "loss/aux_loss": 0.04807472582906484, + "loss/crossentropy": 2.6279158115386965, + "loss/logits": 0.8193077623844147, + "step": 63620 + }, + { + "epoch": 0.6363, + "grad_norm": 15.0625, + "grad_norm_var": 0.19568684895833333, + "learning_rate": 0.0003, + "loss": 10.7969, + "loss/aux_loss": 0.048069669492542745, + "loss/crossentropy": 2.8434048295021057, + "loss/logits": 0.8453941226005555, + "step": 63630 + }, + { + "epoch": 0.6364, + "grad_norm": 15.0, + "grad_norm_var": 1.27265625, + "learning_rate": 0.0003, + "loss": 10.9003, + "loss/aux_loss": 0.0480651993304491, + "loss/crossentropy": 2.757810640335083, + "loss/logits": 0.8333944648504257, + "step": 63640 + }, + { + "epoch": 0.6365, + "grad_norm": 15.375, + "grad_norm_var": 1.1129557291666667, + "learning_rate": 0.0003, + "loss": 11.0213, + "loss/aux_loss": 0.04808567147701979, + "loss/crossentropy": 2.717263233661652, + "loss/logits": 0.862332072854042, + "step": 63650 + }, + { + "epoch": 0.6366, + "grad_norm": 14.625, + "grad_norm_var": 0.43743489583333334, + "learning_rate": 0.0003, + "loss": 11.0693, + "loss/aux_loss": 0.04806395098567009, + "loss/crossentropy": 2.705856317281723, + "loss/logits": 0.7979970872402191, + "step": 63660 + }, + { + "epoch": 0.6367, + "grad_norm": 16.125, + "grad_norm_var": 0.489697265625, + "learning_rate": 0.0003, + "loss": 10.872, + "loss/aux_loss": 0.048058228194713594, + "loss/crossentropy": 2.552783203125, + "loss/logits": 0.8018041133880616, + "step": 63670 + }, + { + "epoch": 0.6368, + "grad_norm": 15.125, + "grad_norm_var": 0.5059895833333333, + "learning_rate": 0.0003, + "loss": 10.7682, + "loss/aux_loss": 0.048081094212830064, + "loss/crossentropy": 2.757079029083252, + "loss/logits": 0.8517242342233657, + "step": 63680 + }, + { + "epoch": 0.6369, + "grad_norm": 15.875, + "grad_norm_var": 0.4239420572916667, + "learning_rate": 0.0003, + "loss": 10.9756, + "loss/aux_loss": 0.04806440509855747, + "loss/crossentropy": 2.679883936047554, + "loss/logits": 0.8242890566587449, + "step": 63690 + }, + { + "epoch": 0.637, + "grad_norm": 15.0625, + "grad_norm_var": 0.792431640625, + "learning_rate": 0.0003, + "loss": 10.6549, + "loss/aux_loss": 0.04806427750736475, + "loss/crossentropy": 2.6404387235641478, + "loss/logits": 0.8252344757318497, + "step": 63700 + }, + { + "epoch": 0.6371, + "grad_norm": 15.125, + "grad_norm_var": 0.6921223958333333, + "learning_rate": 0.0003, + "loss": 10.952, + "loss/aux_loss": 0.04807978924363852, + "loss/crossentropy": 2.7173975467681886, + "loss/logits": 0.8082460671663284, + "step": 63710 + }, + { + "epoch": 0.6372, + "grad_norm": 16.5, + "grad_norm_var": 0.566650390625, + "learning_rate": 0.0003, + "loss": 10.7532, + "loss/aux_loss": 0.04806181751191616, + "loss/crossentropy": 2.790048438310623, + "loss/logits": 0.8141476571559906, + "step": 63720 + }, + { + "epoch": 0.6373, + "grad_norm": 16.125, + "grad_norm_var": 0.7513020833333334, + "learning_rate": 0.0003, + "loss": 10.9906, + "loss/aux_loss": 0.048070454970002174, + "loss/crossentropy": 2.7498911917209625, + "loss/logits": 0.8331282079219818, + "step": 63730 + }, + { + "epoch": 0.6374, + "grad_norm": 16.75, + "grad_norm_var": 0.954931640625, + "learning_rate": 0.0003, + "loss": 10.76, + "loss/aux_loss": 0.04806820340454578, + "loss/crossentropy": 2.685372221469879, + "loss/logits": 0.8030792355537415, + "step": 63740 + }, + { + "epoch": 0.6375, + "grad_norm": 15.25, + "grad_norm_var": 0.9230305989583333, + "learning_rate": 0.0003, + "loss": 10.9593, + "loss/aux_loss": 0.048071705549955365, + "loss/crossentropy": 2.553539252281189, + "loss/logits": 0.8266629427671432, + "step": 63750 + }, + { + "epoch": 0.6376, + "grad_norm": 14.6875, + "grad_norm_var": 0.6065104166666667, + "learning_rate": 0.0003, + "loss": 10.8365, + "loss/aux_loss": 0.04807527456432581, + "loss/crossentropy": 2.7262804925441744, + "loss/logits": 0.801684433221817, + "step": 63760 + }, + { + "epoch": 0.6377, + "grad_norm": 16.625, + "grad_norm_var": 0.5832682291666667, + "learning_rate": 0.0003, + "loss": 10.9552, + "loss/aux_loss": 0.04806269612163305, + "loss/crossentropy": 2.7759795606136324, + "loss/logits": 0.8241303324699402, + "step": 63770 + }, + { + "epoch": 0.6378, + "grad_norm": 14.3125, + "grad_norm_var": 1.2555826822916667, + "learning_rate": 0.0003, + "loss": 10.8722, + "loss/aux_loss": 0.04808154441416264, + "loss/crossentropy": 2.723454737663269, + "loss/logits": 0.7963994681835175, + "step": 63780 + }, + { + "epoch": 0.6379, + "grad_norm": 15.0625, + "grad_norm_var": 1.0734375, + "learning_rate": 0.0003, + "loss": 10.9994, + "loss/aux_loss": 0.04806034788489342, + "loss/crossentropy": 2.787865138053894, + "loss/logits": 0.8507139623165131, + "step": 63790 + }, + { + "epoch": 0.638, + "grad_norm": 14.625, + "grad_norm_var": 1.4801432291666667, + "learning_rate": 0.0003, + "loss": 10.9361, + "loss/aux_loss": 0.048068071529269216, + "loss/crossentropy": 2.6697638273239135, + "loss/logits": 0.8172403901815415, + "step": 63800 + }, + { + "epoch": 0.6381, + "grad_norm": 16.25, + "grad_norm_var": 0.7082682291666667, + "learning_rate": 0.0003, + "loss": 10.862, + "loss/aux_loss": 0.048073775880038735, + "loss/crossentropy": 2.798969733715057, + "loss/logits": 0.8269981414079666, + "step": 63810 + }, + { + "epoch": 0.6382, + "grad_norm": 15.4375, + "grad_norm_var": 0.2596354166666667, + "learning_rate": 0.0003, + "loss": 10.9564, + "loss/aux_loss": 0.04805988427251577, + "loss/crossentropy": 2.732244443893433, + "loss/logits": 0.8475836634635925, + "step": 63820 + }, + { + "epoch": 0.6383, + "grad_norm": 14.4375, + "grad_norm_var": 0.482666015625, + "learning_rate": 0.0003, + "loss": 10.9159, + "loss/aux_loss": 0.04807668384164572, + "loss/crossentropy": 2.782858157157898, + "loss/logits": 0.7943806976079941, + "step": 63830 + }, + { + "epoch": 0.6384, + "grad_norm": 16.25, + "grad_norm_var": 0.328369140625, + "learning_rate": 0.0003, + "loss": 11.0277, + "loss/aux_loss": 0.0480587437748909, + "loss/crossentropy": 2.840771198272705, + "loss/logits": 0.8426368027925492, + "step": 63840 + }, + { + "epoch": 0.6385, + "grad_norm": 16.0, + "grad_norm_var": 0.5332682291666667, + "learning_rate": 0.0003, + "loss": 10.9838, + "loss/aux_loss": 0.04806698095053434, + "loss/crossentropy": 2.771585577726364, + "loss/logits": 0.8477590322494507, + "step": 63850 + }, + { + "epoch": 0.6386, + "grad_norm": 15.875, + "grad_norm_var": 0.3931640625, + "learning_rate": 0.0003, + "loss": 10.7073, + "loss/aux_loss": 0.04806830566376448, + "loss/crossentropy": 2.6541366040706635, + "loss/logits": 0.8144404917955399, + "step": 63860 + }, + { + "epoch": 0.6387, + "grad_norm": 15.875, + "grad_norm_var": 0.6769368489583333, + "learning_rate": 0.0003, + "loss": 10.8338, + "loss/aux_loss": 0.048068562522530556, + "loss/crossentropy": 2.7276011228561403, + "loss/logits": 0.7975292503833771, + "step": 63870 + }, + { + "epoch": 0.6388, + "grad_norm": 14.0625, + "grad_norm_var": 1.0471354166666667, + "learning_rate": 0.0003, + "loss": 10.9173, + "loss/aux_loss": 0.04807192627340555, + "loss/crossentropy": 2.703585624694824, + "loss/logits": 0.8457301408052444, + "step": 63880 + }, + { + "epoch": 0.6389, + "grad_norm": 15.8125, + "grad_norm_var": 18.350455729166665, + "learning_rate": 0.0003, + "loss": 10.986, + "loss/aux_loss": 0.04805469363927841, + "loss/crossentropy": 2.7493717789649965, + "loss/logits": 0.7949663013219833, + "step": 63890 + }, + { + "epoch": 0.639, + "grad_norm": 15.375, + "grad_norm_var": 17.873030598958334, + "learning_rate": 0.0003, + "loss": 10.8859, + "loss/aux_loss": 0.048075062409043315, + "loss/crossentropy": 2.909009563922882, + "loss/logits": 0.8053748130798339, + "step": 63900 + }, + { + "epoch": 0.6391, + "grad_norm": 15.6875, + "grad_norm_var": 0.364306640625, + "learning_rate": 0.0003, + "loss": 10.9087, + "loss/aux_loss": 0.048067734017968176, + "loss/crossentropy": 2.9225926876068113, + "loss/logits": 0.835890656709671, + "step": 63910 + }, + { + "epoch": 0.6392, + "grad_norm": 15.75, + "grad_norm_var": 3.35234375, + "learning_rate": 0.0003, + "loss": 10.8436, + "loss/aux_loss": 0.048074383102357385, + "loss/crossentropy": 2.780510759353638, + "loss/logits": 0.8370512515306473, + "step": 63920 + }, + { + "epoch": 0.6393, + "grad_norm": 14.6875, + "grad_norm_var": 3.1134765625, + "learning_rate": 0.0003, + "loss": 10.8743, + "loss/aux_loss": 0.048071058467030525, + "loss/crossentropy": 2.6825768053531647, + "loss/logits": 0.789939995110035, + "step": 63930 + }, + { + "epoch": 0.6394, + "grad_norm": 15.4375, + "grad_norm_var": 0.42967122395833335, + "learning_rate": 0.0003, + "loss": 11.0965, + "loss/aux_loss": 0.048058373667299745, + "loss/crossentropy": 2.78253173828125, + "loss/logits": 0.8522822350263596, + "step": 63940 + }, + { + "epoch": 0.6395, + "grad_norm": 15.0625, + "grad_norm_var": 0.4110514322916667, + "learning_rate": 0.0003, + "loss": 10.8844, + "loss/aux_loss": 0.04807241130620241, + "loss/crossentropy": 2.608461046218872, + "loss/logits": 0.8056067079305649, + "step": 63950 + }, + { + "epoch": 0.6396, + "grad_norm": 14.375, + "grad_norm_var": 0.46432291666666664, + "learning_rate": 0.0003, + "loss": 10.9174, + "loss/aux_loss": 0.048062784038484095, + "loss/crossentropy": 2.6751578748226166, + "loss/logits": 0.8276819512248039, + "step": 63960 + }, + { + "epoch": 0.6397, + "grad_norm": 15.625, + "grad_norm_var": 0.788134765625, + "learning_rate": 0.0003, + "loss": 11.0461, + "loss/aux_loss": 0.04806781299412251, + "loss/crossentropy": 2.871128559112549, + "loss/logits": 0.846360245347023, + "step": 63970 + }, + { + "epoch": 0.6398, + "grad_norm": 15.75, + "grad_norm_var": 73.32381184895833, + "learning_rate": 0.0003, + "loss": 11.1256, + "loss/aux_loss": 0.04808484613895416, + "loss/crossentropy": 2.7596030294895173, + "loss/logits": 0.8853773176670074, + "step": 63980 + }, + { + "epoch": 0.6399, + "grad_norm": 15.4375, + "grad_norm_var": 73.80618489583334, + "learning_rate": 0.0003, + "loss": 10.8477, + "loss/aux_loss": 0.04804834388196468, + "loss/crossentropy": 2.660778295993805, + "loss/logits": 0.8173895359039307, + "step": 63990 + }, + { + "epoch": 0.64, + "grad_norm": 15.125, + "grad_norm_var": 0.9999348958333333, + "learning_rate": 0.0003, + "loss": 10.8996, + "loss/aux_loss": 0.048082450218498704, + "loss/crossentropy": 2.6331078112125397, + "loss/logits": 0.8155697345733642, + "step": 64000 + }, + { + "epoch": 0.6401, + "grad_norm": 15.0625, + "grad_norm_var": 0.8478515625, + "learning_rate": 0.0003, + "loss": 10.8079, + "loss/aux_loss": 0.04807614423334598, + "loss/crossentropy": 2.6761899530887603, + "loss/logits": 0.7967321127653122, + "step": 64010 + }, + { + "epoch": 0.6402, + "grad_norm": 14.25, + "grad_norm_var": 0.3965983072916667, + "learning_rate": 0.0003, + "loss": 10.8051, + "loss/aux_loss": 0.048058840073645116, + "loss/crossentropy": 2.688008636236191, + "loss/logits": 0.7768597364425659, + "step": 64020 + }, + { + "epoch": 0.6403, + "grad_norm": 14.375, + "grad_norm_var": 0.4049479166666667, + "learning_rate": 0.0003, + "loss": 10.8649, + "loss/aux_loss": 0.048065507970750335, + "loss/crossentropy": 2.7258352994918824, + "loss/logits": 0.8018498718738556, + "step": 64030 + }, + { + "epoch": 0.6404, + "grad_norm": 13.875, + "grad_norm_var": 0.6686848958333333, + "learning_rate": 0.0003, + "loss": 10.6553, + "loss/aux_loss": 0.04808259606361389, + "loss/crossentropy": 2.684278553724289, + "loss/logits": 0.7884336978197097, + "step": 64040 + }, + { + "epoch": 0.6405, + "grad_norm": 14.25, + "grad_norm_var": 0.86640625, + "learning_rate": 0.0003, + "loss": 10.8835, + "loss/aux_loss": 0.04807013440877199, + "loss/crossentropy": 2.6649328231811524, + "loss/logits": 0.8022267431020736, + "step": 64050 + }, + { + "epoch": 0.6406, + "grad_norm": 16.125, + "grad_norm_var": 0.6298014322916666, + "learning_rate": 0.0003, + "loss": 10.8014, + "loss/aux_loss": 0.04807114787399769, + "loss/crossentropy": 2.7330439388751984, + "loss/logits": 0.8096017986536026, + "step": 64060 + }, + { + "epoch": 0.6407, + "grad_norm": 14.875, + "grad_norm_var": 0.4886555989583333, + "learning_rate": 0.0003, + "loss": 10.8495, + "loss/aux_loss": 0.048064742051064965, + "loss/crossentropy": 2.6271802723407744, + "loss/logits": 0.8042764306068421, + "step": 64070 + }, + { + "epoch": 0.6408, + "grad_norm": 14.25, + "grad_norm_var": 0.4212890625, + "learning_rate": 0.0003, + "loss": 10.9443, + "loss/aux_loss": 0.048074343241751194, + "loss/crossentropy": 2.651158905029297, + "loss/logits": 0.8192477524280548, + "step": 64080 + }, + { + "epoch": 0.6409, + "grad_norm": 16.0, + "grad_norm_var": 0.5291015625, + "learning_rate": 0.0003, + "loss": 10.8708, + "loss/aux_loss": 0.04805680923163891, + "loss/crossentropy": 2.740699511766434, + "loss/logits": 0.8247522652149201, + "step": 64090 + }, + { + "epoch": 0.641, + "grad_norm": 16.875, + "grad_norm_var": 1.893603515625, + "learning_rate": 0.0003, + "loss": 10.7972, + "loss/aux_loss": 0.0480952775105834, + "loss/crossentropy": 2.6142990469932554, + "loss/logits": 0.7922994047403336, + "step": 64100 + }, + { + "epoch": 0.6411, + "grad_norm": 14.5, + "grad_norm_var": 0.7624837239583333, + "learning_rate": 0.0003, + "loss": 11.0196, + "loss/aux_loss": 0.04806794486939907, + "loss/crossentropy": 2.6602042615413666, + "loss/logits": 0.8153569340705872, + "step": 64110 + }, + { + "epoch": 0.6412, + "grad_norm": 14.875, + "grad_norm_var": 0.4712890625, + "learning_rate": 0.0003, + "loss": 10.9019, + "loss/aux_loss": 0.04807754717767239, + "loss/crossentropy": 2.6273086309432983, + "loss/logits": 0.7900595605373383, + "step": 64120 + }, + { + "epoch": 0.6413, + "grad_norm": 15.125, + "grad_norm_var": 1.4556640625, + "learning_rate": 0.0003, + "loss": 11.0146, + "loss/aux_loss": 0.048073111660778524, + "loss/crossentropy": 2.8401905834674834, + "loss/logits": 0.8362075448036194, + "step": 64130 + }, + { + "epoch": 0.6414, + "grad_norm": 16.0, + "grad_norm_var": 1.1136555989583334, + "learning_rate": 0.0003, + "loss": 11.0899, + "loss/aux_loss": 0.04805966299027205, + "loss/crossentropy": 2.7619579434394836, + "loss/logits": 0.806191298365593, + "step": 64140 + }, + { + "epoch": 0.6415, + "grad_norm": 14.375, + "grad_norm_var": 0.8770182291666667, + "learning_rate": 0.0003, + "loss": 10.9026, + "loss/aux_loss": 0.0480696702376008, + "loss/crossentropy": 2.6707617938518524, + "loss/logits": 0.8110317856073379, + "step": 64150 + }, + { + "epoch": 0.6416, + "grad_norm": 16.125, + "grad_norm_var": 0.59609375, + "learning_rate": 0.0003, + "loss": 10.9761, + "loss/aux_loss": 0.04806564971804619, + "loss/crossentropy": 2.679551374912262, + "loss/logits": 0.8288383305072784, + "step": 64160 + }, + { + "epoch": 0.6417, + "grad_norm": 14.8125, + "grad_norm_var": 0.6447265625, + "learning_rate": 0.0003, + "loss": 10.7706, + "loss/aux_loss": 0.048071674257516864, + "loss/crossentropy": 2.720490908622742, + "loss/logits": 0.8180992275476455, + "step": 64170 + }, + { + "epoch": 0.6418, + "grad_norm": 66.5, + "grad_norm_var": 244.53170572916667, + "learning_rate": 0.0003, + "loss": 10.8762, + "loss/aux_loss": 0.048073908500373366, + "loss/crossentropy": 2.6952185809612272, + "loss/logits": 0.7881834089756012, + "step": 64180 + }, + { + "epoch": 0.6419, + "grad_norm": 18.375, + "grad_norm_var": 226.25494791666668, + "learning_rate": 0.0003, + "loss": 10.8772, + "loss/aux_loss": 0.04805737938731909, + "loss/crossentropy": 2.7175555408000944, + "loss/logits": 0.7974839717149734, + "step": 64190 + }, + { + "epoch": 0.642, + "grad_norm": 15.8125, + "grad_norm_var": 8.923697916666667, + "learning_rate": 0.0003, + "loss": 10.8089, + "loss/aux_loss": 0.048077397607266904, + "loss/crossentropy": 2.632685160636902, + "loss/logits": 0.8100000500679017, + "step": 64200 + }, + { + "epoch": 0.6421, + "grad_norm": 16.625, + "grad_norm_var": 0.4105305989583333, + "learning_rate": 0.0003, + "loss": 10.9696, + "loss/aux_loss": 0.04807335864752531, + "loss/crossentropy": 2.6972862422466277, + "loss/logits": 0.7903000891208649, + "step": 64210 + }, + { + "epoch": 0.6422, + "grad_norm": 15.1875, + "grad_norm_var": 0.42576497395833335, + "learning_rate": 0.0003, + "loss": 10.7641, + "loss/aux_loss": 0.048062941245734694, + "loss/crossentropy": 2.5591843128204346, + "loss/logits": 0.7982923656702041, + "step": 64220 + }, + { + "epoch": 0.6423, + "grad_norm": 15.25, + "grad_norm_var": 0.5587890625, + "learning_rate": 0.0003, + "loss": 10.882, + "loss/aux_loss": 0.04808053784072399, + "loss/crossentropy": 2.68142853975296, + "loss/logits": 0.8112878233194352, + "step": 64230 + }, + { + "epoch": 0.6424, + "grad_norm": 14.5625, + "grad_norm_var": 0.7109212239583333, + "learning_rate": 0.0003, + "loss": 10.903, + "loss/aux_loss": 0.04807339143007994, + "loss/crossentropy": 2.6936437368392943, + "loss/logits": 0.7852249950170517, + "step": 64240 + }, + { + "epoch": 0.6425, + "grad_norm": 16.5, + "grad_norm_var": 0.6098307291666667, + "learning_rate": 0.0003, + "loss": 10.8919, + "loss/aux_loss": 0.048065887205302714, + "loss/crossentropy": 2.6046033978462217, + "loss/logits": 0.8367180943489074, + "step": 64250 + }, + { + "epoch": 0.6426, + "grad_norm": 16.375, + "grad_norm_var": 1.3727701822916667, + "learning_rate": 0.0003, + "loss": 10.8046, + "loss/aux_loss": 0.048077253997325896, + "loss/crossentropy": 2.7186995148658752, + "loss/logits": 0.8272952169179917, + "step": 64260 + }, + { + "epoch": 0.6427, + "grad_norm": 15.0, + "grad_norm_var": 1.4869140625, + "learning_rate": 0.0003, + "loss": 11.0264, + "loss/aux_loss": 0.04806565903127193, + "loss/crossentropy": 2.7013957381248472, + "loss/logits": 0.8380502104759217, + "step": 64270 + }, + { + "epoch": 0.6428, + "grad_norm": 15.3125, + "grad_norm_var": 0.6114583333333333, + "learning_rate": 0.0003, + "loss": 10.8768, + "loss/aux_loss": 0.048069261759519574, + "loss/crossentropy": 2.6103322327136995, + "loss/logits": 0.824969407916069, + "step": 64280 + }, + { + "epoch": 0.6429, + "grad_norm": 15.125, + "grad_norm_var": 0.39791666666666664, + "learning_rate": 0.0003, + "loss": 10.9467, + "loss/aux_loss": 0.04807145707309246, + "loss/crossentropy": 2.5910118997097014, + "loss/logits": 0.7979099124670028, + "step": 64290 + }, + { + "epoch": 0.643, + "grad_norm": 14.6875, + "grad_norm_var": 0.7320149739583334, + "learning_rate": 0.0003, + "loss": 10.9267, + "loss/aux_loss": 0.048070203140378, + "loss/crossentropy": 2.789473479986191, + "loss/logits": 0.7883479207754135, + "step": 64300 + }, + { + "epoch": 0.6431, + "grad_norm": 15.25, + "grad_norm_var": 1.0072916666666667, + "learning_rate": 0.0003, + "loss": 10.8941, + "loss/aux_loss": 0.04807113241404295, + "loss/crossentropy": 2.706324911117554, + "loss/logits": 0.8335605084896087, + "step": 64310 + }, + { + "epoch": 0.6432, + "grad_norm": 16.5, + "grad_norm_var": 517.822509765625, + "learning_rate": 0.0003, + "loss": 10.9202, + "loss/aux_loss": 0.04807917233556509, + "loss/crossentropy": 2.5871843814849855, + "loss/logits": 0.7983285367488862, + "step": 64320 + }, + { + "epoch": 0.6433, + "grad_norm": 16.375, + "grad_norm_var": 1.3219889322916667, + "learning_rate": 0.0003, + "loss": 10.8152, + "loss/aux_loss": 0.04806725718080997, + "loss/crossentropy": 2.714830732345581, + "loss/logits": 0.7972691237926484, + "step": 64330 + }, + { + "epoch": 0.6434, + "grad_norm": 14.9375, + "grad_norm_var": 0.44464518229166666, + "learning_rate": 0.0003, + "loss": 11.0088, + "loss/aux_loss": 0.048060521483421326, + "loss/crossentropy": 2.863740932941437, + "loss/logits": 0.8662774622440338, + "step": 64340 + }, + { + "epoch": 0.6435, + "grad_norm": 14.375, + "grad_norm_var": 0.461572265625, + "learning_rate": 0.0003, + "loss": 10.8927, + "loss/aux_loss": 0.048076084814965725, + "loss/crossentropy": 2.6789645075798036, + "loss/logits": 0.8414348632097244, + "step": 64350 + }, + { + "epoch": 0.6436, + "grad_norm": 13.5, + "grad_norm_var": 0.2843098958333333, + "learning_rate": 0.0003, + "loss": 10.7573, + "loss/aux_loss": 0.04807751737535, + "loss/crossentropy": 2.5758812725543976, + "loss/logits": 0.7923395410180092, + "step": 64360 + }, + { + "epoch": 0.6437, + "grad_norm": 14.8125, + "grad_norm_var": 0.5059733072916667, + "learning_rate": 0.0003, + "loss": 10.9254, + "loss/aux_loss": 0.04806223139166832, + "loss/crossentropy": 2.824587380886078, + "loss/logits": 0.779283007979393, + "step": 64370 + }, + { + "epoch": 0.6438, + "grad_norm": 15.9375, + "grad_norm_var": 0.23162434895833334, + "learning_rate": 0.0003, + "loss": 11.1465, + "loss/aux_loss": 0.04806876201182604, + "loss/crossentropy": 2.7761879444122313, + "loss/logits": 0.8234198421239853, + "step": 64380 + }, + { + "epoch": 0.6439, + "grad_norm": 14.25, + "grad_norm_var": 23.5265625, + "learning_rate": 0.0003, + "loss": 10.8994, + "loss/aux_loss": 0.048068377934396264, + "loss/crossentropy": 2.849505627155304, + "loss/logits": 0.7996205180883408, + "step": 64390 + }, + { + "epoch": 0.644, + "grad_norm": 14.875, + "grad_norm_var": 24.1228515625, + "learning_rate": 0.0003, + "loss": 10.8842, + "loss/aux_loss": 0.04806913807988167, + "loss/crossentropy": 2.699050772190094, + "loss/logits": 0.8141772150993347, + "step": 64400 + }, + { + "epoch": 0.6441, + "grad_norm": 14.9375, + "grad_norm_var": 0.29635416666666664, + "learning_rate": 0.0003, + "loss": 10.8829, + "loss/aux_loss": 0.04808142352849245, + "loss/crossentropy": 2.6662961184978484, + "loss/logits": 0.8306810945272446, + "step": 64410 + }, + { + "epoch": 0.6442, + "grad_norm": 15.25, + "grad_norm_var": 0.5077473958333333, + "learning_rate": 0.0003, + "loss": 10.8389, + "loss/aux_loss": 0.0480579923838377, + "loss/crossentropy": 2.5457189321517943, + "loss/logits": 0.7988121956586838, + "step": 64420 + }, + { + "epoch": 0.6443, + "grad_norm": 14.5, + "grad_norm_var": 0.24060872395833333, + "learning_rate": 0.0003, + "loss": 10.9997, + "loss/aux_loss": 0.048068817704916, + "loss/crossentropy": 2.6517822682857513, + "loss/logits": 0.7966990500688553, + "step": 64430 + }, + { + "epoch": 0.6444, + "grad_norm": 15.75, + "grad_norm_var": 0.5145182291666667, + "learning_rate": 0.0003, + "loss": 10.9814, + "loss/aux_loss": 0.048084283247590065, + "loss/crossentropy": 2.663684105873108, + "loss/logits": 0.8236712843179703, + "step": 64440 + }, + { + "epoch": 0.6445, + "grad_norm": 16.0, + "grad_norm_var": 0.5385416666666667, + "learning_rate": 0.0003, + "loss": 11.0452, + "loss/aux_loss": 0.04805535394698381, + "loss/crossentropy": 2.7321596264839174, + "loss/logits": 0.81631198823452, + "step": 64450 + }, + { + "epoch": 0.6446, + "grad_norm": 16.0, + "grad_norm_var": 0.3385416666666667, + "learning_rate": 0.0003, + "loss": 10.7714, + "loss/aux_loss": 0.0480718620121479, + "loss/crossentropy": 2.5779692411422728, + "loss/logits": 0.7793363690376282, + "step": 64460 + }, + { + "epoch": 0.6447, + "grad_norm": 13.9375, + "grad_norm_var": 0.6223795572916667, + "learning_rate": 0.0003, + "loss": 10.9604, + "loss/aux_loss": 0.048071989230811595, + "loss/crossentropy": 2.749948966503143, + "loss/logits": 0.8526365518569946, + "step": 64470 + }, + { + "epoch": 0.6448, + "grad_norm": 15.875, + "grad_norm_var": 0.33229166666666665, + "learning_rate": 0.0003, + "loss": 10.9431, + "loss/aux_loss": 0.04805717971175909, + "loss/crossentropy": 2.7506559550762177, + "loss/logits": 0.8254747807979583, + "step": 64480 + }, + { + "epoch": 0.6449, + "grad_norm": 15.125, + "grad_norm_var": 0.2518229166666667, + "learning_rate": 0.0003, + "loss": 11.1098, + "loss/aux_loss": 0.04806588124483824, + "loss/crossentropy": 2.728802466392517, + "loss/logits": 0.8510254561901093, + "step": 64490 + }, + { + "epoch": 0.645, + "grad_norm": 14.1875, + "grad_norm_var": 0.5056640625, + "learning_rate": 0.0003, + "loss": 11.0118, + "loss/aux_loss": 0.04808861147612333, + "loss/crossentropy": 2.828171968460083, + "loss/logits": 0.8330163925886154, + "step": 64500 + }, + { + "epoch": 0.6451, + "grad_norm": 15.0625, + "grad_norm_var": 0.24659830729166668, + "learning_rate": 0.0003, + "loss": 10.6805, + "loss/aux_loss": 0.048062241077423094, + "loss/crossentropy": 2.703647696971893, + "loss/logits": 0.8099043250083924, + "step": 64510 + }, + { + "epoch": 0.6452, + "grad_norm": 14.1875, + "grad_norm_var": 0.42967122395833335, + "learning_rate": 0.0003, + "loss": 10.9858, + "loss/aux_loss": 0.048076873645186424, + "loss/crossentropy": 2.5978757619857786, + "loss/logits": 0.807657128572464, + "step": 64520 + }, + { + "epoch": 0.6453, + "grad_norm": 16.25, + "grad_norm_var": 106.27864583333333, + "learning_rate": 0.0003, + "loss": 10.9333, + "loss/aux_loss": 0.04806945752352476, + "loss/crossentropy": 2.7974547028541563, + "loss/logits": 0.8342884957790375, + "step": 64530 + }, + { + "epoch": 0.6454, + "grad_norm": 14.3125, + "grad_norm_var": 1.3356770833333333, + "learning_rate": 0.0003, + "loss": 10.9232, + "loss/aux_loss": 0.04806990846991539, + "loss/crossentropy": 2.650894695520401, + "loss/logits": 0.7819722086191178, + "step": 64540 + }, + { + "epoch": 0.6455, + "grad_norm": 15.375, + "grad_norm_var": 0.658447265625, + "learning_rate": 0.0003, + "loss": 10.9033, + "loss/aux_loss": 0.04806603621691465, + "loss/crossentropy": 2.7686345756053923, + "loss/logits": 0.7982172280550003, + "step": 64550 + }, + { + "epoch": 0.6456, + "grad_norm": 14.6875, + "grad_norm_var": 0.490087890625, + "learning_rate": 0.0003, + "loss": 10.7961, + "loss/aux_loss": 0.048073521070182326, + "loss/crossentropy": 2.632987970113754, + "loss/logits": 0.8115098506212235, + "step": 64560 + }, + { + "epoch": 0.6457, + "grad_norm": 15.5, + "grad_norm_var": 1.2235514322916667, + "learning_rate": 0.0003, + "loss": 10.9672, + "loss/aux_loss": 0.048069255985319616, + "loss/crossentropy": 2.612694835662842, + "loss/logits": 0.8157520830631256, + "step": 64570 + }, + { + "epoch": 0.6458, + "grad_norm": 14.75, + "grad_norm_var": 0.765087890625, + "learning_rate": 0.0003, + "loss": 10.9776, + "loss/aux_loss": 0.04807736426591873, + "loss/crossentropy": 2.5973219871520996, + "loss/logits": 0.7989464849233627, + "step": 64580 + }, + { + "epoch": 0.6459, + "grad_norm": 17.125, + "grad_norm_var": 0.5703125, + "learning_rate": 0.0003, + "loss": 10.88, + "loss/aux_loss": 0.048061249777674675, + "loss/crossentropy": 2.6475314140319823, + "loss/logits": 0.7981739670038224, + "step": 64590 + }, + { + "epoch": 0.646, + "grad_norm": 15.1875, + "grad_norm_var": 0.57109375, + "learning_rate": 0.0003, + "loss": 10.8287, + "loss/aux_loss": 0.048082700744271276, + "loss/crossentropy": 2.8341873228549956, + "loss/logits": 0.8232771545648575, + "step": 64600 + }, + { + "epoch": 0.6461, + "grad_norm": 16.25, + "grad_norm_var": 1.3593587239583333, + "learning_rate": 0.0003, + "loss": 10.7596, + "loss/aux_loss": 0.04807372409850359, + "loss/crossentropy": 2.4986962258815764, + "loss/logits": 0.7847854226827622, + "step": 64610 + }, + { + "epoch": 0.6462, + "grad_norm": 17.375, + "grad_norm_var": 1.5445149739583333, + "learning_rate": 0.0003, + "loss": 10.8791, + "loss/aux_loss": 0.04804991818964481, + "loss/crossentropy": 2.841459035873413, + "loss/logits": 0.8310028403997421, + "step": 64620 + }, + { + "epoch": 0.6463, + "grad_norm": 16.375, + "grad_norm_var": 0.7822265625, + "learning_rate": 0.0003, + "loss": 10.9785, + "loss/aux_loss": 0.048074251785874364, + "loss/crossentropy": 2.6962937235832216, + "loss/logits": 0.7954140931367875, + "step": 64630 + }, + { + "epoch": 0.6464, + "grad_norm": 14.875, + "grad_norm_var": 0.6072265625, + "learning_rate": 0.0003, + "loss": 11.0638, + "loss/aux_loss": 0.04808508362621069, + "loss/crossentropy": 2.764870321750641, + "loss/logits": 0.8515766054391861, + "step": 64640 + }, + { + "epoch": 0.6465, + "grad_norm": 15.6875, + "grad_norm_var": 0.8242024739583333, + "learning_rate": 0.0003, + "loss": 11.0148, + "loss/aux_loss": 0.04806652627885342, + "loss/crossentropy": 2.594625836610794, + "loss/logits": 0.7873844116926193, + "step": 64650 + }, + { + "epoch": 0.6466, + "grad_norm": 15.1875, + "grad_norm_var": 0.69609375, + "learning_rate": 0.0003, + "loss": 10.8542, + "loss/aux_loss": 0.048073071800172326, + "loss/crossentropy": 2.815520566701889, + "loss/logits": 0.8115405261516571, + "step": 64660 + }, + { + "epoch": 0.6467, + "grad_norm": 14.75, + "grad_norm_var": 0.25323893229166666, + "learning_rate": 0.0003, + "loss": 10.866, + "loss/aux_loss": 0.048070530965924264, + "loss/crossentropy": 2.541173154115677, + "loss/logits": 0.8035361468791962, + "step": 64670 + }, + { + "epoch": 0.6468, + "grad_norm": 14.6875, + "grad_norm_var": 3.2783854166666666, + "learning_rate": 0.0003, + "loss": 10.7697, + "loss/aux_loss": 0.04807205218821764, + "loss/crossentropy": 2.77914103269577, + "loss/logits": 0.8178933262825012, + "step": 64680 + }, + { + "epoch": 0.6469, + "grad_norm": 14.9375, + "grad_norm_var": 1.8332682291666667, + "learning_rate": 0.0003, + "loss": 10.9381, + "loss/aux_loss": 0.048067630268633366, + "loss/crossentropy": 2.6256860315799715, + "loss/logits": 0.833366334438324, + "step": 64690 + }, + { + "epoch": 0.647, + "grad_norm": 14.6875, + "grad_norm_var": 2.178759765625, + "learning_rate": 0.0003, + "loss": 11.0279, + "loss/aux_loss": 0.04807144869118929, + "loss/crossentropy": 2.684089946746826, + "loss/logits": 0.8047218829393387, + "step": 64700 + }, + { + "epoch": 0.6471, + "grad_norm": 15.3125, + "grad_norm_var": 0.6446451822916667, + "learning_rate": 0.0003, + "loss": 10.895, + "loss/aux_loss": 0.04806271083652973, + "loss/crossentropy": 2.5051504015922545, + "loss/logits": 0.814690887928009, + "step": 64710 + }, + { + "epoch": 0.6472, + "grad_norm": 15.875, + "grad_norm_var": 0.5378743489583333, + "learning_rate": 0.0003, + "loss": 11.0694, + "loss/aux_loss": 0.04807141367346048, + "loss/crossentropy": 2.6718297123909, + "loss/logits": 0.8078697264194489, + "step": 64720 + }, + { + "epoch": 0.6473, + "grad_norm": 15.1875, + "grad_norm_var": 0.5561848958333333, + "learning_rate": 0.0003, + "loss": 11.1501, + "loss/aux_loss": 0.04807430915534496, + "loss/crossentropy": 2.7116922199726106, + "loss/logits": 0.8617299765348434, + "step": 64730 + }, + { + "epoch": 0.6474, + "grad_norm": 16.75, + "grad_norm_var": 0.32024739583333334, + "learning_rate": 0.0003, + "loss": 11.0262, + "loss/aux_loss": 0.04805788192898035, + "loss/crossentropy": 2.7382602095603943, + "loss/logits": 0.8403635859489441, + "step": 64740 + }, + { + "epoch": 0.6475, + "grad_norm": 14.625, + "grad_norm_var": 0.27708333333333335, + "learning_rate": 0.0003, + "loss": 10.9941, + "loss/aux_loss": 0.048070405051112174, + "loss/crossentropy": 2.7188424825668336, + "loss/logits": 0.7995573878288269, + "step": 64750 + }, + { + "epoch": 0.6476, + "grad_norm": 15.5625, + "grad_norm_var": 0.367431640625, + "learning_rate": 0.0003, + "loss": 10.9026, + "loss/aux_loss": 0.04807642940431833, + "loss/crossentropy": 2.7417497992515565, + "loss/logits": 0.8509037971496582, + "step": 64760 + }, + { + "epoch": 0.6477, + "grad_norm": 15.5, + "grad_norm_var": 1.25546875, + "learning_rate": 0.0003, + "loss": 10.9626, + "loss/aux_loss": 0.0480690760537982, + "loss/crossentropy": 2.647187089920044, + "loss/logits": 0.8061125695705413, + "step": 64770 + }, + { + "epoch": 0.6478, + "grad_norm": 16.625, + "grad_norm_var": 0.746728515625, + "learning_rate": 0.0003, + "loss": 10.9044, + "loss/aux_loss": 0.048072075471282005, + "loss/crossentropy": 2.6679067850112914, + "loss/logits": 0.8102442860603333, + "step": 64780 + }, + { + "epoch": 0.6479, + "grad_norm": 14.3125, + "grad_norm_var": 0.6597493489583334, + "learning_rate": 0.0003, + "loss": 10.8774, + "loss/aux_loss": 0.04807139951735735, + "loss/crossentropy": 2.679821991920471, + "loss/logits": 0.8046330511569977, + "step": 64790 + }, + { + "epoch": 0.648, + "grad_norm": 15.375, + "grad_norm_var": 0.49777018229166664, + "learning_rate": 0.0003, + "loss": 10.8229, + "loss/aux_loss": 0.04807682540267706, + "loss/crossentropy": 2.7562019169330596, + "loss/logits": 0.8137243837118149, + "step": 64800 + }, + { + "epoch": 0.6481, + "grad_norm": 14.375, + "grad_norm_var": 0.509619140625, + "learning_rate": 0.0003, + "loss": 10.8284, + "loss/aux_loss": 0.048072621785104276, + "loss/crossentropy": 2.723770010471344, + "loss/logits": 0.8341768980026245, + "step": 64810 + }, + { + "epoch": 0.6482, + "grad_norm": 16.0, + "grad_norm_var": 0.34230143229166665, + "learning_rate": 0.0003, + "loss": 10.9179, + "loss/aux_loss": 0.04807199724018574, + "loss/crossentropy": 2.5898614048957826, + "loss/logits": 0.8297031134366989, + "step": 64820 + }, + { + "epoch": 0.6483, + "grad_norm": 15.1875, + "grad_norm_var": 0.40089518229166665, + "learning_rate": 0.0003, + "loss": 10.7685, + "loss/aux_loss": 0.04806529227644205, + "loss/crossentropy": 2.736210232973099, + "loss/logits": 0.8199382722377777, + "step": 64830 + }, + { + "epoch": 0.6484, + "grad_norm": 15.5, + "grad_norm_var": 0.6505208333333333, + "learning_rate": 0.0003, + "loss": 10.8731, + "loss/aux_loss": 0.04806740824133158, + "loss/crossentropy": 2.7414426445960998, + "loss/logits": 0.829671436548233, + "step": 64840 + }, + { + "epoch": 0.6485, + "grad_norm": 15.5, + "grad_norm_var": 0.7796875, + "learning_rate": 0.0003, + "loss": 11.1061, + "loss/aux_loss": 0.04807157013565302, + "loss/crossentropy": 2.6303915977478027, + "loss/logits": 0.8207491040229797, + "step": 64850 + }, + { + "epoch": 0.6486, + "grad_norm": 16.375, + "grad_norm_var": 0.650244140625, + "learning_rate": 0.0003, + "loss": 10.8262, + "loss/aux_loss": 0.04807078931480646, + "loss/crossentropy": 2.7495046079158785, + "loss/logits": 0.8125106036663056, + "step": 64860 + }, + { + "epoch": 0.6487, + "grad_norm": 14.5625, + "grad_norm_var": 0.374072265625, + "learning_rate": 0.0003, + "loss": 10.8215, + "loss/aux_loss": 0.04806679226458073, + "loss/crossentropy": 2.52020383477211, + "loss/logits": 0.7938949555158615, + "step": 64870 + }, + { + "epoch": 0.6488, + "grad_norm": 14.875, + "grad_norm_var": 0.44680989583333336, + "learning_rate": 0.0003, + "loss": 10.981, + "loss/aux_loss": 0.04807438552379608, + "loss/crossentropy": 2.713279777765274, + "loss/logits": 0.8202762633562088, + "step": 64880 + }, + { + "epoch": 0.6489, + "grad_norm": 15.3125, + "grad_norm_var": 0.53046875, + "learning_rate": 0.0003, + "loss": 10.8555, + "loss/aux_loss": 0.048073222115635875, + "loss/crossentropy": 2.6626059472560883, + "loss/logits": 0.8038224250078201, + "step": 64890 + }, + { + "epoch": 0.649, + "grad_norm": 16.0, + "grad_norm_var": 0.2659993489583333, + "learning_rate": 0.0003, + "loss": 10.8778, + "loss/aux_loss": 0.04806961547583342, + "loss/crossentropy": 2.7846663117408754, + "loss/logits": 0.8211091995239258, + "step": 64900 + }, + { + "epoch": 0.6491, + "grad_norm": 14.25, + "grad_norm_var": 0.40260416666666665, + "learning_rate": 0.0003, + "loss": 10.8055, + "loss/aux_loss": 0.04807712137699127, + "loss/crossentropy": 2.6188452005386353, + "loss/logits": 0.7841671526432037, + "step": 64910 + }, + { + "epoch": 0.6492, + "grad_norm": 15.375, + "grad_norm_var": 0.5683430989583333, + "learning_rate": 0.0003, + "loss": 10.8163, + "loss/aux_loss": 0.04806164372712374, + "loss/crossentropy": 2.6171076774597166, + "loss/logits": 0.8012127339839935, + "step": 64920 + }, + { + "epoch": 0.6493, + "grad_norm": 15.25, + "grad_norm_var": 0.430322265625, + "learning_rate": 0.0003, + "loss": 10.894, + "loss/aux_loss": 0.04807732943445444, + "loss/crossentropy": 2.6409548163414, + "loss/logits": 0.8105780005455017, + "step": 64930 + }, + { + "epoch": 0.6494, + "grad_norm": 15.6875, + "grad_norm_var": 0.6157389322916667, + "learning_rate": 0.0003, + "loss": 10.8077, + "loss/aux_loss": 0.04806876610964537, + "loss/crossentropy": 2.5447192013263704, + "loss/logits": 0.7904939085245133, + "step": 64940 + }, + { + "epoch": 0.6495, + "grad_norm": 17.0, + "grad_norm_var": 0.6981770833333333, + "learning_rate": 0.0003, + "loss": 11.0475, + "loss/aux_loss": 0.048071413300931454, + "loss/crossentropy": 2.706156146526337, + "loss/logits": 0.8606253623962402, + "step": 64950 + }, + { + "epoch": 0.6496, + "grad_norm": 16.25, + "grad_norm_var": 0.6577962239583334, + "learning_rate": 0.0003, + "loss": 10.9165, + "loss/aux_loss": 0.04807477165013552, + "loss/crossentropy": 2.676371121406555, + "loss/logits": 0.8254680544137954, + "step": 64960 + }, + { + "epoch": 0.6497, + "grad_norm": 16.0, + "grad_norm_var": 0.37161458333333336, + "learning_rate": 0.0003, + "loss": 10.9431, + "loss/aux_loss": 0.048062054254114625, + "loss/crossentropy": 2.687245047092438, + "loss/logits": 0.8192354500293731, + "step": 64970 + }, + { + "epoch": 0.6498, + "grad_norm": 14.6875, + "grad_norm_var": 0.697900390625, + "learning_rate": 0.0003, + "loss": 10.8736, + "loss/aux_loss": 0.04806843213737011, + "loss/crossentropy": 2.6569925785064696, + "loss/logits": 0.8111906111240387, + "step": 64980 + }, + { + "epoch": 0.6499, + "grad_norm": 16.25, + "grad_norm_var": 0.6346354166666667, + "learning_rate": 0.0003, + "loss": 10.8889, + "loss/aux_loss": 0.04806424044072628, + "loss/crossentropy": 2.7027989625930786, + "loss/logits": 0.8000789701938629, + "step": 64990 + }, + { + "epoch": 0.65, + "grad_norm": 75.5, + "grad_norm_var": 223.42849934895833, + "learning_rate": 0.0003, + "loss": 10.9805, + "loss/aux_loss": 0.048073144629597664, + "loss/crossentropy": 2.808467972278595, + "loss/logits": 0.8571766018867493, + "step": 65000 + }, + { + "epoch": 0.6501, + "grad_norm": 16.0, + "grad_norm_var": 220.27537434895834, + "learning_rate": 0.0003, + "loss": 10.8087, + "loss/aux_loss": 0.048067417740821836, + "loss/crossentropy": 2.7439105987548826, + "loss/logits": 0.824249017238617, + "step": 65010 + }, + { + "epoch": 0.6502, + "grad_norm": 14.8125, + "grad_norm_var": 0.4066243489583333, + "learning_rate": 0.0003, + "loss": 10.795, + "loss/aux_loss": 0.048070468753576276, + "loss/crossentropy": 2.763742119073868, + "loss/logits": 0.8238922148942948, + "step": 65020 + }, + { + "epoch": 0.6503, + "grad_norm": 14.75, + "grad_norm_var": 0.5936848958333333, + "learning_rate": 0.0003, + "loss": 10.9753, + "loss/aux_loss": 0.04807634837925434, + "loss/crossentropy": 2.721911084651947, + "loss/logits": 0.8021749824285507, + "step": 65030 + }, + { + "epoch": 0.6504, + "grad_norm": 14.9375, + "grad_norm_var": 7.0462890625, + "learning_rate": 0.0003, + "loss": 10.7693, + "loss/aux_loss": 0.048060786351561545, + "loss/crossentropy": 2.63610897064209, + "loss/logits": 0.8058023959398269, + "step": 65040 + }, + { + "epoch": 0.6505, + "grad_norm": 15.8125, + "grad_norm_var": 0.7958333333333333, + "learning_rate": 0.0003, + "loss": 10.9499, + "loss/aux_loss": 0.0480716660618782, + "loss/crossentropy": 2.7749450325965883, + "loss/logits": 0.8092481285333634, + "step": 65050 + }, + { + "epoch": 0.6506, + "grad_norm": 15.1875, + "grad_norm_var": 0.32081705729166665, + "learning_rate": 0.0003, + "loss": 10.9791, + "loss/aux_loss": 0.048079953715205195, + "loss/crossentropy": 2.901500105857849, + "loss/logits": 0.8558017522096634, + "step": 65060 + }, + { + "epoch": 0.6507, + "grad_norm": 14.625, + "grad_norm_var": 0.49420572916666666, + "learning_rate": 0.0003, + "loss": 10.886, + "loss/aux_loss": 0.04804853610694408, + "loss/crossentropy": 2.8315212607383726, + "loss/logits": 0.8276244908571243, + "step": 65070 + }, + { + "epoch": 0.6508, + "grad_norm": 14.9375, + "grad_norm_var": 1.0609375, + "learning_rate": 0.0003, + "loss": 10.9138, + "loss/aux_loss": 0.048084153421223165, + "loss/crossentropy": 2.681584632396698, + "loss/logits": 0.7945889711380005, + "step": 65080 + }, + { + "epoch": 0.6509, + "grad_norm": 15.625, + "grad_norm_var": 0.94609375, + "learning_rate": 0.0003, + "loss": 10.9791, + "loss/aux_loss": 0.04806964471936226, + "loss/crossentropy": 2.7207436323165894, + "loss/logits": 0.8214473009109498, + "step": 65090 + }, + { + "epoch": 0.651, + "grad_norm": 16.125, + "grad_norm_var": 1.1119791666666667, + "learning_rate": 0.0003, + "loss": 10.9402, + "loss/aux_loss": 0.048057034611701965, + "loss/crossentropy": 2.6684858202934265, + "loss/logits": 0.7792475908994675, + "step": 65100 + }, + { + "epoch": 0.6511, + "grad_norm": 15.25, + "grad_norm_var": 1.1333333333333333, + "learning_rate": 0.0003, + "loss": 10.9291, + "loss/aux_loss": 0.04806646164506674, + "loss/crossentropy": 2.779614543914795, + "loss/logits": 0.8046439945697784, + "step": 65110 + }, + { + "epoch": 0.6512, + "grad_norm": 16.0, + "grad_norm_var": 1.5386555989583333, + "learning_rate": 0.0003, + "loss": 11.059, + "loss/aux_loss": 0.048073740862309935, + "loss/crossentropy": 2.8252889752388, + "loss/logits": 0.7956268131732941, + "step": 65120 + }, + { + "epoch": 0.6513, + "grad_norm": 14.75, + "grad_norm_var": 0.410791015625, + "learning_rate": 0.0003, + "loss": 11.017, + "loss/aux_loss": 0.048064406216144565, + "loss/crossentropy": 2.746646058559418, + "loss/logits": 0.8085185199975967, + "step": 65130 + }, + { + "epoch": 0.6514, + "grad_norm": 14.0625, + "grad_norm_var": 0.24296875, + "learning_rate": 0.0003, + "loss": 10.9533, + "loss/aux_loss": 0.04806833751499653, + "loss/crossentropy": 2.7476025104522703, + "loss/logits": 0.7968869656324387, + "step": 65140 + }, + { + "epoch": 0.6515, + "grad_norm": 15.0625, + "grad_norm_var": 0.35857747395833334, + "learning_rate": 0.0003, + "loss": 10.7864, + "loss/aux_loss": 0.04806720819324255, + "loss/crossentropy": 2.7668261766433715, + "loss/logits": 0.7852804720401764, + "step": 65150 + }, + { + "epoch": 0.6516, + "grad_norm": 14.6875, + "grad_norm_var": 0.259228515625, + "learning_rate": 0.0003, + "loss": 10.6857, + "loss/aux_loss": 0.048074799962341784, + "loss/crossentropy": 2.6271615505218504, + "loss/logits": 0.7937258869409561, + "step": 65160 + }, + { + "epoch": 0.6517, + "grad_norm": 14.625, + "grad_norm_var": 0.27858072916666665, + "learning_rate": 0.0003, + "loss": 11.0121, + "loss/aux_loss": 0.048050605691969395, + "loss/crossentropy": 2.762240695953369, + "loss/logits": 0.8204376786947251, + "step": 65170 + }, + { + "epoch": 0.6518, + "grad_norm": 14.1875, + "grad_norm_var": 0.278125, + "learning_rate": 0.0003, + "loss": 10.9931, + "loss/aux_loss": 0.048072893917560575, + "loss/crossentropy": 2.6644616603851317, + "loss/logits": 0.826135328412056, + "step": 65180 + }, + { + "epoch": 0.6519, + "grad_norm": 14.75, + "grad_norm_var": 0.7619140625, + "learning_rate": 0.0003, + "loss": 10.9311, + "loss/aux_loss": 0.04807901922613382, + "loss/crossentropy": 2.5317931294441225, + "loss/logits": 0.7690812319517135, + "step": 65190 + }, + { + "epoch": 0.652, + "grad_norm": 15.125, + "grad_norm_var": 0.4796875, + "learning_rate": 0.0003, + "loss": 10.795, + "loss/aux_loss": 0.04806858953088522, + "loss/crossentropy": 2.638898861408234, + "loss/logits": 0.7892222136259079, + "step": 65200 + }, + { + "epoch": 0.6521, + "grad_norm": 16.0, + "grad_norm_var": 0.9770182291666667, + "learning_rate": 0.0003, + "loss": 10.888, + "loss/aux_loss": 0.048061699606478214, + "loss/crossentropy": 2.654205119609833, + "loss/logits": 0.8139635503292084, + "step": 65210 + }, + { + "epoch": 0.6522, + "grad_norm": 15.3125, + "grad_norm_var": 4.557535807291667, + "learning_rate": 0.0003, + "loss": 10.9919, + "loss/aux_loss": 0.04807271305471659, + "loss/crossentropy": 2.6672019481658937, + "loss/logits": 0.7892401427030563, + "step": 65220 + }, + { + "epoch": 0.6523, + "grad_norm": 15.625, + "grad_norm_var": 2.3764973958333333, + "learning_rate": 0.0003, + "loss": 10.718, + "loss/aux_loss": 0.04806781802326441, + "loss/crossentropy": 2.7000105381011963, + "loss/logits": 0.7944419324398041, + "step": 65230 + }, + { + "epoch": 0.6524, + "grad_norm": 14.5, + "grad_norm_var": 0.5551920572916667, + "learning_rate": 0.0003, + "loss": 10.8404, + "loss/aux_loss": 0.04805983956903219, + "loss/crossentropy": 2.5622940182685854, + "loss/logits": 0.7859783351421357, + "step": 65240 + }, + { + "epoch": 0.6525, + "grad_norm": 14.8125, + "grad_norm_var": 0.9098307291666666, + "learning_rate": 0.0003, + "loss": 11.0443, + "loss/aux_loss": 0.048075672797858716, + "loss/crossentropy": 2.7040555834770204, + "loss/logits": 0.8166841179132461, + "step": 65250 + }, + { + "epoch": 0.6526, + "grad_norm": 15.8125, + "grad_norm_var": 0.656103515625, + "learning_rate": 0.0003, + "loss": 10.8139, + "loss/aux_loss": 0.048063617758452894, + "loss/crossentropy": 2.736673855781555, + "loss/logits": 0.7982501238584518, + "step": 65260 + }, + { + "epoch": 0.6527, + "grad_norm": 16.125, + "grad_norm_var": 0.4697265625, + "learning_rate": 0.0003, + "loss": 10.9216, + "loss/aux_loss": 0.04806849732995033, + "loss/crossentropy": 2.7332601666450502, + "loss/logits": 0.8461134731769562, + "step": 65270 + }, + { + "epoch": 0.6528, + "grad_norm": 14.6875, + "grad_norm_var": 0.9911458333333333, + "learning_rate": 0.0003, + "loss": 10.9565, + "loss/aux_loss": 0.04807373881340027, + "loss/crossentropy": 2.682792294025421, + "loss/logits": 0.8356576085090637, + "step": 65280 + }, + { + "epoch": 0.6529, + "grad_norm": 15.125, + "grad_norm_var": 1.2997395833333334, + "learning_rate": 0.0003, + "loss": 10.8412, + "loss/aux_loss": 0.048058380000293256, + "loss/crossentropy": 2.598079466819763, + "loss/logits": 0.8082356095314026, + "step": 65290 + }, + { + "epoch": 0.653, + "grad_norm": 14.875, + "grad_norm_var": 1.1613118489583334, + "learning_rate": 0.0003, + "loss": 10.8663, + "loss/aux_loss": 0.04806201159954071, + "loss/crossentropy": 2.6836532950401306, + "loss/logits": 0.7990910440683365, + "step": 65300 + }, + { + "epoch": 0.6531, + "grad_norm": 14.8125, + "grad_norm_var": 0.5763020833333333, + "learning_rate": 0.0003, + "loss": 10.9157, + "loss/aux_loss": 0.04807909373193979, + "loss/crossentropy": 2.7008519947528837, + "loss/logits": 0.8079722136259079, + "step": 65310 + }, + { + "epoch": 0.6532, + "grad_norm": 13.75, + "grad_norm_var": 0.53671875, + "learning_rate": 0.0003, + "loss": 10.8207, + "loss/aux_loss": 0.04807158559560776, + "loss/crossentropy": 2.5901060104370117, + "loss/logits": 0.8299726933240891, + "step": 65320 + }, + { + "epoch": 0.6533, + "grad_norm": 16.0, + "grad_norm_var": 0.8528645833333334, + "learning_rate": 0.0003, + "loss": 10.891, + "loss/aux_loss": 0.0480627816170454, + "loss/crossentropy": 2.6457729578018188, + "loss/logits": 0.8211910486221313, + "step": 65330 + }, + { + "epoch": 0.6534, + "grad_norm": 14.875, + "grad_norm_var": 0.904150390625, + "learning_rate": 0.0003, + "loss": 10.8124, + "loss/aux_loss": 0.04807421285659075, + "loss/crossentropy": 2.6744938433170318, + "loss/logits": 0.7985322535037994, + "step": 65340 + }, + { + "epoch": 0.6535, + "grad_norm": 15.4375, + "grad_norm_var": 1.1171223958333334, + "learning_rate": 0.0003, + "loss": 10.6537, + "loss/aux_loss": 0.04807833898812532, + "loss/crossentropy": 2.623306268453598, + "loss/logits": 0.7695679128170013, + "step": 65350 + }, + { + "epoch": 0.6536, + "grad_norm": 14.75, + "grad_norm_var": 0.36248372395833334, + "learning_rate": 0.0003, + "loss": 10.7607, + "loss/aux_loss": 0.048066575266420844, + "loss/crossentropy": 2.7030728101730346, + "loss/logits": 0.8045616328716279, + "step": 65360 + }, + { + "epoch": 0.6537, + "grad_norm": 14.4375, + "grad_norm_var": 0.7317708333333334, + "learning_rate": 0.0003, + "loss": 11.0547, + "loss/aux_loss": 0.048070757277309896, + "loss/crossentropy": 2.8383954405784606, + "loss/logits": 0.8456792950630188, + "step": 65370 + }, + { + "epoch": 0.6538, + "grad_norm": 15.1875, + "grad_norm_var": 0.709228515625, + "learning_rate": 0.0003, + "loss": 10.8044, + "loss/aux_loss": 0.04807031229138374, + "loss/crossentropy": 2.7254865407943725, + "loss/logits": 0.8441527247428894, + "step": 65380 + }, + { + "epoch": 0.6539, + "grad_norm": 16.75, + "grad_norm_var": 0.485400390625, + "learning_rate": 0.0003, + "loss": 10.8881, + "loss/aux_loss": 0.04807039219886065, + "loss/crossentropy": 2.7737102448940276, + "loss/logits": 0.8173193544149399, + "step": 65390 + }, + { + "epoch": 0.654, + "grad_norm": 14.625, + "grad_norm_var": 0.7863932291666667, + "learning_rate": 0.0003, + "loss": 10.9235, + "loss/aux_loss": 0.04807139802724123, + "loss/crossentropy": 2.7199482560157775, + "loss/logits": 0.8173371762037277, + "step": 65400 + }, + { + "epoch": 0.6541, + "grad_norm": 14.75, + "grad_norm_var": 1.0648274739583334, + "learning_rate": 0.0003, + "loss": 10.9632, + "loss/aux_loss": 0.048072163760662076, + "loss/crossentropy": 2.7658130168914794, + "loss/logits": 0.8008842885494232, + "step": 65410 + }, + { + "epoch": 0.6542, + "grad_norm": 15.875, + "grad_norm_var": 0.55390625, + "learning_rate": 0.0003, + "loss": 10.9842, + "loss/aux_loss": 0.04805851969867945, + "loss/crossentropy": 2.8067606568336485, + "loss/logits": 0.7942769289016723, + "step": 65420 + }, + { + "epoch": 0.6543, + "grad_norm": 16.0, + "grad_norm_var": 0.332275390625, + "learning_rate": 0.0003, + "loss": 10.9043, + "loss/aux_loss": 0.048070518858730794, + "loss/crossentropy": 2.683800792694092, + "loss/logits": 0.8469008475542068, + "step": 65430 + }, + { + "epoch": 0.6544, + "grad_norm": 14.6875, + "grad_norm_var": 0.6820149739583333, + "learning_rate": 0.0003, + "loss": 10.9415, + "loss/aux_loss": 0.04807311110198498, + "loss/crossentropy": 2.7242295682430266, + "loss/logits": 0.8172059804201126, + "step": 65440 + }, + { + "epoch": 0.6545, + "grad_norm": 14.4375, + "grad_norm_var": 0.5926432291666667, + "learning_rate": 0.0003, + "loss": 10.7765, + "loss/aux_loss": 0.04806800615042448, + "loss/crossentropy": 2.7106892645359038, + "loss/logits": 0.8149084568023681, + "step": 65450 + }, + { + "epoch": 0.6546, + "grad_norm": 15.3125, + "grad_norm_var": 0.5994140625, + "learning_rate": 0.0003, + "loss": 11.0235, + "loss/aux_loss": 0.048058449663221835, + "loss/crossentropy": 2.6193589746952055, + "loss/logits": 0.8009304910898208, + "step": 65460 + }, + { + "epoch": 0.6547, + "grad_norm": 16.75, + "grad_norm_var": 0.7122233072916667, + "learning_rate": 0.0003, + "loss": 10.9139, + "loss/aux_loss": 0.04806790947914123, + "loss/crossentropy": 2.677752900123596, + "loss/logits": 0.8047443449497222, + "step": 65470 + }, + { + "epoch": 0.6548, + "grad_norm": 14.1875, + "grad_norm_var": 0.9540201822916666, + "learning_rate": 0.0003, + "loss": 10.966, + "loss/aux_loss": 0.04808517023921013, + "loss/crossentropy": 2.6315142631530763, + "loss/logits": 0.8178540676832199, + "step": 65480 + }, + { + "epoch": 0.6549, + "grad_norm": 15.875, + "grad_norm_var": 0.6160807291666667, + "learning_rate": 0.0003, + "loss": 10.8793, + "loss/aux_loss": 0.04805512484163046, + "loss/crossentropy": 2.75492285490036, + "loss/logits": 0.7663827478885651, + "step": 65490 + }, + { + "epoch": 0.655, + "grad_norm": 14.8125, + "grad_norm_var": 0.5440104166666667, + "learning_rate": 0.0003, + "loss": 10.945, + "loss/aux_loss": 0.048072614893317225, + "loss/crossentropy": 2.5011337757110597, + "loss/logits": 0.8080022811889649, + "step": 65500 + }, + { + "epoch": 0.6551, + "grad_norm": 15.5, + "grad_norm_var": 0.43203125, + "learning_rate": 0.0003, + "loss": 11.0347, + "loss/aux_loss": 0.048070020973682404, + "loss/crossentropy": 2.806613862514496, + "loss/logits": 0.8248602509498596, + "step": 65510 + }, + { + "epoch": 0.6552, + "grad_norm": 15.5, + "grad_norm_var": 0.265869140625, + "learning_rate": 0.0003, + "loss": 10.8658, + "loss/aux_loss": 0.048069593869149684, + "loss/crossentropy": 2.7143899381160734, + "loss/logits": 0.812701740860939, + "step": 65520 + }, + { + "epoch": 0.6553, + "grad_norm": 16.125, + "grad_norm_var": 0.42355143229166664, + "learning_rate": 0.0003, + "loss": 10.8261, + "loss/aux_loss": 0.0480674784630537, + "loss/crossentropy": 2.8173023641109465, + "loss/logits": 0.8293744832277298, + "step": 65530 + }, + { + "epoch": 0.6554, + "grad_norm": 14.8125, + "grad_norm_var": 1.043212890625, + "learning_rate": 0.0003, + "loss": 10.8558, + "loss/aux_loss": 0.04806223623454571, + "loss/crossentropy": 2.5952585637569427, + "loss/logits": 0.7964704751968383, + "step": 65540 + }, + { + "epoch": 0.6555, + "grad_norm": 15.875, + "grad_norm_var": 0.7447916666666666, + "learning_rate": 0.0003, + "loss": 10.8759, + "loss/aux_loss": 0.048076456785202025, + "loss/crossentropy": 2.5286275029182432, + "loss/logits": 0.8162722438573837, + "step": 65550 + }, + { + "epoch": 0.6556, + "grad_norm": 15.0625, + "grad_norm_var": 0.545556640625, + "learning_rate": 0.0003, + "loss": 10.8139, + "loss/aux_loss": 0.04805875848978758, + "loss/crossentropy": 2.9092560052871703, + "loss/logits": 0.8326963096857071, + "step": 65560 + }, + { + "epoch": 0.6557, + "grad_norm": 16.875, + "grad_norm_var": 0.45284830729166664, + "learning_rate": 0.0003, + "loss": 10.6835, + "loss/aux_loss": 0.04808024391531944, + "loss/crossentropy": 2.513635885715485, + "loss/logits": 0.7660587877035141, + "step": 65570 + }, + { + "epoch": 0.6558, + "grad_norm": 15.375, + "grad_norm_var": 0.9692545572916667, + "learning_rate": 0.0003, + "loss": 10.8846, + "loss/aux_loss": 0.04806702360510826, + "loss/crossentropy": 2.7251508593559266, + "loss/logits": 0.815480324625969, + "step": 65580 + }, + { + "epoch": 0.6559, + "grad_norm": 14.1875, + "grad_norm_var": 0.6831868489583334, + "learning_rate": 0.0003, + "loss": 10.8619, + "loss/aux_loss": 0.04806389529258013, + "loss/crossentropy": 2.5533951461315154, + "loss/logits": 0.7619587257504463, + "step": 65590 + }, + { + "epoch": 0.656, + "grad_norm": 15.5, + "grad_norm_var": 0.7244791666666667, + "learning_rate": 0.0003, + "loss": 10.8078, + "loss/aux_loss": 0.048077251948416236, + "loss/crossentropy": 2.7818902015686033, + "loss/logits": 0.8328968584537506, + "step": 65600 + }, + { + "epoch": 0.6561, + "grad_norm": 14.4375, + "grad_norm_var": 0.9035807291666667, + "learning_rate": 0.0003, + "loss": 10.8577, + "loss/aux_loss": 0.048065769299864766, + "loss/crossentropy": 2.6951212108135225, + "loss/logits": 0.7890205055475235, + "step": 65610 + }, + { + "epoch": 0.6562, + "grad_norm": 15.4375, + "grad_norm_var": 0.708837890625, + "learning_rate": 0.0003, + "loss": 11.0567, + "loss/aux_loss": 0.048072610050439835, + "loss/crossentropy": 2.7550642490386963, + "loss/logits": 0.8612349301576614, + "step": 65620 + }, + { + "epoch": 0.6563, + "grad_norm": 14.9375, + "grad_norm_var": 0.28880208333333335, + "learning_rate": 0.0003, + "loss": 11.0369, + "loss/aux_loss": 0.04806470815092325, + "loss/crossentropy": 2.7217436909675596, + "loss/logits": 0.8420159697532654, + "step": 65630 + }, + { + "epoch": 0.6564, + "grad_norm": 15.0625, + "grad_norm_var": 0.35618489583333335, + "learning_rate": 0.0003, + "loss": 10.9453, + "loss/aux_loss": 0.048069029301404956, + "loss/crossentropy": 2.7288358211517334, + "loss/logits": 0.8264297485351563, + "step": 65640 + }, + { + "epoch": 0.6565, + "grad_norm": 15.125, + "grad_norm_var": 1.4625, + "learning_rate": 0.0003, + "loss": 10.9144, + "loss/aux_loss": 0.048067984730005266, + "loss/crossentropy": 2.7028370201587677, + "loss/logits": 0.7846741080284119, + "step": 65650 + }, + { + "epoch": 0.6566, + "grad_norm": 14.9375, + "grad_norm_var": 46.948681640625, + "learning_rate": 0.0003, + "loss": 10.8727, + "loss/aux_loss": 0.04807305838912725, + "loss/crossentropy": 2.77775114774704, + "loss/logits": 0.8294332057237626, + "step": 65660 + }, + { + "epoch": 0.6567, + "grad_norm": 15.25, + "grad_norm_var": 172.750634765625, + "learning_rate": 0.0003, + "loss": 10.8284, + "loss/aux_loss": 0.04807068482041359, + "loss/crossentropy": 2.617089319229126, + "loss/logits": 0.8040230572223663, + "step": 65670 + }, + { + "epoch": 0.6568, + "grad_norm": 17.125, + "grad_norm_var": 12.295247395833334, + "learning_rate": 0.0003, + "loss": 10.8871, + "loss/aux_loss": 0.048083870112895964, + "loss/crossentropy": 2.7740320563316345, + "loss/logits": 0.8273166418075562, + "step": 65680 + }, + { + "epoch": 0.6569, + "grad_norm": 14.25, + "grad_norm_var": 0.57421875, + "learning_rate": 0.0003, + "loss": 10.8403, + "loss/aux_loss": 0.048068479262292386, + "loss/crossentropy": 2.646625280380249, + "loss/logits": 0.7698414534330368, + "step": 65690 + }, + { + "epoch": 0.657, + "grad_norm": 16.0, + "grad_norm_var": 0.8505208333333333, + "learning_rate": 0.0003, + "loss": 10.7472, + "loss/aux_loss": 0.04805990979075432, + "loss/crossentropy": 2.5781956732273104, + "loss/logits": 0.7773859173059463, + "step": 65700 + }, + { + "epoch": 0.6571, + "grad_norm": 15.375, + "grad_norm_var": 0.9051432291666667, + "learning_rate": 0.0003, + "loss": 10.9749, + "loss/aux_loss": 0.048071971721947195, + "loss/crossentropy": 2.6975907385349274, + "loss/logits": 0.8216118335723877, + "step": 65710 + }, + { + "epoch": 0.6572, + "grad_norm": 14.75, + "grad_norm_var": 0.89609375, + "learning_rate": 0.0003, + "loss": 10.9318, + "loss/aux_loss": 0.04807693250477314, + "loss/crossentropy": 2.7426917433738707, + "loss/logits": 0.8241723477840424, + "step": 65720 + }, + { + "epoch": 0.6573, + "grad_norm": 16.5, + "grad_norm_var": 0.9436848958333334, + "learning_rate": 0.0003, + "loss": 11.0239, + "loss/aux_loss": 0.04806841984391212, + "loss/crossentropy": 2.820639455318451, + "loss/logits": 0.8430281788110733, + "step": 65730 + }, + { + "epoch": 0.6574, + "grad_norm": 16.375, + "grad_norm_var": 0.9322265625, + "learning_rate": 0.0003, + "loss": 10.9057, + "loss/aux_loss": 0.04807513765990734, + "loss/crossentropy": 2.6548421382904053, + "loss/logits": 0.8545916020870209, + "step": 65740 + }, + { + "epoch": 0.6575, + "grad_norm": 14.875, + "grad_norm_var": 1.0886555989583333, + "learning_rate": 0.0003, + "loss": 11.003, + "loss/aux_loss": 0.048061134107410905, + "loss/crossentropy": 2.6990270137786867, + "loss/logits": 0.823108297586441, + "step": 65750 + }, + { + "epoch": 0.6576, + "grad_norm": 15.375, + "grad_norm_var": 0.6048014322916667, + "learning_rate": 0.0003, + "loss": 10.8804, + "loss/aux_loss": 0.04807158131152391, + "loss/crossentropy": 2.6189096331596375, + "loss/logits": 0.7941760838031768, + "step": 65760 + }, + { + "epoch": 0.6577, + "grad_norm": 15.125, + "grad_norm_var": 0.2613118489583333, + "learning_rate": 0.0003, + "loss": 10.7767, + "loss/aux_loss": 0.04806184582412243, + "loss/crossentropy": 2.6444436371326447, + "loss/logits": 0.8036207824945449, + "step": 65770 + }, + { + "epoch": 0.6578, + "grad_norm": 15.25, + "grad_norm_var": 0.3494140625, + "learning_rate": 0.0003, + "loss": 10.9201, + "loss/aux_loss": 0.04807628132402897, + "loss/crossentropy": 2.7261348724365235, + "loss/logits": 0.8091208696365356, + "step": 65780 + }, + { + "epoch": 0.6579, + "grad_norm": 14.25, + "grad_norm_var": 0.7093587239583333, + "learning_rate": 0.0003, + "loss": 10.7387, + "loss/aux_loss": 0.04807419925928116, + "loss/crossentropy": 2.5443699240684508, + "loss/logits": 0.8154137402772903, + "step": 65790 + }, + { + "epoch": 0.658, + "grad_norm": 15.5625, + "grad_norm_var": 0.664697265625, + "learning_rate": 0.0003, + "loss": 10.9833, + "loss/aux_loss": 0.0480806415900588, + "loss/crossentropy": 2.5583129703998564, + "loss/logits": 0.830548295378685, + "step": 65800 + }, + { + "epoch": 0.6581, + "grad_norm": 14.375, + "grad_norm_var": 0.8320149739583333, + "learning_rate": 0.0003, + "loss": 10.9038, + "loss/aux_loss": 0.04805789217352867, + "loss/crossentropy": 2.6957422375679014, + "loss/logits": 0.8219772160053254, + "step": 65810 + }, + { + "epoch": 0.6582, + "grad_norm": 15.0625, + "grad_norm_var": 0.801025390625, + "learning_rate": 0.0003, + "loss": 10.9157, + "loss/aux_loss": 0.04807321224361658, + "loss/crossentropy": 2.6964453876018526, + "loss/logits": 0.8138649493455887, + "step": 65820 + }, + { + "epoch": 0.6583, + "grad_norm": 15.75, + "grad_norm_var": 0.337353515625, + "learning_rate": 0.0003, + "loss": 10.9783, + "loss/aux_loss": 0.048068931140005586, + "loss/crossentropy": 2.7101471066474914, + "loss/logits": 0.8215384483337402, + "step": 65830 + }, + { + "epoch": 0.6584, + "grad_norm": 16.0, + "grad_norm_var": 0.6258951822916666, + "learning_rate": 0.0003, + "loss": 10.9277, + "loss/aux_loss": 0.048083207570016384, + "loss/crossentropy": 2.4620073318481444, + "loss/logits": 0.7914715379476547, + "step": 65840 + }, + { + "epoch": 0.6585, + "grad_norm": 15.875, + "grad_norm_var": 0.5516764322916666, + "learning_rate": 0.0003, + "loss": 10.9253, + "loss/aux_loss": 0.048061787895858285, + "loss/crossentropy": 2.727776914834976, + "loss/logits": 0.7932167321443557, + "step": 65850 + }, + { + "epoch": 0.6586, + "grad_norm": 14.875, + "grad_norm_var": 0.5926920572916666, + "learning_rate": 0.0003, + "loss": 10.9381, + "loss/aux_loss": 0.04806985054165125, + "loss/crossentropy": 2.8461714386940002, + "loss/logits": 0.7947121143341065, + "step": 65860 + }, + { + "epoch": 0.6587, + "grad_norm": 14.125, + "grad_norm_var": 0.3098958333333333, + "learning_rate": 0.0003, + "loss": 10.8217, + "loss/aux_loss": 0.04806865192949772, + "loss/crossentropy": 2.6267981052398683, + "loss/logits": 0.7960964858531951, + "step": 65870 + }, + { + "epoch": 0.6588, + "grad_norm": 15.375, + "grad_norm_var": 0.40670572916666664, + "learning_rate": 0.0003, + "loss": 10.8895, + "loss/aux_loss": 0.04806759785860777, + "loss/crossentropy": 2.741170364618301, + "loss/logits": 0.8563113749027252, + "step": 65880 + }, + { + "epoch": 0.6589, + "grad_norm": 15.0625, + "grad_norm_var": 0.19542643229166667, + "learning_rate": 0.0003, + "loss": 10.8893, + "loss/aux_loss": 0.04807578232139349, + "loss/crossentropy": 2.7358233749866487, + "loss/logits": 0.838133355975151, + "step": 65890 + }, + { + "epoch": 0.659, + "grad_norm": 15.5625, + "grad_norm_var": 0.13956705729166666, + "learning_rate": 0.0003, + "loss": 10.9873, + "loss/aux_loss": 0.048049908503890036, + "loss/crossentropy": 2.681365489959717, + "loss/logits": 0.8367834746837616, + "step": 65900 + }, + { + "epoch": 0.6591, + "grad_norm": 14.9375, + "grad_norm_var": 0.347119140625, + "learning_rate": 0.0003, + "loss": 10.9322, + "loss/aux_loss": 0.04807221945375204, + "loss/crossentropy": 2.7976817011833193, + "loss/logits": 0.8617210656404495, + "step": 65910 + }, + { + "epoch": 0.6592, + "grad_norm": 14.75, + "grad_norm_var": 1.5262858072916667, + "learning_rate": 0.0003, + "loss": 10.9087, + "loss/aux_loss": 0.04806236419826746, + "loss/crossentropy": 2.6097477436065675, + "loss/logits": 0.8578928947448731, + "step": 65920 + }, + { + "epoch": 0.6593, + "grad_norm": 17.125, + "grad_norm_var": 1.06640625, + "learning_rate": 0.0003, + "loss": 11.0508, + "loss/aux_loss": 0.048072699643671515, + "loss/crossentropy": 2.638018161058426, + "loss/logits": 0.8202035665512085, + "step": 65930 + }, + { + "epoch": 0.6594, + "grad_norm": 16.5, + "grad_norm_var": 1.5949055989583334, + "learning_rate": 0.0003, + "loss": 11.0022, + "loss/aux_loss": 0.0480684619396925, + "loss/crossentropy": 2.7689769506454467, + "loss/logits": 0.8445602893829346, + "step": 65940 + }, + { + "epoch": 0.6595, + "grad_norm": 15.5, + "grad_norm_var": 2.2742024739583333, + "learning_rate": 0.0003, + "loss": 10.9084, + "loss/aux_loss": 0.048072042688727376, + "loss/crossentropy": 2.7954149782657622, + "loss/logits": 0.7957364201545716, + "step": 65950 + }, + { + "epoch": 0.6596, + "grad_norm": 15.0625, + "grad_norm_var": 2.113916015625, + "learning_rate": 0.0003, + "loss": 10.9942, + "loss/aux_loss": 0.04806188233196736, + "loss/crossentropy": 2.71195827126503, + "loss/logits": 0.7810200721025466, + "step": 65960 + }, + { + "epoch": 0.6597, + "grad_norm": 15.0625, + "grad_norm_var": 0.23932291666666666, + "learning_rate": 0.0003, + "loss": 10.8403, + "loss/aux_loss": 0.04807203095406294, + "loss/crossentropy": 2.6094238460063934, + "loss/logits": 0.7922606945037842, + "step": 65970 + }, + { + "epoch": 0.6598, + "grad_norm": 15.4375, + "grad_norm_var": 0.396728515625, + "learning_rate": 0.0003, + "loss": 10.7105, + "loss/aux_loss": 0.048070499673485756, + "loss/crossentropy": 2.6893329977989198, + "loss/logits": 0.7958266377449036, + "step": 65980 + }, + { + "epoch": 0.6599, + "grad_norm": 14.6875, + "grad_norm_var": 0.8907389322916667, + "learning_rate": 0.0003, + "loss": 11.0927, + "loss/aux_loss": 0.048058568872511385, + "loss/crossentropy": 2.7194202184677123, + "loss/logits": 0.8332067221403122, + "step": 65990 + }, + { + "epoch": 0.66, + "grad_norm": 15.6875, + "grad_norm_var": 0.6409993489583333, + "learning_rate": 0.0003, + "loss": 10.7974, + "loss/aux_loss": 0.04807265195995569, + "loss/crossentropy": 2.690684497356415, + "loss/logits": 0.8148763328790665, + "step": 66000 + }, + { + "epoch": 0.6601, + "grad_norm": 15.8125, + "grad_norm_var": 0.5574055989583333, + "learning_rate": 0.0003, + "loss": 10.888, + "loss/aux_loss": 0.04805999808013439, + "loss/crossentropy": 2.8406422197818757, + "loss/logits": 0.8340202659368515, + "step": 66010 + }, + { + "epoch": 0.6602, + "grad_norm": 16.625, + "grad_norm_var": 0.6265462239583334, + "learning_rate": 0.0003, + "loss": 10.9604, + "loss/aux_loss": 0.04808083530515432, + "loss/crossentropy": 2.6737845301628114, + "loss/logits": 0.7914625614881515, + "step": 66020 + }, + { + "epoch": 0.6603, + "grad_norm": 14.75, + "grad_norm_var": 0.5699055989583334, + "learning_rate": 0.0003, + "loss": 10.954, + "loss/aux_loss": 0.048062770254909994, + "loss/crossentropy": 2.8464788436889648, + "loss/logits": 0.8319862931966782, + "step": 66030 + }, + { + "epoch": 0.6604, + "grad_norm": 16.5, + "grad_norm_var": 0.5905598958333333, + "learning_rate": 0.0003, + "loss": 11.004, + "loss/aux_loss": 0.04807984437793493, + "loss/crossentropy": 2.741937702894211, + "loss/logits": 0.8358243376016616, + "step": 66040 + }, + { + "epoch": 0.6605, + "grad_norm": 15.8125, + "grad_norm_var": 1.010009765625, + "learning_rate": 0.0003, + "loss": 10.8128, + "loss/aux_loss": 0.04805782604962587, + "loss/crossentropy": 2.4419663667678835, + "loss/logits": 0.8032531976699829, + "step": 66050 + }, + { + "epoch": 0.6606, + "grad_norm": 14.75, + "grad_norm_var": 0.459619140625, + "learning_rate": 0.0003, + "loss": 10.8743, + "loss/aux_loss": 0.048064283281564715, + "loss/crossentropy": 2.7457400977611544, + "loss/logits": 0.8262648940086365, + "step": 66060 + }, + { + "epoch": 0.6607, + "grad_norm": 14.875, + "grad_norm_var": 0.4903645833333333, + "learning_rate": 0.0003, + "loss": 10.8164, + "loss/aux_loss": 0.048074642196297646, + "loss/crossentropy": 2.6913636445999147, + "loss/logits": 0.8233010709285736, + "step": 66070 + }, + { + "epoch": 0.6608, + "grad_norm": 15.5, + "grad_norm_var": 0.5546223958333333, + "learning_rate": 0.0003, + "loss": 10.9488, + "loss/aux_loss": 0.04807223491370678, + "loss/crossentropy": 2.7321924567222595, + "loss/logits": 0.8634290426969529, + "step": 66080 + }, + { + "epoch": 0.6609, + "grad_norm": 14.5, + "grad_norm_var": 0.6843098958333333, + "learning_rate": 0.0003, + "loss": 10.6757, + "loss/aux_loss": 0.048049288988113406, + "loss/crossentropy": 2.68677796125412, + "loss/logits": 0.8587844461202622, + "step": 66090 + }, + { + "epoch": 0.661, + "grad_norm": 15.125, + "grad_norm_var": 0.6515625, + "learning_rate": 0.0003, + "loss": 10.8042, + "loss/aux_loss": 0.048063311353325845, + "loss/crossentropy": 2.6101099252700806, + "loss/logits": 0.8087275177240372, + "step": 66100 + }, + { + "epoch": 0.6611, + "grad_norm": 13.875, + "grad_norm_var": 0.5549479166666667, + "learning_rate": 0.0003, + "loss": 10.8571, + "loss/aux_loss": 0.04808393083512783, + "loss/crossentropy": 2.6760826587677, + "loss/logits": 0.780521473288536, + "step": 66110 + }, + { + "epoch": 0.6612, + "grad_norm": 15.4375, + "grad_norm_var": 0.451025390625, + "learning_rate": 0.0003, + "loss": 10.9433, + "loss/aux_loss": 0.04805605374276638, + "loss/crossentropy": 2.692962384223938, + "loss/logits": 0.8174421191215515, + "step": 66120 + }, + { + "epoch": 0.6613, + "grad_norm": 14.9375, + "grad_norm_var": 0.76171875, + "learning_rate": 0.0003, + "loss": 10.9211, + "loss/aux_loss": 0.04807621203362942, + "loss/crossentropy": 2.6863688111305235, + "loss/logits": 0.798431122303009, + "step": 66130 + }, + { + "epoch": 0.6614, + "grad_norm": 13.9375, + "grad_norm_var": 0.47630208333333335, + "learning_rate": 0.0003, + "loss": 10.9411, + "loss/aux_loss": 0.04805770944803953, + "loss/crossentropy": 2.7181039690971374, + "loss/logits": 0.8259652733802796, + "step": 66140 + }, + { + "epoch": 0.6615, + "grad_norm": 16.5, + "grad_norm_var": 3.274853515625, + "learning_rate": 0.0003, + "loss": 10.8523, + "loss/aux_loss": 0.04807611126452684, + "loss/crossentropy": 2.8777012705802916, + "loss/logits": 0.8209088236093521, + "step": 66150 + }, + { + "epoch": 0.6616, + "grad_norm": 14.9375, + "grad_norm_var": 3.316145833333333, + "learning_rate": 0.0003, + "loss": 11.0427, + "loss/aux_loss": 0.048064004816114905, + "loss/crossentropy": 2.9656025767326355, + "loss/logits": 0.8535489648580551, + "step": 66160 + }, + { + "epoch": 0.6617, + "grad_norm": 17.125, + "grad_norm_var": 0.8879557291666667, + "learning_rate": 0.0003, + "loss": 10.7944, + "loss/aux_loss": 0.04805393647402525, + "loss/crossentropy": 2.6846187472343446, + "loss/logits": 0.7934094220399857, + "step": 66170 + }, + { + "epoch": 0.6618, + "grad_norm": 15.1875, + "grad_norm_var": 0.6486979166666667, + "learning_rate": 0.0003, + "loss": 10.9017, + "loss/aux_loss": 0.04807519093155861, + "loss/crossentropy": 2.854272258281708, + "loss/logits": 0.8095389395952225, + "step": 66180 + }, + { + "epoch": 0.6619, + "grad_norm": 14.9375, + "grad_norm_var": 0.613525390625, + "learning_rate": 0.0003, + "loss": 10.946, + "loss/aux_loss": 0.04806575421243906, + "loss/crossentropy": 2.7491804242134092, + "loss/logits": 0.8097761183977127, + "step": 66190 + }, + { + "epoch": 0.662, + "grad_norm": 16.625, + "grad_norm_var": 0.4331868489583333, + "learning_rate": 0.0003, + "loss": 10.9215, + "loss/aux_loss": 0.04807676300406456, + "loss/crossentropy": 2.6795652329921724, + "loss/logits": 0.801378121972084, + "step": 66200 + }, + { + "epoch": 0.6621, + "grad_norm": 14.625, + "grad_norm_var": 0.563134765625, + "learning_rate": 0.0003, + "loss": 10.8342, + "loss/aux_loss": 0.04805709309875965, + "loss/crossentropy": 2.752977591753006, + "loss/logits": 0.8294487535953522, + "step": 66210 + }, + { + "epoch": 0.6622, + "grad_norm": 15.375, + "grad_norm_var": 0.8949055989583333, + "learning_rate": 0.0003, + "loss": 10.9269, + "loss/aux_loss": 0.048077603057026866, + "loss/crossentropy": 2.8537149250507357, + "loss/logits": 0.8224076896905899, + "step": 66220 + }, + { + "epoch": 0.6623, + "grad_norm": 16.75, + "grad_norm_var": 0.5244140625, + "learning_rate": 0.0003, + "loss": 10.9263, + "loss/aux_loss": 0.04807521738111973, + "loss/crossentropy": 2.651250684261322, + "loss/logits": 0.8318710893392562, + "step": 66230 + }, + { + "epoch": 0.6624, + "grad_norm": 15.9375, + "grad_norm_var": 0.4410807291666667, + "learning_rate": 0.0003, + "loss": 10.9263, + "loss/aux_loss": 0.048059741780161855, + "loss/crossentropy": 2.849357432126999, + "loss/logits": 0.8336068332195282, + "step": 66240 + }, + { + "epoch": 0.6625, + "grad_norm": 16.25, + "grad_norm_var": 0.6070149739583334, + "learning_rate": 0.0003, + "loss": 10.8793, + "loss/aux_loss": 0.04806891325861216, + "loss/crossentropy": 2.5584873795509337, + "loss/logits": 0.8233877867460251, + "step": 66250 + }, + { + "epoch": 0.6626, + "grad_norm": 15.625, + "grad_norm_var": 0.7613932291666666, + "learning_rate": 0.0003, + "loss": 10.8628, + "loss/aux_loss": 0.04806743785738945, + "loss/crossentropy": 2.732908582687378, + "loss/logits": 0.8170014798641205, + "step": 66260 + }, + { + "epoch": 0.6627, + "grad_norm": 14.875, + "grad_norm_var": 0.363525390625, + "learning_rate": 0.0003, + "loss": 10.7629, + "loss/aux_loss": 0.04805696085095405, + "loss/crossentropy": 2.81008266210556, + "loss/logits": 0.8305856496095657, + "step": 66270 + }, + { + "epoch": 0.6628, + "grad_norm": 15.125, + "grad_norm_var": 0.25045572916666664, + "learning_rate": 0.0003, + "loss": 10.8067, + "loss/aux_loss": 0.048075790703296664, + "loss/crossentropy": 2.528480714559555, + "loss/logits": 0.7946991354227066, + "step": 66280 + }, + { + "epoch": 0.6629, + "grad_norm": 15.6875, + "grad_norm_var": 0.4090983072916667, + "learning_rate": 0.0003, + "loss": 10.9142, + "loss/aux_loss": 0.048072568513453005, + "loss/crossentropy": 2.7415711283683777, + "loss/logits": 0.8110772639513015, + "step": 66290 + }, + { + "epoch": 0.663, + "grad_norm": 14.375, + "grad_norm_var": 0.7744791666666667, + "learning_rate": 0.0003, + "loss": 10.8882, + "loss/aux_loss": 0.048075555637478826, + "loss/crossentropy": 2.724819177389145, + "loss/logits": 0.7944720953702926, + "step": 66300 + }, + { + "epoch": 0.6631, + "grad_norm": 15.125, + "grad_norm_var": 0.7395182291666667, + "learning_rate": 0.0003, + "loss": 10.9813, + "loss/aux_loss": 0.04807244967669248, + "loss/crossentropy": 2.643247830867767, + "loss/logits": 0.8237002640962601, + "step": 66310 + }, + { + "epoch": 0.6632, + "grad_norm": 14.9375, + "grad_norm_var": 0.5169108072916667, + "learning_rate": 0.0003, + "loss": 10.7386, + "loss/aux_loss": 0.048069367185235023, + "loss/crossentropy": 2.7121821284294128, + "loss/logits": 0.8011160790920258, + "step": 66320 + }, + { + "epoch": 0.6633, + "grad_norm": 14.625, + "grad_norm_var": 0.30402018229166666, + "learning_rate": 0.0003, + "loss": 10.8735, + "loss/aux_loss": 0.04806159436702728, + "loss/crossentropy": 2.7342435657978057, + "loss/logits": 0.8220883011817932, + "step": 66330 + }, + { + "epoch": 0.6634, + "grad_norm": 15.625, + "grad_norm_var": 0.394775390625, + "learning_rate": 0.0003, + "loss": 10.8719, + "loss/aux_loss": 0.0480760769918561, + "loss/crossentropy": 2.803659129142761, + "loss/logits": 0.8247465431690216, + "step": 66340 + }, + { + "epoch": 0.6635, + "grad_norm": 14.625, + "grad_norm_var": 0.9119791666666667, + "learning_rate": 0.0003, + "loss": 10.9, + "loss/aux_loss": 0.04806450568139553, + "loss/crossentropy": 2.7340852856636046, + "loss/logits": 0.810696679353714, + "step": 66350 + }, + { + "epoch": 0.6636, + "grad_norm": 14.875, + "grad_norm_var": 0.8149576822916667, + "learning_rate": 0.0003, + "loss": 10.9793, + "loss/aux_loss": 0.048075934126973155, + "loss/crossentropy": 2.481299436092377, + "loss/logits": 0.7764072805643082, + "step": 66360 + }, + { + "epoch": 0.6637, + "grad_norm": 14.75, + "grad_norm_var": 0.2515462239583333, + "learning_rate": 0.0003, + "loss": 10.9342, + "loss/aux_loss": 0.04805976003408432, + "loss/crossentropy": 2.8292657256126406, + "loss/logits": 0.8594965010881424, + "step": 66370 + }, + { + "epoch": 0.6638, + "grad_norm": 15.5, + "grad_norm_var": 0.47784830729166666, + "learning_rate": 0.0003, + "loss": 10.7546, + "loss/aux_loss": 0.048081879131495954, + "loss/crossentropy": 2.621725058555603, + "loss/logits": 0.778819665312767, + "step": 66380 + }, + { + "epoch": 0.6639, + "grad_norm": 16.125, + "grad_norm_var": 0.46261393229166664, + "learning_rate": 0.0003, + "loss": 10.8435, + "loss/aux_loss": 0.04807425364851951, + "loss/crossentropy": 2.6788039445877074, + "loss/logits": 0.8278974890708923, + "step": 66390 + }, + { + "epoch": 0.664, + "grad_norm": 14.5625, + "grad_norm_var": 0.8796712239583333, + "learning_rate": 0.0003, + "loss": 10.9036, + "loss/aux_loss": 0.04806184228509665, + "loss/crossentropy": 2.79437460899353, + "loss/logits": 0.8034818679094314, + "step": 66400 + }, + { + "epoch": 0.6641, + "grad_norm": 14.75, + "grad_norm_var": 1.0442057291666667, + "learning_rate": 0.0003, + "loss": 10.9, + "loss/aux_loss": 0.0480803944170475, + "loss/crossentropy": 2.8535486102104186, + "loss/logits": 0.8078803330659866, + "step": 66410 + }, + { + "epoch": 0.6642, + "grad_norm": 15.8125, + "grad_norm_var": 0.6542805989583333, + "learning_rate": 0.0003, + "loss": 10.8832, + "loss/aux_loss": 0.04806033242493868, + "loss/crossentropy": 2.7427396893501284, + "loss/logits": 0.8372073888778686, + "step": 66420 + }, + { + "epoch": 0.6643, + "grad_norm": 14.6875, + "grad_norm_var": 0.7947265625, + "learning_rate": 0.0003, + "loss": 10.7244, + "loss/aux_loss": 0.048065388575196266, + "loss/crossentropy": 2.710828936100006, + "loss/logits": 0.8116421043872833, + "step": 66430 + }, + { + "epoch": 0.6644, + "grad_norm": 15.3125, + "grad_norm_var": 5.3578125, + "learning_rate": 0.0003, + "loss": 10.8632, + "loss/aux_loss": 0.04807053990662098, + "loss/crossentropy": 2.719035828113556, + "loss/logits": 0.8238852351903916, + "step": 66440 + }, + { + "epoch": 0.6645, + "grad_norm": 14.875, + "grad_norm_var": 4.597639973958334, + "learning_rate": 0.0003, + "loss": 10.9155, + "loss/aux_loss": 0.04806662444025278, + "loss/crossentropy": 2.7444665908813475, + "loss/logits": 0.8264550715684891, + "step": 66450 + }, + { + "epoch": 0.6646, + "grad_norm": 15.125, + "grad_norm_var": 0.21243489583333333, + "learning_rate": 0.0003, + "loss": 10.8299, + "loss/aux_loss": 0.04806863609701395, + "loss/crossentropy": 2.6724780917167665, + "loss/logits": 0.8069694906473159, + "step": 66460 + }, + { + "epoch": 0.6647, + "grad_norm": 19.0, + "grad_norm_var": 1.3813639322916667, + "learning_rate": 0.0003, + "loss": 10.8276, + "loss/aux_loss": 0.04807253777980804, + "loss/crossentropy": 2.675402784347534, + "loss/logits": 0.7911726206541061, + "step": 66470 + }, + { + "epoch": 0.6648, + "grad_norm": 15.4375, + "grad_norm_var": 1.024853515625, + "learning_rate": 0.0003, + "loss": 10.9674, + "loss/aux_loss": 0.048071261309087274, + "loss/crossentropy": 2.833948886394501, + "loss/logits": 0.8181155323982239, + "step": 66480 + }, + { + "epoch": 0.6649, + "grad_norm": 15.5, + "grad_norm_var": 0.44503580729166664, + "learning_rate": 0.0003, + "loss": 10.9112, + "loss/aux_loss": 0.048069555498659614, + "loss/crossentropy": 2.501497894525528, + "loss/logits": 0.7850837290287018, + "step": 66490 + }, + { + "epoch": 0.665, + "grad_norm": 14.625, + "grad_norm_var": 0.45546875, + "learning_rate": 0.0003, + "loss": 10.9807, + "loss/aux_loss": 0.04806936550885439, + "loss/crossentropy": 2.6703847885131835, + "loss/logits": 0.8007300883531571, + "step": 66500 + }, + { + "epoch": 0.6651, + "grad_norm": 15.0, + "grad_norm_var": 0.30974934895833334, + "learning_rate": 0.0003, + "loss": 10.9003, + "loss/aux_loss": 0.04807521235197783, + "loss/crossentropy": 2.5529791355133056, + "loss/logits": 0.8032549649477005, + "step": 66510 + }, + { + "epoch": 0.6652, + "grad_norm": 15.1875, + "grad_norm_var": 0.3719889322916667, + "learning_rate": 0.0003, + "loss": 10.8097, + "loss/aux_loss": 0.048067670315504074, + "loss/crossentropy": 2.7725186586380004, + "loss/logits": 0.7987766593694687, + "step": 66520 + }, + { + "epoch": 0.6653, + "grad_norm": 14.3125, + "grad_norm_var": 0.4192708333333333, + "learning_rate": 0.0003, + "loss": 11.0081, + "loss/aux_loss": 0.04806581847369671, + "loss/crossentropy": 2.6651151537895204, + "loss/logits": 0.8225375205278397, + "step": 66530 + }, + { + "epoch": 0.6654, + "grad_norm": 15.375, + "grad_norm_var": 1.2058430989583333, + "learning_rate": 0.0003, + "loss": 10.9034, + "loss/aux_loss": 0.04806330688297748, + "loss/crossentropy": 2.8271175622940063, + "loss/logits": 0.8273738652467728, + "step": 66540 + }, + { + "epoch": 0.6655, + "grad_norm": 14.5, + "grad_norm_var": 0.6911458333333333, + "learning_rate": 0.0003, + "loss": 10.9021, + "loss/aux_loss": 0.0480733385309577, + "loss/crossentropy": 2.6847646474838256, + "loss/logits": 0.7972731322050095, + "step": 66550 + }, + { + "epoch": 0.6656, + "grad_norm": 14.0, + "grad_norm_var": 0.6874837239583333, + "learning_rate": 0.0003, + "loss": 10.7575, + "loss/aux_loss": 0.048074370436370376, + "loss/crossentropy": 2.680130976438522, + "loss/logits": 0.7642890572547912, + "step": 66560 + }, + { + "epoch": 0.6657, + "grad_norm": 14.125, + "grad_norm_var": 0.6921712239583333, + "learning_rate": 0.0003, + "loss": 10.8822, + "loss/aux_loss": 0.04806676432490349, + "loss/crossentropy": 2.791566550731659, + "loss/logits": 0.8233636647462845, + "step": 66570 + }, + { + "epoch": 0.6658, + "grad_norm": 15.5625, + "grad_norm_var": 0.6950358072916667, + "learning_rate": 0.0003, + "loss": 10.9591, + "loss/aux_loss": 0.04806861318647861, + "loss/crossentropy": 2.824068772792816, + "loss/logits": 0.8359575748443604, + "step": 66580 + }, + { + "epoch": 0.6659, + "grad_norm": 16.5, + "grad_norm_var": 3.5942057291666667, + "learning_rate": 0.0003, + "loss": 10.8459, + "loss/aux_loss": 0.04806522503495216, + "loss/crossentropy": 2.7881508350372313, + "loss/logits": 0.8030070185661315, + "step": 66590 + }, + { + "epoch": 0.666, + "grad_norm": 15.125, + "grad_norm_var": 0.3900390625, + "learning_rate": 0.0003, + "loss": 10.9317, + "loss/aux_loss": 0.048059667088091375, + "loss/crossentropy": 2.747694218158722, + "loss/logits": 0.8331306129693985, + "step": 66600 + }, + { + "epoch": 0.6661, + "grad_norm": 16.0, + "grad_norm_var": 0.3963541666666667, + "learning_rate": 0.0003, + "loss": 10.7401, + "loss/aux_loss": 0.04808664340525866, + "loss/crossentropy": 2.5784662127494813, + "loss/logits": 0.7583020776510239, + "step": 66610 + }, + { + "epoch": 0.6662, + "grad_norm": 15.75, + "grad_norm_var": 0.7514973958333333, + "learning_rate": 0.0003, + "loss": 10.8734, + "loss/aux_loss": 0.04807219747453928, + "loss/crossentropy": 2.6055344462394716, + "loss/logits": 0.8134197026491166, + "step": 66620 + }, + { + "epoch": 0.6663, + "grad_norm": 15.0625, + "grad_norm_var": 0.31495768229166665, + "learning_rate": 0.0003, + "loss": 10.8212, + "loss/aux_loss": 0.048065138049423695, + "loss/crossentropy": 2.6512358248233796, + "loss/logits": 0.7796749144792556, + "step": 66630 + }, + { + "epoch": 0.6664, + "grad_norm": 15.3125, + "grad_norm_var": 0.6481770833333333, + "learning_rate": 0.0003, + "loss": 10.8075, + "loss/aux_loss": 0.04805928375571966, + "loss/crossentropy": 2.748984879255295, + "loss/logits": 0.782018169760704, + "step": 66640 + }, + { + "epoch": 0.6665, + "grad_norm": 14.6875, + "grad_norm_var": 0.526806640625, + "learning_rate": 0.0003, + "loss": 10.9915, + "loss/aux_loss": 0.048084696754813194, + "loss/crossentropy": 2.6403255581855776, + "loss/logits": 0.8360554903745652, + "step": 66650 + }, + { + "epoch": 0.6666, + "grad_norm": 14.6875, + "grad_norm_var": 0.5416666666666666, + "learning_rate": 0.0003, + "loss": 10.7651, + "loss/aux_loss": 0.0480577452108264, + "loss/crossentropy": 2.7464575350284575, + "loss/logits": 0.816826593875885, + "step": 66660 + }, + { + "epoch": 0.6667, + "grad_norm": 15.3125, + "grad_norm_var": 0.6425618489583333, + "learning_rate": 0.0003, + "loss": 10.8847, + "loss/aux_loss": 0.04806127417832613, + "loss/crossentropy": 2.687554585933685, + "loss/logits": 0.8001707077026368, + "step": 66670 + }, + { + "epoch": 0.6668, + "grad_norm": 15.0625, + "grad_norm_var": 0.3346354166666667, + "learning_rate": 0.0003, + "loss": 10.9473, + "loss/aux_loss": 0.048075456917285916, + "loss/crossentropy": 2.8568101286888123, + "loss/logits": 0.8553566783666611, + "step": 66680 + }, + { + "epoch": 0.6669, + "grad_norm": 15.25, + "grad_norm_var": 0.2999348958333333, + "learning_rate": 0.0003, + "loss": 10.9538, + "loss/aux_loss": 0.04806934054940939, + "loss/crossentropy": 2.657517743110657, + "loss/logits": 0.8053190678358078, + "step": 66690 + }, + { + "epoch": 0.667, + "grad_norm": 15.9375, + "grad_norm_var": 0.9419108072916667, + "learning_rate": 0.0003, + "loss": 10.8266, + "loss/aux_loss": 0.04807275123894215, + "loss/crossentropy": 2.449772423505783, + "loss/logits": 0.7906589955091476, + "step": 66700 + }, + { + "epoch": 0.6671, + "grad_norm": 14.75, + "grad_norm_var": 0.5174479166666667, + "learning_rate": 0.0003, + "loss": 10.9636, + "loss/aux_loss": 0.048078746907413004, + "loss/crossentropy": 2.7611198365688323, + "loss/logits": 0.8044763505458832, + "step": 66710 + }, + { + "epoch": 0.6672, + "grad_norm": 17.5, + "grad_norm_var": 3.1023274739583333, + "learning_rate": 0.0003, + "loss": 10.9441, + "loss/aux_loss": 0.04806906506419182, + "loss/crossentropy": 2.8186138391494753, + "loss/logits": 0.8578125566244126, + "step": 66720 + }, + { + "epoch": 0.6673, + "grad_norm": 16.625, + "grad_norm_var": 5.800504557291666, + "learning_rate": 0.0003, + "loss": 10.7651, + "loss/aux_loss": 0.04806268475949764, + "loss/crossentropy": 2.5886098742485046, + "loss/logits": 0.7767158389091492, + "step": 66730 + }, + { + "epoch": 0.6674, + "grad_norm": 15.5625, + "grad_norm_var": 5.258854166666667, + "learning_rate": 0.0003, + "loss": 10.9085, + "loss/aux_loss": 0.04808114189654589, + "loss/crossentropy": 2.7260345458984374, + "loss/logits": 0.79498670399189, + "step": 66740 + }, + { + "epoch": 0.6675, + "grad_norm": 14.5625, + "grad_norm_var": 0.98125, + "learning_rate": 0.0003, + "loss": 10.8072, + "loss/aux_loss": 0.048059665225446226, + "loss/crossentropy": 2.596436160802841, + "loss/logits": 0.7788780838251114, + "step": 66750 + }, + { + "epoch": 0.6676, + "grad_norm": 14.3125, + "grad_norm_var": 0.4231770833333333, + "learning_rate": 0.0003, + "loss": 10.9401, + "loss/aux_loss": 0.048060267791152, + "loss/crossentropy": 2.6341087102890013, + "loss/logits": 0.8312490910291672, + "step": 66760 + }, + { + "epoch": 0.6677, + "grad_norm": 14.0, + "grad_norm_var": 0.336962890625, + "learning_rate": 0.0003, + "loss": 11.0289, + "loss/aux_loss": 0.048079300485551354, + "loss/crossentropy": 2.835488021373749, + "loss/logits": 0.8537455588579178, + "step": 66770 + }, + { + "epoch": 0.6678, + "grad_norm": 16.25, + "grad_norm_var": 0.43333333333333335, + "learning_rate": 0.0003, + "loss": 10.8726, + "loss/aux_loss": 0.04806447774171829, + "loss/crossentropy": 2.640529549121857, + "loss/logits": 0.8138740628957748, + "step": 66780 + }, + { + "epoch": 0.6679, + "grad_norm": 14.4375, + "grad_norm_var": 0.384228515625, + "learning_rate": 0.0003, + "loss": 10.8864, + "loss/aux_loss": 0.048069434240460396, + "loss/crossentropy": 2.718799889087677, + "loss/logits": 0.804791709780693, + "step": 66790 + }, + { + "epoch": 0.668, + "grad_norm": 15.4375, + "grad_norm_var": 1.0369140625, + "learning_rate": 0.0003, + "loss": 10.8305, + "loss/aux_loss": 0.04806906692683697, + "loss/crossentropy": 2.598710483312607, + "loss/logits": 0.8012362569570541, + "step": 66800 + }, + { + "epoch": 0.6681, + "grad_norm": 15.8125, + "grad_norm_var": 1.2259765625, + "learning_rate": 0.0003, + "loss": 10.9184, + "loss/aux_loss": 0.048060832917690276, + "loss/crossentropy": 2.6800104796886446, + "loss/logits": 0.8155199468135834, + "step": 66810 + }, + { + "epoch": 0.6682, + "grad_norm": 16.25, + "grad_norm_var": 23.108268229166665, + "learning_rate": 0.0003, + "loss": 10.7432, + "loss/aux_loss": 0.04805918000638485, + "loss/crossentropy": 2.729696071147919, + "loss/logits": 0.8015040099620819, + "step": 66820 + }, + { + "epoch": 0.6683, + "grad_norm": 16.875, + "grad_norm_var": 23.670035807291665, + "learning_rate": 0.0003, + "loss": 10.8194, + "loss/aux_loss": 0.04807895701378584, + "loss/crossentropy": 2.8300251722335816, + "loss/logits": 0.8447652935981751, + "step": 66830 + }, + { + "epoch": 0.6684, + "grad_norm": 17.125, + "grad_norm_var": 0.650244140625, + "learning_rate": 0.0003, + "loss": 10.9207, + "loss/aux_loss": 0.048053346760571006, + "loss/crossentropy": 2.675478661060333, + "loss/logits": 0.8098080486059189, + "step": 66840 + }, + { + "epoch": 0.6685, + "grad_norm": 14.9375, + "grad_norm_var": 0.5291015625, + "learning_rate": 0.0003, + "loss": 10.8967, + "loss/aux_loss": 0.048054653219878674, + "loss/crossentropy": 2.858685314655304, + "loss/logits": 0.8220966100692749, + "step": 66850 + }, + { + "epoch": 0.6686, + "grad_norm": 15.25, + "grad_norm_var": 1.038134765625, + "learning_rate": 0.0003, + "loss": 10.8929, + "loss/aux_loss": 0.04808123260736465, + "loss/crossentropy": 2.7240459442138674, + "loss/logits": 0.8174852192401886, + "step": 66860 + }, + { + "epoch": 0.6687, + "grad_norm": 17.125, + "grad_norm_var": 0.8919270833333334, + "learning_rate": 0.0003, + "loss": 11.0093, + "loss/aux_loss": 0.04807883575558662, + "loss/crossentropy": 2.665953540802002, + "loss/logits": 0.8114261239767074, + "step": 66870 + }, + { + "epoch": 0.6688, + "grad_norm": 15.4375, + "grad_norm_var": 1.1030598958333333, + "learning_rate": 0.0003, + "loss": 10.9429, + "loss/aux_loss": 0.04805787615478039, + "loss/crossentropy": 2.685689914226532, + "loss/logits": 0.806901153922081, + "step": 66880 + }, + { + "epoch": 0.6689, + "grad_norm": 16.375, + "grad_norm_var": 0.511181640625, + "learning_rate": 0.0003, + "loss": 10.8641, + "loss/aux_loss": 0.0480721453204751, + "loss/crossentropy": 2.829719823598862, + "loss/logits": 0.8213761389255524, + "step": 66890 + }, + { + "epoch": 0.669, + "grad_norm": 15.1875, + "grad_norm_var": 0.6014973958333333, + "learning_rate": 0.0003, + "loss": 10.9479, + "loss/aux_loss": 0.04806473944336176, + "loss/crossentropy": 2.6682372391223907, + "loss/logits": 0.8243909746408462, + "step": 66900 + }, + { + "epoch": 0.6691, + "grad_norm": 15.1875, + "grad_norm_var": 0.3916015625, + "learning_rate": 0.0003, + "loss": 10.8806, + "loss/aux_loss": 0.048068069666624066, + "loss/crossentropy": 2.672402673959732, + "loss/logits": 0.8250930517911911, + "step": 66910 + }, + { + "epoch": 0.6692, + "grad_norm": 14.5625, + "grad_norm_var": 1.8825358072916667, + "learning_rate": 0.0003, + "loss": 11.0711, + "loss/aux_loss": 0.04807632770389318, + "loss/crossentropy": 2.706685644388199, + "loss/logits": 0.8509224832057953, + "step": 66920 + }, + { + "epoch": 0.6693, + "grad_norm": 17.25, + "grad_norm_var": 0.6629557291666667, + "learning_rate": 0.0003, + "loss": 10.6838, + "loss/aux_loss": 0.048064228519797324, + "loss/crossentropy": 2.7776548743247984, + "loss/logits": 0.7853770822286605, + "step": 66930 + }, + { + "epoch": 0.6694, + "grad_norm": 14.625, + "grad_norm_var": 0.8407389322916666, + "learning_rate": 0.0003, + "loss": 10.912, + "loss/aux_loss": 0.048056557215750216, + "loss/crossentropy": 2.7243767201900484, + "loss/logits": 0.7943523436784744, + "step": 66940 + }, + { + "epoch": 0.6695, + "grad_norm": 15.1875, + "grad_norm_var": 1.0590983072916667, + "learning_rate": 0.0003, + "loss": 10.9122, + "loss/aux_loss": 0.048080825619399546, + "loss/crossentropy": 2.541512316465378, + "loss/logits": 0.7956676542758941, + "step": 66950 + }, + { + "epoch": 0.6696, + "grad_norm": 14.25, + "grad_norm_var": 0.619775390625, + "learning_rate": 0.0003, + "loss": 10.8168, + "loss/aux_loss": 0.04807412121444941, + "loss/crossentropy": 2.71076363325119, + "loss/logits": 0.8469307273626328, + "step": 66960 + }, + { + "epoch": 0.6697, + "grad_norm": 15.5625, + "grad_norm_var": 1.1841145833333333, + "learning_rate": 0.0003, + "loss": 10.7509, + "loss/aux_loss": 0.0480673236772418, + "loss/crossentropy": 2.56371031999588, + "loss/logits": 0.7859199553728103, + "step": 66970 + }, + { + "epoch": 0.6698, + "grad_norm": 17.375, + "grad_norm_var": 0.7645670572916666, + "learning_rate": 0.0003, + "loss": 10.8529, + "loss/aux_loss": 0.04807327184826136, + "loss/crossentropy": 2.7085660099983215, + "loss/logits": 0.8172822952270508, + "step": 66980 + }, + { + "epoch": 0.6699, + "grad_norm": 15.875, + "grad_norm_var": 0.6764973958333333, + "learning_rate": 0.0003, + "loss": 10.6937, + "loss/aux_loss": 0.048074647411704065, + "loss/crossentropy": 2.551885908842087, + "loss/logits": 0.7823489457368851, + "step": 66990 + }, + { + "epoch": 0.67, + "grad_norm": 15.1875, + "grad_norm_var": 0.36652018229166666, + "learning_rate": 0.0003, + "loss": 10.999, + "loss/aux_loss": 0.04806416109204292, + "loss/crossentropy": 2.615692639350891, + "loss/logits": 0.8171136409044266, + "step": 67000 + }, + { + "epoch": 0.6701, + "grad_norm": 15.1875, + "grad_norm_var": 0.22615559895833334, + "learning_rate": 0.0003, + "loss": 10.8361, + "loss/aux_loss": 0.04806163609027862, + "loss/crossentropy": 2.696385371685028, + "loss/logits": 0.7908263862133026, + "step": 67010 + }, + { + "epoch": 0.6702, + "grad_norm": 16.5, + "grad_norm_var": 0.5753743489583333, + "learning_rate": 0.0003, + "loss": 10.8475, + "loss/aux_loss": 0.04808085560798645, + "loss/crossentropy": 2.712275046110153, + "loss/logits": 0.824543422460556, + "step": 67020 + }, + { + "epoch": 0.6703, + "grad_norm": 14.25, + "grad_norm_var": 0.71640625, + "learning_rate": 0.0003, + "loss": 10.7361, + "loss/aux_loss": 0.04806180745363235, + "loss/crossentropy": 2.527262020111084, + "loss/logits": 0.7941514313220978, + "step": 67030 + }, + { + "epoch": 0.6704, + "grad_norm": 14.5625, + "grad_norm_var": 0.5059895833333333, + "learning_rate": 0.0003, + "loss": 10.8482, + "loss/aux_loss": 0.04806319680064917, + "loss/crossentropy": 2.6808030009269714, + "loss/logits": 0.7904377818107605, + "step": 67040 + }, + { + "epoch": 0.6705, + "grad_norm": 15.125, + "grad_norm_var": 0.41380208333333335, + "learning_rate": 0.0003, + "loss": 10.8096, + "loss/aux_loss": 0.048068330809473994, + "loss/crossentropy": 2.5817946434020995, + "loss/logits": 0.8056021362543107, + "step": 67050 + }, + { + "epoch": 0.6706, + "grad_norm": 15.0, + "grad_norm_var": 0.9958170572916667, + "learning_rate": 0.0003, + "loss": 10.9695, + "loss/aux_loss": 0.048056199215352535, + "loss/crossentropy": 2.782631528377533, + "loss/logits": 0.8429341733455658, + "step": 67060 + }, + { + "epoch": 0.6707, + "grad_norm": 14.5625, + "grad_norm_var": 0.7150390625, + "learning_rate": 0.0003, + "loss": 11.0217, + "loss/aux_loss": 0.04807750023901462, + "loss/crossentropy": 2.7971240878105164, + "loss/logits": 0.8311490327119827, + "step": 67070 + }, + { + "epoch": 0.6708, + "grad_norm": 14.125, + "grad_norm_var": 0.28619791666666666, + "learning_rate": 0.0003, + "loss": 10.9232, + "loss/aux_loss": 0.048067571222782136, + "loss/crossentropy": 2.7258487045764923, + "loss/logits": 0.8153935343027114, + "step": 67080 + }, + { + "epoch": 0.6709, + "grad_norm": 14.5625, + "grad_norm_var": 0.5079264322916667, + "learning_rate": 0.0003, + "loss": 10.7639, + "loss/aux_loss": 0.04806251842528582, + "loss/crossentropy": 2.828408050537109, + "loss/logits": 0.8246762096881867, + "step": 67090 + }, + { + "epoch": 0.671, + "grad_norm": 15.25, + "grad_norm_var": 0.5353515625, + "learning_rate": 0.0003, + "loss": 10.8017, + "loss/aux_loss": 0.04806637335568666, + "loss/crossentropy": 2.6909542202949526, + "loss/logits": 0.7874203026294708, + "step": 67100 + }, + { + "epoch": 0.6711, + "grad_norm": 14.625, + "grad_norm_var": 1.8065104166666666, + "learning_rate": 0.0003, + "loss": 10.7695, + "loss/aux_loss": 0.04807747136801481, + "loss/crossentropy": 2.7141048312187195, + "loss/logits": 0.7738794207572937, + "step": 67110 + }, + { + "epoch": 0.6712, + "grad_norm": 15.0625, + "grad_norm_var": 1.5468098958333334, + "learning_rate": 0.0003, + "loss": 10.8937, + "loss/aux_loss": 0.0480605298653245, + "loss/crossentropy": 2.6752750635147096, + "loss/logits": 0.7823032259941101, + "step": 67120 + }, + { + "epoch": 0.6713, + "grad_norm": 14.625, + "grad_norm_var": 0.30323893229166665, + "learning_rate": 0.0003, + "loss": 10.785, + "loss/aux_loss": 0.04806859977543354, + "loss/crossentropy": 2.962781381607056, + "loss/logits": 0.8514289349317551, + "step": 67130 + }, + { + "epoch": 0.6714, + "grad_norm": 14.9375, + "grad_norm_var": 0.32109375, + "learning_rate": 0.0003, + "loss": 10.9113, + "loss/aux_loss": 0.04805906768888235, + "loss/crossentropy": 2.6575462460517882, + "loss/logits": 0.8165967971086502, + "step": 67140 + }, + { + "epoch": 0.6715, + "grad_norm": 15.6875, + "grad_norm_var": 0.472119140625, + "learning_rate": 0.0003, + "loss": 10.8406, + "loss/aux_loss": 0.04806913398206234, + "loss/crossentropy": 2.6859039187431337, + "loss/logits": 0.7919700384140015, + "step": 67150 + }, + { + "epoch": 0.6716, + "grad_norm": 14.6875, + "grad_norm_var": 0.8429524739583333, + "learning_rate": 0.0003, + "loss": 10.7851, + "loss/aux_loss": 0.04807139337062836, + "loss/crossentropy": 2.7103774309158326, + "loss/logits": 0.8116638362407684, + "step": 67160 + }, + { + "epoch": 0.6717, + "grad_norm": 15.1875, + "grad_norm_var": 0.40870768229166665, + "learning_rate": 0.0003, + "loss": 10.9319, + "loss/aux_loss": 0.04806287419050932, + "loss/crossentropy": 2.6983575582504273, + "loss/logits": 0.7836291432380676, + "step": 67170 + }, + { + "epoch": 0.6718, + "grad_norm": 16.25, + "grad_norm_var": 0.44733072916666666, + "learning_rate": 0.0003, + "loss": 10.8345, + "loss/aux_loss": 0.04806513842195272, + "loss/crossentropy": 2.8540413081645966, + "loss/logits": 0.8250791281461716, + "step": 67180 + }, + { + "epoch": 0.6719, + "grad_norm": 14.875, + "grad_norm_var": 0.6218587239583333, + "learning_rate": 0.0003, + "loss": 10.9294, + "loss/aux_loss": 0.048067282512784006, + "loss/crossentropy": 2.7587247133255004, + "loss/logits": 0.8604197174310684, + "step": 67190 + }, + { + "epoch": 0.672, + "grad_norm": 14.9375, + "grad_norm_var": 1.4669108072916666, + "learning_rate": 0.0003, + "loss": 10.7877, + "loss/aux_loss": 0.04807413946837187, + "loss/crossentropy": 2.6343702554702757, + "loss/logits": 0.8144495546817779, + "step": 67200 + }, + { + "epoch": 0.6721, + "grad_norm": 17.25, + "grad_norm_var": 0.6837076822916667, + "learning_rate": 0.0003, + "loss": 10.9378, + "loss/aux_loss": 0.04806031696498394, + "loss/crossentropy": 2.6540800809860228, + "loss/logits": 0.8209151834249496, + "step": 67210 + }, + { + "epoch": 0.6722, + "grad_norm": 17.25, + "grad_norm_var": 0.6257649739583333, + "learning_rate": 0.0003, + "loss": 10.9277, + "loss/aux_loss": 0.04808404482901096, + "loss/crossentropy": 2.6092947840690615, + "loss/logits": 0.7998175516724586, + "step": 67220 + }, + { + "epoch": 0.6723, + "grad_norm": 15.6875, + "grad_norm_var": 0.7684733072916666, + "learning_rate": 0.0003, + "loss": 10.7706, + "loss/aux_loss": 0.04805383253842592, + "loss/crossentropy": 2.633415186405182, + "loss/logits": 0.8068418264389038, + "step": 67230 + }, + { + "epoch": 0.6724, + "grad_norm": 16.25, + "grad_norm_var": 0.66875, + "learning_rate": 0.0003, + "loss": 10.8816, + "loss/aux_loss": 0.04806835651397705, + "loss/crossentropy": 2.6598873853683473, + "loss/logits": 0.8227006554603576, + "step": 67240 + }, + { + "epoch": 0.6725, + "grad_norm": 14.3125, + "grad_norm_var": 0.51484375, + "learning_rate": 0.0003, + "loss": 10.776, + "loss/aux_loss": 0.04807889815419912, + "loss/crossentropy": 2.6441542148590087, + "loss/logits": 0.8324314415454864, + "step": 67250 + }, + { + "epoch": 0.6726, + "grad_norm": 14.5, + "grad_norm_var": 0.522900390625, + "learning_rate": 0.0003, + "loss": 10.8126, + "loss/aux_loss": 0.04807432275265455, + "loss/crossentropy": 2.5847804844379425, + "loss/logits": 0.766998502612114, + "step": 67260 + }, + { + "epoch": 0.6727, + "grad_norm": 16.0, + "grad_norm_var": 1.5234212239583333, + "learning_rate": 0.0003, + "loss": 10.9851, + "loss/aux_loss": 0.04805461261421442, + "loss/crossentropy": 2.6628905653953554, + "loss/logits": 0.8119227319955826, + "step": 67270 + }, + { + "epoch": 0.6728, + "grad_norm": 15.5, + "grad_norm_var": 0.8550618489583334, + "learning_rate": 0.0003, + "loss": 10.8064, + "loss/aux_loss": 0.0480805242434144, + "loss/crossentropy": 2.709455114603043, + "loss/logits": 0.7887743502855301, + "step": 67280 + }, + { + "epoch": 0.6729, + "grad_norm": 15.6875, + "grad_norm_var": 0.3078125, + "learning_rate": 0.0003, + "loss": 10.8286, + "loss/aux_loss": 0.0480688139796257, + "loss/crossentropy": 2.5801384925842283, + "loss/logits": 0.8218373239040375, + "step": 67290 + }, + { + "epoch": 0.673, + "grad_norm": 14.1875, + "grad_norm_var": 0.9432291666666667, + "learning_rate": 0.0003, + "loss": 10.885, + "loss/aux_loss": 0.04806957859545946, + "loss/crossentropy": 2.741364133358002, + "loss/logits": 0.8136252701282501, + "step": 67300 + }, + { + "epoch": 0.6731, + "grad_norm": 14.6875, + "grad_norm_var": 0.7747395833333334, + "learning_rate": 0.0003, + "loss": 10.8628, + "loss/aux_loss": 0.04806738197803497, + "loss/crossentropy": 2.7313589334487913, + "loss/logits": 0.8280026108026505, + "step": 67310 + }, + { + "epoch": 0.6732, + "grad_norm": 14.625, + "grad_norm_var": 0.38483072916666666, + "learning_rate": 0.0003, + "loss": 10.935, + "loss/aux_loss": 0.04806512799113989, + "loss/crossentropy": 2.731142336130142, + "loss/logits": 0.8196133434772491, + "step": 67320 + }, + { + "epoch": 0.6733, + "grad_norm": 15.1875, + "grad_norm_var": 0.5726399739583333, + "learning_rate": 0.0003, + "loss": 10.9229, + "loss/aux_loss": 0.04806940630078316, + "loss/crossentropy": 2.833940917253494, + "loss/logits": 0.8169467687606812, + "step": 67330 + }, + { + "epoch": 0.6734, + "grad_norm": 15.1875, + "grad_norm_var": 1.2751139322916667, + "learning_rate": 0.0003, + "loss": 10.8098, + "loss/aux_loss": 0.048075680062174796, + "loss/crossentropy": 2.5253068923950197, + "loss/logits": 0.7722189128398895, + "step": 67340 + }, + { + "epoch": 0.6735, + "grad_norm": 16.5, + "grad_norm_var": 1.2885416666666667, + "learning_rate": 0.0003, + "loss": 10.7254, + "loss/aux_loss": 0.04805800002068281, + "loss/crossentropy": 2.618474489450455, + "loss/logits": 0.7805281549692153, + "step": 67350 + }, + { + "epoch": 0.6736, + "grad_norm": 14.5625, + "grad_norm_var": 0.41456705729166665, + "learning_rate": 0.0003, + "loss": 10.8487, + "loss/aux_loss": 0.04805983938276768, + "loss/crossentropy": 2.54278524518013, + "loss/logits": 0.7987161606550217, + "step": 67360 + }, + { + "epoch": 0.6737, + "grad_norm": 14.1875, + "grad_norm_var": 0.29889322916666666, + "learning_rate": 0.0003, + "loss": 10.8717, + "loss/aux_loss": 0.04807660169899464, + "loss/crossentropy": 2.815138578414917, + "loss/logits": 0.8121901094913483, + "step": 67370 + }, + { + "epoch": 0.6738, + "grad_norm": 15.125, + "grad_norm_var": 0.50078125, + "learning_rate": 0.0003, + "loss": 10.96, + "loss/aux_loss": 0.04807771537452936, + "loss/crossentropy": 2.770961511135101, + "loss/logits": 0.8236127972602845, + "step": 67380 + }, + { + "epoch": 0.6739, + "grad_norm": 14.875, + "grad_norm_var": 0.22838541666666667, + "learning_rate": 0.0003, + "loss": 10.9856, + "loss/aux_loss": 0.048058960027992724, + "loss/crossentropy": 2.645844268798828, + "loss/logits": 0.8427874892950058, + "step": 67390 + }, + { + "epoch": 0.674, + "grad_norm": 16.0, + "grad_norm_var": 3.4936848958333333, + "learning_rate": 0.0003, + "loss": 10.8695, + "loss/aux_loss": 0.04807248618453741, + "loss/crossentropy": 2.620198917388916, + "loss/logits": 0.8078225284814835, + "step": 67400 + }, + { + "epoch": 0.6741, + "grad_norm": 16.625, + "grad_norm_var": 3.437613932291667, + "learning_rate": 0.0003, + "loss": 10.7651, + "loss/aux_loss": 0.0480686979368329, + "loss/crossentropy": 2.5131695568561554, + "loss/logits": 0.7861690491437912, + "step": 67410 + }, + { + "epoch": 0.6742, + "grad_norm": 20.125, + "grad_norm_var": 2.122119140625, + "learning_rate": 0.0003, + "loss": 10.8228, + "loss/aux_loss": 0.04807369913905859, + "loss/crossentropy": 2.7605391681194305, + "loss/logits": 0.7911547005176545, + "step": 67420 + }, + { + "epoch": 0.6743, + "grad_norm": 16.125, + "grad_norm_var": 1.8026041666666666, + "learning_rate": 0.0003, + "loss": 10.726, + "loss/aux_loss": 0.04808209650218487, + "loss/crossentropy": 2.621247559785843, + "loss/logits": 0.7596588641405105, + "step": 67430 + }, + { + "epoch": 0.6744, + "grad_norm": 16.5, + "grad_norm_var": 0.91328125, + "learning_rate": 0.0003, + "loss": 10.8972, + "loss/aux_loss": 0.04805378243327141, + "loss/crossentropy": 2.679558593034744, + "loss/logits": 0.8097591936588288, + "step": 67440 + }, + { + "epoch": 0.6745, + "grad_norm": 15.3125, + "grad_norm_var": 0.8770182291666667, + "learning_rate": 0.0003, + "loss": 10.8668, + "loss/aux_loss": 0.04808393493294716, + "loss/crossentropy": 2.6547152400016785, + "loss/logits": 0.7948015958070755, + "step": 67450 + }, + { + "epoch": 0.6746, + "grad_norm": 16.75, + "grad_norm_var": 0.7478515625, + "learning_rate": 0.0003, + "loss": 10.943, + "loss/aux_loss": 0.048087403364479545, + "loss/crossentropy": 2.739590084552765, + "loss/logits": 0.8123195976018905, + "step": 67460 + }, + { + "epoch": 0.6747, + "grad_norm": 14.875, + "grad_norm_var": 1.882666015625, + "learning_rate": 0.0003, + "loss": 10.9271, + "loss/aux_loss": 0.04805273432284594, + "loss/crossentropy": 2.7345412015914916, + "loss/logits": 0.8273571223020554, + "step": 67470 + }, + { + "epoch": 0.6748, + "grad_norm": 15.8125, + "grad_norm_var": 0.30930989583333335, + "learning_rate": 0.0003, + "loss": 10.8483, + "loss/aux_loss": 0.04808681271970272, + "loss/crossentropy": 2.637076383829117, + "loss/logits": 0.7990337044000626, + "step": 67480 + }, + { + "epoch": 0.6749, + "grad_norm": 15.0, + "grad_norm_var": 0.326806640625, + "learning_rate": 0.0003, + "loss": 10.8461, + "loss/aux_loss": 0.04807149153202772, + "loss/crossentropy": 2.694927138090134, + "loss/logits": 0.7909066528081894, + "step": 67490 + }, + { + "epoch": 0.675, + "grad_norm": 15.25, + "grad_norm_var": 0.28899739583333334, + "learning_rate": 0.0003, + "loss": 10.8772, + "loss/aux_loss": 0.04805634953081608, + "loss/crossentropy": 2.811596691608429, + "loss/logits": 0.8140271067619324, + "step": 67500 + }, + { + "epoch": 0.6751, + "grad_norm": 15.5625, + "grad_norm_var": 1.0056640625, + "learning_rate": 0.0003, + "loss": 10.9332, + "loss/aux_loss": 0.04807057473808527, + "loss/crossentropy": 2.724851429462433, + "loss/logits": 0.8114170014858246, + "step": 67510 + }, + { + "epoch": 0.6752, + "grad_norm": 16.25, + "grad_norm_var": 1.0827962239583333, + "learning_rate": 0.0003, + "loss": 10.8187, + "loss/aux_loss": 0.04808598104864359, + "loss/crossentropy": 2.514454412460327, + "loss/logits": 0.7901194989681244, + "step": 67520 + }, + { + "epoch": 0.6753, + "grad_norm": 15.125, + "grad_norm_var": 1.7328125, + "learning_rate": 0.0003, + "loss": 10.7733, + "loss/aux_loss": 0.04806674364954233, + "loss/crossentropy": 2.6894044280052185, + "loss/logits": 0.8146826893091201, + "step": 67530 + }, + { + "epoch": 0.6754, + "grad_norm": 15.8125, + "grad_norm_var": 1.8983723958333334, + "learning_rate": 0.0003, + "loss": 10.9634, + "loss/aux_loss": 0.04807586632668972, + "loss/crossentropy": 2.7447893381118775, + "loss/logits": 0.7936123460531235, + "step": 67540 + }, + { + "epoch": 0.6755, + "grad_norm": 14.0625, + "grad_norm_var": 1.3093098958333333, + "learning_rate": 0.0003, + "loss": 10.7428, + "loss/aux_loss": 0.04806775413453579, + "loss/crossentropy": 2.6494940400123594, + "loss/logits": 0.8177571147680283, + "step": 67550 + }, + { + "epoch": 0.6756, + "grad_norm": 15.1875, + "grad_norm_var": 0.5523274739583334, + "learning_rate": 0.0003, + "loss": 10.9695, + "loss/aux_loss": 0.04806860536336899, + "loss/crossentropy": 2.6481561064720154, + "loss/logits": 0.8320757627487183, + "step": 67560 + }, + { + "epoch": 0.6757, + "grad_norm": 15.4375, + "grad_norm_var": 0.31354166666666666, + "learning_rate": 0.0003, + "loss": 10.9555, + "loss/aux_loss": 0.04807276241481304, + "loss/crossentropy": 2.7160579323768617, + "loss/logits": 0.8053322076797486, + "step": 67570 + }, + { + "epoch": 0.6758, + "grad_norm": 14.5625, + "grad_norm_var": 0.3108723958333333, + "learning_rate": 0.0003, + "loss": 10.8702, + "loss/aux_loss": 0.04807675499469042, + "loss/crossentropy": 2.67366309762001, + "loss/logits": 0.7897478014230728, + "step": 67580 + }, + { + "epoch": 0.6759, + "grad_norm": 15.6875, + "grad_norm_var": 1.7702473958333333, + "learning_rate": 0.0003, + "loss": 10.867, + "loss/aux_loss": 0.048066642321646216, + "loss/crossentropy": 2.6360503315925596, + "loss/logits": 0.8279545217752456, + "step": 67590 + }, + { + "epoch": 0.676, + "grad_norm": 18.5, + "grad_norm_var": 105.2556640625, + "learning_rate": 0.0003, + "loss": 10.8562, + "loss/aux_loss": 0.04806712754070759, + "loss/crossentropy": 2.723428654670715, + "loss/logits": 0.8046755522489548, + "step": 67600 + }, + { + "epoch": 0.6761, + "grad_norm": 15.5, + "grad_norm_var": 1.7884765625, + "learning_rate": 0.0003, + "loss": 10.9624, + "loss/aux_loss": 0.04807472750544548, + "loss/crossentropy": 2.721188187599182, + "loss/logits": 0.8297373622655868, + "step": 67610 + }, + { + "epoch": 0.6762, + "grad_norm": 15.0, + "grad_norm_var": 0.8551920572916667, + "learning_rate": 0.0003, + "loss": 10.8363, + "loss/aux_loss": 0.04807953592389822, + "loss/crossentropy": 2.59203023314476, + "loss/logits": 0.8205517113208771, + "step": 67620 + }, + { + "epoch": 0.6763, + "grad_norm": 15.6875, + "grad_norm_var": 0.448681640625, + "learning_rate": 0.0003, + "loss": 10.9757, + "loss/aux_loss": 0.048056138679385185, + "loss/crossentropy": 2.7822072982788084, + "loss/logits": 0.8061872452497483, + "step": 67630 + }, + { + "epoch": 0.6764, + "grad_norm": 16.625, + "grad_norm_var": 0.5163899739583333, + "learning_rate": 0.0003, + "loss": 10.7339, + "loss/aux_loss": 0.04806150645017624, + "loss/crossentropy": 2.551578390598297, + "loss/logits": 0.7957051217555999, + "step": 67640 + }, + { + "epoch": 0.6765, + "grad_norm": 15.125, + "grad_norm_var": 0.308837890625, + "learning_rate": 0.0003, + "loss": 10.8204, + "loss/aux_loss": 0.048080853372812274, + "loss/crossentropy": 2.618013346195221, + "loss/logits": 0.7895002514123917, + "step": 67650 + }, + { + "epoch": 0.6766, + "grad_norm": 16.125, + "grad_norm_var": 0.353759765625, + "learning_rate": 0.0003, + "loss": 11.013, + "loss/aux_loss": 0.048052550107240674, + "loss/crossentropy": 2.812237298488617, + "loss/logits": 0.8751404196023941, + "step": 67660 + }, + { + "epoch": 0.6767, + "grad_norm": 14.6875, + "grad_norm_var": 0.385009765625, + "learning_rate": 0.0003, + "loss": 10.7287, + "loss/aux_loss": 0.048068588599562645, + "loss/crossentropy": 2.4027431547641753, + "loss/logits": 0.7714583456516266, + "step": 67670 + }, + { + "epoch": 0.6768, + "grad_norm": 15.75, + "grad_norm_var": 0.5728515625, + "learning_rate": 0.0003, + "loss": 10.9295, + "loss/aux_loss": 0.048069121316075325, + "loss/crossentropy": 2.750480669736862, + "loss/logits": 0.8289047926664352, + "step": 67680 + }, + { + "epoch": 0.6769, + "grad_norm": 15.9375, + "grad_norm_var": 1.20859375, + "learning_rate": 0.0003, + "loss": 10.9895, + "loss/aux_loss": 0.04808015916496515, + "loss/crossentropy": 2.8340500593185425, + "loss/logits": 0.8436507463455201, + "step": 67690 + }, + { + "epoch": 0.677, + "grad_norm": 15.1875, + "grad_norm_var": 1.0079264322916666, + "learning_rate": 0.0003, + "loss": 10.9163, + "loss/aux_loss": 0.048059361055493355, + "loss/crossentropy": 2.6917006373405457, + "loss/logits": 0.8295173823833466, + "step": 67700 + }, + { + "epoch": 0.6771, + "grad_norm": 15.9375, + "grad_norm_var": 0.6603515625, + "learning_rate": 0.0003, + "loss": 10.8006, + "loss/aux_loss": 0.04806683622300625, + "loss/crossentropy": 2.6369792103767393, + "loss/logits": 0.7871336251497268, + "step": 67710 + }, + { + "epoch": 0.6772, + "grad_norm": 16.125, + "grad_norm_var": 1.2552083333333333, + "learning_rate": 0.0003, + "loss": 11.0148, + "loss/aux_loss": 0.04807326085865497, + "loss/crossentropy": 2.8675316095352175, + "loss/logits": 0.8307450711727142, + "step": 67720 + }, + { + "epoch": 0.6773, + "grad_norm": 16.375, + "grad_norm_var": 0.6858723958333334, + "learning_rate": 0.0003, + "loss": 10.7185, + "loss/aux_loss": 0.04807858150452375, + "loss/crossentropy": 2.577520215511322, + "loss/logits": 0.8100444704294205, + "step": 67730 + }, + { + "epoch": 0.6774, + "grad_norm": 14.875, + "grad_norm_var": 0.5481770833333334, + "learning_rate": 0.0003, + "loss": 10.849, + "loss/aux_loss": 0.04805320855230093, + "loss/crossentropy": 2.7695468187332155, + "loss/logits": 0.8083236128091812, + "step": 67740 + }, + { + "epoch": 0.6775, + "grad_norm": 15.0625, + "grad_norm_var": 1.6390462239583334, + "learning_rate": 0.0003, + "loss": 10.8675, + "loss/aux_loss": 0.048079329542815685, + "loss/crossentropy": 2.576527512073517, + "loss/logits": 0.8159837514162064, + "step": 67750 + }, + { + "epoch": 0.6776, + "grad_norm": 15.4375, + "grad_norm_var": 0.375, + "learning_rate": 0.0003, + "loss": 10.8023, + "loss/aux_loss": 0.048062733933329584, + "loss/crossentropy": 2.8369166016578675, + "loss/logits": 0.8276311069726944, + "step": 67760 + }, + { + "epoch": 0.6777, + "grad_norm": 14.0, + "grad_norm_var": 1.247119140625, + "learning_rate": 0.0003, + "loss": 10.7408, + "loss/aux_loss": 0.04806175995618105, + "loss/crossentropy": 2.706052553653717, + "loss/logits": 0.8145518273115158, + "step": 67770 + }, + { + "epoch": 0.6778, + "grad_norm": 15.5625, + "grad_norm_var": 0.5853515625, + "learning_rate": 0.0003, + "loss": 10.8887, + "loss/aux_loss": 0.04806300979107618, + "loss/crossentropy": 2.667705309391022, + "loss/logits": 0.8158227071166039, + "step": 67780 + }, + { + "epoch": 0.6779, + "grad_norm": 15.125, + "grad_norm_var": 0.24635416666666668, + "learning_rate": 0.0003, + "loss": 10.8023, + "loss/aux_loss": 0.04807372950017452, + "loss/crossentropy": 2.7875693142414093, + "loss/logits": 0.7981827527284622, + "step": 67790 + }, + { + "epoch": 0.678, + "grad_norm": 16.625, + "grad_norm_var": 0.38553059895833336, + "learning_rate": 0.0003, + "loss": 10.8735, + "loss/aux_loss": 0.048066407814621924, + "loss/crossentropy": 2.7808514714241026, + "loss/logits": 0.8458144783973693, + "step": 67800 + }, + { + "epoch": 0.6781, + "grad_norm": 15.4375, + "grad_norm_var": 0.6945149739583333, + "learning_rate": 0.0003, + "loss": 10.8798, + "loss/aux_loss": 0.048075595125555995, + "loss/crossentropy": 2.605326008796692, + "loss/logits": 0.7936393201351166, + "step": 67810 + }, + { + "epoch": 0.6782, + "grad_norm": 14.875, + "grad_norm_var": 0.8359375, + "learning_rate": 0.0003, + "loss": 10.9042, + "loss/aux_loss": 0.048067840933799746, + "loss/crossentropy": 2.6910835683345793, + "loss/logits": 0.8230360358953476, + "step": 67820 + }, + { + "epoch": 0.6783, + "grad_norm": 16.25, + "grad_norm_var": 3.9468587239583335, + "learning_rate": 0.0003, + "loss": 10.8921, + "loss/aux_loss": 0.048061666823923585, + "loss/crossentropy": 2.8486799359321595, + "loss/logits": 0.8791530191898346, + "step": 67830 + }, + { + "epoch": 0.6784, + "grad_norm": 15.625, + "grad_norm_var": 0.6270182291666667, + "learning_rate": 0.0003, + "loss": 11.0865, + "loss/aux_loss": 0.04808611553162336, + "loss/crossentropy": 2.694787919521332, + "loss/logits": 0.7927749201655387, + "step": 67840 + }, + { + "epoch": 0.6785, + "grad_norm": 15.1875, + "grad_norm_var": 0.6895670572916667, + "learning_rate": 0.0003, + "loss": 10.8049, + "loss/aux_loss": 0.048073581978678705, + "loss/crossentropy": 2.5416905343532563, + "loss/logits": 0.7923329859972, + "step": 67850 + }, + { + "epoch": 0.6786, + "grad_norm": 14.0, + "grad_norm_var": 0.9050618489583333, + "learning_rate": 0.0003, + "loss": 10.9769, + "loss/aux_loss": 0.04805105049163103, + "loss/crossentropy": 2.754929852485657, + "loss/logits": 0.8427582740783691, + "step": 67860 + }, + { + "epoch": 0.6787, + "grad_norm": 16.375, + "grad_norm_var": 0.6212890625, + "learning_rate": 0.0003, + "loss": 10.9937, + "loss/aux_loss": 0.04806809015572071, + "loss/crossentropy": 2.8406196355819704, + "loss/logits": 0.8406887620687484, + "step": 67870 + }, + { + "epoch": 0.6788, + "grad_norm": 14.625, + "grad_norm_var": 0.38201497395833334, + "learning_rate": 0.0003, + "loss": 10.818, + "loss/aux_loss": 0.04806934855878353, + "loss/crossentropy": 2.591457462310791, + "loss/logits": 0.8105394840240479, + "step": 67880 + }, + { + "epoch": 0.6789, + "grad_norm": 15.8125, + "grad_norm_var": 0.4227701822916667, + "learning_rate": 0.0003, + "loss": 10.8923, + "loss/aux_loss": 0.048066012747585776, + "loss/crossentropy": 2.705242431163788, + "loss/logits": 0.8005460679531098, + "step": 67890 + }, + { + "epoch": 0.679, + "grad_norm": 14.6875, + "grad_norm_var": 0.7940104166666667, + "learning_rate": 0.0003, + "loss": 10.7941, + "loss/aux_loss": 0.04805295336991548, + "loss/crossentropy": 2.646135312318802, + "loss/logits": 0.8061649814248085, + "step": 67900 + }, + { + "epoch": 0.6791, + "grad_norm": 16.0, + "grad_norm_var": 0.6841145833333333, + "learning_rate": 0.0003, + "loss": 10.8556, + "loss/aux_loss": 0.04807949885725975, + "loss/crossentropy": 2.5016194105148317, + "loss/logits": 0.7795857936143875, + "step": 67910 + }, + { + "epoch": 0.6792, + "grad_norm": 16.0, + "grad_norm_var": 0.5186848958333333, + "learning_rate": 0.0003, + "loss": 10.8385, + "loss/aux_loss": 0.04806459601968527, + "loss/crossentropy": 2.701348972320557, + "loss/logits": 0.804606556892395, + "step": 67920 + }, + { + "epoch": 0.6793, + "grad_norm": 15.0625, + "grad_norm_var": 0.3329264322916667, + "learning_rate": 0.0003, + "loss": 10.8824, + "loss/aux_loss": 0.04806465972214937, + "loss/crossentropy": 2.708846724033356, + "loss/logits": 0.7914534270763397, + "step": 67930 + }, + { + "epoch": 0.6794, + "grad_norm": 16.5, + "grad_norm_var": 285.405712890625, + "learning_rate": 0.0003, + "loss": 10.877, + "loss/aux_loss": 0.048062794655561444, + "loss/crossentropy": 2.6500784277915956, + "loss/logits": 0.791703137755394, + "step": 67940 + }, + { + "epoch": 0.6795, + "grad_norm": 17.0, + "grad_norm_var": 281.64635416666664, + "learning_rate": 0.0003, + "loss": 11.0174, + "loss/aux_loss": 0.04806160032749176, + "loss/crossentropy": 2.6734558582305907, + "loss/logits": 0.7966185420751571, + "step": 67950 + }, + { + "epoch": 0.6796, + "grad_norm": 17.375, + "grad_norm_var": 2.1645833333333333, + "learning_rate": 0.0003, + "loss": 10.655, + "loss/aux_loss": 0.04807016905397177, + "loss/crossentropy": 2.725010406970978, + "loss/logits": 0.7850367069244385, + "step": 67960 + }, + { + "epoch": 0.6797, + "grad_norm": 15.5, + "grad_norm_var": 1.8442057291666667, + "learning_rate": 0.0003, + "loss": 11.0602, + "loss/aux_loss": 0.04807140734046698, + "loss/crossentropy": 2.7007255434989927, + "loss/logits": 0.843214625120163, + "step": 67970 + }, + { + "epoch": 0.6798, + "grad_norm": 15.6875, + "grad_norm_var": 0.37303059895833335, + "learning_rate": 0.0003, + "loss": 10.7407, + "loss/aux_loss": 0.0480677118524909, + "loss/crossentropy": 2.6968838930130006, + "loss/logits": 0.7861210882663727, + "step": 67980 + }, + { + "epoch": 0.6799, + "grad_norm": 16.625, + "grad_norm_var": 1.053369140625, + "learning_rate": 0.0003, + "loss": 10.8169, + "loss/aux_loss": 0.04806595295667648, + "loss/crossentropy": 2.692828023433685, + "loss/logits": 0.8357434421777725, + "step": 67990 + }, + { + "epoch": 0.68, + "grad_norm": 17.625, + "grad_norm_var": 1.1062337239583333, + "learning_rate": 0.0003, + "loss": 10.9263, + "loss/aux_loss": 0.04806125350296497, + "loss/crossentropy": 2.8140974402427674, + "loss/logits": 0.8311895668506623, + "step": 68000 + }, + { + "epoch": 0.6801, + "grad_norm": 16.75, + "grad_norm_var": 2.515738932291667, + "learning_rate": 0.0003, + "loss": 10.8513, + "loss/aux_loss": 0.04805536307394505, + "loss/crossentropy": 2.7598934888839723, + "loss/logits": 0.817400798201561, + "step": 68010 + }, + { + "epoch": 0.6802, + "grad_norm": 16.5, + "grad_norm_var": 3.612434895833333, + "learning_rate": 0.0003, + "loss": 11.0643, + "loss/aux_loss": 0.04808505550026894, + "loss/crossentropy": 2.6912566304206846, + "loss/logits": 0.819332605600357, + "step": 68020 + }, + { + "epoch": 0.6803, + "grad_norm": 15.375, + "grad_norm_var": 1.826025390625, + "learning_rate": 0.0003, + "loss": 10.9309, + "loss/aux_loss": 0.04807189963757992, + "loss/crossentropy": 2.62255003452301, + "loss/logits": 0.8158010393381119, + "step": 68030 + }, + { + "epoch": 0.6804, + "grad_norm": 16.375, + "grad_norm_var": 2.052604166666667, + "learning_rate": 0.0003, + "loss": 10.9309, + "loss/aux_loss": 0.048042737506330015, + "loss/crossentropy": 2.7515727818012237, + "loss/logits": 0.8346040636301041, + "step": 68040 + }, + { + "epoch": 0.6805, + "grad_norm": 14.5625, + "grad_norm_var": 2.3296223958333333, + "learning_rate": 0.0003, + "loss": 10.8757, + "loss/aux_loss": 0.048085806891322136, + "loss/crossentropy": 2.7306901931762697, + "loss/logits": 0.7944915473461152, + "step": 68050 + }, + { + "epoch": 0.6806, + "grad_norm": 16.875, + "grad_norm_var": 0.5645833333333333, + "learning_rate": 0.0003, + "loss": 10.9716, + "loss/aux_loss": 0.04805421140044928, + "loss/crossentropy": 2.759699082374573, + "loss/logits": 0.8278161138296127, + "step": 68060 + }, + { + "epoch": 0.6807, + "grad_norm": 16.5, + "grad_norm_var": 0.5499348958333333, + "learning_rate": 0.0003, + "loss": 10.984, + "loss/aux_loss": 0.04806980360299349, + "loss/crossentropy": 2.7111220836639403, + "loss/logits": 0.8429874509572983, + "step": 68070 + }, + { + "epoch": 0.6808, + "grad_norm": 14.625, + "grad_norm_var": 0.7937337239583333, + "learning_rate": 0.0003, + "loss": 10.8038, + "loss/aux_loss": 0.04806726835668087, + "loss/crossentropy": 2.7184378623962404, + "loss/logits": 0.8051327586174011, + "step": 68080 + }, + { + "epoch": 0.6809, + "grad_norm": 19.625, + "grad_norm_var": 1.8660807291666666, + "learning_rate": 0.0003, + "loss": 10.9857, + "loss/aux_loss": 0.048076169565320015, + "loss/crossentropy": 2.623359727859497, + "loss/logits": 0.8120907008647918, + "step": 68090 + }, + { + "epoch": 0.681, + "grad_norm": 15.3125, + "grad_norm_var": 1.6804524739583333, + "learning_rate": 0.0003, + "loss": 10.7738, + "loss/aux_loss": 0.04806400742381811, + "loss/crossentropy": 2.5843277633190156, + "loss/logits": 0.7641439378261566, + "step": 68100 + }, + { + "epoch": 0.6811, + "grad_norm": 14.9375, + "grad_norm_var": 0.33671875, + "learning_rate": 0.0003, + "loss": 11.0272, + "loss/aux_loss": 0.048069718293845654, + "loss/crossentropy": 2.7510910749435427, + "loss/logits": 0.8262420713901519, + "step": 68110 + }, + { + "epoch": 0.6812, + "grad_norm": 15.1875, + "grad_norm_var": 0.5855305989583334, + "learning_rate": 0.0003, + "loss": 10.9944, + "loss/aux_loss": 0.04808128289878368, + "loss/crossentropy": 2.745889973640442, + "loss/logits": 0.8404574304819107, + "step": 68120 + }, + { + "epoch": 0.6813, + "grad_norm": 15.625, + "grad_norm_var": 0.40792643229166664, + "learning_rate": 0.0003, + "loss": 10.9824, + "loss/aux_loss": 0.04806574918329716, + "loss/crossentropy": 2.8554471492767335, + "loss/logits": 0.827678182721138, + "step": 68130 + }, + { + "epoch": 0.6814, + "grad_norm": 18.125, + "grad_norm_var": 0.9089680989583333, + "learning_rate": 0.0003, + "loss": 10.8597, + "loss/aux_loss": 0.048068815097212794, + "loss/crossentropy": 2.684776157140732, + "loss/logits": 0.7816103935241699, + "step": 68140 + }, + { + "epoch": 0.6815, + "grad_norm": 16.375, + "grad_norm_var": 0.9602701822916667, + "learning_rate": 0.0003, + "loss": 10.7884, + "loss/aux_loss": 0.04806648455560207, + "loss/crossentropy": 2.6992865085601805, + "loss/logits": 0.8477433979511261, + "step": 68150 + }, + { + "epoch": 0.6816, + "grad_norm": 15.9375, + "grad_norm_var": 0.48072916666666665, + "learning_rate": 0.0003, + "loss": 10.8101, + "loss/aux_loss": 0.048081612959504125, + "loss/crossentropy": 2.5780653059482574, + "loss/logits": 0.7993250101804733, + "step": 68160 + }, + { + "epoch": 0.6817, + "grad_norm": 14.5, + "grad_norm_var": 0.5562337239583334, + "learning_rate": 0.0003, + "loss": 10.8097, + "loss/aux_loss": 0.04806546475738287, + "loss/crossentropy": 2.5478107750415804, + "loss/logits": 0.8167504072189331, + "step": 68170 + }, + { + "epoch": 0.6818, + "grad_norm": 16.0, + "grad_norm_var": 0.9333170572916667, + "learning_rate": 0.0003, + "loss": 10.8763, + "loss/aux_loss": 0.048067673482000826, + "loss/crossentropy": 2.6177058100700377, + "loss/logits": 0.8171544075012207, + "step": 68180 + }, + { + "epoch": 0.6819, + "grad_norm": 13.9375, + "grad_norm_var": 0.3876139322916667, + "learning_rate": 0.0003, + "loss": 10.8284, + "loss/aux_loss": 0.04806404709815979, + "loss/crossentropy": 2.6563953340053557, + "loss/logits": 0.8121503591537476, + "step": 68190 + }, + { + "epoch": 0.682, + "grad_norm": 20.625, + "grad_norm_var": 2.582666015625, + "learning_rate": 0.0003, + "loss": 10.7687, + "loss/aux_loss": 0.048071419820189476, + "loss/crossentropy": 2.6710281014442443, + "loss/logits": 0.8111292243003845, + "step": 68200 + }, + { + "epoch": 0.6821, + "grad_norm": 16.25, + "grad_norm_var": 2.412483723958333, + "learning_rate": 0.0003, + "loss": 10.9164, + "loss/aux_loss": 0.04807704593986273, + "loss/crossentropy": 2.562616801261902, + "loss/logits": 0.8096489131450653, + "step": 68210 + }, + { + "epoch": 0.6822, + "grad_norm": 16.5, + "grad_norm_var": 1.0212076822916667, + "learning_rate": 0.0003, + "loss": 10.8428, + "loss/aux_loss": 0.04805521406233311, + "loss/crossentropy": 2.7523882746696473, + "loss/logits": 0.8298066765069961, + "step": 68220 + }, + { + "epoch": 0.6823, + "grad_norm": 14.875, + "grad_norm_var": 0.9782389322916667, + "learning_rate": 0.0003, + "loss": 10.9563, + "loss/aux_loss": 0.04807618539780378, + "loss/crossentropy": 2.7051311850547792, + "loss/logits": 0.7973050862550736, + "step": 68230 + }, + { + "epoch": 0.6824, + "grad_norm": 14.875, + "grad_norm_var": 0.420166015625, + "learning_rate": 0.0003, + "loss": 10.882, + "loss/aux_loss": 0.048058620654046535, + "loss/crossentropy": 2.6264807403087618, + "loss/logits": 0.8164067506790161, + "step": 68240 + }, + { + "epoch": 0.6825, + "grad_norm": 16.25, + "grad_norm_var": 0.4014973958333333, + "learning_rate": 0.0003, + "loss": 11.0203, + "loss/aux_loss": 0.048072101175785066, + "loss/crossentropy": 2.7212952256202696, + "loss/logits": 0.8090760707855225, + "step": 68250 + }, + { + "epoch": 0.6826, + "grad_norm": 14.9375, + "grad_norm_var": 0.3411458333333333, + "learning_rate": 0.0003, + "loss": 10.9662, + "loss/aux_loss": 0.04806735776364803, + "loss/crossentropy": 2.8346715688705446, + "loss/logits": 0.8262193471193313, + "step": 68260 + }, + { + "epoch": 0.6827, + "grad_norm": 15.25, + "grad_norm_var": 0.539697265625, + "learning_rate": 0.0003, + "loss": 10.845, + "loss/aux_loss": 0.04806448295712471, + "loss/crossentropy": 2.8213131070137023, + "loss/logits": 0.8190656453371048, + "step": 68270 + }, + { + "epoch": 0.6828, + "grad_norm": 16.125, + "grad_norm_var": 0.6005045572916666, + "learning_rate": 0.0003, + "loss": 10.8459, + "loss/aux_loss": 0.048075663857162, + "loss/crossentropy": 2.743839997053146, + "loss/logits": 0.8118878155946732, + "step": 68280 + }, + { + "epoch": 0.6829, + "grad_norm": 15.625, + "grad_norm_var": 0.4852701822916667, + "learning_rate": 0.0003, + "loss": 11.0045, + "loss/aux_loss": 0.048066263645887376, + "loss/crossentropy": 2.638486051559448, + "loss/logits": 0.8266521632671356, + "step": 68290 + }, + { + "epoch": 0.683, + "grad_norm": 15.5625, + "grad_norm_var": 0.48776041666666664, + "learning_rate": 0.0003, + "loss": 11.0403, + "loss/aux_loss": 0.04806353356689215, + "loss/crossentropy": 2.6697838246822356, + "loss/logits": 0.8448014736175538, + "step": 68300 + }, + { + "epoch": 0.6831, + "grad_norm": 14.875, + "grad_norm_var": 0.6639973958333333, + "learning_rate": 0.0003, + "loss": 10.967, + "loss/aux_loss": 0.04807850923389197, + "loss/crossentropy": 2.746517300605774, + "loss/logits": 0.8058427959680557, + "step": 68310 + }, + { + "epoch": 0.6832, + "grad_norm": 15.125, + "grad_norm_var": 0.6393229166666666, + "learning_rate": 0.0003, + "loss": 10.8865, + "loss/aux_loss": 0.0480657272040844, + "loss/crossentropy": 2.5843187630176545, + "loss/logits": 0.768140897154808, + "step": 68320 + }, + { + "epoch": 0.6833, + "grad_norm": 16.25, + "grad_norm_var": 0.5322265625, + "learning_rate": 0.0003, + "loss": 11.0019, + "loss/aux_loss": 0.04805855434387922, + "loss/crossentropy": 2.7240235090255736, + "loss/logits": 0.8157852947711944, + "step": 68330 + }, + { + "epoch": 0.6834, + "grad_norm": 15.9375, + "grad_norm_var": 0.8430826822916667, + "learning_rate": 0.0003, + "loss": 10.8722, + "loss/aux_loss": 0.04806504771113396, + "loss/crossentropy": 2.6174010276794433, + "loss/logits": 0.8211093872785569, + "step": 68340 + }, + { + "epoch": 0.6835, + "grad_norm": 15.9375, + "grad_norm_var": 0.5624348958333333, + "learning_rate": 0.0003, + "loss": 10.7703, + "loss/aux_loss": 0.0480718906968832, + "loss/crossentropy": 2.7147361874580382, + "loss/logits": 0.8232816010713577, + "step": 68350 + }, + { + "epoch": 0.6836, + "grad_norm": 17.125, + "grad_norm_var": 0.713916015625, + "learning_rate": 0.0003, + "loss": 10.7898, + "loss/aux_loss": 0.048076497949659826, + "loss/crossentropy": 2.6525621116161346, + "loss/logits": 0.7956002086400986, + "step": 68360 + }, + { + "epoch": 0.6837, + "grad_norm": 14.8125, + "grad_norm_var": 0.9614583333333333, + "learning_rate": 0.0003, + "loss": 11.029, + "loss/aux_loss": 0.0480541817843914, + "loss/crossentropy": 2.7248359322547913, + "loss/logits": 0.8173481345176696, + "step": 68370 + }, + { + "epoch": 0.6838, + "grad_norm": 14.125, + "grad_norm_var": 0.4525390625, + "learning_rate": 0.0003, + "loss": 11.0335, + "loss/aux_loss": 0.0480697525665164, + "loss/crossentropy": 2.772666358947754, + "loss/logits": 0.8384395599365234, + "step": 68380 + }, + { + "epoch": 0.6839, + "grad_norm": 16.5, + "grad_norm_var": 0.455322265625, + "learning_rate": 0.0003, + "loss": 11.01, + "loss/aux_loss": 0.04807109702378511, + "loss/crossentropy": 2.5988758385181425, + "loss/logits": 0.8189985305070877, + "step": 68390 + }, + { + "epoch": 0.684, + "grad_norm": 15.4375, + "grad_norm_var": 0.2900390625, + "learning_rate": 0.0003, + "loss": 10.8031, + "loss/aux_loss": 0.04807253852486611, + "loss/crossentropy": 2.8541467905044557, + "loss/logits": 0.8169606924057007, + "step": 68400 + }, + { + "epoch": 0.6841, + "grad_norm": 15.9375, + "grad_norm_var": 0.44680989583333336, + "learning_rate": 0.0003, + "loss": 10.8272, + "loss/aux_loss": 0.04806390330195427, + "loss/crossentropy": 2.6487698316574098, + "loss/logits": 0.83295978307724, + "step": 68410 + }, + { + "epoch": 0.6842, + "grad_norm": 15.3125, + "grad_norm_var": 0.5655598958333333, + "learning_rate": 0.0003, + "loss": 10.7979, + "loss/aux_loss": 0.0480561263859272, + "loss/crossentropy": 2.6623626351356506, + "loss/logits": 0.8088810354471206, + "step": 68420 + }, + { + "epoch": 0.6843, + "grad_norm": 16.125, + "grad_norm_var": 0.6546223958333334, + "learning_rate": 0.0003, + "loss": 10.9656, + "loss/aux_loss": 0.04806807264685631, + "loss/crossentropy": 2.766378217935562, + "loss/logits": 0.8358203887939453, + "step": 68430 + }, + { + "epoch": 0.6844, + "grad_norm": 16.375, + "grad_norm_var": 0.37745768229166665, + "learning_rate": 0.0003, + "loss": 10.9736, + "loss/aux_loss": 0.04807545747607946, + "loss/crossentropy": 2.5721539914608003, + "loss/logits": 0.8290560871362687, + "step": 68440 + }, + { + "epoch": 0.6845, + "grad_norm": 16.75, + "grad_norm_var": 0.4228515625, + "learning_rate": 0.0003, + "loss": 10.696, + "loss/aux_loss": 0.04805667717009783, + "loss/crossentropy": 2.80389918088913, + "loss/logits": 0.8191724687814712, + "step": 68450 + }, + { + "epoch": 0.6846, + "grad_norm": 15.1875, + "grad_norm_var": 0.42823893229166665, + "learning_rate": 0.0003, + "loss": 10.7993, + "loss/aux_loss": 0.04807515386492014, + "loss/crossentropy": 2.597550481557846, + "loss/logits": 0.7920944511890411, + "step": 68460 + }, + { + "epoch": 0.6847, + "grad_norm": 16.75, + "grad_norm_var": 0.52421875, + "learning_rate": 0.0003, + "loss": 10.7448, + "loss/aux_loss": 0.048058021068573, + "loss/crossentropy": 2.759678292274475, + "loss/logits": 0.8082679748535156, + "step": 68470 + }, + { + "epoch": 0.6848, + "grad_norm": 16.625, + "grad_norm_var": 0.5075358072916667, + "learning_rate": 0.0003, + "loss": 10.8687, + "loss/aux_loss": 0.0480703329667449, + "loss/crossentropy": 2.637404328584671, + "loss/logits": 0.8026473224163055, + "step": 68480 + }, + { + "epoch": 0.6849, + "grad_norm": 15.375, + "grad_norm_var": 1.1485514322916666, + "learning_rate": 0.0003, + "loss": 10.8992, + "loss/aux_loss": 0.04807462096214295, + "loss/crossentropy": 2.501510390639305, + "loss/logits": 0.7927737981081009, + "step": 68490 + }, + { + "epoch": 0.685, + "grad_norm": 15.4375, + "grad_norm_var": 1.1325520833333333, + "learning_rate": 0.0003, + "loss": 10.9832, + "loss/aux_loss": 0.04806219376623631, + "loss/crossentropy": 2.791027194261551, + "loss/logits": 0.8303872972726822, + "step": 68500 + }, + { + "epoch": 0.6851, + "grad_norm": 15.3125, + "grad_norm_var": 0.8973307291666667, + "learning_rate": 0.0003, + "loss": 11.0139, + "loss/aux_loss": 0.04807308483868837, + "loss/crossentropy": 2.621407997608185, + "loss/logits": 0.8331938594579696, + "step": 68510 + }, + { + "epoch": 0.6852, + "grad_norm": 15.0625, + "grad_norm_var": 0.6895670572916667, + "learning_rate": 0.0003, + "loss": 10.9548, + "loss/aux_loss": 0.04806851521134377, + "loss/crossentropy": 2.852750539779663, + "loss/logits": 0.8136879056692123, + "step": 68520 + }, + { + "epoch": 0.6853, + "grad_norm": 14.875, + "grad_norm_var": 0.3726399739583333, + "learning_rate": 0.0003, + "loss": 10.7068, + "loss/aux_loss": 0.04807034097611904, + "loss/crossentropy": 2.626950180530548, + "loss/logits": 0.8226811677217484, + "step": 68530 + }, + { + "epoch": 0.6854, + "grad_norm": 15.25, + "grad_norm_var": 0.39609375, + "learning_rate": 0.0003, + "loss": 10.893, + "loss/aux_loss": 0.048067341558635235, + "loss/crossentropy": 2.7131328761577604, + "loss/logits": 0.8315279483795166, + "step": 68540 + }, + { + "epoch": 0.6855, + "grad_norm": 14.75, + "grad_norm_var": 0.36666666666666664, + "learning_rate": 0.0003, + "loss": 10.8631, + "loss/aux_loss": 0.048068701103329656, + "loss/crossentropy": 2.760225808620453, + "loss/logits": 0.8215482652187347, + "step": 68550 + }, + { + "epoch": 0.6856, + "grad_norm": 15.0625, + "grad_norm_var": 1.3462890625, + "learning_rate": 0.0003, + "loss": 10.728, + "loss/aux_loss": 0.04806747734546661, + "loss/crossentropy": 2.6939137518405913, + "loss/logits": 0.7923362493515015, + "step": 68560 + }, + { + "epoch": 0.6857, + "grad_norm": 15.75, + "grad_norm_var": 0.9894368489583333, + "learning_rate": 0.0003, + "loss": 10.8082, + "loss/aux_loss": 0.048071125708520415, + "loss/crossentropy": 2.578646457195282, + "loss/logits": 0.7892535030841827, + "step": 68570 + }, + { + "epoch": 0.6858, + "grad_norm": 15.5625, + "grad_norm_var": 0.22578125, + "learning_rate": 0.0003, + "loss": 10.7482, + "loss/aux_loss": 0.048062393255531785, + "loss/crossentropy": 2.7403541207313538, + "loss/logits": 0.7984594285488129, + "step": 68580 + }, + { + "epoch": 0.6859, + "grad_norm": 15.4375, + "grad_norm_var": 1.2327473958333333, + "learning_rate": 0.0003, + "loss": 10.8886, + "loss/aux_loss": 0.04806282836943865, + "loss/crossentropy": 2.47440989613533, + "loss/logits": 0.7788677424192428, + "step": 68590 + }, + { + "epoch": 0.686, + "grad_norm": 17.25, + "grad_norm_var": 0.651806640625, + "learning_rate": 0.0003, + "loss": 10.8691, + "loss/aux_loss": 0.04807308055460453, + "loss/crossentropy": 2.790786528587341, + "loss/logits": 0.8180954813957214, + "step": 68600 + }, + { + "epoch": 0.6861, + "grad_norm": 15.4375, + "grad_norm_var": 0.53203125, + "learning_rate": 0.0003, + "loss": 10.8299, + "loss/aux_loss": 0.04806508533656597, + "loss/crossentropy": 2.843202555179596, + "loss/logits": 0.8343409359455108, + "step": 68610 + }, + { + "epoch": 0.6862, + "grad_norm": 16.375, + "grad_norm_var": 0.434228515625, + "learning_rate": 0.0003, + "loss": 10.9189, + "loss/aux_loss": 0.048060751520097256, + "loss/crossentropy": 2.684927535057068, + "loss/logits": 0.8208200216293335, + "step": 68620 + }, + { + "epoch": 0.6863, + "grad_norm": 16.0, + "grad_norm_var": 0.5707682291666667, + "learning_rate": 0.0003, + "loss": 10.9396, + "loss/aux_loss": 0.04805750884115696, + "loss/crossentropy": 2.725750833749771, + "loss/logits": 0.8101972997188568, + "step": 68630 + }, + { + "epoch": 0.6864, + "grad_norm": 14.8125, + "grad_norm_var": 0.48118489583333335, + "learning_rate": 0.0003, + "loss": 10.8437, + "loss/aux_loss": 0.04807729534804821, + "loss/crossentropy": 2.7160585641860964, + "loss/logits": 0.8321511924266816, + "step": 68640 + }, + { + "epoch": 0.6865, + "grad_norm": 15.0625, + "grad_norm_var": 60.0947265625, + "learning_rate": 0.0003, + "loss": 10.7697, + "loss/aux_loss": 0.048073595762252806, + "loss/crossentropy": 2.7934968948364256, + "loss/logits": 0.8110801339149475, + "step": 68650 + }, + { + "epoch": 0.6866, + "grad_norm": 17.625, + "grad_norm_var": 59.416666666666664, + "learning_rate": 0.0003, + "loss": 10.8273, + "loss/aux_loss": 0.048069017380476, + "loss/crossentropy": 2.611198389530182, + "loss/logits": 0.7912805765867233, + "step": 68660 + }, + { + "epoch": 0.6867, + "grad_norm": 15.875, + "grad_norm_var": 2.5278645833333333, + "learning_rate": 0.0003, + "loss": 10.8545, + "loss/aux_loss": 0.0480608643963933, + "loss/crossentropy": 2.714128017425537, + "loss/logits": 0.8020155668258667, + "step": 68670 + }, + { + "epoch": 0.6868, + "grad_norm": 15.375, + "grad_norm_var": 0.8078125, + "learning_rate": 0.0003, + "loss": 10.8363, + "loss/aux_loss": 0.048069582879543306, + "loss/crossentropy": 2.7775703012943267, + "loss/logits": 0.8089812129735947, + "step": 68680 + }, + { + "epoch": 0.6869, + "grad_norm": 17.875, + "grad_norm_var": 1.0457682291666666, + "learning_rate": 0.0003, + "loss": 10.7583, + "loss/aux_loss": 0.048077752627432344, + "loss/crossentropy": 2.6942376673221586, + "loss/logits": 0.7967723488807679, + "step": 68690 + }, + { + "epoch": 0.687, + "grad_norm": 14.375, + "grad_norm_var": 0.8697265625, + "learning_rate": 0.0003, + "loss": 10.9463, + "loss/aux_loss": 0.04806126933544874, + "loss/crossentropy": 2.6490222990512846, + "loss/logits": 0.8524997681379318, + "step": 68700 + }, + { + "epoch": 0.6871, + "grad_norm": 15.375, + "grad_norm_var": 0.35149739583333334, + "learning_rate": 0.0003, + "loss": 10.6899, + "loss/aux_loss": 0.04805758167058229, + "loss/crossentropy": 2.6266492545604705, + "loss/logits": 0.7861977398395539, + "step": 68710 + }, + { + "epoch": 0.6872, + "grad_norm": 16.375, + "grad_norm_var": 0.5044270833333333, + "learning_rate": 0.0003, + "loss": 11.0628, + "loss/aux_loss": 0.048070350848138335, + "loss/crossentropy": 2.7783553838729858, + "loss/logits": 0.8349015235900878, + "step": 68720 + }, + { + "epoch": 0.6873, + "grad_norm": 14.5625, + "grad_norm_var": 0.9837890625, + "learning_rate": 0.0003, + "loss": 10.8576, + "loss/aux_loss": 0.04807405862957239, + "loss/crossentropy": 2.7715602993965147, + "loss/logits": 0.7893172383308411, + "step": 68730 + }, + { + "epoch": 0.6874, + "grad_norm": 15.0625, + "grad_norm_var": 0.4676920572916667, + "learning_rate": 0.0003, + "loss": 10.7511, + "loss/aux_loss": 0.04806721042841673, + "loss/crossentropy": 2.721312952041626, + "loss/logits": 0.8222410500049591, + "step": 68740 + }, + { + "epoch": 0.6875, + "grad_norm": 14.375, + "grad_norm_var": 0.4657389322916667, + "learning_rate": 0.0003, + "loss": 11.0532, + "loss/aux_loss": 0.04806132633239031, + "loss/crossentropy": 2.6498945474624636, + "loss/logits": 0.8259395629167556, + "step": 68750 + }, + { + "epoch": 0.6876, + "grad_norm": 14.0625, + "grad_norm_var": 0.298681640625, + "learning_rate": 0.0003, + "loss": 10.7399, + "loss/aux_loss": 0.04808110278099775, + "loss/crossentropy": 2.6188538670539856, + "loss/logits": 0.7968878641724586, + "step": 68760 + }, + { + "epoch": 0.6877, + "grad_norm": 14.6875, + "grad_norm_var": 0.4171223958333333, + "learning_rate": 0.0003, + "loss": 10.9881, + "loss/aux_loss": 0.048065911047160625, + "loss/crossentropy": 2.623822647333145, + "loss/logits": 0.8174702137708664, + "step": 68770 + }, + { + "epoch": 0.6878, + "grad_norm": 14.5, + "grad_norm_var": 1.20625, + "learning_rate": 0.0003, + "loss": 10.9751, + "loss/aux_loss": 0.04807086084038019, + "loss/crossentropy": 2.7264267265796662, + "loss/logits": 0.8461625635623932, + "step": 68780 + }, + { + "epoch": 0.6879, + "grad_norm": 15.0625, + "grad_norm_var": 0.814697265625, + "learning_rate": 0.0003, + "loss": 10.7834, + "loss/aux_loss": 0.04806190486997366, + "loss/crossentropy": 2.8499103784561157, + "loss/logits": 0.846915426850319, + "step": 68790 + }, + { + "epoch": 0.688, + "grad_norm": 14.5, + "grad_norm_var": 0.7809895833333333, + "learning_rate": 0.0003, + "loss": 11.0182, + "loss/aux_loss": 0.048072070069611075, + "loss/crossentropy": 2.6494312465190886, + "loss/logits": 0.8267855823040009, + "step": 68800 + }, + { + "epoch": 0.6881, + "grad_norm": 17.5, + "grad_norm_var": 1.037744140625, + "learning_rate": 0.0003, + "loss": 10.9319, + "loss/aux_loss": 0.04805615525692701, + "loss/crossentropy": 2.509679216146469, + "loss/logits": 0.79253771007061, + "step": 68810 + }, + { + "epoch": 0.6882, + "grad_norm": 15.4375, + "grad_norm_var": 0.62421875, + "learning_rate": 0.0003, + "loss": 10.9166, + "loss/aux_loss": 0.04807599224150181, + "loss/crossentropy": 2.87341451048851, + "loss/logits": 0.8474191457033158, + "step": 68820 + }, + { + "epoch": 0.6883, + "grad_norm": 15.375, + "grad_norm_var": 0.2955729166666667, + "learning_rate": 0.0003, + "loss": 11.0466, + "loss/aux_loss": 0.048066175729036334, + "loss/crossentropy": 2.767427670955658, + "loss/logits": 0.8232954949140548, + "step": 68830 + }, + { + "epoch": 0.6884, + "grad_norm": 14.875, + "grad_norm_var": 0.38409830729166666, + "learning_rate": 0.0003, + "loss": 10.9533, + "loss/aux_loss": 0.04806645177304745, + "loss/crossentropy": 2.707893443107605, + "loss/logits": 0.8553780347108841, + "step": 68840 + }, + { + "epoch": 0.6885, + "grad_norm": 15.5625, + "grad_norm_var": 0.5254557291666667, + "learning_rate": 0.0003, + "loss": 11.0061, + "loss/aux_loss": 0.04807383120059967, + "loss/crossentropy": 2.6774882674217224, + "loss/logits": 0.8080006390810013, + "step": 68850 + }, + { + "epoch": 0.6886, + "grad_norm": 15.4375, + "grad_norm_var": 2.9567057291666665, + "learning_rate": 0.0003, + "loss": 10.856, + "loss/aux_loss": 0.048062294721603394, + "loss/crossentropy": 2.6642824053764342, + "loss/logits": 0.7930444091558456, + "step": 68860 + }, + { + "epoch": 0.6887, + "grad_norm": 15.8125, + "grad_norm_var": 3.0775390625, + "learning_rate": 0.0003, + "loss": 10.9136, + "loss/aux_loss": 0.048069755733013156, + "loss/crossentropy": 2.7065019488334654, + "loss/logits": 0.8077657282352447, + "step": 68870 + }, + { + "epoch": 0.6888, + "grad_norm": 16.875, + "grad_norm_var": 0.6134765625, + "learning_rate": 0.0003, + "loss": 11.0391, + "loss/aux_loss": 0.048073760420084, + "loss/crossentropy": 2.668808138370514, + "loss/logits": 0.8384716600179672, + "step": 68880 + }, + { + "epoch": 0.6889, + "grad_norm": 15.75, + "grad_norm_var": 0.561962890625, + "learning_rate": 0.0003, + "loss": 10.8771, + "loss/aux_loss": 0.048064139857888225, + "loss/crossentropy": 2.6391066908836365, + "loss/logits": 0.8003528326749801, + "step": 68890 + }, + { + "epoch": 0.689, + "grad_norm": 15.0, + "grad_norm_var": 328.2468587239583, + "learning_rate": 0.0003, + "loss": 10.9101, + "loss/aux_loss": 0.048088057711720464, + "loss/crossentropy": 2.670439213514328, + "loss/logits": 0.7999959751963616, + "step": 68900 + }, + { + "epoch": 0.6891, + "grad_norm": 16.75, + "grad_norm_var": 0.926416015625, + "learning_rate": 0.0003, + "loss": 10.7369, + "loss/aux_loss": 0.048071911372244355, + "loss/crossentropy": 2.590096038579941, + "loss/logits": 0.7963913947343826, + "step": 68910 + }, + { + "epoch": 0.6892, + "grad_norm": 16.25, + "grad_norm_var": 0.5931640625, + "learning_rate": 0.0003, + "loss": 10.8433, + "loss/aux_loss": 0.04806236661970616, + "loss/crossentropy": 2.7095581710338594, + "loss/logits": 0.808975538611412, + "step": 68920 + }, + { + "epoch": 0.6893, + "grad_norm": 15.0, + "grad_norm_var": 0.7262858072916667, + "learning_rate": 0.0003, + "loss": 10.8429, + "loss/aux_loss": 0.04807297587394714, + "loss/crossentropy": 2.716857922077179, + "loss/logits": 0.8298698961734772, + "step": 68930 + }, + { + "epoch": 0.6894, + "grad_norm": 16.125, + "grad_norm_var": 0.5973958333333333, + "learning_rate": 0.0003, + "loss": 10.8001, + "loss/aux_loss": 0.04806691724807024, + "loss/crossentropy": 2.7727342784404754, + "loss/logits": 0.7861407697200775, + "step": 68940 + }, + { + "epoch": 0.6895, + "grad_norm": 15.875, + "grad_norm_var": 0.4127604166666667, + "learning_rate": 0.0003, + "loss": 10.6628, + "loss/aux_loss": 0.04806840233504772, + "loss/crossentropy": 2.7098950922489164, + "loss/logits": 0.82764173746109, + "step": 68950 + }, + { + "epoch": 0.6896, + "grad_norm": 16.5, + "grad_norm_var": 0.3653483072916667, + "learning_rate": 0.0003, + "loss": 10.8558, + "loss/aux_loss": 0.048067934811115265, + "loss/crossentropy": 2.679134911298752, + "loss/logits": 0.8346406280994415, + "step": 68960 + }, + { + "epoch": 0.6897, + "grad_norm": 15.3125, + "grad_norm_var": 0.42083333333333334, + "learning_rate": 0.0003, + "loss": 10.706, + "loss/aux_loss": 0.04807941559702158, + "loss/crossentropy": 2.5862072229385378, + "loss/logits": 0.784822764992714, + "step": 68970 + }, + { + "epoch": 0.6898, + "grad_norm": 15.25, + "grad_norm_var": 0.38333333333333336, + "learning_rate": 0.0003, + "loss": 10.969, + "loss/aux_loss": 0.04805704411119223, + "loss/crossentropy": 2.7672839522361756, + "loss/logits": 0.8178101569414139, + "step": 68980 + }, + { + "epoch": 0.6899, + "grad_norm": 15.25, + "grad_norm_var": 3.1681640625, + "learning_rate": 0.0003, + "loss": 10.7957, + "loss/aux_loss": 0.0480627154931426, + "loss/crossentropy": 2.7157513022422792, + "loss/logits": 0.8189638644456864, + "step": 68990 + }, + { + "epoch": 0.69, + "grad_norm": 14.625, + "grad_norm_var": 0.68828125, + "learning_rate": 0.0003, + "loss": 10.8248, + "loss/aux_loss": 0.048067789524793625, + "loss/crossentropy": 2.6678106248378755, + "loss/logits": 0.8147345900535583, + "step": 69000 + }, + { + "epoch": 0.6901, + "grad_norm": 16.5, + "grad_norm_var": 0.6473958333333333, + "learning_rate": 0.0003, + "loss": 10.9064, + "loss/aux_loss": 0.04807272534817457, + "loss/crossentropy": 2.617716884613037, + "loss/logits": 0.8001246243715286, + "step": 69010 + }, + { + "epoch": 0.6902, + "grad_norm": 16.75, + "grad_norm_var": 3.076416015625, + "learning_rate": 0.0003, + "loss": 10.9515, + "loss/aux_loss": 0.0480681125074625, + "loss/crossentropy": 2.7977681756019592, + "loss/logits": 0.8114293158054352, + "step": 69020 + }, + { + "epoch": 0.6903, + "grad_norm": 15.875, + "grad_norm_var": 2.7058430989583333, + "learning_rate": 0.0003, + "loss": 10.7312, + "loss/aux_loss": 0.048084371723234653, + "loss/crossentropy": 2.6386759102344515, + "loss/logits": 0.808656194806099, + "step": 69030 + }, + { + "epoch": 0.6904, + "grad_norm": 16.75, + "grad_norm_var": 0.43430989583333335, + "learning_rate": 0.0003, + "loss": 10.9418, + "loss/aux_loss": 0.04805478285998106, + "loss/crossentropy": 2.640076959133148, + "loss/logits": 0.7902828812599182, + "step": 69040 + }, + { + "epoch": 0.6905, + "grad_norm": 16.375, + "grad_norm_var": 1.1566243489583334, + "learning_rate": 0.0003, + "loss": 10.8981, + "loss/aux_loss": 0.048069654405117034, + "loss/crossentropy": 2.6928559839725494, + "loss/logits": 0.8163411170244217, + "step": 69050 + }, + { + "epoch": 0.6906, + "grad_norm": 15.1875, + "grad_norm_var": 0.7915201822916667, + "learning_rate": 0.0003, + "loss": 10.9199, + "loss/aux_loss": 0.048076878674328326, + "loss/crossentropy": 2.6690307438373564, + "loss/logits": 0.8052759945392609, + "step": 69060 + }, + { + "epoch": 0.6907, + "grad_norm": 14.875, + "grad_norm_var": 0.46599934895833334, + "learning_rate": 0.0003, + "loss": 10.7491, + "loss/aux_loss": 0.048068926110863684, + "loss/crossentropy": 2.593144977092743, + "loss/logits": 0.7935424596071243, + "step": 69070 + }, + { + "epoch": 0.6908, + "grad_norm": 15.5625, + "grad_norm_var": 0.41868489583333335, + "learning_rate": 0.0003, + "loss": 10.8175, + "loss/aux_loss": 0.04806020874530077, + "loss/crossentropy": 2.6784588575363157, + "loss/logits": 0.8021058231592179, + "step": 69080 + }, + { + "epoch": 0.6909, + "grad_norm": 15.125, + "grad_norm_var": 0.7081868489583333, + "learning_rate": 0.0003, + "loss": 10.9733, + "loss/aux_loss": 0.04807012509554624, + "loss/crossentropy": 2.667149120569229, + "loss/logits": 0.8256219893693924, + "step": 69090 + }, + { + "epoch": 0.691, + "grad_norm": 15.5625, + "grad_norm_var": 0.5306640625, + "learning_rate": 0.0003, + "loss": 10.6466, + "loss/aux_loss": 0.04807849489152431, + "loss/crossentropy": 2.6570124447345735, + "loss/logits": 0.7915462791919708, + "step": 69100 + }, + { + "epoch": 0.6911, + "grad_norm": 14.4375, + "grad_norm_var": 0.324853515625, + "learning_rate": 0.0003, + "loss": 10.8218, + "loss/aux_loss": 0.048054102994501594, + "loss/crossentropy": 2.5824302971363067, + "loss/logits": 0.8005838513374328, + "step": 69110 + }, + { + "epoch": 0.6912, + "grad_norm": 16.75, + "grad_norm_var": 0.48748372395833334, + "learning_rate": 0.0003, + "loss": 10.9588, + "loss/aux_loss": 0.04807182941585779, + "loss/crossentropy": 2.7127415359020235, + "loss/logits": 0.824643325805664, + "step": 69120 + }, + { + "epoch": 0.6913, + "grad_norm": 15.5, + "grad_norm_var": 0.5541015625, + "learning_rate": 0.0003, + "loss": 10.8355, + "loss/aux_loss": 0.0480794757604599, + "loss/crossentropy": 2.832990896701813, + "loss/logits": 0.835834476351738, + "step": 69130 + }, + { + "epoch": 0.6914, + "grad_norm": 15.8125, + "grad_norm_var": 0.6528483072916667, + "learning_rate": 0.0003, + "loss": 10.8559, + "loss/aux_loss": 0.04806724786758423, + "loss/crossentropy": 2.5886457681655886, + "loss/logits": 0.792993089556694, + "step": 69140 + }, + { + "epoch": 0.6915, + "grad_norm": 14.875, + "grad_norm_var": 0.24088541666666666, + "learning_rate": 0.0003, + "loss": 10.8768, + "loss/aux_loss": 0.04806765224784613, + "loss/crossentropy": 2.728767251968384, + "loss/logits": 0.8263113409280777, + "step": 69150 + }, + { + "epoch": 0.6916, + "grad_norm": 16.0, + "grad_norm_var": 0.5479166666666667, + "learning_rate": 0.0003, + "loss": 10.8623, + "loss/aux_loss": 0.048062493838369844, + "loss/crossentropy": 2.649292767047882, + "loss/logits": 0.8204698622226715, + "step": 69160 + }, + { + "epoch": 0.6917, + "grad_norm": 17.375, + "grad_norm_var": 0.8387858072916666, + "learning_rate": 0.0003, + "loss": 11.0247, + "loss/aux_loss": 0.04808208290487528, + "loss/crossentropy": 2.7280562281608582, + "loss/logits": 0.8480396270751953, + "step": 69170 + }, + { + "epoch": 0.6918, + "grad_norm": 14.6875, + "grad_norm_var": 1.9921223958333334, + "learning_rate": 0.0003, + "loss": 10.8223, + "loss/aux_loss": 0.04805909302085638, + "loss/crossentropy": 2.467037004232407, + "loss/logits": 0.7907186537981034, + "step": 69180 + }, + { + "epoch": 0.6919, + "grad_norm": 17.75, + "grad_norm_var": 0.7739583333333333, + "learning_rate": 0.0003, + "loss": 10.8994, + "loss/aux_loss": 0.04806915447115898, + "loss/crossentropy": 2.8982559561729433, + "loss/logits": 0.8535742044448853, + "step": 69190 + }, + { + "epoch": 0.692, + "grad_norm": 15.3125, + "grad_norm_var": 0.9844889322916667, + "learning_rate": 0.0003, + "loss": 10.8113, + "loss/aux_loss": 0.04806341417133808, + "loss/crossentropy": 2.7215474128723143, + "loss/logits": 0.8118506580591202, + "step": 69200 + }, + { + "epoch": 0.6921, + "grad_norm": 15.4375, + "grad_norm_var": 1.2166015625, + "learning_rate": 0.0003, + "loss": 10.9878, + "loss/aux_loss": 0.04807480592280626, + "loss/crossentropy": 2.6422359228134153, + "loss/logits": 0.8116691440343857, + "step": 69210 + }, + { + "epoch": 0.6922, + "grad_norm": 16.125, + "grad_norm_var": 0.509375, + "learning_rate": 0.0003, + "loss": 10.9659, + "loss/aux_loss": 0.048063729889690876, + "loss/crossentropy": 2.697904723882675, + "loss/logits": 0.8118081420660019, + "step": 69220 + }, + { + "epoch": 0.6923, + "grad_norm": 14.8125, + "grad_norm_var": 3.744124348958333, + "learning_rate": 0.0003, + "loss": 10.758, + "loss/aux_loss": 0.04806367959827185, + "loss/crossentropy": 2.5523222506046297, + "loss/logits": 0.7924966961145401, + "step": 69230 + }, + { + "epoch": 0.6924, + "grad_norm": 15.375, + "grad_norm_var": 4.15390625, + "learning_rate": 0.0003, + "loss": 10.8883, + "loss/aux_loss": 0.04806468244642019, + "loss/crossentropy": 2.8213607549667357, + "loss/logits": 0.8581427276134491, + "step": 69240 + }, + { + "epoch": 0.6925, + "grad_norm": 15.0625, + "grad_norm_var": 0.776806640625, + "learning_rate": 0.0003, + "loss": 10.9512, + "loss/aux_loss": 0.048063874058425424, + "loss/crossentropy": 2.7791464805603026, + "loss/logits": 0.8555617034435272, + "step": 69250 + }, + { + "epoch": 0.6926, + "grad_norm": 15.5625, + "grad_norm_var": 0.5567057291666667, + "learning_rate": 0.0003, + "loss": 10.8125, + "loss/aux_loss": 0.04807287901639938, + "loss/crossentropy": 2.721563369035721, + "loss/logits": 0.8256457418203353, + "step": 69260 + }, + { + "epoch": 0.6927, + "grad_norm": 15.625, + "grad_norm_var": 1.0127604166666666, + "learning_rate": 0.0003, + "loss": 10.7597, + "loss/aux_loss": 0.04807109721004963, + "loss/crossentropy": 2.6567570507526397, + "loss/logits": 0.7937098532915116, + "step": 69270 + }, + { + "epoch": 0.6928, + "grad_norm": 14.625, + "grad_norm_var": 0.8239420572916667, + "learning_rate": 0.0003, + "loss": 10.922, + "loss/aux_loss": 0.04806143771857023, + "loss/crossentropy": 2.618191432952881, + "loss/logits": 0.7884357571601868, + "step": 69280 + }, + { + "epoch": 0.6929, + "grad_norm": 15.3125, + "grad_norm_var": 1.0782389322916666, + "learning_rate": 0.0003, + "loss": 10.9623, + "loss/aux_loss": 0.04808080028742552, + "loss/crossentropy": 2.732917082309723, + "loss/logits": 0.8358202904462815, + "step": 69290 + }, + { + "epoch": 0.693, + "grad_norm": 14.625, + "grad_norm_var": 1.6150390625, + "learning_rate": 0.0003, + "loss": 10.949, + "loss/aux_loss": 0.048059186339378356, + "loss/crossentropy": 2.7098691940307615, + "loss/logits": 0.8036975592374802, + "step": 69300 + }, + { + "epoch": 0.6931, + "grad_norm": 14.5, + "grad_norm_var": 0.6299479166666667, + "learning_rate": 0.0003, + "loss": 10.8893, + "loss/aux_loss": 0.04807203523814678, + "loss/crossentropy": 2.675479108095169, + "loss/logits": 0.8248216599225998, + "step": 69310 + }, + { + "epoch": 0.6932, + "grad_norm": 16.375, + "grad_norm_var": 0.5020833333333333, + "learning_rate": 0.0003, + "loss": 10.7723, + "loss/aux_loss": 0.04805539548397064, + "loss/crossentropy": 2.6765593349933625, + "loss/logits": 0.8203548967838288, + "step": 69320 + }, + { + "epoch": 0.6933, + "grad_norm": 14.125, + "grad_norm_var": 1.6524576822916666, + "learning_rate": 0.0003, + "loss": 10.8687, + "loss/aux_loss": 0.04807414021342993, + "loss/crossentropy": 2.7173945188522337, + "loss/logits": 0.8036992192268372, + "step": 69330 + }, + { + "epoch": 0.6934, + "grad_norm": 15.25, + "grad_norm_var": 1.7469889322916667, + "learning_rate": 0.0003, + "loss": 10.9554, + "loss/aux_loss": 0.04807598683983087, + "loss/crossentropy": 2.8862260222434997, + "loss/logits": 0.809663537144661, + "step": 69340 + }, + { + "epoch": 0.6935, + "grad_norm": 15.375, + "grad_norm_var": 0.6254557291666667, + "learning_rate": 0.0003, + "loss": 10.6077, + "loss/aux_loss": 0.04807170238345861, + "loss/crossentropy": 2.7360516667366026, + "loss/logits": 0.8002244532108307, + "step": 69350 + }, + { + "epoch": 0.6936, + "grad_norm": 15.625, + "grad_norm_var": 0.5206868489583333, + "learning_rate": 0.0003, + "loss": 10.9361, + "loss/aux_loss": 0.04806582704186439, + "loss/crossentropy": 2.6366516649723053, + "loss/logits": 0.7836940854787826, + "step": 69360 + }, + { + "epoch": 0.6937, + "grad_norm": 15.0625, + "grad_norm_var": 1.025244140625, + "learning_rate": 0.0003, + "loss": 10.8865, + "loss/aux_loss": 0.04807285238057375, + "loss/crossentropy": 2.700425660610199, + "loss/logits": 0.786887913942337, + "step": 69370 + }, + { + "epoch": 0.6938, + "grad_norm": 15.9375, + "grad_norm_var": 1.1244140625, + "learning_rate": 0.0003, + "loss": 10.7267, + "loss/aux_loss": 0.048068417236208916, + "loss/crossentropy": 2.751612478494644, + "loss/logits": 0.8176582008600235, + "step": 69380 + }, + { + "epoch": 0.6939, + "grad_norm": 14.5, + "grad_norm_var": 0.6707682291666667, + "learning_rate": 0.0003, + "loss": 10.8515, + "loss/aux_loss": 0.04806055538356304, + "loss/crossentropy": 2.6871352314949037, + "loss/logits": 0.8337746620178222, + "step": 69390 + }, + { + "epoch": 0.694, + "grad_norm": 14.8125, + "grad_norm_var": 0.42239583333333336, + "learning_rate": 0.0003, + "loss": 10.9541, + "loss/aux_loss": 0.04807214047759771, + "loss/crossentropy": 2.9056159615516663, + "loss/logits": 0.8280203819274903, + "step": 69400 + }, + { + "epoch": 0.6941, + "grad_norm": 14.25, + "grad_norm_var": 0.9585774739583334, + "learning_rate": 0.0003, + "loss": 10.8548, + "loss/aux_loss": 0.048070849664509294, + "loss/crossentropy": 2.7352217197418214, + "loss/logits": 0.822101253271103, + "step": 69410 + }, + { + "epoch": 0.6942, + "grad_norm": 17.0, + "grad_norm_var": 1.1666015625, + "learning_rate": 0.0003, + "loss": 10.8417, + "loss/aux_loss": 0.04806300960481167, + "loss/crossentropy": 2.747141933441162, + "loss/logits": 0.8111588656902313, + "step": 69420 + }, + { + "epoch": 0.6943, + "grad_norm": 15.0625, + "grad_norm_var": 0.666259765625, + "learning_rate": 0.0003, + "loss": 10.9775, + "loss/aux_loss": 0.04806242380291224, + "loss/crossentropy": 2.7164130806922913, + "loss/logits": 0.8515024065971375, + "step": 69430 + }, + { + "epoch": 0.6944, + "grad_norm": 16.75, + "grad_norm_var": 128.33709309895832, + "learning_rate": 0.0003, + "loss": 11.0138, + "loss/aux_loss": 0.048085224255919455, + "loss/crossentropy": 2.6374498426914217, + "loss/logits": 0.8130934327840805, + "step": 69440 + }, + { + "epoch": 0.6945, + "grad_norm": 16.25, + "grad_norm_var": 2.3645833333333335, + "learning_rate": 0.0003, + "loss": 10.7526, + "loss/aux_loss": 0.04807570818811655, + "loss/crossentropy": 2.6251815021038056, + "loss/logits": 0.8005797922611236, + "step": 69450 + }, + { + "epoch": 0.6946, + "grad_norm": 14.875, + "grad_norm_var": 0.9205729166666666, + "learning_rate": 0.0003, + "loss": 10.845, + "loss/aux_loss": 0.048060524836182596, + "loss/crossentropy": 2.734144937992096, + "loss/logits": 0.8050288885831833, + "step": 69460 + }, + { + "epoch": 0.6947, + "grad_norm": 15.625, + "grad_norm_var": 0.60078125, + "learning_rate": 0.0003, + "loss": 10.8845, + "loss/aux_loss": 0.048075083270668985, + "loss/crossentropy": 2.705908918380737, + "loss/logits": 0.8168601602315902, + "step": 69470 + }, + { + "epoch": 0.6948, + "grad_norm": 15.5, + "grad_norm_var": 0.7356770833333334, + "learning_rate": 0.0003, + "loss": 10.7593, + "loss/aux_loss": 0.048058357648551465, + "loss/crossentropy": 2.618184173107147, + "loss/logits": 0.8116142481565476, + "step": 69480 + }, + { + "epoch": 0.6949, + "grad_norm": 16.125, + "grad_norm_var": 1.1541015625, + "learning_rate": 0.0003, + "loss": 10.9445, + "loss/aux_loss": 0.0480714937672019, + "loss/crossentropy": 2.6427643597126007, + "loss/logits": 0.8324632406234741, + "step": 69490 + }, + { + "epoch": 0.695, + "grad_norm": 15.0625, + "grad_norm_var": 0.9432291666666667, + "learning_rate": 0.0003, + "loss": 10.7763, + "loss/aux_loss": 0.048056024312973025, + "loss/crossentropy": 2.7632019460201263, + "loss/logits": 0.8252893060445785, + "step": 69500 + }, + { + "epoch": 0.6951, + "grad_norm": 15.25, + "grad_norm_var": 0.4384765625, + "learning_rate": 0.0003, + "loss": 10.904, + "loss/aux_loss": 0.04805950913578272, + "loss/crossentropy": 2.741923874616623, + "loss/logits": 0.8043259769678116, + "step": 69510 + }, + { + "epoch": 0.6952, + "grad_norm": 14.8125, + "grad_norm_var": 0.211962890625, + "learning_rate": 0.0003, + "loss": 10.8344, + "loss/aux_loss": 0.04806842133402824, + "loss/crossentropy": 2.824173706769943, + "loss/logits": 0.8347906857728958, + "step": 69520 + }, + { + "epoch": 0.6953, + "grad_norm": 15.75, + "grad_norm_var": 0.379541015625, + "learning_rate": 0.0003, + "loss": 10.9516, + "loss/aux_loss": 0.04806260485202074, + "loss/crossentropy": 2.81458033323288, + "loss/logits": 0.8338780552148819, + "step": 69530 + }, + { + "epoch": 0.6954, + "grad_norm": 15.4375, + "grad_norm_var": 0.35201822916666664, + "learning_rate": 0.0003, + "loss": 10.7228, + "loss/aux_loss": 0.04807497151196003, + "loss/crossentropy": 2.718851935863495, + "loss/logits": 0.7873619675636292, + "step": 69540 + }, + { + "epoch": 0.6955, + "grad_norm": 15.25, + "grad_norm_var": 0.40358072916666665, + "learning_rate": 0.0003, + "loss": 11.0079, + "loss/aux_loss": 0.04806389175355434, + "loss/crossentropy": 2.8200283885002135, + "loss/logits": 0.8427982300519943, + "step": 69550 + }, + { + "epoch": 0.6956, + "grad_norm": 16.125, + "grad_norm_var": 0.8676432291666667, + "learning_rate": 0.0003, + "loss": 10.9712, + "loss/aux_loss": 0.04806965868920088, + "loss/crossentropy": 2.617030918598175, + "loss/logits": 0.7800733983516693, + "step": 69560 + }, + { + "epoch": 0.6957, + "grad_norm": 15.25, + "grad_norm_var": 0.790869140625, + "learning_rate": 0.0003, + "loss": 10.6778, + "loss/aux_loss": 0.0480612862855196, + "loss/crossentropy": 2.6069294095039366, + "loss/logits": 0.7806821346282959, + "step": 69570 + }, + { + "epoch": 0.6958, + "grad_norm": 15.3125, + "grad_norm_var": 0.493994140625, + "learning_rate": 0.0003, + "loss": 10.7764, + "loss/aux_loss": 0.04807495810091496, + "loss/crossentropy": 2.655094450712204, + "loss/logits": 0.8132032155990601, + "step": 69580 + }, + { + "epoch": 0.6959, + "grad_norm": 15.875, + "grad_norm_var": 0.35052083333333334, + "learning_rate": 0.0003, + "loss": 10.6866, + "loss/aux_loss": 0.04805904570966959, + "loss/crossentropy": 2.5849641382694246, + "loss/logits": 0.7947615712881089, + "step": 69590 + }, + { + "epoch": 0.696, + "grad_norm": 15.625, + "grad_norm_var": 0.5973958333333333, + "learning_rate": 0.0003, + "loss": 10.9278, + "loss/aux_loss": 0.048071693442761895, + "loss/crossentropy": 2.529132205247879, + "loss/logits": 0.7914834886789321, + "step": 69600 + }, + { + "epoch": 0.6961, + "grad_norm": 15.5, + "grad_norm_var": 0.7426432291666667, + "learning_rate": 0.0003, + "loss": 10.7448, + "loss/aux_loss": 0.048061798140406606, + "loss/crossentropy": 2.5973862528800966, + "loss/logits": 0.762446054816246, + "step": 69610 + }, + { + "epoch": 0.6962, + "grad_norm": 15.625, + "grad_norm_var": 0.7150390625, + "learning_rate": 0.0003, + "loss": 10.7611, + "loss/aux_loss": 0.04808028992265463, + "loss/crossentropy": 2.580735170841217, + "loss/logits": 0.8113909959793091, + "step": 69620 + }, + { + "epoch": 0.6963, + "grad_norm": 14.375, + "grad_norm_var": 0.9327473958333333, + "learning_rate": 0.0003, + "loss": 10.7292, + "loss/aux_loss": 0.04805351886898279, + "loss/crossentropy": 2.6865119695663453, + "loss/logits": 0.7755003601312638, + "step": 69630 + }, + { + "epoch": 0.6964, + "grad_norm": 14.4375, + "grad_norm_var": 0.8343098958333334, + "learning_rate": 0.0003, + "loss": 10.8817, + "loss/aux_loss": 0.048071601428091526, + "loss/crossentropy": 2.7158267498016357, + "loss/logits": 0.813782611489296, + "step": 69640 + }, + { + "epoch": 0.6965, + "grad_norm": 15.4375, + "grad_norm_var": 0.877978515625, + "learning_rate": 0.0003, + "loss": 10.7957, + "loss/aux_loss": 0.048066675662994385, + "loss/crossentropy": 2.715426343679428, + "loss/logits": 0.8276042312383651, + "step": 69650 + }, + { + "epoch": 0.6966, + "grad_norm": 15.8125, + "grad_norm_var": 0.6184895833333334, + "learning_rate": 0.0003, + "loss": 10.9359, + "loss/aux_loss": 0.048063802719116214, + "loss/crossentropy": 2.6631912708282472, + "loss/logits": 0.8037597626447678, + "step": 69660 + }, + { + "epoch": 0.6967, + "grad_norm": 15.5625, + "grad_norm_var": 1.0847493489583333, + "learning_rate": 0.0003, + "loss": 11.0285, + "loss/aux_loss": 0.0480733547359705, + "loss/crossentropy": 2.6649845838546753, + "loss/logits": 0.824079555273056, + "step": 69670 + }, + { + "epoch": 0.6968, + "grad_norm": 19.75, + "grad_norm_var": 2.1988932291666665, + "learning_rate": 0.0003, + "loss": 10.7858, + "loss/aux_loss": 0.04807499777525663, + "loss/crossentropy": 2.559821057319641, + "loss/logits": 0.8164191097021103, + "step": 69680 + }, + { + "epoch": 0.6969, + "grad_norm": 16.0, + "grad_norm_var": 1.6702473958333333, + "learning_rate": 0.0003, + "loss": 10.7732, + "loss/aux_loss": 0.048064416646957396, + "loss/crossentropy": 2.5081125438213348, + "loss/logits": 0.7806854665279388, + "step": 69690 + }, + { + "epoch": 0.697, + "grad_norm": 17.125, + "grad_norm_var": 0.537744140625, + "learning_rate": 0.0003, + "loss": 10.911, + "loss/aux_loss": 0.048071997612714766, + "loss/crossentropy": 2.798080360889435, + "loss/logits": 0.849156191945076, + "step": 69700 + }, + { + "epoch": 0.6971, + "grad_norm": 15.5, + "grad_norm_var": 0.8555826822916667, + "learning_rate": 0.0003, + "loss": 10.7062, + "loss/aux_loss": 0.048063857667148116, + "loss/crossentropy": 2.721605783700943, + "loss/logits": 0.7785751849412919, + "step": 69710 + }, + { + "epoch": 0.6972, + "grad_norm": 15.5625, + "grad_norm_var": 0.411572265625, + "learning_rate": 0.0003, + "loss": 10.8278, + "loss/aux_loss": 0.048065428622066975, + "loss/crossentropy": 2.7682719230651855, + "loss/logits": 0.8334241211414337, + "step": 69720 + }, + { + "epoch": 0.6973, + "grad_norm": 15.25, + "grad_norm_var": 0.6947265625, + "learning_rate": 0.0003, + "loss": 10.7517, + "loss/aux_loss": 0.04808158706873655, + "loss/crossentropy": 2.7136940717697144, + "loss/logits": 0.799360203742981, + "step": 69730 + }, + { + "epoch": 0.6974, + "grad_norm": 15.8125, + "grad_norm_var": 0.6452473958333333, + "learning_rate": 0.0003, + "loss": 10.871, + "loss/aux_loss": 0.04807530529797077, + "loss/crossentropy": 2.723317527770996, + "loss/logits": 0.7894266813993454, + "step": 69740 + }, + { + "epoch": 0.6975, + "grad_norm": 15.8125, + "grad_norm_var": 140.85598958333333, + "learning_rate": 0.0003, + "loss": 10.8967, + "loss/aux_loss": 0.048066935315728185, + "loss/crossentropy": 2.7271577537059786, + "loss/logits": 0.8186393707990647, + "step": 69750 + }, + { + "epoch": 0.6976, + "grad_norm": 17.125, + "grad_norm_var": 3.801806640625, + "learning_rate": 0.0003, + "loss": 10.8478, + "loss/aux_loss": 0.04806602392345667, + "loss/crossentropy": 2.499061381816864, + "loss/logits": 0.7904832571744919, + "step": 69760 + }, + { + "epoch": 0.6977, + "grad_norm": 15.1875, + "grad_norm_var": 0.8098307291666667, + "learning_rate": 0.0003, + "loss": 10.9179, + "loss/aux_loss": 0.04806220382452011, + "loss/crossentropy": 2.8025425612926482, + "loss/logits": 0.8429517328739167, + "step": 69770 + }, + { + "epoch": 0.6978, + "grad_norm": 15.4375, + "grad_norm_var": 0.5463541666666667, + "learning_rate": 0.0003, + "loss": 10.8747, + "loss/aux_loss": 0.0480809373781085, + "loss/crossentropy": 2.549819737672806, + "loss/logits": 0.7724178716540336, + "step": 69780 + }, + { + "epoch": 0.6979, + "grad_norm": 14.4375, + "grad_norm_var": 0.5058430989583333, + "learning_rate": 0.0003, + "loss": 10.7673, + "loss/aux_loss": 0.048068621568381785, + "loss/crossentropy": 2.629793846607208, + "loss/logits": 0.8028360933065415, + "step": 69790 + }, + { + "epoch": 0.698, + "grad_norm": 14.9375, + "grad_norm_var": 0.8476399739583333, + "learning_rate": 0.0003, + "loss": 10.6601, + "loss/aux_loss": 0.048068128526210785, + "loss/crossentropy": 2.6547737777233125, + "loss/logits": 0.8297581821680069, + "step": 69800 + }, + { + "epoch": 0.6981, + "grad_norm": 15.8125, + "grad_norm_var": 0.863916015625, + "learning_rate": 0.0003, + "loss": 10.9559, + "loss/aux_loss": 0.04806609004735947, + "loss/crossentropy": 2.636369228363037, + "loss/logits": 0.8080786511301994, + "step": 69810 + }, + { + "epoch": 0.6982, + "grad_norm": 16.375, + "grad_norm_var": 16.676416015625, + "learning_rate": 0.0003, + "loss": 10.9601, + "loss/aux_loss": 0.04808267876505852, + "loss/crossentropy": 2.6405935764312742, + "loss/logits": 0.7962011188268662, + "step": 69820 + }, + { + "epoch": 0.6983, + "grad_norm": 15.4375, + "grad_norm_var": 1.4374348958333334, + "learning_rate": 0.0003, + "loss": 10.7869, + "loss/aux_loss": 0.04806174710392952, + "loss/crossentropy": 2.7505713582038878, + "loss/logits": 0.7872932314872741, + "step": 69830 + }, + { + "epoch": 0.6984, + "grad_norm": 14.9375, + "grad_norm_var": 0.9222493489583333, + "learning_rate": 0.0003, + "loss": 10.9906, + "loss/aux_loss": 0.04806131403893232, + "loss/crossentropy": 2.8466971039772035, + "loss/logits": 0.8570821315050126, + "step": 69840 + }, + { + "epoch": 0.6985, + "grad_norm": 15.25, + "grad_norm_var": 0.22389322916666668, + "learning_rate": 0.0003, + "loss": 10.897, + "loss/aux_loss": 0.048060120269656184, + "loss/crossentropy": 2.6072149515151977, + "loss/logits": 0.8024741411209106, + "step": 69850 + }, + { + "epoch": 0.6986, + "grad_norm": 14.9375, + "grad_norm_var": 0.25201822916666666, + "learning_rate": 0.0003, + "loss": 10.7511, + "loss/aux_loss": 0.048070829920470716, + "loss/crossentropy": 2.805588722229004, + "loss/logits": 0.8278827935457229, + "step": 69860 + }, + { + "epoch": 0.6987, + "grad_norm": 16.125, + "grad_norm_var": 0.35154622395833335, + "learning_rate": 0.0003, + "loss": 10.905, + "loss/aux_loss": 0.04806218836456537, + "loss/crossentropy": 2.727338945865631, + "loss/logits": 0.7851577132940293, + "step": 69870 + }, + { + "epoch": 0.6988, + "grad_norm": 15.375, + "grad_norm_var": 0.5751139322916666, + "learning_rate": 0.0003, + "loss": 10.75, + "loss/aux_loss": 0.04807434901595116, + "loss/crossentropy": 2.7194652020931245, + "loss/logits": 0.8074722796678543, + "step": 69880 + }, + { + "epoch": 0.6989, + "grad_norm": 15.6875, + "grad_norm_var": 0.5465983072916667, + "learning_rate": 0.0003, + "loss": 10.7433, + "loss/aux_loss": 0.048061872646212575, + "loss/crossentropy": 2.685518753528595, + "loss/logits": 0.8617990851402283, + "step": 69890 + }, + { + "epoch": 0.699, + "grad_norm": 14.9375, + "grad_norm_var": 0.4423014322916667, + "learning_rate": 0.0003, + "loss": 10.7309, + "loss/aux_loss": 0.04807621408253908, + "loss/crossentropy": 2.622563087940216, + "loss/logits": 0.809404906630516, + "step": 69900 + }, + { + "epoch": 0.6991, + "grad_norm": 14.75, + "grad_norm_var": 0.31417643229166664, + "learning_rate": 0.0003, + "loss": 10.9159, + "loss/aux_loss": 0.04805949460715055, + "loss/crossentropy": 2.748386710882187, + "loss/logits": 0.8233764231204986, + "step": 69910 + }, + { + "epoch": 0.6992, + "grad_norm": 15.25, + "grad_norm_var": 0.4359212239583333, + "learning_rate": 0.0003, + "loss": 10.7166, + "loss/aux_loss": 0.04807121455669403, + "loss/crossentropy": 2.661421650648117, + "loss/logits": 0.7985509872436524, + "step": 69920 + }, + { + "epoch": 0.6993, + "grad_norm": 15.9375, + "grad_norm_var": 1.0502604166666667, + "learning_rate": 0.0003, + "loss": 10.8889, + "loss/aux_loss": 0.048076589964330195, + "loss/crossentropy": 2.886363685131073, + "loss/logits": 0.8310510069131851, + "step": 69930 + }, + { + "epoch": 0.6994, + "grad_norm": 14.875, + "grad_norm_var": 1.1340983072916666, + "learning_rate": 0.0003, + "loss": 10.9695, + "loss/aux_loss": 0.04806211348623037, + "loss/crossentropy": 2.730562311410904, + "loss/logits": 0.8359936803579331, + "step": 69940 + }, + { + "epoch": 0.6995, + "grad_norm": 15.0, + "grad_norm_var": 0.21261393229166667, + "learning_rate": 0.0003, + "loss": 10.8209, + "loss/aux_loss": 0.04806340225040913, + "loss/crossentropy": 2.6769157886505126, + "loss/logits": 0.7736663967370987, + "step": 69950 + }, + { + "epoch": 0.6996, + "grad_norm": 17.25, + "grad_norm_var": 0.4559733072916667, + "learning_rate": 0.0003, + "loss": 10.8065, + "loss/aux_loss": 0.048078119195997714, + "loss/crossentropy": 2.6120175421237946, + "loss/logits": 0.8157159000635147, + "step": 69960 + }, + { + "epoch": 0.6997, + "grad_norm": 15.9375, + "grad_norm_var": 0.9344889322916666, + "learning_rate": 0.0003, + "loss": 10.9741, + "loss/aux_loss": 0.04807462692260742, + "loss/crossentropy": 2.7217097640037538, + "loss/logits": 0.806912750005722, + "step": 69970 + }, + { + "epoch": 0.6998, + "grad_norm": 14.4375, + "grad_norm_var": 0.9408854166666667, + "learning_rate": 0.0003, + "loss": 10.7853, + "loss/aux_loss": 0.04805513937026262, + "loss/crossentropy": 2.603360629081726, + "loss/logits": 0.8143667846918106, + "step": 69980 + }, + { + "epoch": 0.6999, + "grad_norm": 15.6875, + "grad_norm_var": 0.5075358072916667, + "learning_rate": 0.0003, + "loss": 10.9126, + "loss/aux_loss": 0.04806795883923769, + "loss/crossentropy": 2.860744071006775, + "loss/logits": 0.8322966694831848, + "step": 69990 + }, + { + "epoch": 0.7, + "grad_norm": 16.125, + "grad_norm_var": 0.4610514322916667, + "learning_rate": 0.0003, + "loss": 10.8683, + "loss/aux_loss": 0.04808022417128086, + "loss/crossentropy": 2.708540141582489, + "loss/logits": 0.8456598520278931, + "step": 70000 + } + ], + "logging_steps": 10, + "max_steps": 100000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.9787190713817498e+20, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}