| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.0125, |
| "eval_steps": 500, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 2.5e-05, |
| "grad_norm": 8.947145462036133, |
| "learning_rate": 2.0000000000000002e-07, |
| "loss": 127.4759, |
| "loss/crossentropy": 2.8921360969543457, |
| "loss/hidden": 0.373046875, |
| "loss/logits": 0.1864406317472458, |
| "loss/reg": 126.91645812988281, |
| "step": 1 |
| }, |
| { |
| "epoch": 5e-05, |
| "grad_norm": 13.883983612060547, |
| "learning_rate": 4.0000000000000003e-07, |
| "loss": 115.8264, |
| "loss/crossentropy": 2.8499865531921387, |
| "loss/hidden": 0.6484375, |
| "loss/logits": 0.27192822098731995, |
| "loss/reg": 114.90603637695312, |
| "step": 2 |
| }, |
| { |
| "epoch": 7.5e-05, |
| "grad_norm": 19.773080825805664, |
| "learning_rate": 6.000000000000001e-07, |
| "loss": 116.012, |
| "loss/crossentropy": 3.1656363010406494, |
| "loss/hidden": 0.72265625, |
| "loss/logits": 0.3859825134277344, |
| "loss/reg": 114.9033432006836, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.0001, |
| "grad_norm": 18.47258758544922, |
| "learning_rate": 8.000000000000001e-07, |
| "loss": 115.903, |
| "loss/crossentropy": 2.8063976764678955, |
| "loss/hidden": 0.6875, |
| "loss/logits": 0.31685781478881836, |
| "loss/reg": 114.89862823486328, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.000125, |
| "grad_norm": 18.90447998046875, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 115.9715, |
| "loss/crossentropy": 3.020209312438965, |
| "loss/hidden": 0.7265625, |
| "loss/logits": 0.3528100252151489, |
| "loss/reg": 114.89217376708984, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.00015, |
| "grad_norm": 15.197858810424805, |
| "learning_rate": 1.2000000000000002e-06, |
| "loss": 115.9385, |
| "loss/crossentropy": 2.891916036605835, |
| "loss/hidden": 0.6953125, |
| "loss/logits": 0.35902339220046997, |
| "loss/reg": 114.8841781616211, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.000175, |
| "grad_norm": 15.15339469909668, |
| "learning_rate": 1.4000000000000001e-06, |
| "loss": 115.8638, |
| "loss/crossentropy": 2.888406753540039, |
| "loss/hidden": 0.6796875, |
| "loss/logits": 0.3095901906490326, |
| "loss/reg": 114.8744888305664, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.0002, |
| "grad_norm": 13.61527156829834, |
| "learning_rate": 1.6000000000000001e-06, |
| "loss": 115.85, |
| "loss/crossentropy": 2.9885716438293457, |
| "loss/hidden": 0.671875, |
| "loss/logits": 0.3148782253265381, |
| "loss/reg": 114.86323547363281, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.000225, |
| "grad_norm": 13.632570266723633, |
| "learning_rate": 1.8000000000000001e-06, |
| "loss": 115.7999, |
| "loss/crossentropy": 2.6390163898468018, |
| "loss/hidden": 0.6640625, |
| "loss/logits": 0.2855065166950226, |
| "loss/reg": 114.85034942626953, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.00025, |
| "grad_norm": 12.151652336120605, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 115.7602, |
| "loss/crossentropy": 2.9643092155456543, |
| "loss/hidden": 0.6328125, |
| "loss/logits": 0.2915680408477783, |
| "loss/reg": 114.83580780029297, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.000275, |
| "grad_norm": 10.124210357666016, |
| "learning_rate": 2.2e-06, |
| "loss": 115.6818, |
| "loss/crossentropy": 2.7221367359161377, |
| "loss/hidden": 0.609375, |
| "loss/logits": 0.2528327405452728, |
| "loss/reg": 114.81963348388672, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.0003, |
| "grad_norm": 12.582488059997559, |
| "learning_rate": 2.4000000000000003e-06, |
| "loss": 112.7148, |
| "loss/crossentropy": 3.236349582672119, |
| "loss/hidden": 0.7890625, |
| "loss/logits": 0.3880160450935364, |
| "loss/reg": 111.53773498535156, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.000325, |
| "grad_norm": 9.126688957214355, |
| "learning_rate": 2.6e-06, |
| "loss": 112.5513, |
| "loss/crossentropy": 2.9401042461395264, |
| "loss/hidden": 0.7109375, |
| "loss/logits": 0.3218693733215332, |
| "loss/reg": 111.51847839355469, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.00035, |
| "grad_norm": 5.668403625488281, |
| "learning_rate": 2.8000000000000003e-06, |
| "loss": 112.3898, |
| "loss/crossentropy": 2.5529444217681885, |
| "loss/hidden": 0.66796875, |
| "loss/logits": 0.22446033358573914, |
| "loss/reg": 111.49739837646484, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.000375, |
| "grad_norm": 5.202491283416748, |
| "learning_rate": 3e-06, |
| "loss": 112.4113, |
| "loss/crossentropy": 3.1544463634490967, |
| "loss/hidden": 0.65625, |
| "loss/logits": 0.28114181756973267, |
| "loss/reg": 111.47393035888672, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.0004, |
| "grad_norm": 4.127603054046631, |
| "grad_norm_var": 23.16038376789459, |
| "learning_rate": 3.2000000000000003e-06, |
| "loss": 112.257, |
| "loss/crossentropy": 3.014888286590576, |
| "loss/hidden": 0.6015625, |
| "loss/logits": 0.20710057020187378, |
| "loss/reg": 111.44828796386719, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.000425, |
| "grad_norm": 3.9600610733032227, |
| "grad_norm_var": 26.934472753712136, |
| "learning_rate": 3.4000000000000005e-06, |
| "loss": 112.1822, |
| "loss/crossentropy": 2.7658724784851074, |
| "loss/hidden": 0.57421875, |
| "loss/logits": 0.18775172531604767, |
| "loss/reg": 111.42024230957031, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.00045, |
| "grad_norm": 4.745978355407715, |
| "grad_norm_var": 29.825747343372463, |
| "learning_rate": 3.6000000000000003e-06, |
| "loss": 112.1734, |
| "loss/crossentropy": 3.176030158996582, |
| "loss/hidden": 0.58203125, |
| "loss/logits": 0.2011646032333374, |
| "loss/reg": 111.39022064208984, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.000475, |
| "grad_norm": 3.7388970851898193, |
| "grad_norm_var": 27.998639503348578, |
| "learning_rate": 3.8000000000000005e-06, |
| "loss": 112.0506, |
| "loss/crossentropy": 3.0487682819366455, |
| "loss/hidden": 0.53515625, |
| "loss/logits": 0.15711598098278046, |
| "loss/reg": 111.3582763671875, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.0005, |
| "grad_norm": 3.213416576385498, |
| "grad_norm_var": 26.127740304788446, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 111.9841, |
| "loss/crossentropy": 2.9822959899902344, |
| "loss/hidden": 0.51171875, |
| "loss/logits": 0.14767958223819733, |
| "loss/reg": 111.32469177246094, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.000525, |
| "grad_norm": 5.186110019683838, |
| "grad_norm_var": 20.59027009605237, |
| "learning_rate": 4.2000000000000004e-06, |
| "loss": 111.9338, |
| "loss/crossentropy": 2.6938610076904297, |
| "loss/hidden": 0.50390625, |
| "loss/logits": 0.14082029461860657, |
| "loss/reg": 111.2890625, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.00055, |
| "grad_norm": 3.830970525741577, |
| "grad_norm_var": 18.64965973207226, |
| "learning_rate": 4.4e-06, |
| "loss": 111.3757, |
| "loss/crossentropy": 2.60425066947937, |
| "loss/hidden": 0.5234375, |
| "loss/logits": 0.11805371195077896, |
| "loss/reg": 110.73417663574219, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.000575, |
| "grad_norm": 4.170963764190674, |
| "grad_norm_var": 15.535602850312825, |
| "learning_rate": 4.600000000000001e-06, |
| "loss": 111.3937, |
| "loss/crossentropy": 3.180182695388794, |
| "loss/hidden": 0.55859375, |
| "loss/logits": 0.14081788063049316, |
| "loss/reg": 110.69429779052734, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.0006, |
| "grad_norm": 2.6631879806518555, |
| "grad_norm_var": 13.65312757585727, |
| "learning_rate": 4.800000000000001e-06, |
| "loss": 111.2621, |
| "loss/crossentropy": 2.8149499893188477, |
| "loss/hidden": 0.48828125, |
| "loss/logits": 0.12221543490886688, |
| "loss/reg": 110.65160369873047, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.000625, |
| "grad_norm": 2.618382453918457, |
| "grad_norm_var": 10.772089347301707, |
| "learning_rate": 5e-06, |
| "loss": 111.1819, |
| "loss/crossentropy": 2.8573601245880127, |
| "loss/hidden": 0.458984375, |
| "loss/logits": 0.11608506739139557, |
| "loss/reg": 110.60684204101562, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.00065, |
| "grad_norm": 2.31030011177063, |
| "grad_norm_var": 8.516386613252116, |
| "learning_rate": 5.2e-06, |
| "loss": 111.117, |
| "loss/crossentropy": 3.055554151535034, |
| "loss/hidden": 0.439453125, |
| "loss/logits": 0.11802805215120316, |
| "loss/reg": 110.5595474243164, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.000675, |
| "grad_norm": 2.017902374267578, |
| "grad_norm_var": 7.3058512031776175, |
| "learning_rate": 5.400000000000001e-06, |
| "loss": 110.9925, |
| "loss/crossentropy": 3.038731813430786, |
| "loss/hidden": 0.388671875, |
| "loss/logits": 0.09406228363513947, |
| "loss/reg": 110.50971984863281, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.0007, |
| "grad_norm": 2.0806379318237305, |
| "grad_norm_var": 3.1583128509652356, |
| "learning_rate": 5.600000000000001e-06, |
| "loss": 110.9178, |
| "loss/crossentropy": 2.8045244216918945, |
| "loss/hidden": 0.37109375, |
| "loss/logits": 0.0893578976392746, |
| "loss/reg": 110.45733642578125, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.000725, |
| "grad_norm": 1.8444489240646362, |
| "grad_norm_var": 1.5350877177341649, |
| "learning_rate": 5.8e-06, |
| "loss": 110.8323, |
| "loss/crossentropy": 2.9446072578430176, |
| "loss/hidden": 0.3515625, |
| "loss/logits": 0.07847409695386887, |
| "loss/reg": 110.40225219726562, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.00075, |
| "grad_norm": 1.900687575340271, |
| "grad_norm_var": 1.3763151431979745, |
| "learning_rate": 6e-06, |
| "loss": 110.766, |
| "loss/crossentropy": 2.991264581680298, |
| "loss/hidden": 0.345703125, |
| "loss/logits": 0.07573088258504868, |
| "loss/reg": 110.34453582763672, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.000775, |
| "grad_norm": 1.8288308382034302, |
| "grad_norm_var": 1.254712449102712, |
| "learning_rate": 6.200000000000001e-06, |
| "loss": 110.6691, |
| "loss/crossentropy": 2.695741653442383, |
| "loss/hidden": 0.31640625, |
| "loss/logits": 0.06882831454277039, |
| "loss/reg": 110.28388214111328, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.0008, |
| "grad_norm": 7.6339874267578125, |
| "grad_norm_var": 2.4849026575088917, |
| "learning_rate": 6.4000000000000006e-06, |
| "loss": 110.524, |
| "loss/crossentropy": 2.8983194828033447, |
| "loss/hidden": 0.30078125, |
| "loss/logits": 0.0650840774178505, |
| "loss/reg": 110.15809631347656, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.000825, |
| "grad_norm": 1.7091763019561768, |
| "grad_norm_var": 2.6211828816924214, |
| "learning_rate": 6.600000000000001e-06, |
| "loss": 110.421, |
| "loss/crossentropy": 3.1094229221343994, |
| "loss/hidden": 0.2734375, |
| "loss/logits": 0.055526312440633774, |
| "loss/reg": 110.09200286865234, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.00085, |
| "grad_norm": 2.408731698989868, |
| "grad_norm_var": 2.486549186444365, |
| "learning_rate": 6.800000000000001e-06, |
| "loss": 110.3723, |
| "loss/crossentropy": 3.0703020095825195, |
| "loss/hidden": 0.28515625, |
| "loss/logits": 0.06386865675449371, |
| "loss/reg": 110.02332305908203, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.000875, |
| "grad_norm": 5.639738082885742, |
| "grad_norm_var": 2.8813226444205484, |
| "learning_rate": 7e-06, |
| "loss": 110.2663, |
| "loss/crossentropy": 2.6366381645202637, |
| "loss/hidden": 0.259765625, |
| "loss/logits": 0.05466887354850769, |
| "loss/reg": 109.95182800292969, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.0009, |
| "grad_norm": 1.5474259853363037, |
| "grad_norm_var": 3.049833938595448, |
| "learning_rate": 7.2000000000000005e-06, |
| "loss": 110.163, |
| "loss/crossentropy": 2.9982497692108154, |
| "loss/hidden": 0.2373046875, |
| "loss/logits": 0.0486503466963768, |
| "loss/reg": 109.8770751953125, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.000925, |
| "grad_norm": 1.822043776512146, |
| "grad_norm_var": 2.8155889180573683, |
| "learning_rate": 7.4e-06, |
| "loss": 110.0863, |
| "loss/crossentropy": 2.5750184059143066, |
| "loss/hidden": 0.244140625, |
| "loss/logits": 0.04214131087064743, |
| "loss/reg": 109.80003356933594, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.00095, |
| "grad_norm": 1.646996259689331, |
| "grad_norm_var": 2.8358215165690646, |
| "learning_rate": 7.600000000000001e-06, |
| "loss": 109.9793, |
| "loss/crossentropy": 2.6531567573547363, |
| "loss/hidden": 0.216796875, |
| "loss/logits": 0.04228401929140091, |
| "loss/reg": 109.72017669677734, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.000975, |
| "grad_norm": 1.6782788038253784, |
| "grad_norm_var": 2.7486431517133116, |
| "learning_rate": 7.800000000000002e-06, |
| "loss": 109.9137, |
| "loss/crossentropy": 2.8859128952026367, |
| "loss/hidden": 0.2333984375, |
| "loss/logits": 0.04267453774809837, |
| "loss/reg": 109.63758087158203, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.001, |
| "grad_norm": 1.1451095342636108, |
| "grad_norm_var": 2.8767352862187083, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 109.7821, |
| "loss/crossentropy": 2.717745304107666, |
| "loss/hidden": 0.1982421875, |
| "loss/logits": 0.03221501410007477, |
| "loss/reg": 109.55165100097656, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.001025, |
| "grad_norm": 1.5123186111450195, |
| "grad_norm_var": 2.934195649765045, |
| "learning_rate": 8.2e-06, |
| "loss": 109.6931, |
| "loss/crossentropy": 3.1204404830932617, |
| "loss/hidden": 0.193359375, |
| "loss/logits": 0.03670802339911461, |
| "loss/reg": 109.46299743652344, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.00105, |
| "grad_norm": 1.5102463960647583, |
| "grad_norm_var": 2.985947226480973, |
| "learning_rate": 8.400000000000001e-06, |
| "loss": 109.5902, |
| "loss/crossentropy": 2.754708766937256, |
| "loss/hidden": 0.193359375, |
| "loss/logits": 0.0338013619184494, |
| "loss/reg": 109.36308288574219, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.001075, |
| "grad_norm": 1.7040822505950928, |
| "grad_norm_var": 3.00685228461388, |
| "learning_rate": 8.6e-06, |
| "loss": 109.4694, |
| "loss/crossentropy": 2.99281907081604, |
| "loss/hidden": 0.169921875, |
| "loss/logits": 0.030927911400794983, |
| "loss/reg": 109.26850891113281, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.0011, |
| "grad_norm": 2.202953338623047, |
| "grad_norm_var": 3.0033814137683112, |
| "learning_rate": 8.8e-06, |
| "loss": 109.3558, |
| "loss/crossentropy": 2.8213930130004883, |
| "loss/hidden": 0.1572265625, |
| "loss/logits": 0.027427691966295242, |
| "loss/reg": 109.17113494873047, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.001125, |
| "grad_norm": 2.543388843536377, |
| "grad_norm_var": 2.986013784076106, |
| "learning_rate": 9e-06, |
| "loss": 109.2579, |
| "loss/crossentropy": 2.9007112979888916, |
| "loss/hidden": 0.1572265625, |
| "loss/logits": 0.02932737208902836, |
| "loss/reg": 109.07130432128906, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.00115, |
| "grad_norm": 2.870173692703247, |
| "grad_norm_var": 2.9799395606014794, |
| "learning_rate": 9.200000000000002e-06, |
| "loss": 109.153, |
| "loss/crossentropy": 2.8239502906799316, |
| "loss/hidden": 0.1572265625, |
| "loss/logits": 0.028089674189686775, |
| "loss/reg": 108.96764373779297, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.001175, |
| "grad_norm": 1.966118574142456, |
| "grad_norm_var": 2.9695142383622777, |
| "learning_rate": 9.4e-06, |
| "loss": 109.0394, |
| "loss/crossentropy": 2.8659369945526123, |
| "loss/hidden": 0.1513671875, |
| "loss/logits": 0.025958221405744553, |
| "loss/reg": 108.862060546875, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.0012, |
| "grad_norm": 1.9000473022460938, |
| "grad_norm_var": 1.0773867075877055, |
| "learning_rate": 9.600000000000001e-06, |
| "loss": 108.9211, |
| "loss/crossentropy": 2.765244722366333, |
| "loss/hidden": 0.1455078125, |
| "loss/logits": 0.02295522391796112, |
| "loss/reg": 108.75267028808594, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.001225, |
| "grad_norm": 2.161703109741211, |
| "grad_norm_var": 1.0658244305761075, |
| "learning_rate": 9.800000000000001e-06, |
| "loss": 108.8061, |
| "loss/crossentropy": 2.999718427658081, |
| "loss/hidden": 0.140625, |
| "loss/logits": 0.025498513132333755, |
| "loss/reg": 108.64002227783203, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.00125, |
| "grad_norm": 2.4916954040527344, |
| "grad_norm_var": 1.0692138980283508, |
| "learning_rate": 1e-05, |
| "loss": 108.6839, |
| "loss/crossentropy": 2.717797040939331, |
| "loss/hidden": 0.138671875, |
| "loss/logits": 0.021402183920145035, |
| "loss/reg": 108.52385711669922, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.001275, |
| "grad_norm": 1.824416995048523, |
| "grad_norm_var": 0.20190934714180309, |
| "learning_rate": 1.02e-05, |
| "loss": 108.5772, |
| "loss/crossentropy": 3.0746214389801025, |
| "loss/hidden": 0.1484375, |
| "loss/logits": 0.023359911516308784, |
| "loss/reg": 108.4053726196289, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.0013, |
| "grad_norm": 2.363981008529663, |
| "grad_norm_var": 0.20433165463635838, |
| "learning_rate": 1.04e-05, |
| "loss": 108.4089, |
| "loss/crossentropy": 2.7881290912628174, |
| "loss/hidden": 0.115234375, |
| "loss/logits": 0.0219597015529871, |
| "loss/reg": 108.27169036865234, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.001325, |
| "grad_norm": 2.2850818634033203, |
| "grad_norm_var": 0.2092781831522757, |
| "learning_rate": 1.0600000000000002e-05, |
| "loss": 108.2794, |
| "loss/crossentropy": 2.877086639404297, |
| "loss/hidden": 0.11328125, |
| "loss/logits": 0.019944053143262863, |
| "loss/reg": 108.1462173461914, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.00135, |
| "grad_norm": 2.2029547691345215, |
| "grad_norm_var": 0.203324965479775, |
| "learning_rate": 1.0800000000000002e-05, |
| "loss": 108.1448, |
| "loss/crossentropy": 2.5684406757354736, |
| "loss/hidden": 0.10888671875, |
| "loss/logits": 0.018222380429506302, |
| "loss/reg": 108.01766204833984, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.001375, |
| "grad_norm": 4.355873107910156, |
| "grad_norm_var": 0.5284712138594291, |
| "learning_rate": 1.1000000000000001e-05, |
| "loss": 108.0238, |
| "loss/crossentropy": 3.1050121784210205, |
| "loss/hidden": 0.11474609375, |
| "loss/logits": 0.023576244711875916, |
| "loss/reg": 107.8854751586914, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.0014, |
| "grad_norm": 8.262518882751465, |
| "grad_norm_var": 2.702968942542203, |
| "learning_rate": 1.1200000000000001e-05, |
| "loss": 107.8767, |
| "loss/crossentropy": 2.6639468669891357, |
| "loss/hidden": 0.109375, |
| "loss/logits": 0.017037922516465187, |
| "loss/reg": 107.75032806396484, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.001425, |
| "grad_norm": 1.5480672121047974, |
| "grad_norm_var": 2.6976983052976697, |
| "learning_rate": 1.14e-05, |
| "loss": 107.734, |
| "loss/crossentropy": 2.7018520832061768, |
| "loss/hidden": 0.10693359375, |
| "loss/logits": 0.016180139034986496, |
| "loss/reg": 107.61087036132812, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.00145, |
| "grad_norm": 1.2781832218170166, |
| "grad_norm_var": 2.7359303926605016, |
| "learning_rate": 1.16e-05, |
| "loss": 107.5901, |
| "loss/crossentropy": 2.8058838844299316, |
| "loss/hidden": 0.1044921875, |
| "loss/logits": 0.016446152701973915, |
| "loss/reg": 107.46919250488281, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.001475, |
| "grad_norm": 1.5537561178207397, |
| "grad_norm_var": 2.755752608942243, |
| "learning_rate": 1.18e-05, |
| "loss": 107.4512, |
| "loss/crossentropy": 2.6371188163757324, |
| "loss/hidden": 0.109375, |
| "loss/logits": 0.018213655799627304, |
| "loss/reg": 107.32366180419922, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.0015, |
| "grad_norm": 3.3459622859954834, |
| "grad_norm_var": 2.774887598663352, |
| "learning_rate": 1.2e-05, |
| "loss": 107.3005, |
| "loss/crossentropy": 3.190295457839966, |
| "loss/hidden": 0.109375, |
| "loss/logits": 0.016267115250229836, |
| "loss/reg": 107.1749038696289, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.001525, |
| "grad_norm": 1.5608716011047363, |
| "grad_norm_var": 2.8537229826550923, |
| "learning_rate": 1.22e-05, |
| "loss": 107.1405, |
| "loss/crossentropy": 2.689225673675537, |
| "loss/hidden": 0.10205078125, |
| "loss/logits": 0.015642033889889717, |
| "loss/reg": 107.0228271484375, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.00155, |
| "grad_norm": 2.9783740043640137, |
| "grad_norm_var": 2.858017521797208, |
| "learning_rate": 1.2400000000000002e-05, |
| "loss": 106.9817, |
| "loss/crossentropy": 3.0421159267425537, |
| "loss/hidden": 0.10205078125, |
| "loss/logits": 0.016691621392965317, |
| "loss/reg": 106.86290740966797, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.001575, |
| "grad_norm": 1.751490831375122, |
| "grad_norm_var": 2.8798941991380254, |
| "learning_rate": 1.2600000000000001e-05, |
| "loss": 106.8293, |
| "loss/crossentropy": 2.954704523086548, |
| "loss/hidden": 0.10693359375, |
| "loss/logits": 0.018012236803770065, |
| "loss/reg": 106.7043228149414, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.0016, |
| "grad_norm": 2.2218408584594727, |
| "grad_norm_var": 2.855623539050459, |
| "learning_rate": 1.2800000000000001e-05, |
| "loss": 106.6657, |
| "loss/crossentropy": 2.735278844833374, |
| "loss/hidden": 0.1044921875, |
| "loss/logits": 0.019818950444459915, |
| "loss/reg": 106.54141998291016, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.001625, |
| "grad_norm": 1.6660186052322388, |
| "grad_norm_var": 2.9023713600347114, |
| "learning_rate": 1.3000000000000001e-05, |
| "loss": 106.4947, |
| "loss/crossentropy": 3.00019907951355, |
| "loss/hidden": 0.1044921875, |
| "loss/logits": 0.015060758218169212, |
| "loss/reg": 106.3750991821289, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.00165, |
| "grad_norm": 1.7433619499206543, |
| "grad_norm_var": 2.948745965901263, |
| "learning_rate": 1.3200000000000002e-05, |
| "loss": 106.3047, |
| "loss/crossentropy": 2.4090218544006348, |
| "loss/hidden": 0.0859375, |
| "loss/logits": 0.013767421245574951, |
| "loss/reg": 106.20496368408203, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.001675, |
| "grad_norm": 1.8968372344970703, |
| "grad_norm_var": 2.941981354522321, |
| "learning_rate": 1.3400000000000002e-05, |
| "loss": 106.158, |
| "loss/crossentropy": 2.8800578117370605, |
| "loss/hidden": 0.10693359375, |
| "loss/logits": 0.019842734560370445, |
| "loss/reg": 106.03123474121094, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.0017, |
| "grad_norm": 2.075490951538086, |
| "grad_norm_var": 2.95485559076868, |
| "learning_rate": 1.3600000000000002e-05, |
| "loss": 105.9487, |
| "loss/crossentropy": 3.121166944503784, |
| "loss/hidden": 0.07861328125, |
| "loss/logits": 0.015778005123138428, |
| "loss/reg": 105.85435485839844, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.001725, |
| "grad_norm": 2.8943285942077637, |
| "grad_norm_var": 2.9569066036314915, |
| "learning_rate": 1.38e-05, |
| "loss": 105.765, |
| "loss/crossentropy": 2.750092029571533, |
| "loss/hidden": 0.0732421875, |
| "loss/logits": 0.01827952079474926, |
| "loss/reg": 105.67351531982422, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.00175, |
| "grad_norm": 1.740116000175476, |
| "grad_norm_var": 2.993779212225659, |
| "learning_rate": 1.4e-05, |
| "loss": 105.5679, |
| "loss/crossentropy": 2.808587074279785, |
| "loss/hidden": 0.06396484375, |
| "loss/logits": 0.014089204370975494, |
| "loss/reg": 105.48979949951172, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.001775, |
| "grad_norm": 3.365384101867676, |
| "grad_norm_var": 2.817206299433534, |
| "learning_rate": 1.4200000000000001e-05, |
| "loss": 105.4282, |
| "loss/crossentropy": 3.0790698528289795, |
| "loss/hidden": 0.109375, |
| "loss/logits": 0.01681649312376976, |
| "loss/reg": 105.30203247070312, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.0018, |
| "grad_norm": 1.2450242042541504, |
| "grad_norm_var": 0.49637522068055445, |
| "learning_rate": 1.4400000000000001e-05, |
| "loss": 105.1742, |
| "loss/crossentropy": 2.7480971813201904, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.011092130094766617, |
| "loss/reg": 105.10601806640625, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.001825, |
| "grad_norm": 1.7799638509750366, |
| "grad_norm_var": 0.48409086806424567, |
| "learning_rate": 1.46e-05, |
| "loss": 104.9855, |
| "loss/crossentropy": 2.8875551223754883, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.013171407394111156, |
| "loss/reg": 104.91083526611328, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.00185, |
| "grad_norm": 3.0080645084381104, |
| "grad_norm_var": 0.4888197405301819, |
| "learning_rate": 1.48e-05, |
| "loss": 104.7925, |
| "loss/crossentropy": 2.947901725769043, |
| "loss/hidden": 0.06396484375, |
| "loss/logits": 0.015885423868894577, |
| "loss/reg": 104.71261596679688, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.001875, |
| "grad_norm": 2.132399320602417, |
| "grad_norm_var": 0.46168637093159354, |
| "learning_rate": 1.5000000000000002e-05, |
| "loss": 104.5821, |
| "loss/crossentropy": 2.8153300285339355, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.012619425542652607, |
| "loss/reg": 104.51014709472656, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.0019, |
| "grad_norm": 4.225804328918457, |
| "grad_norm_var": 0.642997495639734, |
| "learning_rate": 1.5200000000000002e-05, |
| "loss": 104.3881, |
| "loss/crossentropy": 3.0654807090759277, |
| "loss/hidden": 0.06591796875, |
| "loss/logits": 0.017460748553276062, |
| "loss/reg": 104.30471801757812, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.001925, |
| "grad_norm": 1.3250384330749512, |
| "grad_norm_var": 0.6687036530869581, |
| "learning_rate": 1.54e-05, |
| "loss": 104.1632, |
| "loss/crossentropy": 2.7858409881591797, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.010722491890192032, |
| "loss/reg": 104.09539031982422, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.00195, |
| "grad_norm": 2.416010618209839, |
| "grad_norm_var": 0.6340868082909626, |
| "learning_rate": 1.5600000000000003e-05, |
| "loss": 103.955, |
| "loss/crossentropy": 3.242889642715454, |
| "loss/hidden": 0.05908203125, |
| "loss/logits": 0.013778477907180786, |
| "loss/reg": 103.8821792602539, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.001975, |
| "grad_norm": 2.7921998500823975, |
| "grad_norm_var": 0.6370527990787508, |
| "learning_rate": 1.58e-05, |
| "loss": 103.7406, |
| "loss/crossentropy": 2.815262794494629, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.012466374784708023, |
| "loss/reg": 103.66657257080078, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.002, |
| "grad_norm": 2.4101223945617676, |
| "grad_norm_var": 0.6377332514460964, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 103.5152, |
| "loss/crossentropy": 2.6628899574279785, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.011213818565011024, |
| "loss/reg": 103.44680786132812, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.002025, |
| "grad_norm": 4.508922576904297, |
| "grad_norm_var": 0.9045377168218747, |
| "learning_rate": 1.62e-05, |
| "loss": 103.3373, |
| "loss/crossentropy": 2.809832811355591, |
| "loss/hidden": 0.099609375, |
| "loss/logits": 0.014138867147266865, |
| "loss/reg": 103.2235107421875, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.00205, |
| "grad_norm": 2.5658440589904785, |
| "grad_norm_var": 0.8668634995421407, |
| "learning_rate": 1.64e-05, |
| "loss": 103.0584, |
| "loss/crossentropy": 2.9585628509521484, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.011128315702080727, |
| "loss/reg": 102.99018859863281, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.002075, |
| "grad_norm": 1.03963041305542, |
| "grad_norm_var": 0.9844523199274171, |
| "learning_rate": 1.66e-05, |
| "loss": 102.8268, |
| "loss/crossentropy": 2.9794199466705322, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.010468900203704834, |
| "loss/reg": 102.75920104980469, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.0021, |
| "grad_norm": 2.0196003913879395, |
| "grad_norm_var": 0.9875894888249034, |
| "learning_rate": 1.6800000000000002e-05, |
| "loss": 102.5951, |
| "loss/crossentropy": 3.0950756072998047, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.012872470542788506, |
| "loss/reg": 102.52505493164062, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.002125, |
| "grad_norm": 2.780224323272705, |
| "grad_norm_var": 0.9818985175549242, |
| "learning_rate": 1.7e-05, |
| "loss": 102.3623, |
| "loss/crossentropy": 2.8365509510040283, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.013720070943236351, |
| "loss/reg": 102.28706359863281, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.00215, |
| "grad_norm": 4.0338544845581055, |
| "grad_norm_var": 1.0906703730721057, |
| "learning_rate": 1.72e-05, |
| "loss": 102.1238, |
| "loss/crossentropy": 2.8407351970672607, |
| "loss/hidden": 0.06396484375, |
| "loss/logits": 0.013968261890113354, |
| "loss/reg": 102.04585266113281, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.002175, |
| "grad_norm": 1.787563681602478, |
| "grad_norm_var": 1.0858789976663803, |
| "learning_rate": 1.7400000000000003e-05, |
| "loss": 101.8693, |
| "loss/crossentropy": 2.7350587844848633, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.010529162362217903, |
| "loss/reg": 101.80165100097656, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.0022, |
| "grad_norm": 2.832834005355835, |
| "grad_norm_var": 0.9768321018939086, |
| "learning_rate": 1.76e-05, |
| "loss": 101.6227, |
| "loss/crossentropy": 2.9185311794281006, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.01262863352894783, |
| "loss/reg": 101.55294036865234, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.002225, |
| "grad_norm": 3.1689658164978027, |
| "grad_norm_var": 0.944871850947833, |
| "learning_rate": 1.7800000000000002e-05, |
| "loss": 101.3782, |
| "loss/crossentropy": 3.041252374649048, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.01572200283408165, |
| "loss/reg": 101.30099487304688, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.00225, |
| "grad_norm": 1.7799317836761475, |
| "grad_norm_var": 0.9871302861463281, |
| "learning_rate": 1.8e-05, |
| "loss": 101.1145, |
| "loss/crossentropy": 2.7331812381744385, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.011585134081542492, |
| "loss/reg": 101.04578399658203, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.002275, |
| "grad_norm": 2.1665501594543457, |
| "grad_norm_var": 0.9850116745237391, |
| "learning_rate": 1.8200000000000002e-05, |
| "loss": 100.8592, |
| "loss/crossentropy": 2.4707322120666504, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.013475198298692703, |
| "loss/reg": 100.7863998413086, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.0023, |
| "grad_norm": 1.3672914505004883, |
| "grad_norm_var": 0.8820831680751643, |
| "learning_rate": 1.8400000000000003e-05, |
| "loss": 100.5874, |
| "loss/crossentropy": 2.577828884124756, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.01066804863512516, |
| "loss/reg": 100.51959228515625, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.002325, |
| "grad_norm": 1.8506819009780884, |
| "grad_norm_var": 0.8214079520174674, |
| "learning_rate": 1.86e-05, |
| "loss": 100.323, |
| "loss/crossentropy": 2.847485065460205, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.011566294357180595, |
| "loss/reg": 100.25430297851562, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.00235, |
| "grad_norm": 3.947273015975952, |
| "grad_norm_var": 0.956929905592916, |
| "learning_rate": 1.88e-05, |
| "loss": 100.0834, |
| "loss/crossentropy": 2.833409070968628, |
| "loss/hidden": 0.0849609375, |
| "loss/logits": 0.013771452009677887, |
| "loss/reg": 99.98463439941406, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.002375, |
| "grad_norm": 1.522775650024414, |
| "grad_norm_var": 1.0193112536277242, |
| "learning_rate": 1.9e-05, |
| "loss": 99.7793, |
| "loss/crossentropy": 2.616933822631836, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.01004397589713335, |
| "loss/reg": 99.71215057373047, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.0024, |
| "grad_norm": 2.6764161586761475, |
| "grad_norm_var": 1.0210357175361253, |
| "learning_rate": 1.9200000000000003e-05, |
| "loss": 99.5082, |
| "loss/crossentropy": 3.357677936553955, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.01289614848792553, |
| "loss/reg": 99.43595886230469, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.002425, |
| "grad_norm": 2.411071538925171, |
| "grad_norm_var": 0.7350196269207759, |
| "learning_rate": 1.94e-05, |
| "loss": 99.2327, |
| "loss/crossentropy": 2.9822490215301514, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.014493342489004135, |
| "loss/reg": 99.15665435791016, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.00245, |
| "grad_norm": 5.154429912567139, |
| "grad_norm_var": 1.2207546039745256, |
| "learning_rate": 1.9600000000000002e-05, |
| "loss": 98.9606, |
| "loss/crossentropy": 2.878635883331299, |
| "loss/hidden": 0.068359375, |
| "loss/logits": 0.018285058438777924, |
| "loss/reg": 98.87391662597656, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.002475, |
| "grad_norm": 5.486807823181152, |
| "grad_norm_var": 1.5709261870313864, |
| "learning_rate": 1.98e-05, |
| "loss": 98.6813, |
| "loss/crossentropy": 3.3747873306274414, |
| "loss/hidden": 0.07275390625, |
| "loss/logits": 0.02096904069185257, |
| "loss/reg": 98.58756256103516, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.0025, |
| "grad_norm": 2.2651255130767822, |
| "grad_norm_var": 1.548765033441659, |
| "learning_rate": 2e-05, |
| "loss": 98.378, |
| "loss/crossentropy": 2.667487621307373, |
| "loss/hidden": 0.06396484375, |
| "loss/logits": 0.015711724758148193, |
| "loss/reg": 98.29834747314453, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.002525, |
| "grad_norm": 2.8461413383483887, |
| "grad_norm_var": 1.548625602237729, |
| "learning_rate": 2e-05, |
| "loss": 98.0811, |
| "loss/crossentropy": 2.8070576190948486, |
| "loss/hidden": 0.05908203125, |
| "loss/logits": 0.01651117019355297, |
| "loss/reg": 98.00546264648438, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.00255, |
| "grad_norm": 3.3563625812530518, |
| "grad_norm_var": 1.4686659287860662, |
| "learning_rate": 2e-05, |
| "loss": 97.79, |
| "loss/crossentropy": 3.0256845951080322, |
| "loss/hidden": 0.06591796875, |
| "loss/logits": 0.01792600378394127, |
| "loss/reg": 97.70619201660156, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.002575, |
| "grad_norm": 3.0281784534454346, |
| "grad_norm_var": 1.3992474053637958, |
| "learning_rate": 2e-05, |
| "loss": 97.485, |
| "loss/crossentropy": 2.7928712368011475, |
| "loss/hidden": 0.05908203125, |
| "loss/logits": 0.016789790242910385, |
| "loss/reg": 97.40911102294922, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.0026, |
| "grad_norm": 2.899623394012451, |
| "grad_norm_var": 1.399228163006243, |
| "learning_rate": 2e-05, |
| "loss": 97.1889, |
| "loss/crossentropy": 2.6017889976501465, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.014727754518389702, |
| "loss/reg": 97.11270141601562, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.002625, |
| "grad_norm": 4.760987758636475, |
| "grad_norm_var": 1.6209967019940714, |
| "learning_rate": 2e-05, |
| "loss": 96.9561, |
| "loss/crossentropy": 2.79957914352417, |
| "loss/hidden": 0.1044921875, |
| "loss/logits": 0.036032598465681076, |
| "loss/reg": 96.81553649902344, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.00265, |
| "grad_norm": 1.9602891206741333, |
| "grad_norm_var": 1.5944119405426551, |
| "learning_rate": 2e-05, |
| "loss": 96.5917, |
| "loss/crossentropy": 3.0477099418640137, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.014111923985183239, |
| "loss/reg": 96.51824951171875, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.002675, |
| "grad_norm": 3.8924121856689453, |
| "grad_norm_var": 1.593099738565498, |
| "learning_rate": 2e-05, |
| "loss": 96.306, |
| "loss/crossentropy": 2.725721597671509, |
| "loss/hidden": 0.06591796875, |
| "loss/logits": 0.01935645565390587, |
| "loss/reg": 96.22071838378906, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.0027, |
| "grad_norm": 2.1384472846984863, |
| "grad_norm_var": 1.45322790572492, |
| "learning_rate": 2e-05, |
| "loss": 95.9971, |
| "loss/crossentropy": 2.9953653812408447, |
| "loss/hidden": 0.05908203125, |
| "loss/logits": 0.015377204865217209, |
| "loss/reg": 95.92265319824219, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.002725, |
| "grad_norm": 2.844248056411743, |
| "grad_norm_var": 1.3444790509526579, |
| "learning_rate": 2e-05, |
| "loss": 95.7164, |
| "loss/crossentropy": 3.295865297317505, |
| "loss/hidden": 0.06591796875, |
| "loss/logits": 0.02641688659787178, |
| "loss/reg": 95.62409973144531, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.00275, |
| "grad_norm": 1.7702182531356812, |
| "grad_norm_var": 1.423617390900579, |
| "learning_rate": 2e-05, |
| "loss": 95.4, |
| "loss/crossentropy": 2.6738994121551514, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.014722645282745361, |
| "loss/reg": 95.32594299316406, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.002775, |
| "grad_norm": 1.3760358095169067, |
| "grad_norm_var": 1.4551049156645277, |
| "learning_rate": 2e-05, |
| "loss": 95.0977, |
| "loss/crossentropy": 2.6674644947052, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.013482127338647842, |
| "loss/reg": 95.02709197998047, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.0028, |
| "grad_norm": 1.5607060194015503, |
| "grad_norm_var": 1.589101356479648, |
| "learning_rate": 2e-05, |
| "loss": 94.8153, |
| "loss/crossentropy": 2.852834939956665, |
| "loss/hidden": 0.08251953125, |
| "loss/logits": 0.014033878222107887, |
| "loss/reg": 94.71875, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.002825, |
| "grad_norm": 3.75045108795166, |
| "grad_norm_var": 1.5988275534787362, |
| "learning_rate": 2e-05, |
| "loss": 94.5009, |
| "loss/crossentropy": 2.9057774543762207, |
| "loss/hidden": 0.061279296875, |
| "loss/logits": 0.020055318251252174, |
| "loss/reg": 94.4195556640625, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.00285, |
| "grad_norm": 2.267350912094116, |
| "grad_norm_var": 1.3166800230582432, |
| "learning_rate": 2e-05, |
| "loss": 94.1944, |
| "loss/crossentropy": 2.7482869625091553, |
| "loss/hidden": 0.056884765625, |
| "loss/logits": 0.017577335238456726, |
| "loss/reg": 94.1199722290039, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.002875, |
| "grad_norm": 2.194246768951416, |
| "grad_norm_var": 0.8532155162290316, |
| "learning_rate": 2e-05, |
| "loss": 93.8912, |
| "loss/crossentropy": 2.743758201599121, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.013515879400074482, |
| "loss/reg": 93.82050323486328, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.0029, |
| "grad_norm": 1.7801834344863892, |
| "grad_norm_var": 0.8948634812187672, |
| "learning_rate": 2e-05, |
| "loss": 93.5896, |
| "loss/crossentropy": 2.744076728820801, |
| "loss/hidden": 0.0546875, |
| "loss/logits": 0.014069687575101852, |
| "loss/reg": 93.5208740234375, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.002925, |
| "grad_norm": 1.7759678363800049, |
| "grad_norm_var": 0.9386863932751007, |
| "learning_rate": 2e-05, |
| "loss": 93.2962, |
| "loss/crossentropy": 2.806662082672119, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.015871506184339523, |
| "loss/reg": 93.22098541259766, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.00295, |
| "grad_norm": 2.6695425510406494, |
| "grad_norm_var": 0.8975061402584836, |
| "learning_rate": 2e-05, |
| "loss": 92.9929, |
| "loss/crossentropy": 2.797207832336426, |
| "loss/hidden": 0.056884765625, |
| "loss/logits": 0.01545205432921648, |
| "loss/reg": 92.92053985595703, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.002975, |
| "grad_norm": 2.0127737522125244, |
| "grad_norm_var": 0.8960978495062031, |
| "learning_rate": 2e-05, |
| "loss": 92.6882, |
| "loss/crossentropy": 3.020660638809204, |
| "loss/hidden": 0.0546875, |
| "loss/logits": 0.013032155111432076, |
| "loss/reg": 92.62051391601562, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.003, |
| "grad_norm": 2.1293652057647705, |
| "grad_norm_var": 0.88991297365724, |
| "learning_rate": 2e-05, |
| "loss": 92.3946, |
| "loss/crossentropy": 2.962179660797119, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.014865359291434288, |
| "loss/reg": 92.32038116455078, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.003025, |
| "grad_norm": 2.659437894821167, |
| "grad_norm_var": 0.5128431927902019, |
| "learning_rate": 2e-05, |
| "loss": 92.0991, |
| "loss/crossentropy": 2.7553904056549072, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.017567995935678482, |
| "loss/reg": 92.01996612548828, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.00305, |
| "grad_norm": 3.1532716751098633, |
| "grad_norm_var": 0.5479399334418377, |
| "learning_rate": 2e-05, |
| "loss": 91.7898, |
| "loss/crossentropy": 3.20906138420105, |
| "loss/hidden": 0.05908203125, |
| "loss/logits": 0.01771578937768936, |
| "loss/reg": 91.71299743652344, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.003075, |
| "grad_norm": 2.2405543327331543, |
| "grad_norm_var": 0.3839241818330661, |
| "learning_rate": 2e-05, |
| "loss": 91.4828, |
| "loss/crossentropy": 2.6381375789642334, |
| "loss/hidden": 0.056884765625, |
| "loss/logits": 0.013064563274383545, |
| "loss/reg": 91.41288757324219, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.0031, |
| "grad_norm": 3.476731300354004, |
| "grad_norm_var": 0.4723567159636746, |
| "learning_rate": 2e-05, |
| "loss": 91.1941, |
| "loss/crossentropy": 2.8266453742980957, |
| "loss/hidden": 0.06396484375, |
| "loss/logits": 0.01818031072616577, |
| "loss/reg": 91.11192321777344, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.003125, |
| "grad_norm": 1.7917191982269287, |
| "grad_norm_var": 0.4727696793815556, |
| "learning_rate": 2e-05, |
| "loss": 90.8828, |
| "loss/crossentropy": 3.0270473957061768, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.013925662264227867, |
| "loss/reg": 90.81177520751953, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.00315, |
| "grad_norm": 3.2372536659240723, |
| "grad_norm_var": 0.5059943836231217, |
| "learning_rate": 2e-05, |
| "loss": 90.5864, |
| "loss/crossentropy": 2.838686227798462, |
| "loss/hidden": 0.05908203125, |
| "loss/logits": 0.016410067677497864, |
| "loss/reg": 90.51092529296875, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.003175, |
| "grad_norm": 1.6228362321853638, |
| "grad_norm_var": 0.4767731820654664, |
| "learning_rate": 2e-05, |
| "loss": 90.2798, |
| "loss/crossentropy": 2.6141273975372314, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.011823797598481178, |
| "loss/reg": 90.21080780029297, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.0032, |
| "grad_norm": 2.675074577331543, |
| "grad_norm_var": 0.4304030863895955, |
| "learning_rate": 2e-05, |
| "loss": 89.9815, |
| "loss/crossentropy": 2.7882752418518066, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.014153627678751945, |
| "loss/reg": 89.91018676757812, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.003225, |
| "grad_norm": 2.7944464683532715, |
| "grad_norm_var": 0.32364587258530453, |
| "learning_rate": 2e-05, |
| "loss": 89.685, |
| "loss/crossentropy": 2.5254697799682617, |
| "loss/hidden": 0.05908203125, |
| "loss/logits": 0.016538312658667564, |
| "loss/reg": 89.6093978881836, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.00325, |
| "grad_norm": 1.8436682224273682, |
| "grad_norm_var": 0.34264366805227553, |
| "learning_rate": 2e-05, |
| "loss": 89.3803, |
| "loss/crossentropy": 2.928762197494507, |
| "loss/hidden": 0.056884765625, |
| "loss/logits": 0.014276012778282166, |
| "loss/reg": 89.30912017822266, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.003275, |
| "grad_norm": 2.6479427814483643, |
| "grad_norm_var": 0.3443586211365847, |
| "learning_rate": 2e-05, |
| "loss": 89.084, |
| "loss/crossentropy": 2.7882349491119385, |
| "loss/hidden": 0.05908203125, |
| "loss/logits": 0.016245754435658455, |
| "loss/reg": 89.00865936279297, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.0033, |
| "grad_norm": 4.339890956878662, |
| "grad_norm_var": 0.5399623455134483, |
| "learning_rate": 2e-05, |
| "loss": 88.7743, |
| "loss/crossentropy": 2.843156337738037, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.014006150886416435, |
| "loss/reg": 88.70095825195312, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.003325, |
| "grad_norm": 1.3859492540359497, |
| "grad_norm_var": 0.5906001827105968, |
| "learning_rate": 2e-05, |
| "loss": 88.4686, |
| "loss/crossentropy": 2.56451153755188, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.011048642918467522, |
| "loss/reg": 88.40039825439453, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.00335, |
| "grad_norm": 3.4068877696990967, |
| "grad_norm_var": 0.6370671369808453, |
| "learning_rate": 2e-05, |
| "loss": 88.1735, |
| "loss/crossentropy": 2.813845634460449, |
| "loss/hidden": 0.056884765625, |
| "loss/logits": 0.016296520829200745, |
| "loss/reg": 88.1003646850586, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.003375, |
| "grad_norm": 2.2145471572875977, |
| "grad_norm_var": 0.6241198038182761, |
| "learning_rate": 2e-05, |
| "loss": 87.8709, |
| "loss/crossentropy": 2.9568309783935547, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.013690088875591755, |
| "loss/reg": 87.80007934570312, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.0034, |
| "grad_norm": 2.6741232872009277, |
| "grad_norm_var": 0.6083942369195152, |
| "learning_rate": 2e-05, |
| "loss": 87.5719, |
| "loss/crossentropy": 2.7441060543060303, |
| "loss/hidden": 0.056884765625, |
| "loss/logits": 0.014466654509305954, |
| "loss/reg": 87.50055694580078, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.003425, |
| "grad_norm": 2.338223934173584, |
| "grad_norm_var": 0.6138078516363993, |
| "learning_rate": 2e-05, |
| "loss": 87.2744, |
| "loss/crossentropy": 2.4464330673217773, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.014399213716387749, |
| "loss/reg": 87.20066833496094, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.00345, |
| "grad_norm": 1.8222837448120117, |
| "grad_norm_var": 0.6290386070508833, |
| "learning_rate": 2e-05, |
| "loss": 86.9703, |
| "loss/crossentropy": 2.8742918968200684, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.011516067199409008, |
| "loss/reg": 86.90167999267578, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.003475, |
| "grad_norm": 2.03706955909729, |
| "grad_norm_var": 0.6395340114512105, |
| "learning_rate": 2e-05, |
| "loss": 86.6734, |
| "loss/crossentropy": 2.651973009109497, |
| "loss/hidden": 0.056884765625, |
| "loss/logits": 0.01390259712934494, |
| "loss/reg": 86.60263061523438, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.0035, |
| "grad_norm": 2.9460866451263428, |
| "grad_norm_var": 0.5893915505304466, |
| "learning_rate": 2e-05, |
| "loss": 86.4269, |
| "loss/crossentropy": 3.348736047744751, |
| "loss/hidden": 0.10693359375, |
| "loss/logits": 0.016606930643320084, |
| "loss/reg": 86.30339050292969, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.003525, |
| "grad_norm": 2.397338390350342, |
| "grad_norm_var": 0.5562422152076701, |
| "learning_rate": 2e-05, |
| "loss": 86.0776, |
| "loss/crossentropy": 2.462433099746704, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.01363131683319807, |
| "loss/reg": 86.00459289550781, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.00355, |
| "grad_norm": 2.2476322650909424, |
| "grad_norm_var": 0.5233350316059858, |
| "learning_rate": 2e-05, |
| "loss": 85.7713, |
| "loss/crossentropy": 2.6098899841308594, |
| "loss/hidden": 0.056884765625, |
| "loss/logits": 0.015527555719017982, |
| "loss/reg": 85.69889831542969, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.003575, |
| "grad_norm": 1.603454828262329, |
| "grad_norm_var": 0.5255273885155527, |
| "learning_rate": 2e-05, |
| "loss": 85.4701, |
| "loss/crossentropy": 2.62600040435791, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.012892360799014568, |
| "loss/reg": 85.40011596679688, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.0036, |
| "grad_norm": 1.814164400100708, |
| "grad_norm_var": 0.5472671850991829, |
| "learning_rate": 2e-05, |
| "loss": 85.1766, |
| "loss/crossentropy": 2.7545013427734375, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.015227731317281723, |
| "loss/reg": 85.10199737548828, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.003625, |
| "grad_norm": 3.5215349197387695, |
| "grad_norm_var": 0.6178589741537791, |
| "learning_rate": 2e-05, |
| "loss": 84.8787, |
| "loss/crossentropy": 3.249708890914917, |
| "loss/hidden": 0.05908203125, |
| "loss/logits": 0.015877550467848778, |
| "loss/reg": 84.80371856689453, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.00365, |
| "grad_norm": 1.9731359481811523, |
| "grad_norm_var": 0.6083958559047544, |
| "learning_rate": 2e-05, |
| "loss": 84.5763, |
| "loss/crossentropy": 3.2176551818847656, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.013086882419884205, |
| "loss/reg": 84.50607299804688, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.003675, |
| "grad_norm": 4.117058753967285, |
| "grad_norm_var": 0.7799786660544029, |
| "learning_rate": 2e-05, |
| "loss": 84.2939, |
| "loss/crossentropy": 2.667962074279785, |
| "loss/hidden": 0.0634765625, |
| "loss/logits": 0.022213537245988846, |
| "loss/reg": 84.20823669433594, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.0037, |
| "grad_norm": 3.0872607231140137, |
| "grad_norm_var": 0.5795145425234849, |
| "learning_rate": 2e-05, |
| "loss": 84.0006, |
| "loss/crossentropy": 2.904244899749756, |
| "loss/hidden": 0.07080078125, |
| "loss/logits": 0.01870722323656082, |
| "loss/reg": 83.91111755371094, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.003725, |
| "grad_norm": 2.1129748821258545, |
| "grad_norm_var": 0.507061159842186, |
| "learning_rate": 2e-05, |
| "loss": 83.6913, |
| "loss/crossentropy": 2.4297919273376465, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.01596939191222191, |
| "loss/reg": 83.61376953125, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.00375, |
| "grad_norm": 1.9368386268615723, |
| "grad_norm_var": 0.46821439415792837, |
| "learning_rate": 2e-05, |
| "loss": 83.3904, |
| "loss/crossentropy": 2.7391281127929688, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.014091565273702145, |
| "loss/reg": 83.31694030761719, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.003775, |
| "grad_norm": 3.6663706302642822, |
| "grad_norm_var": 0.5586835942509762, |
| "learning_rate": 2e-05, |
| "loss": 83.1007, |
| "loss/crossentropy": 2.8713481426239014, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.01851273886859417, |
| "loss/reg": 83.02061462402344, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.0038, |
| "grad_norm": 2.134021043777466, |
| "grad_norm_var": 0.5657064796702647, |
| "learning_rate": 2e-05, |
| "loss": 82.7923, |
| "loss/crossentropy": 3.2020580768585205, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.01641719415783882, |
| "loss/reg": 82.71440124511719, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.003825, |
| "grad_norm": 2.897209405899048, |
| "grad_norm_var": 0.574317300050713, |
| "learning_rate": 2e-05, |
| "loss": 82.5031, |
| "loss/crossentropy": 2.975339412689209, |
| "loss/hidden": 0.06396484375, |
| "loss/logits": 0.021052952855825424, |
| "loss/reg": 82.41812133789062, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.00385, |
| "grad_norm": 1.751135230064392, |
| "grad_norm_var": 0.5812492457165689, |
| "learning_rate": 2e-05, |
| "loss": 82.2003, |
| "loss/crossentropy": 2.38250994682312, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.016858752816915512, |
| "loss/reg": 82.12188720703125, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.003875, |
| "grad_norm": 2.5146450996398926, |
| "grad_norm_var": 0.5650580150903565, |
| "learning_rate": 2e-05, |
| "loss": 81.9018, |
| "loss/crossentropy": 2.493891954421997, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.013709386810660362, |
| "loss/reg": 81.82661437988281, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.0039, |
| "grad_norm": 2.2257137298583984, |
| "grad_norm_var": 0.5589724988129731, |
| "learning_rate": 2e-05, |
| "loss": 81.606, |
| "loss/crossentropy": 2.713038682937622, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.015239729546010494, |
| "loss/reg": 81.53138732910156, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.003925, |
| "grad_norm": 5.219791412353516, |
| "grad_norm_var": 1.01821672382146, |
| "learning_rate": 2e-05, |
| "loss": 81.3182, |
| "loss/crossentropy": 2.7188966274261475, |
| "loss/hidden": 0.06396484375, |
| "loss/logits": 0.017340324819087982, |
| "loss/reg": 81.23693084716797, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.00395, |
| "grad_norm": 1.601794958114624, |
| "grad_norm_var": 1.0812106477349142, |
| "learning_rate": 2e-05, |
| "loss": 81.0152, |
| "loss/crossentropy": 2.647388219833374, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.013692192733287811, |
| "loss/reg": 80.9421615600586, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.003975, |
| "grad_norm": 2.963209390640259, |
| "grad_norm_var": 1.009555342899499, |
| "learning_rate": 2e-05, |
| "loss": 80.7271, |
| "loss/crossentropy": 2.849640369415283, |
| "loss/hidden": 0.05908203125, |
| "loss/logits": 0.019920967519283295, |
| "loss/reg": 80.64808654785156, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.004, |
| "grad_norm": 2.3866851329803467, |
| "grad_norm_var": 0.9608132022483118, |
| "learning_rate": 2e-05, |
| "loss": 80.4313, |
| "loss/crossentropy": 2.6206085681915283, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.015263768844306469, |
| "loss/reg": 80.35447692871094, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.004025, |
| "grad_norm": 2.6866133213043213, |
| "grad_norm_var": 0.9192531230990411, |
| "learning_rate": 2e-05, |
| "loss": 80.1358, |
| "loss/crossentropy": 2.6675243377685547, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.015180731192231178, |
| "loss/reg": 80.061279296875, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.00405, |
| "grad_norm": 2.2380247116088867, |
| "grad_norm_var": 0.8978023926442598, |
| "learning_rate": 2e-05, |
| "loss": 79.8366, |
| "loss/crossentropy": 2.9008312225341797, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.014321748167276382, |
| "loss/reg": 79.76295471191406, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.004075, |
| "grad_norm": 5.501147747039795, |
| "grad_norm_var": 1.2751311244587156, |
| "learning_rate": 2e-05, |
| "loss": 79.5506, |
| "loss/crossentropy": 2.774265766143799, |
| "loss/hidden": 0.06396484375, |
| "loss/logits": 0.016338951885700226, |
| "loss/reg": 79.47030639648438, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.0041, |
| "grad_norm": 1.578844666481018, |
| "grad_norm_var": 1.3611156779384619, |
| "learning_rate": 2e-05, |
| "loss": 79.2495, |
| "loss/crossentropy": 2.8486151695251465, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.01422686967998743, |
| "loss/reg": 79.17816925048828, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.004125, |
| "grad_norm": 1.8603459596633911, |
| "grad_norm_var": 1.3853304523501924, |
| "learning_rate": 2e-05, |
| "loss": 78.9609, |
| "loss/crossentropy": 2.9763364791870117, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.015356909483671188, |
| "loss/reg": 78.88626098632812, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.00415, |
| "grad_norm": 2.4613988399505615, |
| "grad_norm_var": 1.3493160215050024, |
| "learning_rate": 2e-05, |
| "loss": 78.6768, |
| "loss/crossentropy": 2.6457760334014893, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.01994500868022442, |
| "loss/reg": 78.59528350830078, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.004175, |
| "grad_norm": 2.5572216510772705, |
| "grad_norm_var": 1.2877918838448412, |
| "learning_rate": 2e-05, |
| "loss": 78.3792, |
| "loss/crossentropy": 2.726158380508423, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.015283918008208275, |
| "loss/reg": 78.30459594726562, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.0042, |
| "grad_norm": 3.149622917175293, |
| "grad_norm_var": 1.2808819694844975, |
| "learning_rate": 2e-05, |
| "loss": 78.0974, |
| "loss/crossentropy": 2.464186906814575, |
| "loss/hidden": 0.068359375, |
| "loss/logits": 0.014741174876689911, |
| "loss/reg": 78.01426696777344, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.004225, |
| "grad_norm": 1.4037340879440308, |
| "grad_norm_var": 1.3859120968246847, |
| "learning_rate": 2e-05, |
| "loss": 77.8036, |
| "loss/crossentropy": 2.8845841884613037, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.017704475671052933, |
| "loss/reg": 77.724365234375, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.00425, |
| "grad_norm": 4.424335479736328, |
| "grad_norm_var": 1.518842252075294, |
| "learning_rate": 2e-05, |
| "loss": 77.5411, |
| "loss/crossentropy": 3.1228435039520264, |
| "loss/hidden": 0.0859375, |
| "loss/logits": 0.020280467346310616, |
| "loss/reg": 77.4349136352539, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.004275, |
| "grad_norm": 1.2430707216262817, |
| "grad_norm_var": 1.6679938506527754, |
| "learning_rate": 2e-05, |
| "loss": 77.2194, |
| "loss/crossentropy": 2.7156295776367188, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.014160854741930962, |
| "loss/reg": 77.14595794677734, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.0043, |
| "grad_norm": 2.0627477169036865, |
| "grad_norm_var": 1.6803689194951175, |
| "learning_rate": 2e-05, |
| "loss": 76.9251, |
| "loss/crossentropy": 2.817188262939453, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.01637103408575058, |
| "loss/reg": 76.84935760498047, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.004325, |
| "grad_norm": 2.538249969482422, |
| "grad_norm_var": 1.2319590329252075, |
| "learning_rate": 2e-05, |
| "loss": 76.637, |
| "loss/crossentropy": 2.71427321434021, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.014527076855301857, |
| "loss/reg": 76.56098937988281, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.00435, |
| "grad_norm": 1.848551869392395, |
| "grad_norm_var": 1.2048617288824057, |
| "learning_rate": 2e-05, |
| "loss": 76.3474, |
| "loss/crossentropy": 2.9161787033081055, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.015176617540419102, |
| "loss/reg": 76.27291870117188, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.004375, |
| "grad_norm": 2.4840588569641113, |
| "grad_norm_var": 1.1932266879105464, |
| "learning_rate": 2e-05, |
| "loss": 76.0652, |
| "loss/crossentropy": 3.0494370460510254, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.018287423998117447, |
| "loss/reg": 75.98535919189453, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.0044, |
| "grad_norm": 1.4861845970153809, |
| "grad_norm_var": 1.2607000339864975, |
| "learning_rate": 2e-05, |
| "loss": 75.7693, |
| "loss/crossentropy": 2.6618382930755615, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.013419630005955696, |
| "loss/reg": 75.69873046875, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.004425, |
| "grad_norm": 2.652388334274292, |
| "grad_norm_var": 1.2597859494322088, |
| "learning_rate": 2e-05, |
| "loss": 75.4947, |
| "loss/crossentropy": 3.0685219764709473, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.020900297909975052, |
| "loss/reg": 75.41223907470703, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.00445, |
| "grad_norm": 2.228562831878662, |
| "grad_norm_var": 1.2600818300469971, |
| "learning_rate": 2e-05, |
| "loss": 75.2042, |
| "loss/crossentropy": 2.6521048545837402, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.01636704057455063, |
| "loss/reg": 75.1263427734375, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.004475, |
| "grad_norm": 2.145296096801758, |
| "grad_norm_var": 0.6065573977317332, |
| "learning_rate": 2e-05, |
| "loss": 74.9156, |
| "loss/crossentropy": 2.7815215587615967, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.01532922312617302, |
| "loss/reg": 74.84095764160156, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.0045, |
| "grad_norm": 3.027330160140991, |
| "grad_norm_var": 0.6065639312435943, |
| "learning_rate": 2e-05, |
| "loss": 74.6397, |
| "loss/crossentropy": 2.828174591064453, |
| "loss/hidden": 0.06396484375, |
| "loss/logits": 0.020114243030548096, |
| "loss/reg": 74.55561828613281, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.004525, |
| "grad_norm": 1.8776589632034302, |
| "grad_norm_var": 0.6054562283667906, |
| "learning_rate": 2e-05, |
| "loss": 74.3442, |
| "loss/crossentropy": 2.7055771350860596, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.01365465484559536, |
| "loss/reg": 74.27119445800781, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.00455, |
| "grad_norm": 3.372591733932495, |
| "grad_norm_var": 0.6709551658510691, |
| "learning_rate": 2e-05, |
| "loss": 74.0724, |
| "loss/crossentropy": 3.0145347118377686, |
| "loss/hidden": 0.07275390625, |
| "loss/logits": 0.022765886038541794, |
| "loss/reg": 73.97685241699219, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.004575, |
| "grad_norm": 1.4099830389022827, |
| "grad_norm_var": 0.7301368956932454, |
| "learning_rate": 2e-05, |
| "loss": 73.7664, |
| "loss/crossentropy": 2.795443058013916, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.013609878718852997, |
| "loss/reg": 73.69348907470703, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.0046, |
| "grad_norm": 1.8791447877883911, |
| "grad_norm_var": 0.6929646819722675, |
| "learning_rate": 2e-05, |
| "loss": 73.4903, |
| "loss/crossentropy": 2.60862135887146, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.017894212156534195, |
| "loss/reg": 73.41085052490234, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.004625, |
| "grad_norm": 4.824383735656738, |
| "grad_norm_var": 1.0359055758899136, |
| "learning_rate": 2e-05, |
| "loss": 73.2051, |
| "loss/crossentropy": 2.708460569381714, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.015174289233982563, |
| "loss/reg": 73.12835693359375, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.00465, |
| "grad_norm": 1.6892882585525513, |
| "grad_norm_var": 0.7903900820577945, |
| "learning_rate": 2e-05, |
| "loss": 72.9203, |
| "loss/crossentropy": 2.5275192260742188, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.014362558722496033, |
| "loss/reg": 72.84661102294922, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.004675, |
| "grad_norm": 2.1509838104248047, |
| "grad_norm_var": 0.7141933855878756, |
| "learning_rate": 2e-05, |
| "loss": 72.6431, |
| "loss/crossentropy": 2.828364849090576, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.016414958983659744, |
| "loss/reg": 72.56517028808594, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.0047, |
| "grad_norm": 2.066053867340088, |
| "grad_norm_var": 0.7140653095859344, |
| "learning_rate": 2e-05, |
| "loss": 72.3592, |
| "loss/crossentropy": 2.918135166168213, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.015539775602519512, |
| "loss/reg": 72.2843246459961, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.004725, |
| "grad_norm": 1.7003145217895508, |
| "grad_norm_var": 0.7374802094477738, |
| "learning_rate": 2e-05, |
| "loss": 72.0782, |
| "loss/crossentropy": 2.828505754470825, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.01498749665915966, |
| "loss/reg": 72.00386047363281, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.00475, |
| "grad_norm": 2.66283917427063, |
| "grad_norm_var": 0.7296169710249801, |
| "learning_rate": 2e-05, |
| "loss": 71.8013, |
| "loss/crossentropy": 2.9118659496307373, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.015920046716928482, |
| "loss/reg": 71.723876953125, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.004775, |
| "grad_norm": 2.2389256954193115, |
| "grad_norm_var": 0.7291075429880787, |
| "learning_rate": 2e-05, |
| "loss": 71.5216, |
| "loss/crossentropy": 2.6960675716400146, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.014915665611624718, |
| "loss/reg": 71.4451675415039, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.0048, |
| "grad_norm": 2.652817487716675, |
| "grad_norm_var": 0.6816330763082435, |
| "learning_rate": 2e-05, |
| "loss": 71.2388, |
| "loss/crossentropy": 2.7465293407440186, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.01686110533773899, |
| "loss/reg": 71.160400390625, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.004825, |
| "grad_norm": 2.2822165489196777, |
| "grad_norm_var": 0.6782911578411197, |
| "learning_rate": 2e-05, |
| "loss": 70.9582, |
| "loss/crossentropy": 2.776250123977661, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.016233820468187332, |
| "loss/reg": 70.88268280029297, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.00485, |
| "grad_norm": 2.540499210357666, |
| "grad_norm_var": 0.6777404274996485, |
| "learning_rate": 2e-05, |
| "loss": 70.6821, |
| "loss/crossentropy": 2.798632860183716, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.015345659106969833, |
| "loss/reg": 70.60520935058594, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.004875, |
| "grad_norm": 1.9856505393981934, |
| "grad_norm_var": 0.6849150734256606, |
| "learning_rate": 2e-05, |
| "loss": 70.405, |
| "loss/crossentropy": 2.6718432903289795, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.014745904132723808, |
| "loss/reg": 70.32874298095703, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.0049, |
| "grad_norm": 2.3494439125061035, |
| "grad_norm_var": 0.6567124398874036, |
| "learning_rate": 2e-05, |
| "loss": 70.1346, |
| "loss/crossentropy": 2.9501113891601562, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.02032644674181938, |
| "loss/reg": 70.05279541015625, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.004925, |
| "grad_norm": 3.5448572635650635, |
| "grad_norm_var": 0.7242858678478853, |
| "learning_rate": 2e-05, |
| "loss": 69.8519, |
| "loss/crossentropy": 2.6818275451660156, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.014945561066269875, |
| "loss/reg": 69.7776107788086, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.00495, |
| "grad_norm": 1.8181650638580322, |
| "grad_norm_var": 0.6860304672396395, |
| "learning_rate": 2e-05, |
| "loss": 69.5793, |
| "loss/crossentropy": 2.56965708732605, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.014989674091339111, |
| "loss/reg": 69.50276947021484, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.004975, |
| "grad_norm": 3.4242897033691406, |
| "grad_norm_var": 0.6838728374435107, |
| "learning_rate": 2e-05, |
| "loss": 69.3558, |
| "loss/crossentropy": 3.0659897327423096, |
| "loss/hidden": 0.109375, |
| "loss/logits": 0.017705179750919342, |
| "loss/reg": 69.22872161865234, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.005, |
| "grad_norm": 1.973741888999939, |
| "grad_norm_var": 0.6767511902871185, |
| "learning_rate": 2e-05, |
| "loss": 69.0331, |
| "loss/crossentropy": 2.637833595275879, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.01859479956328869, |
| "loss/reg": 68.95521545410156, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.005025, |
| "grad_norm": 2.6389999389648438, |
| "grad_norm_var": 0.2962159441771213, |
| "learning_rate": 2e-05, |
| "loss": 68.7623, |
| "loss/crossentropy": 3.0950043201446533, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.01863202080130577, |
| "loss/reg": 68.68214416503906, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.00505, |
| "grad_norm": 2.583447217941284, |
| "grad_norm_var": 0.26652776132981815, |
| "learning_rate": 2e-05, |
| "loss": 68.4826, |
| "loss/crossentropy": 3.238170862197876, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.019120125100016594, |
| "loss/reg": 68.40199279785156, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.005075, |
| "grad_norm": 1.9552291631698608, |
| "grad_norm_var": 0.275770096339541, |
| "learning_rate": 2e-05, |
| "loss": 68.2168, |
| "loss/crossentropy": 2.745107650756836, |
| "loss/hidden": 0.06591796875, |
| "loss/logits": 0.02080545760691166, |
| "loss/reg": 68.13004302978516, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.0051, |
| "grad_norm": 2.0590291023254395, |
| "grad_norm_var": 0.2760869902247992, |
| "learning_rate": 2e-05, |
| "loss": 67.9385, |
| "loss/crossentropy": 2.9923768043518066, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.018183503299951553, |
| "loss/reg": 67.85881805419922, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.005125, |
| "grad_norm": 2.838015079498291, |
| "grad_norm_var": 0.25074774510742726, |
| "learning_rate": 2e-05, |
| "loss": 67.6847, |
| "loss/crossentropy": 3.02968692779541, |
| "loss/hidden": 0.07080078125, |
| "loss/logits": 0.02540656551718712, |
| "loss/reg": 67.58853149414062, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.00515, |
| "grad_norm": 3.082895278930664, |
| "grad_norm_var": 0.2724775294796672, |
| "learning_rate": 2e-05, |
| "loss": 67.4095, |
| "loss/crossentropy": 2.6776514053344727, |
| "loss/hidden": 0.07080078125, |
| "loss/logits": 0.020375534892082214, |
| "loss/reg": 67.3183364868164, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.005175, |
| "grad_norm": 2.3748674392700195, |
| "grad_norm_var": 0.2689364182377325, |
| "learning_rate": 2e-05, |
| "loss": 67.1321, |
| "loss/crossentropy": 2.7331337928771973, |
| "loss/hidden": 0.06396484375, |
| "loss/logits": 0.018977412953972816, |
| "loss/reg": 67.04918670654297, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.0052, |
| "grad_norm": 1.8422597646713257, |
| "grad_norm_var": 0.2941871012735906, |
| "learning_rate": 2e-05, |
| "loss": 66.8571, |
| "loss/crossentropy": 2.6536264419555664, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.01525039691478014, |
| "loss/reg": 66.78032684326172, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.005225, |
| "grad_norm": 2.370894193649292, |
| "grad_norm_var": 0.29262559140895844, |
| "learning_rate": 2e-05, |
| "loss": 66.5972, |
| "loss/crossentropy": 2.880850076675415, |
| "loss/hidden": 0.06640625, |
| "loss/logits": 0.01811142824590206, |
| "loss/reg": 66.51265716552734, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.00525, |
| "grad_norm": 1.8227527141571045, |
| "grad_norm_var": 0.3172526467082339, |
| "learning_rate": 2e-05, |
| "loss": 66.3284, |
| "loss/crossentropy": 2.6867871284484863, |
| "loss/hidden": 0.06396484375, |
| "loss/logits": 0.018997574225068092, |
| "loss/reg": 66.24544525146484, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.005275, |
| "grad_norm": 1.5266575813293457, |
| "grad_norm_var": 0.3567894464636623, |
| "learning_rate": 2e-05, |
| "loss": 66.0535, |
| "loss/crossentropy": 2.711437940597534, |
| "loss/hidden": 0.059326171875, |
| "loss/logits": 0.014955010265111923, |
| "loss/reg": 65.97917175292969, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.0053, |
| "grad_norm": 2.41679310798645, |
| "grad_norm_var": 0.3567280892475267, |
| "learning_rate": 2e-05, |
| "loss": 65.7917, |
| "loss/crossentropy": 2.9064691066741943, |
| "loss/hidden": 0.06396484375, |
| "loss/logits": 0.0197412371635437, |
| "loss/reg": 65.70800018310547, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.005325, |
| "grad_norm": 1.972184181213379, |
| "grad_norm_var": 0.2695787564070026, |
| "learning_rate": 2e-05, |
| "loss": 65.5564, |
| "loss/crossentropy": 2.7818334102630615, |
| "loss/hidden": 0.0947265625, |
| "loss/logits": 0.019062824547290802, |
| "loss/reg": 65.4426040649414, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.00535, |
| "grad_norm": 1.5892772674560547, |
| "grad_norm_var": 0.2873676086054596, |
| "learning_rate": 2e-05, |
| "loss": 65.259, |
| "loss/crossentropy": 2.737332344055176, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.019478369504213333, |
| "loss/reg": 65.17799377441406, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.005375, |
| "grad_norm": 6.980996131896973, |
| "grad_norm_var": 1.6209131844775593, |
| "learning_rate": 2e-05, |
| "loss": 65.0488, |
| "loss/crossentropy": 2.853780746459961, |
| "loss/hidden": 0.095703125, |
| "loss/logits": 0.03903430327773094, |
| "loss/reg": 64.91402435302734, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.0054, |
| "grad_norm": 3.549551486968994, |
| "grad_norm_var": 1.6651724517150548, |
| "learning_rate": 2e-05, |
| "loss": 64.7459, |
| "loss/crossentropy": 2.9141762256622314, |
| "loss/hidden": 0.07275390625, |
| "loss/logits": 0.022131089121103287, |
| "loss/reg": 64.6510238647461, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.005425, |
| "grad_norm": 1.94115149974823, |
| "grad_norm_var": 1.6920030605329315, |
| "learning_rate": 2e-05, |
| "loss": 64.4759, |
| "loss/crossentropy": 2.94362473487854, |
| "loss/hidden": 0.068359375, |
| "loss/logits": 0.019080817699432373, |
| "loss/reg": 64.38845825195312, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.00545, |
| "grad_norm": 2.473371744155884, |
| "grad_norm_var": 1.6923666873172403, |
| "learning_rate": 2e-05, |
| "loss": 64.2163, |
| "loss/crossentropy": 3.031843662261963, |
| "loss/hidden": 0.068359375, |
| "loss/logits": 0.02138814702630043, |
| "loss/reg": 64.12654113769531, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.005475, |
| "grad_norm": 2.364009141921997, |
| "grad_norm_var": 1.6704069952290912, |
| "learning_rate": 2e-05, |
| "loss": 63.9948, |
| "loss/crossentropy": 2.869771718978882, |
| "loss/hidden": 0.109375, |
| "loss/logits": 0.020195169374346733, |
| "loss/reg": 63.86522674560547, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.0055, |
| "grad_norm": 2.0074684619903564, |
| "grad_norm_var": 1.6741223453539247, |
| "learning_rate": 2e-05, |
| "loss": 63.6908, |
| "loss/crossentropy": 2.9813449382781982, |
| "loss/hidden": 0.06591796875, |
| "loss/logits": 0.01999806985259056, |
| "loss/reg": 63.60487365722656, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.005525, |
| "grad_norm": 1.478585958480835, |
| "grad_norm_var": 1.741421135603261, |
| "learning_rate": 2e-05, |
| "loss": 63.4223, |
| "loss/crossentropy": 2.450387716293335, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.01574028469622135, |
| "loss/reg": 63.34505081176758, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.00555, |
| "grad_norm": 1.767280101776123, |
| "grad_norm_var": 1.7450884712307488, |
| "learning_rate": 2e-05, |
| "loss": 63.165, |
| "loss/crossentropy": 2.9608945846557617, |
| "loss/hidden": 0.06591796875, |
| "loss/logits": 0.02006276696920395, |
| "loss/reg": 63.07899856567383, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.005575, |
| "grad_norm": 2.900290012359619, |
| "grad_norm_var": 1.7602401150279823, |
| "learning_rate": 2e-05, |
| "loss": 62.9101, |
| "loss/crossentropy": 2.7069523334503174, |
| "loss/hidden": 0.068359375, |
| "loss/logits": 0.021145779639482498, |
| "loss/reg": 62.82056427001953, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.0056, |
| "grad_norm": 2.2942614555358887, |
| "grad_norm_var": 1.7371226601830352, |
| "learning_rate": 2e-05, |
| "loss": 62.6463, |
| "loss/crossentropy": 2.82142972946167, |
| "loss/hidden": 0.06396484375, |
| "loss/logits": 0.01956920698285103, |
| "loss/reg": 62.56277847290039, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.005625, |
| "grad_norm": 2.362708330154419, |
| "grad_norm_var": 1.7372306188924502, |
| "learning_rate": 2e-05, |
| "loss": 62.3916, |
| "loss/crossentropy": 2.910395622253418, |
| "loss/hidden": 0.06591796875, |
| "loss/logits": 0.020157648250460625, |
| "loss/reg": 62.30548095703125, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.00565, |
| "grad_norm": 2.486630439758301, |
| "grad_norm_var": 1.7078860460967495, |
| "learning_rate": 2e-05, |
| "loss": 62.1414, |
| "loss/crossentropy": 3.0249392986297607, |
| "loss/hidden": 0.07080078125, |
| "loss/logits": 0.021398965269327164, |
| "loss/reg": 62.04918670654297, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.005675, |
| "grad_norm": 2.1476340293884277, |
| "grad_norm_var": 1.6508215590496325, |
| "learning_rate": 2e-05, |
| "loss": 61.8835, |
| "loss/crossentropy": 3.085418462753296, |
| "loss/hidden": 0.068359375, |
| "loss/logits": 0.021710071712732315, |
| "loss/reg": 61.79338455200195, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.0057, |
| "grad_norm": 1.5186433792114258, |
| "grad_norm_var": 1.7166830776668944, |
| "learning_rate": 2e-05, |
| "loss": 61.6184, |
| "loss/crossentropy": 2.4423468112945557, |
| "loss/hidden": 0.06396484375, |
| "loss/logits": 0.016175974160432816, |
| "loss/reg": 61.53825378417969, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.005725, |
| "grad_norm": 2.6107263565063477, |
| "grad_norm_var": 1.6981119809293725, |
| "learning_rate": 2e-05, |
| "loss": 61.373, |
| "loss/crossentropy": 2.9523637294769287, |
| "loss/hidden": 0.068359375, |
| "loss/logits": 0.0207897387444973, |
| "loss/reg": 61.283809661865234, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.00575, |
| "grad_norm": 1.9942364692687988, |
| "grad_norm_var": 1.6575925882854556, |
| "learning_rate": 2e-05, |
| "loss": 61.159, |
| "loss/crossentropy": 2.678663492202759, |
| "loss/hidden": 0.109375, |
| "loss/logits": 0.019178075715899467, |
| "loss/reg": 61.030418395996094, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.005775, |
| "grad_norm": 1.4201483726501465, |
| "grad_norm_var": 0.30852930383016425, |
| "learning_rate": 2e-05, |
| "loss": 60.8589, |
| "loss/crossentropy": 2.6229424476623535, |
| "loss/hidden": 0.06396484375, |
| "loss/logits": 0.017583219334483147, |
| "loss/reg": 60.777366638183594, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.0058, |
| "grad_norm": 1.8850197792053223, |
| "grad_norm_var": 0.18379847000516059, |
| "learning_rate": 2e-05, |
| "loss": 60.5975, |
| "loss/crossentropy": 2.717855930328369, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.018108602613210678, |
| "loss/reg": 60.51786422729492, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.005825, |
| "grad_norm": 2.635830879211426, |
| "grad_norm_var": 0.19894452868075493, |
| "learning_rate": 2e-05, |
| "loss": 60.3529, |
| "loss/crossentropy": 2.717303991317749, |
| "loss/hidden": 0.06591796875, |
| "loss/logits": 0.02073700539767742, |
| "loss/reg": 60.26628112792969, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.00585, |
| "grad_norm": 2.0215964317321777, |
| "grad_norm_var": 0.19202186958297318, |
| "learning_rate": 2e-05, |
| "loss": 60.1403, |
| "loss/crossentropy": 2.547003984451294, |
| "loss/hidden": 0.1044921875, |
| "loss/logits": 0.020180463790893555, |
| "loss/reg": 60.015602111816406, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.005875, |
| "grad_norm": 2.0360617637634277, |
| "grad_norm_var": 0.18800595898524625, |
| "learning_rate": 2e-05, |
| "loss": 59.8916, |
| "loss/crossentropy": 2.7054409980773926, |
| "loss/hidden": 0.10693359375, |
| "loss/logits": 0.019324198365211487, |
| "loss/reg": 59.76530456542969, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.0059, |
| "grad_norm": 1.9470698833465576, |
| "grad_norm_var": 0.18896257994485927, |
| "learning_rate": 2e-05, |
| "loss": 59.6092, |
| "loss/crossentropy": 2.5585479736328125, |
| "loss/hidden": 0.0703125, |
| "loss/logits": 0.02303473837673664, |
| "loss/reg": 59.515865325927734, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.005925, |
| "grad_norm": 1.3322805166244507, |
| "grad_norm_var": 0.20230885388968142, |
| "learning_rate": 2e-05, |
| "loss": 59.3467, |
| "loss/crossentropy": 2.845449924468994, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.01773185282945633, |
| "loss/reg": 59.26749038696289, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.00595, |
| "grad_norm": 1.921771764755249, |
| "grad_norm_var": 0.1972553683811412, |
| "learning_rate": 2e-05, |
| "loss": 59.1028, |
| "loss/crossentropy": 2.437079668045044, |
| "loss/hidden": 0.06591796875, |
| "loss/logits": 0.016928989440202713, |
| "loss/reg": 59.01997756958008, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.005975, |
| "grad_norm": 2.337186574935913, |
| "grad_norm_var": 0.15658778213605107, |
| "learning_rate": 2e-05, |
| "loss": 58.8627, |
| "loss/crossentropy": 2.7914915084838867, |
| "loss/hidden": 0.068359375, |
| "loss/logits": 0.02136731520295143, |
| "loss/reg": 58.77294921875, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.006, |
| "grad_norm": 1.9505559206008911, |
| "grad_norm_var": 0.1532120628996779, |
| "learning_rate": 2e-05, |
| "loss": 58.6135, |
| "loss/crossentropy": 2.8501033782958984, |
| "loss/hidden": 0.06591796875, |
| "loss/logits": 0.020856186747550964, |
| "loss/reg": 58.52672576904297, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.006025, |
| "grad_norm": 1.3474467992782593, |
| "grad_norm_var": 0.17367998148026603, |
| "learning_rate": 2e-05, |
| "loss": 58.3605, |
| "loss/crossentropy": 2.4960248470306396, |
| "loss/hidden": 0.06396484375, |
| "loss/logits": 0.015542775392532349, |
| "loss/reg": 58.280967712402344, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.00605, |
| "grad_norm": 2.888327121734619, |
| "grad_norm_var": 0.21119166554983568, |
| "learning_rate": 2e-05, |
| "loss": 58.123, |
| "loss/crossentropy": 2.714336395263672, |
| "loss/hidden": 0.07275390625, |
| "loss/logits": 0.02144978940486908, |
| "loss/reg": 58.028785705566406, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.006075, |
| "grad_norm": 2.533353805541992, |
| "grad_norm_var": 0.22810067793098288, |
| "learning_rate": 2e-05, |
| "loss": 57.8713, |
| "loss/crossentropy": 2.735057830810547, |
| "loss/hidden": 0.06591796875, |
| "loss/logits": 0.020662324503064156, |
| "loss/reg": 57.78470993041992, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.0061, |
| "grad_norm": 2.4481282234191895, |
| "grad_norm_var": 0.21949654966500648, |
| "learning_rate": 2e-05, |
| "loss": 57.6443, |
| "loss/crossentropy": 2.7660956382751465, |
| "loss/hidden": 0.0771484375, |
| "loss/logits": 0.025949764996767044, |
| "loss/reg": 57.541160583496094, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.006125, |
| "grad_norm": 1.7993848323822021, |
| "grad_norm_var": 0.2034264459530992, |
| "learning_rate": 2e-05, |
| "loss": 57.3799, |
| "loss/crossentropy": 2.469017505645752, |
| "loss/hidden": 0.06396484375, |
| "loss/logits": 0.017876872792840004, |
| "loss/reg": 57.2980842590332, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.00615, |
| "grad_norm": 2.93696928024292, |
| "grad_norm_var": 0.2543330785650771, |
| "learning_rate": 2e-05, |
| "loss": 57.1865, |
| "loss/crossentropy": 2.988600254058838, |
| "loss/hidden": 0.109375, |
| "loss/logits": 0.020821403712034225, |
| "loss/reg": 57.0562629699707, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.006175, |
| "grad_norm": 1.6419404745101929, |
| "grad_norm_var": 0.23759642989912186, |
| "learning_rate": 2e-05, |
| "loss": 56.9006, |
| "loss/crossentropy": 2.8848021030426025, |
| "loss/hidden": 0.06591796875, |
| "loss/logits": 0.020043395459651947, |
| "loss/reg": 56.81468200683594, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.0062, |
| "grad_norm": 1.7423299551010132, |
| "grad_norm_var": 0.24303384118528198, |
| "learning_rate": 2e-05, |
| "loss": 56.6711, |
| "loss/crossentropy": 2.8383066654205322, |
| "loss/hidden": 0.0751953125, |
| "loss/logits": 0.021477218717336655, |
| "loss/reg": 56.57439422607422, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.006225, |
| "grad_norm": 1.8884074687957764, |
| "grad_norm_var": 0.22405312170182767, |
| "learning_rate": 2e-05, |
| "loss": 56.4196, |
| "loss/crossentropy": 3.002903461456299, |
| "loss/hidden": 0.06591796875, |
| "loss/logits": 0.019266938790678978, |
| "loss/reg": 56.33445739746094, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.00625, |
| "grad_norm": 2.0407156944274902, |
| "grad_norm_var": 0.22400789294475007, |
| "learning_rate": 2e-05, |
| "loss": 56.1875, |
| "loss/crossentropy": 2.517186164855957, |
| "loss/hidden": 0.07080078125, |
| "loss/logits": 0.02101137302815914, |
| "loss/reg": 56.09565734863281, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.006275, |
| "grad_norm": 2.173579216003418, |
| "grad_norm_var": 0.22494351474196836, |
| "learning_rate": 2e-05, |
| "loss": 55.945, |
| "loss/crossentropy": 2.853949785232544, |
| "loss/hidden": 0.068359375, |
| "loss/logits": 0.019413135945796967, |
| "loss/reg": 55.85718536376953, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.0063, |
| "grad_norm": 10.485960960388184, |
| "grad_norm_var": 4.655585789275835, |
| "learning_rate": 2e-05, |
| "loss": 55.7284, |
| "loss/crossentropy": 2.9653632640838623, |
| "loss/hidden": 0.091796875, |
| "loss/logits": 0.025614306330680847, |
| "loss/reg": 55.61101150512695, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.006325, |
| "grad_norm": 2.01010799407959, |
| "grad_norm_var": 4.570472437484742, |
| "learning_rate": 2e-05, |
| "loss": 55.461, |
| "loss/crossentropy": 2.8666720390319824, |
| "loss/hidden": 0.06591796875, |
| "loss/logits": 0.020927395671606064, |
| "loss/reg": 55.37415313720703, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.00635, |
| "grad_norm": 2.971705913543701, |
| "grad_norm_var": 4.539645393263383, |
| "learning_rate": 2e-05, |
| "loss": 55.2367, |
| "loss/crossentropy": 3.0727896690368652, |
| "loss/hidden": 0.0771484375, |
| "loss/logits": 0.02156730368733406, |
| "loss/reg": 55.137996673583984, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.006375, |
| "grad_norm": 2.025726556777954, |
| "grad_norm_var": 4.560765147149087, |
| "learning_rate": 2e-05, |
| "loss": 54.9904, |
| "loss/crossentropy": 2.8021695613861084, |
| "loss/hidden": 0.068359375, |
| "loss/logits": 0.019413193687796593, |
| "loss/reg": 54.90263366699219, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.0064, |
| "grad_norm": 2.515801429748535, |
| "grad_norm_var": 4.525736863311493, |
| "learning_rate": 2e-05, |
| "loss": 54.7593, |
| "loss/crossentropy": 2.818976402282715, |
| "loss/hidden": 0.07080078125, |
| "loss/logits": 0.020367074757814407, |
| "loss/reg": 54.668121337890625, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.006425, |
| "grad_norm": 6.981767177581787, |
| "grad_norm_var": 5.482006202620502, |
| "learning_rate": 2e-05, |
| "loss": 54.5586, |
| "loss/crossentropy": 2.819366693496704, |
| "loss/hidden": 0.10693359375, |
| "loss/logits": 0.01748763397336006, |
| "loss/reg": 54.434200286865234, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.00645, |
| "grad_norm": 1.3410686254501343, |
| "grad_norm_var": 5.668649556530609, |
| "learning_rate": 2e-05, |
| "loss": 54.2886, |
| "loss/crossentropy": 2.730813503265381, |
| "loss/hidden": 0.068359375, |
| "loss/logits": 0.01936509646475315, |
| "loss/reg": 54.200897216796875, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.006475, |
| "grad_norm": 1.8726478815078735, |
| "grad_norm_var": 5.734492101602018, |
| "learning_rate": 2e-05, |
| "loss": 54.0538, |
| "loss/crossentropy": 3.02189564704895, |
| "loss/hidden": 0.06591796875, |
| "loss/logits": 0.01970936357975006, |
| "loss/reg": 53.96820831298828, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.0065, |
| "grad_norm": 3.454385995864868, |
| "grad_norm_var": 5.733156656528479, |
| "learning_rate": 2e-05, |
| "loss": 53.8456, |
| "loss/crossentropy": 2.5955543518066406, |
| "loss/hidden": 0.0751953125, |
| "loss/logits": 0.03384391963481903, |
| "loss/reg": 53.73658752441406, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.006525, |
| "grad_norm": 1.4927403926849365, |
| "grad_norm_var": 5.787821586949178, |
| "learning_rate": 2e-05, |
| "loss": 53.59, |
| "loss/crossentropy": 2.7993125915527344, |
| "loss/hidden": 0.06591796875, |
| "loss/logits": 0.018266569823026657, |
| "loss/reg": 53.50581359863281, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.00655, |
| "grad_norm": 2.202789068222046, |
| "grad_norm_var": 5.825085503943205, |
| "learning_rate": 2e-05, |
| "loss": 53.3542, |
| "loss/crossentropy": 2.6597094535827637, |
| "loss/hidden": 0.06591796875, |
| "loss/logits": 0.019392475485801697, |
| "loss/reg": 53.26885986328125, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.006575, |
| "grad_norm": 2.4559764862060547, |
| "grad_norm_var": 5.72695782376293, |
| "learning_rate": 2e-05, |
| "loss": 53.1381, |
| "loss/crossentropy": 2.9647631645202637, |
| "loss/hidden": 0.0771484375, |
| "loss/logits": 0.02172122895717621, |
| "loss/reg": 53.03927993774414, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.0066, |
| "grad_norm": 1.6547397375106812, |
| "grad_norm_var": 5.741873969332573, |
| "learning_rate": 2e-05, |
| "loss": 52.8978, |
| "loss/crossentropy": 2.8823652267456055, |
| "loss/hidden": 0.06591796875, |
| "loss/logits": 0.020957665517926216, |
| "loss/reg": 52.81092071533203, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.006625, |
| "grad_norm": 2.558666706085205, |
| "grad_norm_var": 5.673023506020719, |
| "learning_rate": 2e-05, |
| "loss": 52.6838, |
| "loss/crossentropy": 2.993384599685669, |
| "loss/hidden": 0.0771484375, |
| "loss/logits": 0.023681573569774628, |
| "loss/reg": 52.58296203613281, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.00665, |
| "grad_norm": 1.5857821702957153, |
| "grad_norm_var": 5.745050591443283, |
| "learning_rate": 2e-05, |
| "loss": 52.4834, |
| "loss/crossentropy": 2.769315242767334, |
| "loss/hidden": 0.10400390625, |
| "loss/logits": 0.023636557161808014, |
| "loss/reg": 52.35578918457031, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.006675, |
| "grad_norm": 2.601391553878784, |
| "grad_norm_var": 5.710121188860965, |
| "learning_rate": 2e-05, |
| "loss": 52.2297, |
| "loss/crossentropy": 2.79392671585083, |
| "loss/hidden": 0.0751953125, |
| "loss/logits": 0.02494995854794979, |
| "loss/reg": 52.12955856323242, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.0067, |
| "grad_norm": 1.7602055072784424, |
| "grad_norm_var": 1.7747363411260721, |
| "learning_rate": 2e-05, |
| "loss": 51.9948, |
| "loss/crossentropy": 2.9216957092285156, |
| "loss/hidden": 0.068359375, |
| "loss/logits": 0.022738244384527206, |
| "loss/reg": 51.903709411621094, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.006725, |
| "grad_norm": 1.9448193311691284, |
| "grad_norm_var": 1.7789874166887367, |
| "learning_rate": 2e-05, |
| "loss": 51.7677, |
| "loss/crossentropy": 2.7518603801727295, |
| "loss/hidden": 0.068359375, |
| "loss/logits": 0.020492523908615112, |
| "loss/reg": 51.67887878417969, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.00675, |
| "grad_norm": 5.49958610534668, |
| "grad_norm_var": 2.349575931185125, |
| "learning_rate": 2e-05, |
| "loss": 51.5654, |
| "loss/crossentropy": 2.48050856590271, |
| "loss/hidden": 0.0869140625, |
| "loss/logits": 0.0240333154797554, |
| "loss/reg": 51.454437255859375, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.006775, |
| "grad_norm": 1.8433462381362915, |
| "grad_norm_var": 2.3661487125074525, |
| "learning_rate": 2e-05, |
| "loss": 51.34, |
| "loss/crossentropy": 2.705430030822754, |
| "loss/hidden": 0.08740234375, |
| "loss/logits": 0.021868586540222168, |
| "loss/reg": 51.230751037597656, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.0068, |
| "grad_norm": 1.7715281248092651, |
| "grad_norm_var": 2.410153507700154, |
| "learning_rate": 2e-05, |
| "loss": 51.1084, |
| "loss/crossentropy": 2.9934589862823486, |
| "loss/hidden": 0.0849609375, |
| "loss/logits": 0.020941920578479767, |
| "loss/reg": 51.00248718261719, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.006825, |
| "grad_norm": 2.3719260692596436, |
| "grad_norm_var": 1.0228592647114596, |
| "learning_rate": 2e-05, |
| "loss": 50.876, |
| "loss/crossentropy": 2.6865527629852295, |
| "loss/hidden": 0.07275390625, |
| "loss/logits": 0.022343173623085022, |
| "loss/reg": 50.78093338012695, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.00685, |
| "grad_norm": 1.5276238918304443, |
| "grad_norm_var": 1.00178576807754, |
| "learning_rate": 2e-05, |
| "loss": 50.6493, |
| "loss/crossentropy": 2.901967763900757, |
| "loss/hidden": 0.068359375, |
| "loss/logits": 0.02106352336704731, |
| "loss/reg": 50.55984115600586, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.006875, |
| "grad_norm": 1.6692343950271606, |
| "grad_norm_var": 1.0156202418807194, |
| "learning_rate": 2e-05, |
| "loss": 50.4286, |
| "loss/crossentropy": 2.760937213897705, |
| "loss/hidden": 0.068359375, |
| "loss/logits": 0.020720936357975006, |
| "loss/reg": 50.33951950073242, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.0069, |
| "grad_norm": 1.5566948652267456, |
| "grad_norm_var": 0.9421993519803222, |
| "learning_rate": 2e-05, |
| "loss": 50.2053, |
| "loss/crossentropy": 2.9708621501922607, |
| "loss/hidden": 0.06591796875, |
| "loss/logits": 0.019328070804476738, |
| "loss/reg": 50.12004089355469, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.006925, |
| "grad_norm": 2.321723222732544, |
| "grad_norm_var": 0.911832100377154, |
| "learning_rate": 2e-05, |
| "loss": 49.9954, |
| "loss/crossentropy": 2.669133186340332, |
| "loss/hidden": 0.07275390625, |
| "loss/logits": 0.021549934521317482, |
| "loss/reg": 49.90113830566406, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.00695, |
| "grad_norm": 2.465369462966919, |
| "grad_norm_var": 0.9159632439841786, |
| "learning_rate": 2e-05, |
| "loss": 49.7791, |
| "loss/crossentropy": 2.7726550102233887, |
| "loss/hidden": 0.07275390625, |
| "loss/logits": 0.023049041628837585, |
| "loss/reg": 49.683250427246094, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.006975, |
| "grad_norm": 2.3777921199798584, |
| "grad_norm_var": 0.9139300418415842, |
| "learning_rate": 2e-05, |
| "loss": 49.574, |
| "loss/crossentropy": 2.879091739654541, |
| "loss/hidden": 0.08447265625, |
| "loss/logits": 0.023941390216350555, |
| "loss/reg": 49.46562957763672, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.007, |
| "grad_norm": 1.9317271709442139, |
| "grad_norm_var": 0.897871261555297, |
| "learning_rate": 2e-05, |
| "loss": 49.3497, |
| "loss/crossentropy": 2.8615469932556152, |
| "loss/hidden": 0.07958984375, |
| "loss/logits": 0.02096753567457199, |
| "loss/reg": 49.249176025390625, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.007025, |
| "grad_norm": 1.8541237115859985, |
| "grad_norm_var": 0.8986510855344348, |
| "learning_rate": 2e-05, |
| "loss": 49.127, |
| "loss/crossentropy": 2.6375844478607178, |
| "loss/hidden": 0.07275390625, |
| "loss/logits": 0.020916959270834923, |
| "loss/reg": 49.03334426879883, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.00705, |
| "grad_norm": 2.8278629779815674, |
| "grad_norm_var": 0.8945651245224343, |
| "learning_rate": 2e-05, |
| "loss": 48.9067, |
| "loss/crossentropy": 2.7122256755828857, |
| "loss/hidden": 0.0751953125, |
| "loss/logits": 0.020496461540460587, |
| "loss/reg": 48.81095886230469, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.007075, |
| "grad_norm": 1.7860355377197266, |
| "grad_norm_var": 0.9001221835000724, |
| "learning_rate": 2e-05, |
| "loss": 48.687, |
| "loss/crossentropy": 2.636594772338867, |
| "loss/hidden": 0.07080078125, |
| "loss/logits": 0.019465088844299316, |
| "loss/reg": 48.596717834472656, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.0071, |
| "grad_norm": 2.448721170425415, |
| "grad_norm_var": 0.8876001311071434, |
| "learning_rate": 2e-05, |
| "loss": 48.5043, |
| "loss/crossentropy": 2.9665403366088867, |
| "loss/hidden": 0.09912109375, |
| "loss/logits": 0.02214897982776165, |
| "loss/reg": 48.38304901123047, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.007125, |
| "grad_norm": 1.8128719329833984, |
| "grad_norm_var": 0.8942751417055016, |
| "learning_rate": 2e-05, |
| "loss": 48.2649, |
| "loss/crossentropy": 2.886575698852539, |
| "loss/hidden": 0.0751953125, |
| "loss/logits": 0.0196707583963871, |
| "loss/reg": 48.17000198364258, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.00715, |
| "grad_norm": 1.8738943338394165, |
| "grad_norm_var": 0.14694385548773212, |
| "learning_rate": 2e-05, |
| "loss": 48.093, |
| "loss/crossentropy": 2.9793410301208496, |
| "loss/hidden": 0.111328125, |
| "loss/logits": 0.023733166977763176, |
| "loss/reg": 47.957942962646484, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.007175, |
| "grad_norm": 1.8546899557113647, |
| "grad_norm_var": 0.14667332129106872, |
| "learning_rate": 2e-05, |
| "loss": 47.8435, |
| "loss/crossentropy": 2.5512354373931885, |
| "loss/hidden": 0.0751953125, |
| "loss/logits": 0.021910887211561203, |
| "loss/reg": 47.74641036987305, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.0072, |
| "grad_norm": 2.126343250274658, |
| "grad_norm_var": 0.1423970435536375, |
| "learning_rate": 2e-05, |
| "loss": 47.6355, |
| "loss/crossentropy": 2.801690101623535, |
| "loss/hidden": 0.0771484375, |
| "loss/logits": 0.022434473037719727, |
| "loss/reg": 47.53596496582031, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.007225, |
| "grad_norm": 2.5778279304504395, |
| "grad_norm_var": 0.15387340759936208, |
| "learning_rate": 2e-05, |
| "loss": 47.4306, |
| "loss/crossentropy": 2.7154979705810547, |
| "loss/hidden": 0.07958984375, |
| "loss/logits": 0.025169990956783295, |
| "loss/reg": 47.32587814331055, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.00725, |
| "grad_norm": 2.0250680446624756, |
| "grad_norm_var": 0.13381097704940614, |
| "learning_rate": 2e-05, |
| "loss": 47.2131, |
| "loss/crossentropy": 2.8832640647888184, |
| "loss/hidden": 0.0751953125, |
| "loss/logits": 0.021248571574687958, |
| "loss/reg": 47.11664962768555, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.007275, |
| "grad_norm": 1.7236976623535156, |
| "grad_norm_var": 0.13090910370116995, |
| "learning_rate": 2e-05, |
| "loss": 47.0029, |
| "loss/crossentropy": 2.8528189659118652, |
| "loss/hidden": 0.07275390625, |
| "loss/logits": 0.021527128294110298, |
| "loss/reg": 46.90858459472656, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.0073, |
| "grad_norm": 1.55807363986969, |
| "grad_norm_var": 0.1308097516741924, |
| "learning_rate": 2e-05, |
| "loss": 46.7919, |
| "loss/crossentropy": 2.664276361465454, |
| "loss/hidden": 0.0771484375, |
| "loss/logits": 0.021483037620782852, |
| "loss/reg": 46.69326400756836, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.007325, |
| "grad_norm": 2.162757396697998, |
| "grad_norm_var": 0.12764433705414124, |
| "learning_rate": 2e-05, |
| "loss": 46.6093, |
| "loss/crossentropy": 2.663728952407837, |
| "loss/hidden": 0.09375, |
| "loss/logits": 0.028836317360401154, |
| "loss/reg": 46.486732482910156, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.00735, |
| "grad_norm": 3.1162450313568115, |
| "grad_norm_var": 0.18687738678810295, |
| "learning_rate": 2e-05, |
| "loss": 46.3789, |
| "loss/crossentropy": 2.6197454929351807, |
| "loss/hidden": 0.0751953125, |
| "loss/logits": 0.02272343635559082, |
| "loss/reg": 46.28093719482422, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.007375, |
| "grad_norm": 1.504614233970642, |
| "grad_norm_var": 0.20551894946539725, |
| "learning_rate": 2e-05, |
| "loss": 46.1705, |
| "loss/crossentropy": 2.695830821990967, |
| "loss/hidden": 0.07275390625, |
| "loss/logits": 0.02187586948275566, |
| "loss/reg": 46.075836181640625, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.0074, |
| "grad_norm": 2.918551206588745, |
| "grad_norm_var": 0.24765848230744034, |
| "learning_rate": 2e-05, |
| "loss": 45.9933, |
| "loss/crossentropy": 2.9710702896118164, |
| "loss/hidden": 0.09326171875, |
| "loss/logits": 0.028397060930728912, |
| "loss/reg": 45.87164306640625, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.007425, |
| "grad_norm": 2.2940146923065186, |
| "grad_norm_var": 0.24323678513681368, |
| "learning_rate": 2e-05, |
| "loss": 45.7702, |
| "loss/crossentropy": 2.80556058883667, |
| "loss/hidden": 0.0771484375, |
| "loss/logits": 0.025129806250333786, |
| "loss/reg": 45.66794204711914, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.00745, |
| "grad_norm": 3.3687565326690674, |
| "grad_norm_var": 0.3094567617568638, |
| "learning_rate": 2e-05, |
| "loss": 45.5908, |
| "loss/crossentropy": 3.0187761783599854, |
| "loss/hidden": 0.09521484375, |
| "loss/logits": 0.03083011880517006, |
| "loss/reg": 45.46473693847656, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.007475, |
| "grad_norm": 1.961309790611267, |
| "grad_norm_var": 0.3017723922679678, |
| "learning_rate": 2e-05, |
| "loss": 45.3716, |
| "loss/crossentropy": 2.801523447036743, |
| "loss/hidden": 0.083984375, |
| "loss/logits": 0.025450080633163452, |
| "loss/reg": 45.26218032836914, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.0075, |
| "grad_norm": 2.01747989654541, |
| "grad_norm_var": 0.2995522458701202, |
| "learning_rate": 2e-05, |
| "loss": 45.1653, |
| "loss/crossentropy": 2.8323147296905518, |
| "loss/hidden": 0.07958984375, |
| "loss/logits": 0.02548621967434883, |
| "loss/reg": 45.06023406982422, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.007525, |
| "grad_norm": 2.3311493396759033, |
| "grad_norm_var": 0.2909006236155415, |
| "learning_rate": 2e-05, |
| "loss": 44.9708, |
| "loss/crossentropy": 2.7468738555908203, |
| "loss/hidden": 0.083984375, |
| "loss/logits": 0.027555497363209724, |
| "loss/reg": 44.859291076660156, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.00755, |
| "grad_norm": 2.1381912231445312, |
| "grad_norm_var": 0.2833022269660657, |
| "learning_rate": 2e-05, |
| "loss": 44.7534, |
| "loss/crossentropy": 2.472642421722412, |
| "loss/hidden": 0.0771484375, |
| "loss/logits": 0.022814348340034485, |
| "loss/reg": 44.65345001220703, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.007575, |
| "grad_norm": 2.22025203704834, |
| "grad_norm_var": 0.2733649855868098, |
| "learning_rate": 2e-05, |
| "loss": 44.5945, |
| "loss/crossentropy": 2.7359273433685303, |
| "loss/hidden": 0.10791015625, |
| "loss/logits": 0.03234729915857315, |
| "loss/reg": 44.45427703857422, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.0076, |
| "grad_norm": 1.7129307985305786, |
| "grad_norm_var": 0.29101574490811494, |
| "learning_rate": 2e-05, |
| "loss": 44.3613, |
| "loss/crossentropy": 2.7747325897216797, |
| "loss/hidden": 0.07958984375, |
| "loss/logits": 0.025640204548835754, |
| "loss/reg": 44.25602340698242, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.007625, |
| "grad_norm": 1.9883134365081787, |
| "grad_norm_var": 0.2851551419506978, |
| "learning_rate": 2e-05, |
| "loss": 44.1729, |
| "loss/crossentropy": 2.8054683208465576, |
| "loss/hidden": 0.0888671875, |
| "loss/logits": 0.02528468333184719, |
| "loss/reg": 44.05876159667969, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.00765, |
| "grad_norm": 1.863852858543396, |
| "grad_norm_var": 0.29032669692645163, |
| "learning_rate": 2e-05, |
| "loss": 43.9655, |
| "loss/crossentropy": 2.888584852218628, |
| "loss/hidden": 0.0771484375, |
| "loss/logits": 0.02664921060204506, |
| "loss/reg": 43.861751556396484, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.007675, |
| "grad_norm": 2.4267005920410156, |
| "grad_norm_var": 0.27844298773661547, |
| "learning_rate": 2e-05, |
| "loss": 43.768, |
| "loss/crossentropy": 2.7776331901550293, |
| "loss/hidden": 0.0791015625, |
| "loss/logits": 0.023454634472727776, |
| "loss/reg": 43.6654052734375, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.0077, |
| "grad_norm": 1.952818512916565, |
| "grad_norm_var": 0.25313514417236144, |
| "learning_rate": 2e-05, |
| "loss": 43.5914, |
| "loss/crossentropy": 2.8431732654571533, |
| "loss/hidden": 0.09375, |
| "loss/logits": 0.027756892144680023, |
| "loss/reg": 43.46989440917969, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.007725, |
| "grad_norm": 1.5612258911132812, |
| "grad_norm_var": 0.2826367832773698, |
| "learning_rate": 2e-05, |
| "loss": 43.3769, |
| "loss/crossentropy": 2.791144847869873, |
| "loss/hidden": 0.07958984375, |
| "loss/logits": 0.02249729633331299, |
| "loss/reg": 43.274803161621094, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.00775, |
| "grad_norm": 1.583145022392273, |
| "grad_norm_var": 0.24449754312349892, |
| "learning_rate": 2e-05, |
| "loss": 43.1754, |
| "loss/crossentropy": 2.503157615661621, |
| "loss/hidden": 0.07275390625, |
| "loss/logits": 0.021705985069274902, |
| "loss/reg": 43.08091354370117, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.007775, |
| "grad_norm": 4.015507221221924, |
| "grad_norm_var": 0.4341163960002968, |
| "learning_rate": 2e-05, |
| "loss": 43.029, |
| "loss/crossentropy": 2.5782852172851562, |
| "loss/hidden": 0.11279296875, |
| "loss/logits": 0.028784021735191345, |
| "loss/reg": 42.88740921020508, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.0078, |
| "grad_norm": 8.164732933044434, |
| "grad_norm_var": 2.6064283600035822, |
| "learning_rate": 2e-05, |
| "loss": 42.834, |
| "loss/crossentropy": 2.8343613147735596, |
| "loss/hidden": 0.10693359375, |
| "loss/logits": 0.03768664225935936, |
| "loss/reg": 42.68938446044922, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.007825, |
| "grad_norm": 1.5896762609481812, |
| "grad_norm_var": 2.666172012313534, |
| "learning_rate": 2e-05, |
| "loss": 42.6196, |
| "loss/crossentropy": 2.701037883758545, |
| "loss/hidden": 0.09619140625, |
| "loss/logits": 0.0258449986577034, |
| "loss/reg": 42.497581481933594, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.00785, |
| "grad_norm": 3.047536611557007, |
| "grad_norm_var": 2.6378112036190617, |
| "learning_rate": 2e-05, |
| "loss": 42.4423, |
| "loss/crossentropy": 3.052617073059082, |
| "loss/hidden": 0.10302734375, |
| "loss/logits": 0.032618869096040726, |
| "loss/reg": 42.30662536621094, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.007875, |
| "grad_norm": 3.6461970806121826, |
| "grad_norm_var": 2.686150464102299, |
| "learning_rate": 2e-05, |
| "loss": 42.2375, |
| "loss/crossentropy": 2.7235336303710938, |
| "loss/hidden": 0.09375, |
| "loss/logits": 0.027443446218967438, |
| "loss/reg": 42.11628723144531, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.0079, |
| "grad_norm": 2.1187753677368164, |
| "grad_norm_var": 2.6783673292138563, |
| "learning_rate": 2e-05, |
| "loss": 42.0521, |
| "loss/crossentropy": 3.022662401199341, |
| "loss/hidden": 0.0986328125, |
| "loss/logits": 0.02708546072244644, |
| "loss/reg": 41.926414489746094, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.007925, |
| "grad_norm": 3.020916700363159, |
| "grad_norm_var": 2.6790032915758424, |
| "learning_rate": 2e-05, |
| "loss": 41.8743, |
| "loss/crossentropy": 3.040327787399292, |
| "loss/hidden": 0.10498046875, |
| "loss/logits": 0.03200379014015198, |
| "loss/reg": 41.737281799316406, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.00795, |
| "grad_norm": 1.2161192893981934, |
| "grad_norm_var": 2.800065592587199, |
| "learning_rate": 2e-05, |
| "loss": 41.6418, |
| "loss/crossentropy": 2.455482006072998, |
| "loss/hidden": 0.07275390625, |
| "loss/logits": 0.01976654678583145, |
| "loss/reg": 41.54926300048828, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.007975, |
| "grad_norm": 1.7299896478652954, |
| "grad_norm_var": 2.842071420926294, |
| "learning_rate": 2e-05, |
| "loss": 41.4748, |
| "loss/crossentropy": 2.58695650100708, |
| "loss/hidden": 0.08935546875, |
| "loss/logits": 0.023488441482186317, |
| "loss/reg": 41.36199951171875, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 1.4372546672821045, |
| "grad_norm_var": 2.8795153989340108, |
| "learning_rate": 2e-05, |
| "loss": 41.2741, |
| "loss/crossentropy": 2.516500473022461, |
| "loss/hidden": 0.0771484375, |
| "loss/logits": 0.021874364465475082, |
| "loss/reg": 41.175071716308594, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.008025, |
| "grad_norm": 1.4945651292800903, |
| "grad_norm_var": 2.934045206445749, |
| "learning_rate": 2e-05, |
| "loss": 41.094, |
| "loss/crossentropy": 2.5344505310058594, |
| "loss/hidden": 0.08154296875, |
| "loss/logits": 0.023724202066659927, |
| "loss/reg": 40.98875427246094, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.00805, |
| "grad_norm": 2.882326602935791, |
| "grad_norm_var": 2.9051136352536324, |
| "learning_rate": 2e-05, |
| "loss": 40.9209, |
| "loss/crossentropy": 2.9912266731262207, |
| "loss/hidden": 0.09375, |
| "loss/logits": 0.028768811374902725, |
| "loss/reg": 40.798377990722656, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.008075, |
| "grad_norm": 2.139209270477295, |
| "grad_norm_var": 2.917611033862551, |
| "learning_rate": 2e-05, |
| "loss": 40.7388, |
| "loss/crossentropy": 3.03482985496521, |
| "loss/hidden": 0.0986328125, |
| "loss/logits": 0.02618756890296936, |
| "loss/reg": 40.61396789550781, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.0081, |
| "grad_norm": 1.5827956199645996, |
| "grad_norm_var": 2.9580979264872167, |
| "learning_rate": 2e-05, |
| "loss": 40.5364, |
| "loss/crossentropy": 3.321441411972046, |
| "loss/hidden": 0.0771484375, |
| "loss/logits": 0.02882487140595913, |
| "loss/reg": 40.43039321899414, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.008125, |
| "grad_norm": 3.9542465209960938, |
| "grad_norm_var": 2.9919451226799567, |
| "learning_rate": 2e-05, |
| "loss": 40.3997, |
| "loss/crossentropy": 2.867952585220337, |
| "loss/hidden": 0.11669921875, |
| "loss/logits": 0.03544551879167557, |
| "loss/reg": 40.24754333496094, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.00815, |
| "grad_norm": 1.3926026821136475, |
| "grad_norm_var": 3.023260345272987, |
| "learning_rate": 2e-05, |
| "loss": 40.1766, |
| "loss/crossentropy": 2.859464645385742, |
| "loss/hidden": 0.08642578125, |
| "loss/logits": 0.02500808984041214, |
| "loss/reg": 40.06516647338867, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.008175, |
| "grad_norm": 1.7433418035507202, |
| "grad_norm_var": 2.9517927278006177, |
| "learning_rate": 2e-05, |
| "loss": 39.9911, |
| "loss/crossentropy": 2.5700864791870117, |
| "loss/hidden": 0.08447265625, |
| "loss/logits": 0.02326771430671215, |
| "loss/reg": 39.883399963378906, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.0082, |
| "grad_norm": 1.7338348627090454, |
| "grad_norm_var": 0.7415088588988434, |
| "learning_rate": 2e-05, |
| "loss": 39.823, |
| "loss/crossentropy": 3.0460493564605713, |
| "loss/hidden": 0.09375, |
| "loss/logits": 0.026862995699048042, |
| "loss/reg": 39.70237731933594, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.008225, |
| "grad_norm": 2.071286916732788, |
| "grad_norm_var": 0.7187026357001777, |
| "learning_rate": 2e-05, |
| "loss": 39.6565, |
| "loss/crossentropy": 2.8687374591827393, |
| "loss/hidden": 0.10546875, |
| "loss/logits": 0.028884585946798325, |
| "loss/reg": 39.52215576171875, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.00825, |
| "grad_norm": 3.727687120437622, |
| "grad_norm_var": 0.8244134178781979, |
| "learning_rate": 2e-05, |
| "loss": 39.4632, |
| "loss/crossentropy": 3.016721248626709, |
| "loss/hidden": 0.091796875, |
| "loss/logits": 0.02862412855029106, |
| "loss/reg": 39.34282684326172, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.008275, |
| "grad_norm": 1.4077656269073486, |
| "grad_norm_var": 0.7188383933289703, |
| "learning_rate": 2e-05, |
| "loss": 39.2819, |
| "loss/crossentropy": 2.6854705810546875, |
| "loss/hidden": 0.09375, |
| "loss/logits": 0.02421317994594574, |
| "loss/reg": 39.163978576660156, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.0083, |
| "grad_norm": 2.4871857166290283, |
| "grad_norm_var": 0.7280817035919266, |
| "learning_rate": 2e-05, |
| "loss": 39.1068, |
| "loss/crossentropy": 2.6913976669311523, |
| "loss/hidden": 0.09814453125, |
| "loss/logits": 0.030065573751926422, |
| "loss/reg": 38.9786262512207, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.008325, |
| "grad_norm": 2.2533462047576904, |
| "grad_norm_var": 0.6733490639853538, |
| "learning_rate": 2e-05, |
| "loss": 38.9318, |
| "loss/crossentropy": 2.960143804550171, |
| "loss/hidden": 0.09814453125, |
| "loss/logits": 0.032106850296258926, |
| "loss/reg": 38.80150604248047, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.00835, |
| "grad_norm": 9.487396240234375, |
| "grad_norm_var": 3.99832851363437, |
| "learning_rate": 2e-05, |
| "loss": 38.7644, |
| "loss/crossentropy": 2.7557554244995117, |
| "loss/hidden": 0.10986328125, |
| "loss/logits": 0.029345765709877014, |
| "loss/reg": 38.62519454956055, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.008375, |
| "grad_norm": 2.2959508895874023, |
| "grad_norm_var": 3.9530502420570093, |
| "learning_rate": 2e-05, |
| "loss": 38.5785, |
| "loss/crossentropy": 2.8349740505218506, |
| "loss/hidden": 0.10009765625, |
| "loss/logits": 0.028996147215366364, |
| "loss/reg": 38.449363708496094, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.0084, |
| "grad_norm": 2.147952079772949, |
| "grad_norm_var": 3.8715303432503254, |
| "learning_rate": 2e-05, |
| "loss": 38.4335, |
| "loss/crossentropy": 2.791761636734009, |
| "loss/hidden": 0.1259765625, |
| "loss/logits": 0.033383168280124664, |
| "loss/reg": 38.274166107177734, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.008425, |
| "grad_norm": 2.610886573791504, |
| "grad_norm_var": 3.7737029983097736, |
| "learning_rate": 2e-05, |
| "loss": 38.2484, |
| "loss/crossentropy": 2.7260618209838867, |
| "loss/hidden": 0.109375, |
| "loss/logits": 0.03957097977399826, |
| "loss/reg": 38.09947204589844, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.00845, |
| "grad_norm": 2.4849438667297363, |
| "grad_norm_var": 3.776289163852828, |
| "learning_rate": 2e-05, |
| "loss": 38.116, |
| "loss/crossentropy": 2.946385145187378, |
| "loss/hidden": 0.16015625, |
| "loss/logits": 0.03063117153942585, |
| "loss/reg": 37.9251708984375, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.008475, |
| "grad_norm": 2.560534954071045, |
| "grad_norm_var": 3.754755415172888, |
| "learning_rate": 2e-05, |
| "loss": 37.9078, |
| "loss/crossentropy": 2.9236810207366943, |
| "loss/hidden": 0.1240234375, |
| "loss/logits": 0.03169061243534088, |
| "loss/reg": 37.75209045410156, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.0085, |
| "grad_norm": 2.5575692653656006, |
| "grad_norm_var": 3.662913660444159, |
| "learning_rate": 2e-05, |
| "loss": 37.7183, |
| "loss/crossentropy": 3.100344181060791, |
| "loss/hidden": 0.10693359375, |
| "loss/logits": 0.03185933083295822, |
| "loss/reg": 37.57951354980469, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.008525, |
| "grad_norm": 2.165684461593628, |
| "grad_norm_var": 3.5893262917243276, |
| "learning_rate": 2e-05, |
| "loss": 37.5408, |
| "loss/crossentropy": 2.663774251937866, |
| "loss/hidden": 0.10595703125, |
| "loss/logits": 0.027293076738715172, |
| "loss/reg": 37.407588958740234, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.00855, |
| "grad_norm": 1.7527039051055908, |
| "grad_norm_var": 3.534874283949813, |
| "learning_rate": 2e-05, |
| "loss": 37.3585, |
| "loss/crossentropy": 2.862217903137207, |
| "loss/hidden": 0.10107421875, |
| "loss/logits": 0.025328852236270905, |
| "loss/reg": 37.23208236694336, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.008575, |
| "grad_norm": 1.9454602003097534, |
| "grad_norm_var": 3.5111612253458673, |
| "learning_rate": 2e-05, |
| "loss": 37.1783, |
| "loss/crossentropy": 2.7598438262939453, |
| "loss/hidden": 0.09375, |
| "loss/logits": 0.023144034668803215, |
| "loss/reg": 37.061397552490234, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.0086, |
| "grad_norm": 1.8846484422683716, |
| "grad_norm_var": 3.4925386021223233, |
| "learning_rate": 2e-05, |
| "loss": 37.0198, |
| "loss/crossentropy": 2.5700979232788086, |
| "loss/hidden": 0.1005859375, |
| "loss/logits": 0.027288008481264114, |
| "loss/reg": 36.89188003540039, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.008625, |
| "grad_norm": 2.6235435009002686, |
| "grad_norm_var": 3.4623555366449352, |
| "learning_rate": 2e-05, |
| "loss": 36.8804, |
| "loss/crossentropy": 2.765270471572876, |
| "loss/hidden": 0.1259765625, |
| "loss/logits": 0.03164152801036835, |
| "loss/reg": 36.72273254394531, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.00865, |
| "grad_norm": 1.9208968877792358, |
| "grad_norm_var": 3.4367772871203273, |
| "learning_rate": 2e-05, |
| "loss": 36.6959, |
| "loss/crossentropy": 3.0344183444976807, |
| "loss/hidden": 0.11572265625, |
| "loss/logits": 0.0260776337236166, |
| "loss/reg": 36.55405807495117, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.008675, |
| "grad_norm": 2.031083583831787, |
| "grad_norm_var": 3.356850606629226, |
| "learning_rate": 2e-05, |
| "loss": 36.5175, |
| "loss/crossentropy": 2.8031561374664307, |
| "loss/hidden": 0.1005859375, |
| "loss/logits": 0.030941586941480637, |
| "loss/reg": 36.38599395751953, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.0087, |
| "grad_norm": 2.3042380809783936, |
| "grad_norm_var": 3.3641485746872584, |
| "learning_rate": 2e-05, |
| "loss": 36.3562, |
| "loss/crossentropy": 2.727903366088867, |
| "loss/hidden": 0.107421875, |
| "loss/logits": 0.029567444697022438, |
| "loss/reg": 36.219173431396484, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.008725, |
| "grad_norm": 2.40014910697937, |
| "grad_norm_var": 3.356964679129802, |
| "learning_rate": 2e-05, |
| "loss": 36.2139, |
| "loss/crossentropy": 3.2525317668914795, |
| "loss/hidden": 0.1298828125, |
| "loss/logits": 0.030907781794667244, |
| "loss/reg": 36.0531005859375, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.00875, |
| "grad_norm": 2.395150661468506, |
| "grad_norm_var": 0.08076944890508499, |
| "learning_rate": 2e-05, |
| "loss": 36.0546, |
| "loss/crossentropy": 2.973644256591797, |
| "loss/hidden": 0.130859375, |
| "loss/logits": 0.03634234890341759, |
| "loss/reg": 35.887351989746094, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.008775, |
| "grad_norm": 2.5341644287109375, |
| "grad_norm_var": 0.0856139565907459, |
| "learning_rate": 2e-05, |
| "loss": 35.8731, |
| "loss/crossentropy": 2.683912754058838, |
| "loss/hidden": 0.11962890625, |
| "loss/logits": 0.03108149953186512, |
| "loss/reg": 35.722354888916016, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.0088, |
| "grad_norm": 2.0005195140838623, |
| "grad_norm_var": 0.08937117842573526, |
| "learning_rate": 2e-05, |
| "loss": 35.6898, |
| "loss/crossentropy": 3.0332581996917725, |
| "loss/hidden": 0.10498046875, |
| "loss/logits": 0.03197905793786049, |
| "loss/reg": 35.552879333496094, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.008825, |
| "grad_norm": 2.4667744636535645, |
| "grad_norm_var": 0.08394155421554826, |
| "learning_rate": 2e-05, |
| "loss": 35.5462, |
| "loss/crossentropy": 2.7992653846740723, |
| "loss/hidden": 0.1279296875, |
| "loss/logits": 0.02878217026591301, |
| "loss/reg": 35.38950729370117, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.00885, |
| "grad_norm": 1.8536417484283447, |
| "grad_norm_var": 0.0892220247213709, |
| "learning_rate": 2e-05, |
| "loss": 35.3542, |
| "loss/crossentropy": 2.6289925575256348, |
| "loss/hidden": 0.09814453125, |
| "loss/logits": 0.029167503118515015, |
| "loss/reg": 35.226863861083984, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.008875, |
| "grad_norm": 2.0940122604370117, |
| "grad_norm_var": 0.08116332781685945, |
| "learning_rate": 2e-05, |
| "loss": 35.2059, |
| "loss/crossentropy": 2.63510799407959, |
| "loss/hidden": 0.1123046875, |
| "loss/logits": 0.028763707727193832, |
| "loss/reg": 35.06479263305664, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.0089, |
| "grad_norm": 2.8152213096618652, |
| "grad_norm_var": 0.098175358135505, |
| "learning_rate": 2e-05, |
| "loss": 35.0467, |
| "loss/crossentropy": 2.7934632301330566, |
| "loss/hidden": 0.109375, |
| "loss/logits": 0.03379831463098526, |
| "loss/reg": 34.903560638427734, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.008925, |
| "grad_norm": 2.6323490142822266, |
| "grad_norm_var": 0.10969825083783077, |
| "learning_rate": 2e-05, |
| "loss": 34.8794, |
| "loss/crossentropy": 2.75350284576416, |
| "loss/hidden": 0.10546875, |
| "loss/logits": 0.031125463545322418, |
| "loss/reg": 34.74283218383789, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.00895, |
| "grad_norm": 1.6894372701644897, |
| "grad_norm_var": 0.11396125918644202, |
| "learning_rate": 2e-05, |
| "loss": 34.7202, |
| "loss/crossentropy": 2.6154894828796387, |
| "loss/hidden": 0.1103515625, |
| "loss/logits": 0.027205005288124084, |
| "loss/reg": 34.58259963989258, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.008975, |
| "grad_norm": 1.4869511127471924, |
| "grad_norm_var": 0.14415693080625397, |
| "learning_rate": 2e-05, |
| "loss": 34.5381, |
| "loss/crossentropy": 2.6061604022979736, |
| "loss/hidden": 0.0888671875, |
| "loss/logits": 0.026232272386550903, |
| "loss/reg": 34.4229621887207, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.009, |
| "grad_norm": 1.736647367477417, |
| "grad_norm_var": 0.1516660297286099, |
| "learning_rate": 2e-05, |
| "loss": 34.4049, |
| "loss/crossentropy": 2.993236780166626, |
| "loss/hidden": 0.107421875, |
| "loss/logits": 0.033529821783304214, |
| "loss/reg": 34.263916015625, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.009025, |
| "grad_norm": 1.8005791902542114, |
| "grad_norm_var": 0.146044611712044, |
| "learning_rate": 2e-05, |
| "loss": 34.2384, |
| "loss/crossentropy": 2.6941475868225098, |
| "loss/hidden": 0.1025390625, |
| "loss/logits": 0.029957549646496773, |
| "loss/reg": 34.105873107910156, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.00905, |
| "grad_norm": 2.542261838912964, |
| "grad_norm_var": 0.1524279525586619, |
| "learning_rate": 2e-05, |
| "loss": 34.0847, |
| "loss/crossentropy": 2.9499382972717285, |
| "loss/hidden": 0.10888671875, |
| "loss/logits": 0.03367718309164047, |
| "loss/reg": 33.942176818847656, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.009075, |
| "grad_norm": 2.2937710285186768, |
| "grad_norm_var": 0.15173689243287544, |
| "learning_rate": 2e-05, |
| "loss": 33.9802, |
| "loss/crossentropy": 2.7008249759674072, |
| "loss/hidden": 0.1611328125, |
| "loss/logits": 0.03337787464261055, |
| "loss/reg": 33.78564453125, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.0091, |
| "grad_norm": 2.6358730792999268, |
| "grad_norm_var": 0.16364591458108368, |
| "learning_rate": 2e-05, |
| "loss": 33.7769, |
| "loss/crossentropy": 2.773253917694092, |
| "loss/hidden": 0.1142578125, |
| "loss/logits": 0.033234164118766785, |
| "loss/reg": 33.6294059753418, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.009125, |
| "grad_norm": 2.1610424518585205, |
| "grad_norm_var": 0.16119191833999386, |
| "learning_rate": 2e-05, |
| "loss": 33.6193, |
| "loss/crossentropy": 3.2032077312469482, |
| "loss/hidden": 0.109375, |
| "loss/logits": 0.03599859029054642, |
| "loss/reg": 33.4738883972168, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.00915, |
| "grad_norm": 1.6486601829528809, |
| "grad_norm_var": 0.17621295368330056, |
| "learning_rate": 2e-05, |
| "loss": 33.4549, |
| "loss/crossentropy": 2.7943410873413086, |
| "loss/hidden": 0.10986328125, |
| "loss/logits": 0.0257001630961895, |
| "loss/reg": 33.31929397583008, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.009175, |
| "grad_norm": 1.789995551109314, |
| "grad_norm_var": 0.17265670694948476, |
| "learning_rate": 2e-05, |
| "loss": 33.3092, |
| "loss/crossentropy": 2.974548101425171, |
| "loss/hidden": 0.11181640625, |
| "loss/logits": 0.032162584364414215, |
| "loss/reg": 33.16526794433594, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.0092, |
| "grad_norm": 2.0894887447357178, |
| "grad_norm_var": 0.17193594057413286, |
| "learning_rate": 2e-05, |
| "loss": 33.1586, |
| "loss/crossentropy": 2.8424057960510254, |
| "loss/hidden": 0.11376953125, |
| "loss/logits": 0.032967403531074524, |
| "loss/reg": 33.01184844970703, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.009225, |
| "grad_norm": 2.1341543197631836, |
| "grad_norm_var": 0.1629634187131666, |
| "learning_rate": 2e-05, |
| "loss": 32.9976, |
| "loss/crossentropy": 2.5876452922821045, |
| "loss/hidden": 0.10693359375, |
| "loss/logits": 0.031542833894491196, |
| "loss/reg": 32.859153747558594, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.00925, |
| "grad_norm": 1.9316920042037964, |
| "grad_norm_var": 0.16090780810081545, |
| "learning_rate": 2e-05, |
| "loss": 32.8487, |
| "loss/crossentropy": 2.521399974822998, |
| "loss/hidden": 0.11181640625, |
| "loss/logits": 0.029829028993844986, |
| "loss/reg": 32.70704650878906, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.009275, |
| "grad_norm": 1.597779631614685, |
| "grad_norm_var": 0.1762070126850133, |
| "learning_rate": 2e-05, |
| "loss": 32.7608, |
| "loss/crossentropy": 2.5190658569335938, |
| "loss/hidden": 0.1708984375, |
| "loss/logits": 0.03421615809202194, |
| "loss/reg": 32.55567932128906, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.0093, |
| "grad_norm": 1.9022934436798096, |
| "grad_norm_var": 0.13656557084311008, |
| "learning_rate": 2e-05, |
| "loss": 32.5483, |
| "loss/crossentropy": 2.6691269874572754, |
| "loss/hidden": 0.119140625, |
| "loss/logits": 0.02910671941936016, |
| "loss/reg": 32.40000534057617, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.009325, |
| "grad_norm": 1.3260115385055542, |
| "grad_norm_var": 0.1338758554160241, |
| "learning_rate": 2e-05, |
| "loss": 32.3699, |
| "loss/crossentropy": 2.8744542598724365, |
| "loss/hidden": 0.09326171875, |
| "loss/logits": 0.026506535708904266, |
| "loss/reg": 32.2501220703125, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.00935, |
| "grad_norm": 2.3821325302124023, |
| "grad_norm_var": 0.14230117723275787, |
| "learning_rate": 2e-05, |
| "loss": 32.2718, |
| "loss/crossentropy": 2.809650421142578, |
| "loss/hidden": 0.13671875, |
| "loss/logits": 0.03423428162932396, |
| "loss/reg": 32.1008415222168, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.009375, |
| "grad_norm": 2.7512526512145996, |
| "grad_norm_var": 0.16141473329477468, |
| "learning_rate": 2e-05, |
| "loss": 32.106, |
| "loss/crossentropy": 3.2307400703430176, |
| "loss/hidden": 0.12255859375, |
| "loss/logits": 0.03138088434934616, |
| "loss/reg": 31.952072143554688, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.0094, |
| "grad_norm": 3.3555221557617188, |
| "grad_norm_var": 0.2586050041345979, |
| "learning_rate": 2e-05, |
| "loss": 31.9938, |
| "loss/crossentropy": 3.005693197250366, |
| "loss/hidden": 0.1533203125, |
| "loss/logits": 0.036577533930540085, |
| "loss/reg": 31.803926467895508, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.009425, |
| "grad_norm": 3.3253965377807617, |
| "grad_norm_var": 0.3336118725112632, |
| "learning_rate": 2e-05, |
| "loss": 31.8469, |
| "loss/crossentropy": 3.547780990600586, |
| "loss/hidden": 0.1533203125, |
| "loss/logits": 0.03727081045508385, |
| "loss/reg": 31.656269073486328, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.00945, |
| "grad_norm": 1.6626850366592407, |
| "grad_norm_var": 0.3467173050765467, |
| "learning_rate": 2e-05, |
| "loss": 31.6552, |
| "loss/crossentropy": 2.612881898880005, |
| "loss/hidden": 0.1162109375, |
| "loss/logits": 0.029575737193226814, |
| "loss/reg": 31.50940704345703, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.009475, |
| "grad_norm": 3.107280969619751, |
| "grad_norm_var": 0.39968975146762575, |
| "learning_rate": 2e-05, |
| "loss": 31.5286, |
| "loss/crossentropy": 2.9622950553894043, |
| "loss/hidden": 0.1298828125, |
| "loss/logits": 0.03570305183529854, |
| "loss/reg": 31.363008499145508, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.0095, |
| "grad_norm": 2.105592727661133, |
| "grad_norm_var": 0.38910356240631966, |
| "learning_rate": 2e-05, |
| "loss": 31.3657, |
| "loss/crossentropy": 2.7373220920562744, |
| "loss/hidden": 0.11865234375, |
| "loss/logits": 0.02971404604613781, |
| "loss/reg": 31.217342376708984, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.009525, |
| "grad_norm": 6.658995628356934, |
| "grad_norm_var": 1.6275530318456253, |
| "learning_rate": 2e-05, |
| "loss": 31.2326, |
| "loss/crossentropy": 2.7819790840148926, |
| "loss/hidden": 0.1259765625, |
| "loss/logits": 0.03413031995296478, |
| "loss/reg": 31.072481155395508, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.00955, |
| "grad_norm": 17.702425003051758, |
| "grad_norm_var": 15.943881150548076, |
| "learning_rate": 2e-05, |
| "loss": 31.1402, |
| "loss/crossentropy": 2.9263079166412354, |
| "loss/hidden": 0.1787109375, |
| "loss/logits": 0.037496551871299744, |
| "loss/reg": 30.92399787902832, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.009575, |
| "grad_norm": 9.755738258361816, |
| "grad_norm_var": 18.105272629168468, |
| "learning_rate": 2e-05, |
| "loss": 30.9569, |
| "loss/crossentropy": 2.610964298248291, |
| "loss/hidden": 0.146484375, |
| "loss/logits": 0.03007764369249344, |
| "loss/reg": 30.780288696289062, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.0096, |
| "grad_norm": 1.4994475841522217, |
| "grad_norm_var": 18.276295715224364, |
| "learning_rate": 2e-05, |
| "loss": 30.7982, |
| "loss/crossentropy": 2.600332498550415, |
| "loss/hidden": 0.1298828125, |
| "loss/logits": 0.031154906377196312, |
| "loss/reg": 30.637126922607422, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.009625, |
| "grad_norm": 1.7103643417358398, |
| "grad_norm_var": 18.39011989648016, |
| "learning_rate": 2e-05, |
| "loss": 30.6776, |
| "loss/crossentropy": 3.032283306121826, |
| "loss/hidden": 0.1396484375, |
| "loss/logits": 0.04348542541265488, |
| "loss/reg": 30.494457244873047, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.00965, |
| "grad_norm": 2.2822492122650146, |
| "grad_norm_var": 18.304705584234476, |
| "learning_rate": 2e-05, |
| "loss": 30.5332, |
| "loss/crossentropy": 2.367107629776001, |
| "loss/hidden": 0.1484375, |
| "loss/logits": 0.03226654976606369, |
| "loss/reg": 30.352514266967773, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.009675, |
| "grad_norm": 2.002864122390747, |
| "grad_norm_var": 18.188167639061543, |
| "learning_rate": 2e-05, |
| "loss": 30.3683, |
| "loss/crossentropy": 2.7061312198638916, |
| "loss/hidden": 0.12353515625, |
| "loss/logits": 0.03370695561170578, |
| "loss/reg": 30.211055755615234, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.0097, |
| "grad_norm": 5.602196216583252, |
| "grad_norm_var": 18.023389822229017, |
| "learning_rate": 2e-05, |
| "loss": 30.2394, |
| "loss/crossentropy": 2.397667169570923, |
| "loss/hidden": 0.138671875, |
| "loss/logits": 0.03049187734723091, |
| "loss/reg": 30.070276260375977, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.009725, |
| "grad_norm": 1.5408035516738892, |
| "grad_norm_var": 17.943911354217352, |
| "learning_rate": 2e-05, |
| "loss": 30.0907, |
| "loss/crossentropy": 2.834867000579834, |
| "loss/hidden": 0.1298828125, |
| "loss/logits": 0.03077739104628563, |
| "loss/reg": 29.930038452148438, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.00975, |
| "grad_norm": 6.018068313598633, |
| "grad_norm_var": 17.881454834941156, |
| "learning_rate": 2e-05, |
| "loss": 29.9551, |
| "loss/crossentropy": 2.8220815658569336, |
| "loss/hidden": 0.1328125, |
| "loss/logits": 0.031766001135110855, |
| "loss/reg": 29.790555953979492, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.009775, |
| "grad_norm": 10.923192977905273, |
| "grad_norm_var": 20.212413139020512, |
| "learning_rate": 2e-05, |
| "loss": 29.8563, |
| "loss/crossentropy": 2.811016321182251, |
| "loss/hidden": 0.1689453125, |
| "loss/logits": 0.03583759814500809, |
| "loss/reg": 29.651498794555664, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.0098, |
| "grad_norm": 1.9290435314178467, |
| "grad_norm_var": 20.64348377939481, |
| "learning_rate": 2e-05, |
| "loss": 29.6891, |
| "loss/crossentropy": 2.863645076751709, |
| "loss/hidden": 0.1435546875, |
| "loss/logits": 0.036813415586948395, |
| "loss/reg": 29.50878143310547, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.009825, |
| "grad_norm": 3.237116813659668, |
| "grad_norm_var": 20.662082917547686, |
| "learning_rate": 2e-05, |
| "loss": 29.5663, |
| "loss/crossentropy": 2.7489192485809326, |
| "loss/hidden": 0.158203125, |
| "loss/logits": 0.03718395531177521, |
| "loss/reg": 29.370952606201172, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.00985, |
| "grad_norm": 1.6290416717529297, |
| "grad_norm_var": 20.676489967742715, |
| "learning_rate": 2e-05, |
| "loss": 29.3821, |
| "loss/crossentropy": 2.96504282951355, |
| "loss/hidden": 0.11181640625, |
| "loss/logits": 0.03646458685398102, |
| "loss/reg": 29.2337703704834, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.009875, |
| "grad_norm": 2.237454652786255, |
| "grad_norm_var": 20.926649282212274, |
| "learning_rate": 2e-05, |
| "loss": 29.269, |
| "loss/crossentropy": 2.8521690368652344, |
| "loss/hidden": 0.1376953125, |
| "loss/logits": 0.03403034061193466, |
| "loss/reg": 29.097240447998047, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.0099, |
| "grad_norm": 1.9784162044525146, |
| "grad_norm_var": 20.97338552568199, |
| "learning_rate": 2e-05, |
| "loss": 29.1487, |
| "loss/crossentropy": 2.748701572418213, |
| "loss/hidden": 0.1533203125, |
| "loss/logits": 0.034278061240911484, |
| "loss/reg": 28.961105346679688, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.009925, |
| "grad_norm": 2.4967384338378906, |
| "grad_norm_var": 21.021265946892495, |
| "learning_rate": 2e-05, |
| "loss": 29.011, |
| "loss/crossentropy": 3.1087985038757324, |
| "loss/hidden": 0.1455078125, |
| "loss/logits": 0.03961321711540222, |
| "loss/reg": 28.825891494750977, |
| "step": 397 |
| }, |
| { |
| "epoch": 0.00995, |
| "grad_norm": 2.17684006690979, |
| "grad_norm_var": 8.826986086885336, |
| "learning_rate": 2e-05, |
| "loss": 28.8691, |
| "loss/crossentropy": 2.919372320175171, |
| "loss/hidden": 0.1416015625, |
| "loss/logits": 0.03645738214254379, |
| "loss/reg": 28.69107437133789, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.009975, |
| "grad_norm": 2.1314728260040283, |
| "grad_norm_var": 6.165466501526623, |
| "learning_rate": 2e-05, |
| "loss": 28.7475, |
| "loss/crossentropy": 2.795255422592163, |
| "loss/hidden": 0.1533203125, |
| "loss/logits": 0.037487540394067764, |
| "loss/reg": 28.556673049926758, |
| "step": 399 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 1.821183204650879, |
| "grad_norm_var": 6.103824283803898, |
| "learning_rate": 2e-05, |
| "loss": 28.5859, |
| "loss/crossentropy": 2.6662023067474365, |
| "loss/hidden": 0.1279296875, |
| "loss/logits": 0.034927770495414734, |
| "loss/reg": 28.423030853271484, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.010025, |
| "grad_norm": 1.682054042816162, |
| "grad_norm_var": 6.109147456078416, |
| "learning_rate": 2e-05, |
| "loss": 28.4353, |
| "loss/crossentropy": 3.024109125137329, |
| "loss/hidden": 0.11181640625, |
| "loss/logits": 0.03378160297870636, |
| "loss/reg": 28.28972625732422, |
| "step": 401 |
| }, |
| { |
| "epoch": 0.01005, |
| "grad_norm": 2.1850829124450684, |
| "grad_norm_var": 6.120403763567225, |
| "learning_rate": 2e-05, |
| "loss": 28.3327, |
| "loss/crossentropy": 3.1913986206054688, |
| "loss/hidden": 0.1435546875, |
| "loss/logits": 0.03758620843291283, |
| "loss/reg": 28.151596069335938, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.010075, |
| "grad_norm": 2.2850840091705322, |
| "grad_norm_var": 6.084117119532081, |
| "learning_rate": 2e-05, |
| "loss": 28.2094, |
| "loss/crossentropy": 2.844414710998535, |
| "loss/hidden": 0.1484375, |
| "loss/logits": 0.04106660932302475, |
| "loss/reg": 28.019901275634766, |
| "step": 403 |
| }, |
| { |
| "epoch": 0.0101, |
| "grad_norm": 3.0755233764648438, |
| "grad_norm_var": 5.6459224869144, |
| "learning_rate": 2e-05, |
| "loss": 28.1054, |
| "loss/crossentropy": 2.7236874103546143, |
| "loss/hidden": 0.169921875, |
| "loss/logits": 0.04699024558067322, |
| "loss/reg": 27.88851547241211, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.010125, |
| "grad_norm": 2.632650852203369, |
| "grad_norm_var": 5.513941759398255, |
| "learning_rate": 2e-05, |
| "loss": 27.9764, |
| "loss/crossentropy": 2.971388578414917, |
| "loss/hidden": 0.1787109375, |
| "loss/logits": 0.039863601326942444, |
| "loss/reg": 27.757869720458984, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.01015, |
| "grad_norm": 2.490246057510376, |
| "grad_norm_var": 4.8850644555352325, |
| "learning_rate": 2e-05, |
| "loss": 27.8153, |
| "loss/crossentropy": 3.0535378456115723, |
| "loss/hidden": 0.1513671875, |
| "loss/logits": 0.03618897125124931, |
| "loss/reg": 27.62775421142578, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.010175, |
| "grad_norm": 3.116687536239624, |
| "grad_norm_var": 0.24597344619940412, |
| "learning_rate": 2e-05, |
| "loss": 27.723, |
| "loss/crossentropy": 3.0417606830596924, |
| "loss/hidden": 0.1748046875, |
| "loss/logits": 0.049759428948163986, |
| "loss/reg": 27.49844741821289, |
| "step": 407 |
| }, |
| { |
| "epoch": 0.0102, |
| "grad_norm": 2.1385886669158936, |
| "grad_norm_var": 0.2378215272062145, |
| "learning_rate": 2e-05, |
| "loss": 27.5528, |
| "loss/crossentropy": 3.0288665294647217, |
| "loss/hidden": 0.1455078125, |
| "loss/logits": 0.03794676810503006, |
| "loss/reg": 27.369342803955078, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.010225, |
| "grad_norm": 2.3155360221862793, |
| "grad_norm_var": 0.1797017907210517, |
| "learning_rate": 2e-05, |
| "loss": 27.4314, |
| "loss/crossentropy": 2.7471296787261963, |
| "loss/hidden": 0.1484375, |
| "loss/logits": 0.041661813855171204, |
| "loss/reg": 27.241291046142578, |
| "step": 409 |
| }, |
| { |
| "epoch": 0.01025, |
| "grad_norm": 2.1539342403411865, |
| "grad_norm_var": 0.15174583963641694, |
| "learning_rate": 2e-05, |
| "loss": 27.3053, |
| "loss/crossentropy": 3.1815056800842285, |
| "loss/hidden": 0.1484375, |
| "loss/logits": 0.0430469810962677, |
| "loss/reg": 27.11379051208496, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.010275, |
| "grad_norm": 3.004754066467285, |
| "grad_norm_var": 0.1813925430056429, |
| "learning_rate": 2e-05, |
| "loss": 27.1907, |
| "loss/crossentropy": 2.7185699939727783, |
| "loss/hidden": 0.166015625, |
| "loss/logits": 0.038052164018154144, |
| "loss/reg": 26.986587524414062, |
| "step": 411 |
| }, |
| { |
| "epoch": 0.0103, |
| "grad_norm": 1.7727258205413818, |
| "grad_norm_var": 0.19437299657180346, |
| "learning_rate": 2e-05, |
| "loss": 27.031, |
| "loss/crossentropy": 2.5481350421905518, |
| "loss/hidden": 0.1416015625, |
| "loss/logits": 0.03450712189078331, |
| "loss/reg": 26.85489273071289, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.010325, |
| "grad_norm": 2.942481756210327, |
| "grad_norm_var": 0.21596104298213442, |
| "learning_rate": 2e-05, |
| "loss": 26.9305, |
| "loss/crossentropy": 3.015340566635132, |
| "loss/hidden": 0.1630859375, |
| "loss/logits": 0.038723111152648926, |
| "loss/reg": 26.728679656982422, |
| "step": 413 |
| }, |
| { |
| "epoch": 0.01035, |
| "grad_norm": 6.163323879241943, |
| "grad_norm_var": 1.1063828714438992, |
| "learning_rate": 2e-05, |
| "loss": 26.8133, |
| "loss/crossentropy": 2.55928373336792, |
| "loss/hidden": 0.1728515625, |
| "loss/logits": 0.03731565177440643, |
| "loss/reg": 26.603172302246094, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.010375, |
| "grad_norm": 1.9960532188415527, |
| "grad_norm_var": 1.1163400619934953, |
| "learning_rate": 2e-05, |
| "loss": 26.6719, |
| "loss/crossentropy": 2.80226731300354, |
| "loss/hidden": 0.15625, |
| "loss/logits": 0.03723851963877678, |
| "loss/reg": 26.47838592529297, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.0104, |
| "grad_norm": 3.301450729370117, |
| "grad_norm_var": 1.0974053192222506, |
| "learning_rate": 2e-05, |
| "loss": 26.5614, |
| "loss/crossentropy": 2.791954517364502, |
| "loss/hidden": 0.166015625, |
| "loss/logits": 0.04129061847925186, |
| "loss/reg": 26.354053497314453, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.010425, |
| "grad_norm": 1.9505646228790283, |
| "grad_norm_var": 1.0653418372806414, |
| "learning_rate": 2e-05, |
| "loss": 26.4048, |
| "loss/crossentropy": 2.7842905521392822, |
| "loss/hidden": 0.13671875, |
| "loss/logits": 0.0375644825398922, |
| "loss/reg": 26.230472564697266, |
| "step": 417 |
| }, |
| { |
| "epoch": 0.01045, |
| "grad_norm": 2.055081605911255, |
| "grad_norm_var": 1.075675176346497, |
| "learning_rate": 2e-05, |
| "loss": 26.296, |
| "loss/crossentropy": 2.7261054515838623, |
| "loss/hidden": 0.1513671875, |
| "loss/logits": 0.03734801337122917, |
| "loss/reg": 26.1073055267334, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.010475, |
| "grad_norm": 3.4958529472351074, |
| "grad_norm_var": 1.09835111004395, |
| "learning_rate": 2e-05, |
| "loss": 26.1998, |
| "loss/crossentropy": 3.256852626800537, |
| "loss/hidden": 0.1689453125, |
| "loss/logits": 0.04601683095097542, |
| "loss/reg": 25.984798431396484, |
| "step": 419 |
| }, |
| { |
| "epoch": 0.0105, |
| "grad_norm": 2.4832944869995117, |
| "grad_norm_var": 1.097555539592861, |
| "learning_rate": 2e-05, |
| "loss": 26.0394, |
| "loss/crossentropy": 2.6658496856689453, |
| "loss/hidden": 0.13671875, |
| "loss/logits": 0.03990530967712402, |
| "loss/reg": 25.862812042236328, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.010525, |
| "grad_norm": 1.888179898262024, |
| "grad_norm_var": 1.143925812704361, |
| "learning_rate": 2e-05, |
| "loss": 25.9507, |
| "loss/crossentropy": 2.893690347671509, |
| "loss/hidden": 0.166015625, |
| "loss/logits": 0.04318728297948837, |
| "loss/reg": 25.74148941040039, |
| "step": 421 |
| }, |
| { |
| "epoch": 0.01055, |
| "grad_norm": 1.8328545093536377, |
| "grad_norm_var": 1.1896980975503562, |
| "learning_rate": 2e-05, |
| "loss": 25.8021, |
| "loss/crossentropy": 2.862842559814453, |
| "loss/hidden": 0.1484375, |
| "loss/logits": 0.037468597292900085, |
| "loss/reg": 25.616188049316406, |
| "step": 422 |
| }, |
| { |
| "epoch": 0.010575, |
| "grad_norm": 2.6049251556396484, |
| "grad_norm_var": 1.1751238780603264, |
| "learning_rate": 2e-05, |
| "loss": 25.7268, |
| "loss/crossentropy": 2.9560859203338623, |
| "loss/hidden": 0.1767578125, |
| "loss/logits": 0.054243914783000946, |
| "loss/reg": 25.49581527709961, |
| "step": 423 |
| }, |
| { |
| "epoch": 0.0106, |
| "grad_norm": 2.8561511039733887, |
| "grad_norm_var": 1.1601718819937763, |
| "learning_rate": 2e-05, |
| "loss": 25.5684, |
| "loss/crossentropy": 2.6629557609558105, |
| "loss/hidden": 0.1533203125, |
| "loss/logits": 0.039182234555482864, |
| "loss/reg": 25.375917434692383, |
| "step": 424 |
| }, |
| { |
| "epoch": 0.010625, |
| "grad_norm": 1.8477426767349243, |
| "grad_norm_var": 1.1963363532274474, |
| "learning_rate": 2e-05, |
| "loss": 25.4692, |
| "loss/crossentropy": 3.0463385581970215, |
| "loss/hidden": 0.166015625, |
| "loss/logits": 0.046355295926332474, |
| "loss/reg": 25.256792068481445, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.01065, |
| "grad_norm": 2.3839778900146484, |
| "grad_norm_var": 1.1845253457707392, |
| "learning_rate": 2e-05, |
| "loss": 25.3301, |
| "loss/crossentropy": 2.812811851501465, |
| "loss/hidden": 0.1533203125, |
| "loss/logits": 0.03877168148756027, |
| "loss/reg": 25.13796615600586, |
| "step": 426 |
| }, |
| { |
| "epoch": 0.010675, |
| "grad_norm": 3.649106979370117, |
| "grad_norm_var": 1.2399896204357086, |
| "learning_rate": 2e-05, |
| "loss": 25.2656, |
| "loss/crossentropy": 2.817918300628662, |
| "loss/hidden": 0.201171875, |
| "loss/logits": 0.04450376331806183, |
| "loss/reg": 25.01987648010254, |
| "step": 427 |
| }, |
| { |
| "epoch": 0.0107, |
| "grad_norm": 2.212465524673462, |
| "grad_norm_var": 1.197620310146587, |
| "learning_rate": 2e-05, |
| "loss": 25.1043, |
| "loss/crossentropy": 3.138951301574707, |
| "loss/hidden": 0.1611328125, |
| "loss/logits": 0.04081891477108002, |
| "loss/reg": 24.902353286743164, |
| "step": 428 |
| }, |
| { |
| "epoch": 0.010725, |
| "grad_norm": 2.2249510288238525, |
| "grad_norm_var": 1.2093715461039343, |
| "learning_rate": 2e-05, |
| "loss": 25.003, |
| "loss/crossentropy": 2.6012675762176514, |
| "loss/hidden": 0.177734375, |
| "loss/logits": 0.04012144356966019, |
| "loss/reg": 24.785186767578125, |
| "step": 429 |
| }, |
| { |
| "epoch": 0.01075, |
| "grad_norm": 2.0124897956848145, |
| "grad_norm_var": 0.36066606030676096, |
| "learning_rate": 2e-05, |
| "loss": 24.862, |
| "loss/crossentropy": 2.6634442806243896, |
| "loss/hidden": 0.15625, |
| "loss/logits": 0.03708156198263168, |
| "loss/reg": 24.668659210205078, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.010775, |
| "grad_norm": 5.937109470367432, |
| "grad_norm_var": 1.1061704228386418, |
| "learning_rate": 2e-05, |
| "loss": 24.7883, |
| "loss/crossentropy": 2.98293137550354, |
| "loss/hidden": 0.1904296875, |
| "loss/logits": 0.04528553783893585, |
| "loss/reg": 24.552602767944336, |
| "step": 431 |
| }, |
| { |
| "epoch": 0.0108, |
| "grad_norm": 2.0511343479156494, |
| "grad_norm_var": 1.098776464532748, |
| "learning_rate": 2e-05, |
| "loss": 24.6483, |
| "loss/crossentropy": 3.482454299926758, |
| "loss/hidden": 0.173828125, |
| "loss/logits": 0.04099034145474434, |
| "loss/reg": 24.43346405029297, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.010825, |
| "grad_norm": 2.0217044353485107, |
| "grad_norm_var": 1.0930003270482167, |
| "learning_rate": 2e-05, |
| "loss": 24.4952, |
| "loss/crossentropy": 2.766650915145874, |
| "loss/hidden": 0.134765625, |
| "loss/logits": 0.041929375380277634, |
| "loss/reg": 24.31853485107422, |
| "step": 433 |
| }, |
| { |
| "epoch": 0.01085, |
| "grad_norm": 2.1080517768859863, |
| "grad_norm_var": 1.0893460739164103, |
| "learning_rate": 2e-05, |
| "loss": 24.3975, |
| "loss/crossentropy": 2.829714059829712, |
| "loss/hidden": 0.15625, |
| "loss/logits": 0.03715629130601883, |
| "loss/reg": 24.20409393310547, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.010875, |
| "grad_norm": 2.590503215789795, |
| "grad_norm_var": 1.0325087297316353, |
| "learning_rate": 2e-05, |
| "loss": 24.2874, |
| "loss/crossentropy": 2.812587261199951, |
| "loss/hidden": 0.158203125, |
| "loss/logits": 0.03894827514886856, |
| "loss/reg": 24.090213775634766, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.0109, |
| "grad_norm": 2.045022964477539, |
| "grad_norm_var": 1.0480635957117082, |
| "learning_rate": 2e-05, |
| "loss": 24.1827, |
| "loss/crossentropy": 3.086454391479492, |
| "loss/hidden": 0.166015625, |
| "loss/logits": 0.040063485503196716, |
| "loss/reg": 23.976669311523438, |
| "step": 436 |
| }, |
| { |
| "epoch": 0.010925, |
| "grad_norm": 2.0887551307678223, |
| "grad_norm_var": 1.033770641152501, |
| "learning_rate": 2e-05, |
| "loss": 24.0595, |
| "loss/crossentropy": 2.839022636413574, |
| "loss/hidden": 0.1533203125, |
| "loss/logits": 0.042500950396060944, |
| "loss/reg": 23.863662719726562, |
| "step": 437 |
| }, |
| { |
| "epoch": 0.01095, |
| "grad_norm": 2.314814567565918, |
| "grad_norm_var": 1.0035414097905393, |
| "learning_rate": 2e-05, |
| "loss": 23.9523, |
| "loss/crossentropy": 2.9472670555114746, |
| "loss/hidden": 0.15625, |
| "loss/logits": 0.044826384633779526, |
| "loss/reg": 23.751253128051758, |
| "step": 438 |
| }, |
| { |
| "epoch": 0.010975, |
| "grad_norm": 3.2724692821502686, |
| "grad_norm_var": 1.0354526746118948, |
| "learning_rate": 2e-05, |
| "loss": 23.853, |
| "loss/crossentropy": 2.7853267192840576, |
| "loss/hidden": 0.173828125, |
| "loss/logits": 0.03991552069783211, |
| "loss/reg": 23.63922882080078, |
| "step": 439 |
| }, |
| { |
| "epoch": 0.011, |
| "grad_norm": 1.8264756202697754, |
| "grad_norm_var": 1.0666913011185395, |
| "learning_rate": 2e-05, |
| "loss": 23.7176, |
| "loss/crossentropy": 2.5802700519561768, |
| "loss/hidden": 0.1513671875, |
| "loss/logits": 0.03871052712202072, |
| "loss/reg": 23.527509689331055, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.011025, |
| "grad_norm": 2.1698477268218994, |
| "grad_norm_var": 1.043588037234136, |
| "learning_rate": 2e-05, |
| "loss": 23.6232, |
| "loss/crossentropy": 2.9302892684936523, |
| "loss/hidden": 0.1630859375, |
| "loss/logits": 0.043618083000183105, |
| "loss/reg": 23.41650390625, |
| "step": 441 |
| }, |
| { |
| "epoch": 0.01105, |
| "grad_norm": 2.231228828430176, |
| "grad_norm_var": 1.0485661946787095, |
| "learning_rate": 2e-05, |
| "loss": 23.4972, |
| "loss/crossentropy": 2.867483615875244, |
| "loss/hidden": 0.1513671875, |
| "loss/logits": 0.04387916252017021, |
| "loss/reg": 23.30193328857422, |
| "step": 442 |
| }, |
| { |
| "epoch": 0.011075, |
| "grad_norm": 1.5965532064437866, |
| "grad_norm_var": 1.0103300653968241, |
| "learning_rate": 2e-05, |
| "loss": 23.3734, |
| "loss/crossentropy": 2.6820480823516846, |
| "loss/hidden": 0.1455078125, |
| "loss/logits": 0.03568604588508606, |
| "loss/reg": 23.192241668701172, |
| "step": 443 |
| }, |
| { |
| "epoch": 0.0111, |
| "grad_norm": 1.9236520528793335, |
| "grad_norm_var": 1.0234956986676196, |
| "learning_rate": 2e-05, |
| "loss": 23.299, |
| "loss/crossentropy": 2.791132688522339, |
| "loss/hidden": 0.1728515625, |
| "loss/logits": 0.04304938763380051, |
| "loss/reg": 23.083120346069336, |
| "step": 444 |
| }, |
| { |
| "epoch": 0.011125, |
| "grad_norm": 2.615415096282959, |
| "grad_norm_var": 1.0238631693360027, |
| "learning_rate": 2e-05, |
| "loss": 23.2208, |
| "loss/crossentropy": 2.7200093269348145, |
| "loss/hidden": 0.20703125, |
| "loss/logits": 0.039411187171936035, |
| "loss/reg": 22.974393844604492, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.01115, |
| "grad_norm": 3.7206993103027344, |
| "grad_norm_var": 1.11220864186432, |
| "learning_rate": 2e-05, |
| "loss": 23.1272, |
| "loss/crossentropy": 2.934048652648926, |
| "loss/hidden": 0.2109375, |
| "loss/logits": 0.050051335245370865, |
| "loss/reg": 22.866172790527344, |
| "step": 446 |
| }, |
| { |
| "epoch": 0.011175, |
| "grad_norm": 2.2452712059020996, |
| "grad_norm_var": 0.2879587549997645, |
| "learning_rate": 2e-05, |
| "loss": 22.9826, |
| "loss/crossentropy": 3.0915427207946777, |
| "loss/hidden": 0.177734375, |
| "loss/logits": 0.04651949554681778, |
| "loss/reg": 22.75834846496582, |
| "step": 447 |
| }, |
| { |
| "epoch": 0.0112, |
| "grad_norm": 1.6305818557739258, |
| "grad_norm_var": 0.3130432844059186, |
| "learning_rate": 2e-05, |
| "loss": 22.8382, |
| "loss/crossentropy": 2.8177685737609863, |
| "loss/hidden": 0.1484375, |
| "loss/logits": 0.038578279316425323, |
| "loss/reg": 22.65113639831543, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.011225, |
| "grad_norm": 2.833962917327881, |
| "grad_norm_var": 0.3268392463359208, |
| "learning_rate": 2e-05, |
| "loss": 22.7767, |
| "loss/crossentropy": 2.2847812175750732, |
| "loss/hidden": 0.189453125, |
| "loss/logits": 0.04296421259641647, |
| "loss/reg": 22.544269561767578, |
| "step": 449 |
| }, |
| { |
| "epoch": 0.01125, |
| "grad_norm": 1.8361362218856812, |
| "grad_norm_var": 0.33935606993982687, |
| "learning_rate": 2e-05, |
| "loss": 22.621, |
| "loss/crossentropy": 3.0569963455200195, |
| "loss/hidden": 0.138671875, |
| "loss/logits": 0.044218260794878006, |
| "loss/reg": 22.43810272216797, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.011275, |
| "grad_norm": 2.2452023029327393, |
| "grad_norm_var": 0.3338401600601277, |
| "learning_rate": 2e-05, |
| "loss": 22.5931, |
| "loss/crossentropy": 2.7638001441955566, |
| "loss/hidden": 0.21484375, |
| "loss/logits": 0.046010829508304596, |
| "loss/reg": 22.332237243652344, |
| "step": 451 |
| }, |
| { |
| "epoch": 0.0113, |
| "grad_norm": 2.50168776512146, |
| "grad_norm_var": 0.3321248705423902, |
| "learning_rate": 2e-05, |
| "loss": 22.4375, |
| "loss/crossentropy": 2.8486430644989014, |
| "loss/hidden": 0.169921875, |
| "loss/logits": 0.04485378414392471, |
| "loss/reg": 22.22277069091797, |
| "step": 452 |
| }, |
| { |
| "epoch": 0.011325, |
| "grad_norm": 2.492642402648926, |
| "grad_norm_var": 0.3300935987394846, |
| "learning_rate": 2e-05, |
| "loss": 22.3492, |
| "loss/crossentropy": 2.71463942527771, |
| "loss/hidden": 0.1796875, |
| "loss/logits": 0.05168076232075691, |
| "loss/reg": 22.117813110351562, |
| "step": 453 |
| }, |
| { |
| "epoch": 0.01135, |
| "grad_norm": 1.9064103364944458, |
| "grad_norm_var": 0.341946302980373, |
| "learning_rate": 2e-05, |
| "loss": 22.2137, |
| "loss/crossentropy": 2.8734917640686035, |
| "loss/hidden": 0.158203125, |
| "loss/logits": 0.042219504714012146, |
| "loss/reg": 22.013267517089844, |
| "step": 454 |
| }, |
| { |
| "epoch": 0.011375, |
| "grad_norm": 2.5244898796081543, |
| "grad_norm_var": 0.2814757407362549, |
| "learning_rate": 2e-05, |
| "loss": 22.1323, |
| "loss/crossentropy": 2.5757384300231934, |
| "loss/hidden": 0.1796875, |
| "loss/logits": 0.043334104120731354, |
| "loss/reg": 21.90932273864746, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.0114, |
| "grad_norm": 1.49411940574646, |
| "grad_norm_var": 0.3079792610992302, |
| "learning_rate": 2e-05, |
| "loss": 21.9885, |
| "loss/crossentropy": 2.7647297382354736, |
| "loss/hidden": 0.1416015625, |
| "loss/logits": 0.04113258421421051, |
| "loss/reg": 21.80577278137207, |
| "step": 456 |
| }, |
| { |
| "epoch": 0.011425, |
| "grad_norm": 2.4946882724761963, |
| "grad_norm_var": 0.31118967972119144, |
| "learning_rate": 2e-05, |
| "loss": 21.9178, |
| "loss/crossentropy": 2.653733968734741, |
| "loss/hidden": 0.169921875, |
| "loss/logits": 0.04473033547401428, |
| "loss/reg": 21.7031192779541, |
| "step": 457 |
| }, |
| { |
| "epoch": 0.01145, |
| "grad_norm": 1.9149430990219116, |
| "grad_norm_var": 0.31900516012573277, |
| "learning_rate": 2e-05, |
| "loss": 21.825, |
| "loss/crossentropy": 2.6428985595703125, |
| "loss/hidden": 0.1826171875, |
| "loss/logits": 0.04166660085320473, |
| "loss/reg": 21.600685119628906, |
| "step": 458 |
| }, |
| { |
| "epoch": 0.011475, |
| "grad_norm": 1.7886494398117065, |
| "grad_norm_var": 0.3046125382910664, |
| "learning_rate": 2e-05, |
| "loss": 21.7068, |
| "loss/crossentropy": 2.5072784423828125, |
| "loss/hidden": 0.169921875, |
| "loss/logits": 0.03796003758907318, |
| "loss/reg": 21.498947143554688, |
| "step": 459 |
| }, |
| { |
| "epoch": 0.0115, |
| "grad_norm": 2.5755653381347656, |
| "grad_norm_var": 0.30189205483048126, |
| "learning_rate": 2e-05, |
| "loss": 21.6383, |
| "loss/crossentropy": 2.8383917808532715, |
| "loss/hidden": 0.19140625, |
| "loss/logits": 0.0494539812207222, |
| "loss/reg": 21.397401809692383, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.011525, |
| "grad_norm": 3.0548369884490967, |
| "grad_norm_var": 0.33236538038720104, |
| "learning_rate": 2e-05, |
| "loss": 21.4977, |
| "loss/crossentropy": 2.717653751373291, |
| "loss/hidden": 0.1630859375, |
| "loss/logits": 0.03829260170459747, |
| "loss/reg": 21.29631805419922, |
| "step": 461 |
| }, |
| { |
| "epoch": 0.01155, |
| "grad_norm": 2.220721483230591, |
| "grad_norm_var": 0.19459906037036465, |
| "learning_rate": 2e-05, |
| "loss": 21.4131, |
| "loss/crossentropy": 2.909445285797119, |
| "loss/hidden": 0.177734375, |
| "loss/logits": 0.04297621548175812, |
| "loss/reg": 21.192386627197266, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.011575, |
| "grad_norm": 2.5778627395629883, |
| "grad_norm_var": 0.2019683654886343, |
| "learning_rate": 2e-05, |
| "loss": 21.3024, |
| "loss/crossentropy": 2.7236719131469727, |
| "loss/hidden": 0.1650390625, |
| "loss/logits": 0.044891439378261566, |
| "loss/reg": 21.092477798461914, |
| "step": 463 |
| }, |
| { |
| "epoch": 0.0116, |
| "grad_norm": 2.302924156188965, |
| "grad_norm_var": 0.17417472367764025, |
| "learning_rate": 2e-05, |
| "loss": 21.2079, |
| "loss/crossentropy": 2.661893129348755, |
| "loss/hidden": 0.1728515625, |
| "loss/logits": 0.042085111141204834, |
| "loss/reg": 20.99297523498535, |
| "step": 464 |
| }, |
| { |
| "epoch": 0.011625, |
| "grad_norm": 3.1231939792633057, |
| "grad_norm_var": 0.2000796962515011, |
| "learning_rate": 2e-05, |
| "loss": 21.158, |
| "loss/crossentropy": 2.9192662239074707, |
| "loss/hidden": 0.1923828125, |
| "loss/logits": 0.07160253822803497, |
| "loss/reg": 20.89402961730957, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.01165, |
| "grad_norm": 2.0219297409057617, |
| "grad_norm_var": 0.19035271984780966, |
| "learning_rate": 2e-05, |
| "loss": 21.0286, |
| "loss/crossentropy": 3.011322021484375, |
| "loss/hidden": 0.1826171875, |
| "loss/logits": 0.05049794167280197, |
| "loss/reg": 20.7955265045166, |
| "step": 466 |
| }, |
| { |
| "epoch": 0.011675, |
| "grad_norm": 2.3812074661254883, |
| "grad_norm_var": 0.1900165697599144, |
| "learning_rate": 2e-05, |
| "loss": 20.9126, |
| "loss/crossentropy": 2.594330072402954, |
| "loss/hidden": 0.1728515625, |
| "loss/logits": 0.04238292574882507, |
| "loss/reg": 20.69733428955078, |
| "step": 467 |
| }, |
| { |
| "epoch": 0.0117, |
| "grad_norm": 3.212700128555298, |
| "grad_norm_var": 0.23732095982581564, |
| "learning_rate": 2e-05, |
| "loss": 20.8329, |
| "loss/crossentropy": 3.0930912494659424, |
| "loss/hidden": 0.1796875, |
| "loss/logits": 0.05329480022192001, |
| "loss/reg": 20.599882125854492, |
| "step": 468 |
| }, |
| { |
| "epoch": 0.011725, |
| "grad_norm": 2.802736759185791, |
| "grad_norm_var": 0.24797037852165235, |
| "learning_rate": 2e-05, |
| "loss": 20.7412, |
| "loss/crossentropy": 2.682535409927368, |
| "loss/hidden": 0.1865234375, |
| "loss/logits": 0.05204048752784729, |
| "loss/reg": 20.502676010131836, |
| "step": 469 |
| }, |
| { |
| "epoch": 0.01175, |
| "grad_norm": 6.470301151275635, |
| "grad_norm_var": 1.249545399419666, |
| "learning_rate": 2e-05, |
| "loss": 20.6501, |
| "loss/crossentropy": 3.1839163303375244, |
| "loss/hidden": 0.1923828125, |
| "loss/logits": 0.052045367658138275, |
| "loss/reg": 20.405675888061523, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.011775, |
| "grad_norm": 2.094099283218384, |
| "grad_norm_var": 1.2703367134671737, |
| "learning_rate": 2e-05, |
| "loss": 20.5315, |
| "loss/crossentropy": 2.884645462036133, |
| "loss/hidden": 0.1728515625, |
| "loss/logits": 0.0492381826043129, |
| "loss/reg": 20.309377670288086, |
| "step": 471 |
| }, |
| { |
| "epoch": 0.0118, |
| "grad_norm": 2.2063822746276855, |
| "grad_norm_var": 1.1914975389137128, |
| "learning_rate": 2e-05, |
| "loss": 20.4294, |
| "loss/crossentropy": 3.0197482109069824, |
| "loss/hidden": 0.1728515625, |
| "loss/logits": 0.0465642511844635, |
| "loss/reg": 20.20997428894043, |
| "step": 472 |
| }, |
| { |
| "epoch": 0.011825, |
| "grad_norm": 2.1600499153137207, |
| "grad_norm_var": 1.2077763497968268, |
| "learning_rate": 2e-05, |
| "loss": 20.3465, |
| "loss/crossentropy": 2.7459301948547363, |
| "loss/hidden": 0.1845703125, |
| "loss/logits": 0.04744536057114601, |
| "loss/reg": 20.114482879638672, |
| "step": 473 |
| }, |
| { |
| "epoch": 0.01185, |
| "grad_norm": 2.352337121963501, |
| "grad_norm_var": 1.1750134992717125, |
| "learning_rate": 2e-05, |
| "loss": 20.2671, |
| "loss/crossentropy": 3.246368646621704, |
| "loss/hidden": 0.19921875, |
| "loss/logits": 0.048278048634529114, |
| "loss/reg": 20.019636154174805, |
| "step": 474 |
| }, |
| { |
| "epoch": 0.011875, |
| "grad_norm": 1.7839800119400024, |
| "grad_norm_var": 1.1755879216903617, |
| "learning_rate": 2e-05, |
| "loss": 20.1489, |
| "loss/crossentropy": 2.9967947006225586, |
| "loss/hidden": 0.1748046875, |
| "loss/logits": 0.048982493579387665, |
| "loss/reg": 19.92508888244629, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.0119, |
| "grad_norm": 2.4864652156829834, |
| "grad_norm_var": 1.1776669498268881, |
| "learning_rate": 2e-05, |
| "loss": 20.0598, |
| "loss/crossentropy": 2.60671329498291, |
| "loss/hidden": 0.1767578125, |
| "loss/logits": 0.052169423550367355, |
| "loss/reg": 19.830875396728516, |
| "step": 476 |
| }, |
| { |
| "epoch": 0.011925, |
| "grad_norm": 2.363190174102783, |
| "grad_norm_var": 1.175140638989229, |
| "learning_rate": 2e-05, |
| "loss": 20.0069, |
| "loss/crossentropy": 2.858886480331421, |
| "loss/hidden": 0.220703125, |
| "loss/logits": 0.04880058765411377, |
| "loss/reg": 19.737428665161133, |
| "step": 477 |
| }, |
| { |
| "epoch": 0.01195, |
| "grad_norm": 2.021653652191162, |
| "grad_norm_var": 1.189277020649322, |
| "learning_rate": 2e-05, |
| "loss": 19.8696, |
| "loss/crossentropy": 2.7821409702301025, |
| "loss/hidden": 0.1796875, |
| "loss/logits": 0.04575974866747856, |
| "loss/reg": 19.64413833618164, |
| "step": 478 |
| }, |
| { |
| "epoch": 0.011975, |
| "grad_norm": 1.9459588527679443, |
| "grad_norm_var": 1.2201059740655162, |
| "learning_rate": 2e-05, |
| "loss": 19.763, |
| "loss/crossentropy": 2.772062301635742, |
| "loss/hidden": 0.169921875, |
| "loss/logits": 0.041855379939079285, |
| "loss/reg": 19.551227569580078, |
| "step": 479 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 2.5396687984466553, |
| "grad_norm_var": 1.2139767764762164, |
| "learning_rate": 2e-05, |
| "loss": 19.705, |
| "loss/crossentropy": 2.853855848312378, |
| "loss/hidden": 0.197265625, |
| "loss/logits": 0.0488080270588398, |
| "loss/reg": 19.45893096923828, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.012025, |
| "grad_norm": 2.137702226638794, |
| "grad_norm_var": 1.208933842733709, |
| "learning_rate": 2e-05, |
| "loss": 19.6405, |
| "loss/crossentropy": 2.817518949508667, |
| "loss/hidden": 0.224609375, |
| "loss/logits": 0.04868399724364281, |
| "loss/reg": 19.367197036743164, |
| "step": 481 |
| }, |
| { |
| "epoch": 0.01205, |
| "grad_norm": 2.0161988735198975, |
| "grad_norm_var": 1.2093480157874186, |
| "learning_rate": 2e-05, |
| "loss": 19.4942, |
| "loss/crossentropy": 3.041217565536499, |
| "loss/hidden": 0.1728515625, |
| "loss/logits": 0.049482930451631546, |
| "loss/reg": 19.271909713745117, |
| "step": 482 |
| }, |
| { |
| "epoch": 0.012075, |
| "grad_norm": 2.4689245223999023, |
| "grad_norm_var": 1.2077271255452753, |
| "learning_rate": 2e-05, |
| "loss": 19.4371, |
| "loss/crossentropy": 2.8089406490325928, |
| "loss/hidden": 0.205078125, |
| "loss/logits": 0.05119004100561142, |
| "loss/reg": 19.180824279785156, |
| "step": 483 |
| }, |
| { |
| "epoch": 0.0121, |
| "grad_norm": 2.2316370010375977, |
| "grad_norm_var": 1.1833405153175727, |
| "learning_rate": 2e-05, |
| "loss": 19.3215, |
| "loss/crossentropy": 2.9401211738586426, |
| "loss/hidden": 0.1875, |
| "loss/logits": 0.04366878420114517, |
| "loss/reg": 19.090309143066406, |
| "step": 484 |
| }, |
| { |
| "epoch": 0.012125, |
| "grad_norm": 2.047006607055664, |
| "grad_norm_var": 1.1890429642677636, |
| "learning_rate": 2e-05, |
| "loss": 19.2222, |
| "loss/crossentropy": 2.878319501876831, |
| "loss/hidden": 0.1748046875, |
| "loss/logits": 0.04722724109888077, |
| "loss/reg": 19.000150680541992, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.01215, |
| "grad_norm": 1.9564520120620728, |
| "grad_norm_var": 0.047588535415659786, |
| "learning_rate": 2e-05, |
| "loss": 19.1429, |
| "loss/crossentropy": 2.7814621925354004, |
| "loss/hidden": 0.1845703125, |
| "loss/logits": 0.04784928262233734, |
| "loss/reg": 18.91046142578125, |
| "step": 486 |
| }, |
| { |
| "epoch": 0.012175, |
| "grad_norm": 2.660238265991211, |
| "grad_norm_var": 0.06145858363412131, |
| "learning_rate": 2e-05, |
| "loss": 19.042, |
| "loss/crossentropy": 2.7573916912078857, |
| "loss/hidden": 0.171875, |
| "loss/logits": 0.04883112385869026, |
| "loss/reg": 18.821319580078125, |
| "step": 487 |
| }, |
| { |
| "epoch": 0.0122, |
| "grad_norm": 3.515303611755371, |
| "grad_norm_var": 0.16771224633902548, |
| "learning_rate": 2e-05, |
| "loss": 18.9393, |
| "loss/crossentropy": 2.9051125049591064, |
| "loss/hidden": 0.162109375, |
| "loss/logits": 0.04467906057834625, |
| "loss/reg": 18.73251724243164, |
| "step": 488 |
| }, |
| { |
| "epoch": 0.012225, |
| "grad_norm": 2.64762806892395, |
| "grad_norm_var": 0.17393239434562133, |
| "learning_rate": 2e-05, |
| "loss": 18.9105, |
| "loss/crossentropy": 2.8703958988189697, |
| "loss/hidden": 0.21875, |
| "loss/logits": 0.04748106747865677, |
| "loss/reg": 18.644243240356445, |
| "step": 489 |
| }, |
| { |
| "epoch": 0.01225, |
| "grad_norm": 2.6073079109191895, |
| "grad_norm_var": 0.17897939206579636, |
| "learning_rate": 2e-05, |
| "loss": 18.8279, |
| "loss/crossentropy": 3.221273422241211, |
| "loss/hidden": 0.2158203125, |
| "loss/logits": 0.05585157871246338, |
| "loss/reg": 18.556241989135742, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.012275, |
| "grad_norm": 1.5472153425216675, |
| "grad_norm_var": 0.20001469118204512, |
| "learning_rate": 2e-05, |
| "loss": 18.679, |
| "loss/crossentropy": 2.915916681289673, |
| "loss/hidden": 0.1669921875, |
| "loss/logits": 0.043436288833618164, |
| "loss/reg": 18.468524932861328, |
| "step": 491 |
| }, |
| { |
| "epoch": 0.0123, |
| "grad_norm": 1.7620818614959717, |
| "grad_norm_var": 0.21717040004255922, |
| "learning_rate": 2e-05, |
| "loss": 18.6017, |
| "loss/crossentropy": 2.867325782775879, |
| "loss/hidden": 0.1728515625, |
| "loss/logits": 0.051028743386268616, |
| "loss/reg": 18.377832412719727, |
| "step": 492 |
| }, |
| { |
| "epoch": 0.012325, |
| "grad_norm": 2.20041561126709, |
| "grad_norm_var": 0.21700482333929852, |
| "learning_rate": 2e-05, |
| "loss": 18.5379, |
| "loss/crossentropy": 2.9628453254699707, |
| "loss/hidden": 0.19921875, |
| "loss/logits": 0.04741261899471283, |
| "loss/reg": 18.291229248046875, |
| "step": 493 |
| }, |
| { |
| "epoch": 0.01235, |
| "grad_norm": 1.9837777614593506, |
| "grad_norm_var": 0.2183440529603916, |
| "learning_rate": 2e-05, |
| "loss": 18.4331, |
| "loss/crossentropy": 2.825575113296509, |
| "loss/hidden": 0.1796875, |
| "loss/logits": 0.048515524715185165, |
| "loss/reg": 18.204925537109375, |
| "step": 494 |
| }, |
| { |
| "epoch": 0.012375, |
| "grad_norm": 2.4184892177581787, |
| "grad_norm_var": 0.21209012166573088, |
| "learning_rate": 2e-05, |
| "loss": 18.4085, |
| "loss/crossentropy": 2.6464486122131348, |
| "loss/hidden": 0.2451171875, |
| "loss/logits": 0.04423639923334122, |
| "loss/reg": 18.119152069091797, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.0124, |
| "grad_norm": 1.9086065292358398, |
| "grad_norm_var": 0.21649869079500092, |
| "learning_rate": 2e-05, |
| "loss": 18.2784, |
| "loss/crossentropy": 3.1577796936035156, |
| "loss/hidden": 0.1953125, |
| "loss/logits": 0.04937343671917915, |
| "loss/reg": 18.033668518066406, |
| "step": 496 |
| }, |
| { |
| "epoch": 0.012425, |
| "grad_norm": 1.8744006156921387, |
| "grad_norm_var": 0.22501323270567444, |
| "learning_rate": 2e-05, |
| "loss": 18.1669, |
| "loss/crossentropy": 2.6969730854034424, |
| "loss/hidden": 0.1748046875, |
| "loss/logits": 0.0435866042971611, |
| "loss/reg": 17.94854736328125, |
| "step": 497 |
| }, |
| { |
| "epoch": 0.01245, |
| "grad_norm": 3.1248912811279297, |
| "grad_norm_var": 0.26870209982110016, |
| "learning_rate": 2e-05, |
| "loss": 18.1115, |
| "loss/crossentropy": 2.8137989044189453, |
| "loss/hidden": 0.20703125, |
| "loss/logits": 0.04066001996397972, |
| "loss/reg": 17.863773345947266, |
| "step": 498 |
| }, |
| { |
| "epoch": 0.012475, |
| "grad_norm": 3.302873373031616, |
| "grad_norm_var": 0.32987942357099176, |
| "learning_rate": 2e-05, |
| "loss": 18.0577, |
| "loss/crossentropy": 3.0537073612213135, |
| "loss/hidden": 0.2197265625, |
| "loss/logits": 0.05833124369382858, |
| "loss/reg": 17.779624938964844, |
| "step": 499 |
| }, |
| { |
| "epoch": 0.0125, |
| "grad_norm": 2.7058842182159424, |
| "grad_norm_var": 0.335707609950227, |
| "learning_rate": 2e-05, |
| "loss": 17.982, |
| "loss/crossentropy": 3.436614751815796, |
| "loss/hidden": 0.23046875, |
| "loss/logits": 0.055682770907878876, |
| "loss/reg": 17.69584846496582, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 40000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": true, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.227844083712e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|