| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.0625, |
| "eval_steps": 250, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.000125, |
| "grad_norm": 2.4410972595214844, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 1.3861, |
| "loss/crossentropy": 2.3799004554748535, |
| "loss/hidden": 1.1796875, |
| "loss/logits": 0.20445775985717773, |
| "loss/reg": 0.00019296666141599417, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.00025, |
| "grad_norm": 5.245810508728027, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 1.8002, |
| "loss/crossentropy": 3.2071006298065186, |
| "loss/hidden": 1.5234375, |
| "loss/logits": 0.27488040924072266, |
| "loss/reg": 0.00019296666141599417, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.000375, |
| "grad_norm": 2.093414545059204, |
| "learning_rate": 3e-06, |
| "loss": 1.1295, |
| "loss/crossentropy": 2.474642753601074, |
| "loss/hidden": 0.98046875, |
| "loss/logits": 0.14706741273403168, |
| "loss/reg": 0.00019296648679301143, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.0005, |
| "grad_norm": 2.3313233852386475, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 1.1829, |
| "loss/crossentropy": 2.6595563888549805, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.15751230716705322, |
| "loss/reg": 0.00019296605023555458, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.000625, |
| "grad_norm": 2.8557937145233154, |
| "learning_rate": 5e-06, |
| "loss": 1.2347, |
| "loss/crossentropy": 2.4212286472320557, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.1781189739704132, |
| "loss/reg": 0.00019296558457426727, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.00075, |
| "grad_norm": 3.0438766479492188, |
| "learning_rate": 6e-06, |
| "loss": 1.3852, |
| "loss/crossentropy": 2.5387792587280273, |
| "loss/hidden": 1.1953125, |
| "loss/logits": 0.1879885196685791, |
| "loss/reg": 0.00019296500249765813, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.000875, |
| "grad_norm": 2.6279566287994385, |
| "learning_rate": 7.000000000000001e-06, |
| "loss": 1.3298, |
| "loss/crossentropy": 2.4866859912872314, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.17939803004264832, |
| "loss/reg": 0.00019296453683637083, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.001, |
| "grad_norm": 4.1743645668029785, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 1.3617, |
| "loss/crossentropy": 2.6065773963928223, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.20351174473762512, |
| "loss/reg": 0.00019296388200018555, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.001125, |
| "grad_norm": 2.4077372550964355, |
| "learning_rate": 9e-06, |
| "loss": 1.3601, |
| "loss/crossentropy": 2.5489792823791504, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.20188993215560913, |
| "loss/reg": 0.00019296332902740687, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.00125, |
| "grad_norm": 2.785814046859741, |
| "learning_rate": 1e-05, |
| "loss": 1.3588, |
| "loss/crossentropy": 2.892237424850464, |
| "loss/hidden": 1.1640625, |
| "loss/logits": 0.19282634556293488, |
| "loss/reg": 0.0001929624268086627, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.001375, |
| "grad_norm": 2.223435163497925, |
| "learning_rate": 1.1000000000000001e-05, |
| "loss": 1.3473, |
| "loss/crossentropy": 2.6144230365753174, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.20472508668899536, |
| "loss/reg": 0.0001929616992129013, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.0015, |
| "grad_norm": 2.2602744102478027, |
| "learning_rate": 1.2e-05, |
| "loss": 1.4537, |
| "loss/crossentropy": 2.445138931274414, |
| "loss/hidden": 1.234375, |
| "loss/logits": 0.21741002798080444, |
| "loss/reg": 0.0001929609279613942, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.001625, |
| "grad_norm": 2.167941093444824, |
| "learning_rate": 1.3000000000000001e-05, |
| "loss": 1.3285, |
| "loss/crossentropy": 2.4828319549560547, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.18590743839740753, |
| "loss/reg": 0.00019296009850222617, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.00175, |
| "grad_norm": 3.135204315185547, |
| "learning_rate": 1.4000000000000001e-05, |
| "loss": 1.2881, |
| "loss/crossentropy": 2.485008955001831, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.16897538304328918, |
| "loss/reg": 0.00019296004029456526, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.001875, |
| "grad_norm": 1.9516435861587524, |
| "learning_rate": 1.5e-05, |
| "loss": 1.3793, |
| "loss/crossentropy": 2.6037957668304443, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.22108706831932068, |
| "loss/reg": 0.0001929590798681602, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.002, |
| "grad_norm": 2.4786994457244873, |
| "grad_norm_var": 0.7268455688420417, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 1.4938, |
| "loss/crossentropy": 2.6338419914245605, |
| "loss/hidden": 1.2578125, |
| "loss/logits": 0.23409520089626312, |
| "loss/reg": 0.00019295798847451806, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.002125, |
| "grad_norm": 1.935854196548462, |
| "grad_norm_var": 0.7645541886139611, |
| "learning_rate": 1.7000000000000003e-05, |
| "loss": 1.1905, |
| "loss/crossentropy": 2.5458219051361084, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.17296427488327026, |
| "loss/reg": 0.00019295624224469066, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.00225, |
| "grad_norm": 4.175808906555176, |
| "grad_norm_var": 0.4775368463766857, |
| "learning_rate": 1.8e-05, |
| "loss": 1.4189, |
| "loss/crossentropy": 2.745898485183716, |
| "loss/hidden": 1.21875, |
| "loss/logits": 0.1982022523880005, |
| "loss/reg": 0.00019295603851787746, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.002375, |
| "grad_norm": 2.6384661197662354, |
| "grad_norm_var": 0.45452375883730595, |
| "learning_rate": 1.9e-05, |
| "loss": 1.546, |
| "loss/crossentropy": 2.5159096717834473, |
| "loss/hidden": 1.3203125, |
| "loss/logits": 0.22378988564014435, |
| "loss/reg": 0.00019295531092211604, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.0025, |
| "grad_norm": 1.930158019065857, |
| "grad_norm_var": 0.48428273913252134, |
| "learning_rate": 2e-05, |
| "loss": 1.1151, |
| "loss/crossentropy": 2.6261353492736816, |
| "loss/hidden": 0.9609375, |
| "loss/logits": 0.1521872580051422, |
| "loss/reg": 0.00019295411766506732, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.002625, |
| "grad_norm": 2.75396728515625, |
| "grad_norm_var": 0.48247025151936307, |
| "learning_rate": 2.1e-05, |
| "loss": 1.4758, |
| "loss/crossentropy": 2.448762893676758, |
| "loss/hidden": 1.2578125, |
| "loss/logits": 0.21609561145305634, |
| "loss/reg": 0.00019295368110761046, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.00275, |
| "grad_norm": 2.8601644039154053, |
| "grad_norm_var": 0.475377454219718, |
| "learning_rate": 2.2000000000000003e-05, |
| "loss": 1.5402, |
| "loss/crossentropy": 2.4616007804870605, |
| "loss/hidden": 1.328125, |
| "loss/logits": 0.21012833714485168, |
| "loss/reg": 0.00019295132369734347, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.002875, |
| "grad_norm": 2.119006395339966, |
| "grad_norm_var": 0.493518604142709, |
| "learning_rate": 2.3000000000000003e-05, |
| "loss": 1.3082, |
| "loss/crossentropy": 2.633119821548462, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.18130342662334442, |
| "loss/reg": 0.0001929480058606714, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.003, |
| "grad_norm": 2.163001537322998, |
| "grad_norm_var": 0.3308316653990078, |
| "learning_rate": 2.4e-05, |
| "loss": 1.2387, |
| "loss/crossentropy": 2.494175672531128, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.16641545295715332, |
| "loss/reg": 0.00019294496451038867, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.003125, |
| "grad_norm": 2.0063726902008057, |
| "grad_norm_var": 0.34579458432486, |
| "learning_rate": 2.5e-05, |
| "loss": 1.122, |
| "loss/crossentropy": 2.4526851177215576, |
| "loss/hidden": 0.9765625, |
| "loss/logits": 0.1435042917728424, |
| "loss/reg": 0.00019294198136776686, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.00325, |
| "grad_norm": 2.8159995079040527, |
| "grad_norm_var": 0.34710604301850645, |
| "learning_rate": 2.6000000000000002e-05, |
| "loss": 1.4971, |
| "loss/crossentropy": 2.3794209957122803, |
| "loss/hidden": 1.28125, |
| "loss/logits": 0.2139449566602707, |
| "loss/reg": 0.00019293944933451712, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.003375, |
| "grad_norm": 4.617008209228516, |
| "grad_norm_var": 0.6245762786757856, |
| "learning_rate": 2.7000000000000002e-05, |
| "loss": 1.5303, |
| "loss/crossentropy": 2.374497413635254, |
| "loss/hidden": 1.3046875, |
| "loss/logits": 0.22372832894325256, |
| "loss/reg": 0.0001929359568748623, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.0035, |
| "grad_norm": 3.328347682952881, |
| "grad_norm_var": 0.6438493937520643, |
| "learning_rate": 2.8000000000000003e-05, |
| "loss": 1.5591, |
| "loss/crossentropy": 2.732736587524414, |
| "loss/hidden": 1.3203125, |
| "loss/logits": 0.23687124252319336, |
| "loss/reg": 0.00019293361401651055, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.003625, |
| "grad_norm": 2.3474817276000977, |
| "grad_norm_var": 0.6333103119315846, |
| "learning_rate": 2.9e-05, |
| "loss": 1.2983, |
| "loss/crossentropy": 2.3827714920043945, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.20258934795856476, |
| "loss/reg": 0.00019292977231089026, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.00375, |
| "grad_norm": 2.560765504837036, |
| "grad_norm_var": 0.6208746981103727, |
| "learning_rate": 3e-05, |
| "loss": 1.2558, |
| "loss/crossentropy": 2.5275678634643555, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.16795387864112854, |
| "loss/reg": 0.00019292706565465778, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.003875, |
| "grad_norm": 2.21565580368042, |
| "grad_norm_var": 0.6000257496388314, |
| "learning_rate": 3.1e-05, |
| "loss": 1.1396, |
| "loss/crossentropy": 2.991466999053955, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.13766203820705414, |
| "loss/reg": 0.00019292418437544256, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.004, |
| "grad_norm": 3.0804481506347656, |
| "grad_norm_var": 0.6061713539146301, |
| "learning_rate": 3.2000000000000005e-05, |
| "loss": 1.4502, |
| "loss/crossentropy": 2.5434651374816895, |
| "loss/hidden": 1.234375, |
| "loss/logits": 0.21392151713371277, |
| "loss/reg": 0.00019291977514512837, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.004125, |
| "grad_norm": 3.4683053493499756, |
| "grad_norm_var": 0.5923607081005333, |
| "learning_rate": 3.3e-05, |
| "loss": 1.586, |
| "loss/crossentropy": 2.4229018688201904, |
| "loss/hidden": 1.359375, |
| "loss/logits": 0.2246609479188919, |
| "loss/reg": 0.0001929169666254893, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.00425, |
| "grad_norm": 2.0332539081573486, |
| "grad_norm_var": 0.4912531320085567, |
| "learning_rate": 3.4000000000000007e-05, |
| "loss": 1.1758, |
| "loss/crossentropy": 2.4137802124023438, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.15825411677360535, |
| "loss/reg": 0.000192913124919869, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.004375, |
| "grad_norm": 2.101600408554077, |
| "grad_norm_var": 0.5125015485684291, |
| "learning_rate": 3.5e-05, |
| "loss": 1.2872, |
| "loss/crossentropy": 2.5844082832336426, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.19150416553020477, |
| "loss/reg": 0.0001929093268699944, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.0045, |
| "grad_norm": 3.8385353088378906, |
| "grad_norm_var": 0.556932092742056, |
| "learning_rate": 3.6e-05, |
| "loss": 1.1458, |
| "loss/crossentropy": 2.294477701187134, |
| "loss/hidden": 0.98828125, |
| "loss/logits": 0.15560123324394226, |
| "loss/reg": 0.0001929069694597274, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.004625, |
| "grad_norm": 1.8931406736373901, |
| "grad_norm_var": 0.6050138278151498, |
| "learning_rate": 3.7e-05, |
| "loss": 1.0789, |
| "loss/crossentropy": 2.5690038204193115, |
| "loss/hidden": 0.93359375, |
| "loss/logits": 0.14334949851036072, |
| "loss/reg": 0.00019290446653030813, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.00475, |
| "grad_norm": 1.720080018043518, |
| "grad_norm_var": 0.6642705659226048, |
| "learning_rate": 3.8e-05, |
| "loss": 1.2069, |
| "loss/crossentropy": 2.289663791656494, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.158083975315094, |
| "loss/reg": 0.0001929002464748919, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.004875, |
| "grad_norm": 1.7338802814483643, |
| "grad_norm_var": 0.7005152543709433, |
| "learning_rate": 3.9000000000000006e-05, |
| "loss": 1.0516, |
| "loss/crossentropy": 2.6059696674346924, |
| "loss/hidden": 0.921875, |
| "loss/logits": 0.1277628391981125, |
| "loss/reg": 0.00019289416377432644, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.005, |
| "grad_norm": 2.491640329360962, |
| "grad_norm_var": 0.6872298635287296, |
| "learning_rate": 4e-05, |
| "loss": 1.4446, |
| "loss/crossentropy": 2.4038562774658203, |
| "loss/hidden": 1.2109375, |
| "loss/logits": 0.2316935658454895, |
| "loss/reg": 0.0001928891142597422, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.005125, |
| "grad_norm": 2.42004656791687, |
| "grad_norm_var": 0.6629334231954789, |
| "learning_rate": 4.1e-05, |
| "loss": 1.2198, |
| "loss/crossentropy": 2.4321351051330566, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.15540093183517456, |
| "loss/reg": 0.00019288310431875288, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.00525, |
| "grad_norm": 2.5629024505615234, |
| "grad_norm_var": 0.6618966221430126, |
| "learning_rate": 4.2e-05, |
| "loss": 1.5254, |
| "loss/crossentropy": 2.23368501663208, |
| "loss/hidden": 1.3046875, |
| "loss/logits": 0.21881133317947388, |
| "loss/reg": 0.0001928747951751575, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.005375, |
| "grad_norm": 2.2626430988311768, |
| "grad_norm_var": 0.3911191161730377, |
| "learning_rate": 4.3e-05, |
| "loss": 1.4362, |
| "loss/crossentropy": 2.2880733013153076, |
| "loss/hidden": 1.2109375, |
| "loss/logits": 0.22328704595565796, |
| "loss/reg": 0.0001928645942825824, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.0055, |
| "grad_norm": 2.928407907485962, |
| "grad_norm_var": 0.3571399417370721, |
| "learning_rate": 4.4000000000000006e-05, |
| "loss": 1.1088, |
| "loss/crossentropy": 2.837071418762207, |
| "loss/hidden": 0.96484375, |
| "loss/logits": 0.1420312374830246, |
| "loss/reg": 0.00019285624148324132, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.005625, |
| "grad_norm": 2.7014455795288086, |
| "grad_norm_var": 0.35877893903101093, |
| "learning_rate": 4.5e-05, |
| "loss": 1.3821, |
| "loss/crossentropy": 2.2957558631896973, |
| "loss/hidden": 1.1953125, |
| "loss/logits": 0.18486037850379944, |
| "loss/reg": 0.0001928448909893632, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.00575, |
| "grad_norm": 1.8899732828140259, |
| "grad_norm_var": 0.38153805228543447, |
| "learning_rate": 4.600000000000001e-05, |
| "loss": 1.0258, |
| "loss/crossentropy": 2.3359339237213135, |
| "loss/hidden": 0.90625, |
| "loss/logits": 0.11766321957111359, |
| "loss/reg": 0.00019283634901512414, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.005875, |
| "grad_norm": 2.2188658714294434, |
| "grad_norm_var": 0.3814345973993899, |
| "learning_rate": 4.7e-05, |
| "loss": 1.0929, |
| "loss/crossentropy": 2.6772139072418213, |
| "loss/hidden": 0.96484375, |
| "loss/logits": 0.12616246938705444, |
| "loss/reg": 0.00019282741413917392, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.006, |
| "grad_norm": 2.1530601978302, |
| "grad_norm_var": 0.35835352199148307, |
| "learning_rate": 4.8e-05, |
| "loss": 1.2993, |
| "loss/crossentropy": 2.678344249725342, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.18795417249202728, |
| "loss/reg": 0.00019281756249256432, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.006125, |
| "grad_norm": 2.0299501419067383, |
| "grad_norm_var": 0.28299041785001716, |
| "learning_rate": 4.9e-05, |
| "loss": 1.1433, |
| "loss/crossentropy": 2.6076881885528564, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.14920485019683838, |
| "loss/reg": 0.00019280907872598618, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.00625, |
| "grad_norm": 2.6268253326416016, |
| "grad_norm_var": 0.28301229188597427, |
| "learning_rate": 5e-05, |
| "loss": 1.057, |
| "loss/crossentropy": 2.625675678253174, |
| "loss/hidden": 0.91796875, |
| "loss/logits": 0.13711729645729065, |
| "loss/reg": 0.00019279817934148014, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.006375, |
| "grad_norm": 1.8832714557647705, |
| "grad_norm_var": 0.29317342698340093, |
| "learning_rate": 5.1000000000000006e-05, |
| "loss": 1.2667, |
| "loss/crossentropy": 2.4522390365600586, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.17878666520118713, |
| "loss/reg": 0.00019278968102298677, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.0065, |
| "grad_norm": 2.7895314693450928, |
| "grad_norm_var": 0.15160714498306468, |
| "learning_rate": 5.2000000000000004e-05, |
| "loss": 1.38, |
| "loss/crossentropy": 2.143043041229248, |
| "loss/hidden": 1.21875, |
| "loss/logits": 0.15935076773166656, |
| "loss/reg": 0.00019278022227808833, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.006625, |
| "grad_norm": 3.2793004512786865, |
| "grad_norm_var": 0.20221103833160992, |
| "learning_rate": 5.300000000000001e-05, |
| "loss": 1.5103, |
| "loss/crossentropy": 2.9073903560638428, |
| "loss/hidden": 1.25, |
| "loss/logits": 0.2583482563495636, |
| "loss/reg": 0.00019277248065918684, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.00675, |
| "grad_norm": 2.013625144958496, |
| "grad_norm_var": 0.18271730407283251, |
| "learning_rate": 5.4000000000000005e-05, |
| "loss": 1.0809, |
| "loss/crossentropy": 2.50508975982666, |
| "loss/hidden": 0.9453125, |
| "loss/logits": 0.1336328089237213, |
| "loss/reg": 0.00019276094099041075, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.006875, |
| "grad_norm": 2.584568738937378, |
| "grad_norm_var": 0.15533136257730426, |
| "learning_rate": 5.500000000000001e-05, |
| "loss": 1.2027, |
| "loss/crossentropy": 2.4901885986328125, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.1539323329925537, |
| "loss/reg": 0.00019275395607110113, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.007, |
| "grad_norm": 2.0262043476104736, |
| "grad_norm_var": 0.16487505994896826, |
| "learning_rate": 5.6000000000000006e-05, |
| "loss": 1.3553, |
| "loss/crossentropy": 2.299118757247925, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.20495867729187012, |
| "loss/reg": 0.0001927466510096565, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.007125, |
| "grad_norm": 2.4767560958862305, |
| "grad_norm_var": 0.16524151904890463, |
| "learning_rate": 5.6999999999999996e-05, |
| "loss": 1.2841, |
| "loss/crossentropy": 2.5054197311401367, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.1650199145078659, |
| "loss/reg": 0.00019273380166850984, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.00725, |
| "grad_norm": 1.8864933252334595, |
| "grad_norm_var": 0.17929933439751622, |
| "learning_rate": 5.8e-05, |
| "loss": 1.0831, |
| "loss/crossentropy": 2.6685423851013184, |
| "loss/hidden": 0.95703125, |
| "loss/logits": 0.1241319477558136, |
| "loss/reg": 0.00019271954079158604, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.007375, |
| "grad_norm": 1.927100658416748, |
| "grad_norm_var": 0.19066639705673746, |
| "learning_rate": 5.9e-05, |
| "loss": 1.2188, |
| "loss/crossentropy": 2.4530575275421143, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.1622183918952942, |
| "loss/reg": 0.0001927079283632338, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.0075, |
| "grad_norm": 1.9683936834335327, |
| "grad_norm_var": 0.17275381294839126, |
| "learning_rate": 6e-05, |
| "loss": 1.1956, |
| "loss/crossentropy": 2.699939250946045, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.15460126101970673, |
| "loss/reg": 0.00019269227050244808, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.007625, |
| "grad_norm": 2.430546998977661, |
| "grad_norm_var": 0.1620622944705805, |
| "learning_rate": 6.1e-05, |
| "loss": 1.2826, |
| "loss/crossentropy": 2.2341370582580566, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.16349473595619202, |
| "loss/reg": 0.0001926780241774395, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.00775, |
| "grad_norm": 2.28157377243042, |
| "grad_norm_var": 0.15224653123686924, |
| "learning_rate": 6.2e-05, |
| "loss": 1.0961, |
| "loss/crossentropy": 2.3603291511535645, |
| "loss/hidden": 0.953125, |
| "loss/logits": 0.14104235172271729, |
| "loss/reg": 0.00019266277377028018, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.007875, |
| "grad_norm": 1.9362468719482422, |
| "grad_norm_var": 0.1597685683605616, |
| "learning_rate": 6.3e-05, |
| "loss": 1.1251, |
| "loss/crossentropy": 2.6113460063934326, |
| "loss/hidden": 0.984375, |
| "loss/logits": 0.1387864351272583, |
| "loss/reg": 0.00019264982256572694, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 3.4527783393859863, |
| "grad_norm_var": 0.245370177212871, |
| "learning_rate": 6.400000000000001e-05, |
| "loss": 1.5699, |
| "loss/crossentropy": 2.3186392784118652, |
| "loss/hidden": 1.3359375, |
| "loss/logits": 0.2319989800453186, |
| "loss/reg": 0.00019263311696704477, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.008125, |
| "grad_norm": 2.002469062805176, |
| "grad_norm_var": 0.24658852169075185, |
| "learning_rate": 6.500000000000001e-05, |
| "loss": 1.2089, |
| "loss/crossentropy": 2.1945531368255615, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.1600569784641266, |
| "loss/reg": 0.0001926190307131037, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.00825, |
| "grad_norm": 1.8942357301712036, |
| "grad_norm_var": 0.25288209179575216, |
| "learning_rate": 6.6e-05, |
| "loss": 1.1301, |
| "loss/crossentropy": 2.3987529277801514, |
| "loss/hidden": 0.98046875, |
| "loss/logits": 0.1477484107017517, |
| "loss/reg": 0.0001926012773765251, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.008375, |
| "grad_norm": 2.9109678268432617, |
| "grad_norm_var": 0.2615059196420252, |
| "learning_rate": 6.7e-05, |
| "loss": 1.2255, |
| "loss/crossentropy": 2.5337891578674316, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.16106070578098297, |
| "loss/reg": 0.00019258313113823533, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.0085, |
| "grad_norm": 2.1384077072143555, |
| "grad_norm_var": 0.25126003810038394, |
| "learning_rate": 6.800000000000001e-05, |
| "loss": 1.3278, |
| "loss/crossentropy": 2.577103614807129, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.20084746181964874, |
| "loss/reg": 0.00019256297673564404, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.008625, |
| "grad_norm": 2.1597537994384766, |
| "grad_norm_var": 0.18723560405016518, |
| "learning_rate": 6.9e-05, |
| "loss": 1.1538, |
| "loss/crossentropy": 2.6212854385375977, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.15191976726055145, |
| "loss/reg": 0.00019254189101047814, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.00875, |
| "grad_norm": 1.8894274234771729, |
| "grad_norm_var": 0.19220724163081773, |
| "learning_rate": 7e-05, |
| "loss": 1.1419, |
| "loss/crossentropy": 2.439669132232666, |
| "loss/hidden": 0.96484375, |
| "loss/logits": 0.17508690059185028, |
| "loss/reg": 0.00019252618949394673, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.008875, |
| "grad_norm": 1.9923198223114014, |
| "grad_norm_var": 0.1875417585129336, |
| "learning_rate": 7.1e-05, |
| "loss": 1.159, |
| "loss/crossentropy": 2.593405246734619, |
| "loss/hidden": 0.99609375, |
| "loss/logits": 0.16099245846271515, |
| "loss/reg": 0.00019250869809184223, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.009, |
| "grad_norm": 2.1258373260498047, |
| "grad_norm_var": 0.18570921033151055, |
| "learning_rate": 7.2e-05, |
| "loss": 1.2886, |
| "loss/crossentropy": 2.31851863861084, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.16945742070674896, |
| "loss/reg": 0.00019249116303399205, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.009125, |
| "grad_norm": 2.2625362873077393, |
| "grad_norm_var": 0.1811603588675789, |
| "learning_rate": 7.3e-05, |
| "loss": 1.0782, |
| "loss/crossentropy": 2.4330179691314697, |
| "loss/hidden": 0.93359375, |
| "loss/logits": 0.14271824061870575, |
| "loss/reg": 0.0001924755924846977, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.00925, |
| "grad_norm": 2.4275050163269043, |
| "grad_norm_var": 0.17657254479349263, |
| "learning_rate": 7.4e-05, |
| "loss": 1.2584, |
| "loss/crossentropy": 2.5500473976135254, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.186170294880867, |
| "loss/reg": 0.0001924607204273343, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.009375, |
| "grad_norm": 2.333041191101074, |
| "grad_norm_var": 0.17007094778421958, |
| "learning_rate": 7.500000000000001e-05, |
| "loss": 1.0767, |
| "loss/crossentropy": 2.6195261478424072, |
| "loss/hidden": 0.94921875, |
| "loss/logits": 0.12556512653827667, |
| "loss/reg": 0.0001924492244143039, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.0095, |
| "grad_norm": 2.6031246185302734, |
| "grad_norm_var": 0.1703287548027977, |
| "learning_rate": 7.6e-05, |
| "loss": 1.2179, |
| "loss/crossentropy": 2.488292932510376, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.1925877332687378, |
| "loss/reg": 0.000192438907106407, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.009625, |
| "grad_norm": 2.139622211456299, |
| "grad_norm_var": 0.17065351345722057, |
| "learning_rate": 7.7e-05, |
| "loss": 1.0607, |
| "loss/crossentropy": 2.934234619140625, |
| "loss/hidden": 0.921875, |
| "loss/logits": 0.1368916928768158, |
| "loss/reg": 0.0001924296229844913, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.00975, |
| "grad_norm": 1.9406142234802246, |
| "grad_norm_var": 0.17804626450119793, |
| "learning_rate": 7.800000000000001e-05, |
| "loss": 1.2752, |
| "loss/crossentropy": 2.2727177143096924, |
| "loss/hidden": 1.1015625, |
| "loss/logits": 0.17168399691581726, |
| "loss/reg": 0.0001924206007970497, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.009875, |
| "grad_norm": 3.055265426635742, |
| "grad_norm_var": 0.20754827159903322, |
| "learning_rate": 7.900000000000001e-05, |
| "loss": 1.1607, |
| "loss/crossentropy": 2.6755881309509277, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.15880770981311798, |
| "loss/reg": 0.00019241031259298325, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 2.129188060760498, |
| "grad_norm_var": 0.11942340663251176, |
| "learning_rate": 8e-05, |
| "loss": 1.2684, |
| "loss/crossentropy": 2.5891458988189697, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.18836861848831177, |
| "loss/reg": 0.00019238927052356303, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.010125, |
| "grad_norm": 3.2390594482421875, |
| "grad_norm_var": 0.17413858607061497, |
| "learning_rate": 8.1e-05, |
| "loss": 2.3912, |
| "loss/crossentropy": 2.566899061203003, |
| "loss/hidden": 1.7265625, |
| "loss/logits": 0.6627247333526611, |
| "loss/reg": 0.00019236840307712555, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.01025, |
| "grad_norm": 2.6307613849639893, |
| "grad_norm_var": 0.16548936874203926, |
| "learning_rate": 8.2e-05, |
| "loss": 1.2744, |
| "loss/crossentropy": 1.8344066143035889, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.1318967193365097, |
| "loss/reg": 0.00019235462241340429, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.010375, |
| "grad_norm": 3.545522928237915, |
| "grad_norm_var": 0.23612178547081322, |
| "learning_rate": 8.3e-05, |
| "loss": 1.2902, |
| "loss/crossentropy": 2.7840187549591064, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.1788717359304428, |
| "loss/reg": 0.00019232937484048307, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.0105, |
| "grad_norm": 2.2964553833007812, |
| "grad_norm_var": 0.23189123641267292, |
| "learning_rate": 8.4e-05, |
| "loss": 1.4269, |
| "loss/crossentropy": 2.6213274002075195, |
| "loss/hidden": 1.2265625, |
| "loss/logits": 0.19838152825832367, |
| "loss/reg": 0.00019230577163398266, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.010625, |
| "grad_norm": 2.2252533435821533, |
| "grad_norm_var": 0.229859261969087, |
| "learning_rate": 8.5e-05, |
| "loss": 1.1939, |
| "loss/crossentropy": 2.493159055709839, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.16855394840240479, |
| "loss/reg": 0.00019228595192544162, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.01075, |
| "grad_norm": 2.2857768535614014, |
| "grad_norm_var": 0.21125701567141184, |
| "learning_rate": 8.6e-05, |
| "loss": 1.2406, |
| "loss/crossentropy": 2.486485719680786, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.1917887181043625, |
| "loss/reg": 0.00019227097800467163, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.010875, |
| "grad_norm": 4.509547710418701, |
| "grad_norm_var": 0.4530040889277906, |
| "learning_rate": 8.7e-05, |
| "loss": 1.1073, |
| "loss/crossentropy": 2.630571126937866, |
| "loss/hidden": 0.96484375, |
| "loss/logits": 0.14057384431362152, |
| "loss/reg": 0.00019225555297452956, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.011, |
| "grad_norm": 2.0168137550354004, |
| "grad_norm_var": 0.4607750991685954, |
| "learning_rate": 8.800000000000001e-05, |
| "loss": 1.053, |
| "loss/crossentropy": 2.3603971004486084, |
| "loss/hidden": 0.9140625, |
| "loss/logits": 0.13705039024353027, |
| "loss/reg": 0.0001922310038935393, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.011125, |
| "grad_norm": 2.73807430267334, |
| "grad_norm_var": 0.45335285375272844, |
| "learning_rate": 8.900000000000001e-05, |
| "loss": 1.3178, |
| "loss/crossentropy": 2.93936824798584, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.19865703582763672, |
| "loss/reg": 0.00019220533431507647, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.01125, |
| "grad_norm": 2.305682420730591, |
| "grad_norm_var": 0.4576056958578417, |
| "learning_rate": 9e-05, |
| "loss": 1.0605, |
| "loss/crossentropy": 2.4470906257629395, |
| "loss/hidden": 0.921875, |
| "loss/logits": 0.13670536875724792, |
| "loss/reg": 0.00019217943190596998, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.011375, |
| "grad_norm": 2.333709239959717, |
| "grad_norm_var": 0.45757975254874145, |
| "learning_rate": 9.1e-05, |
| "loss": 1.1485, |
| "loss/crossentropy": 2.3914265632629395, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.14661094546318054, |
| "loss/reg": 0.00019215639622416347, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.0115, |
| "grad_norm": 3.1051077842712402, |
| "grad_norm_var": 0.4718879306887779, |
| "learning_rate": 9.200000000000001e-05, |
| "loss": 1.1946, |
| "loss/crossentropy": 2.3768973350524902, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.1458442062139511, |
| "loss/reg": 0.00019213555788155645, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.011625, |
| "grad_norm": 1.9250227212905884, |
| "grad_norm_var": 0.48954230695473144, |
| "learning_rate": 9.300000000000001e-05, |
| "loss": 1.0554, |
| "loss/crossentropy": 2.565377712249756, |
| "loss/hidden": 0.9140625, |
| "loss/logits": 0.13942840695381165, |
| "loss/reg": 0.00019211515609640628, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.01175, |
| "grad_norm": 2.176602840423584, |
| "grad_norm_var": 0.4709343827101543, |
| "learning_rate": 9.4e-05, |
| "loss": 1.1256, |
| "loss/crossentropy": 2.4452078342437744, |
| "loss/hidden": 0.9765625, |
| "loss/logits": 0.1470910608768463, |
| "loss/reg": 0.00019209457968827337, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.011875, |
| "grad_norm": 3.14528489112854, |
| "grad_norm_var": 0.4762166867826877, |
| "learning_rate": 9.5e-05, |
| "loss": 1.6721, |
| "loss/crossentropy": 2.0026628971099854, |
| "loss/hidden": 1.421875, |
| "loss/logits": 0.24830523133277893, |
| "loss/reg": 0.0001920641807373613, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 2.5958688259124756, |
| "grad_norm_var": 0.4566131842781544, |
| "learning_rate": 9.6e-05, |
| "loss": 1.2013, |
| "loss/crossentropy": 2.620640993118286, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.1759262979030609, |
| "loss/reg": 0.00019204463751520962, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.012125, |
| "grad_norm": 2.195146322250366, |
| "grad_norm_var": 0.4486006387079304, |
| "learning_rate": 9.7e-05, |
| "loss": 1.1398, |
| "loss/crossentropy": 2.547090768814087, |
| "loss/hidden": 0.9765625, |
| "loss/logits": 0.16133451461791992, |
| "loss/reg": 0.0001920342183439061, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.01225, |
| "grad_norm": 2.252614974975586, |
| "grad_norm_var": 0.4573438457489293, |
| "learning_rate": 9.8e-05, |
| "loss": 1.1583, |
| "loss/crossentropy": 2.4780287742614746, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.15635529160499573, |
| "loss/reg": 0.00019200837414246053, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.012375, |
| "grad_norm": 2.421096086502075, |
| "grad_norm_var": 0.3951004366779675, |
| "learning_rate": 9.900000000000001e-05, |
| "loss": 1.1038, |
| "loss/crossentropy": 2.2537543773651123, |
| "loss/hidden": 0.96484375, |
| "loss/logits": 0.13705003261566162, |
| "loss/reg": 0.0001919834321597591, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.0125, |
| "grad_norm": 2.174814462661743, |
| "grad_norm_var": 0.3998617443443601, |
| "learning_rate": 0.0001, |
| "loss": 1.1691, |
| "loss/crossentropy": 2.458591938018799, |
| "loss/hidden": 0.99609375, |
| "loss/logits": 0.17106780409812927, |
| "loss/reg": 0.0001919578353408724, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.012625, |
| "grad_norm": 2.622562885284424, |
| "grad_norm_var": 0.39382746835866644, |
| "learning_rate": 0.0001, |
| "loss": 1.1769, |
| "loss/crossentropy": 2.5160534381866455, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.16712725162506104, |
| "loss/reg": 0.00019193820480722934, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.01275, |
| "grad_norm": 2.24092960357666, |
| "grad_norm_var": 0.3955345231673798, |
| "learning_rate": 0.0001, |
| "loss": 1.2354, |
| "loss/crossentropy": 2.1451447010040283, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.1553465873003006, |
| "loss/reg": 0.00019191819592379034, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.012875, |
| "grad_norm": 2.344604730606079, |
| "grad_norm_var": 0.12208757192350952, |
| "learning_rate": 0.0001, |
| "loss": 1.4627, |
| "loss/crossentropy": 2.641226053237915, |
| "loss/hidden": 1.203125, |
| "loss/logits": 0.2576836943626404, |
| "loss/reg": 0.00019189789600204676, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.013, |
| "grad_norm": 2.6583104133605957, |
| "grad_norm_var": 0.1139956751841869, |
| "learning_rate": 0.0001, |
| "loss": 1.2235, |
| "loss/crossentropy": 2.2702269554138184, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.1746675968170166, |
| "loss/reg": 0.00019186925783287734, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.013125, |
| "grad_norm": 2.452503204345703, |
| "grad_norm_var": 0.10820816494096158, |
| "learning_rate": 0.0001, |
| "loss": 1.2496, |
| "loss/crossentropy": 2.344975709915161, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.1617823839187622, |
| "loss/reg": 0.000191839542821981, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.01325, |
| "grad_norm": 2.3220486640930176, |
| "grad_norm_var": 0.1079440961702573, |
| "learning_rate": 0.0001, |
| "loss": 1.3184, |
| "loss/crossentropy": 2.382570505142212, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.1758350431919098, |
| "loss/reg": 0.00019180966774001718, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.013375, |
| "grad_norm": 2.275015115737915, |
| "grad_norm_var": 0.1089551443983727, |
| "learning_rate": 0.0001, |
| "loss": 1.1398, |
| "loss/crossentropy": 2.288180112838745, |
| "loss/hidden": 0.9765625, |
| "loss/logits": 0.1613542139530182, |
| "loss/reg": 0.00019177970534656197, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.0135, |
| "grad_norm": 2.281081199645996, |
| "grad_norm_var": 0.0774087174098435, |
| "learning_rate": 0.0001, |
| "loss": 1.1199, |
| "loss/crossentropy": 2.566871404647827, |
| "loss/hidden": 0.96875, |
| "loss/logits": 0.14925265312194824, |
| "loss/reg": 0.00019174529006704688, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.013625, |
| "grad_norm": 2.281104564666748, |
| "grad_norm_var": 0.06372173379328742, |
| "learning_rate": 0.0001, |
| "loss": 1.1916, |
| "loss/crossentropy": 2.335575580596924, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.1740630865097046, |
| "loss/reg": 0.00019171558960806578, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.01375, |
| "grad_norm": 3.2676873207092285, |
| "grad_norm_var": 0.10526650532868657, |
| "learning_rate": 0.0001, |
| "loss": 1.4065, |
| "loss/crossentropy": 2.33091402053833, |
| "loss/hidden": 1.1796875, |
| "loss/logits": 0.2248753160238266, |
| "loss/reg": 0.00019168361905030906, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.013875, |
| "grad_norm": 2.448396921157837, |
| "grad_norm_var": 0.07293540299188876, |
| "learning_rate": 0.0001, |
| "loss": 1.1487, |
| "loss/crossentropy": 2.759523391723633, |
| "loss/hidden": 0.98828125, |
| "loss/logits": 0.1585177183151245, |
| "loss/reg": 0.0001916515757329762, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.014, |
| "grad_norm": 2.745311975479126, |
| "grad_norm_var": 0.07769384665264147, |
| "learning_rate": 0.0001, |
| "loss": 1.3054, |
| "loss/crossentropy": 2.6752800941467285, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.17071780562400818, |
| "loss/reg": 0.00019161647651344538, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.014125, |
| "grad_norm": 6.7866058349609375, |
| "grad_norm_var": 1.2475617279497555, |
| "learning_rate": 0.0001, |
| "loss": 1.527, |
| "loss/crossentropy": 2.536259174346924, |
| "loss/hidden": 1.328125, |
| "loss/logits": 0.19696056842803955, |
| "loss/reg": 0.0001915794564411044, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.01425, |
| "grad_norm": 2.572187900543213, |
| "grad_norm_var": 1.2338838698080574, |
| "learning_rate": 0.0001, |
| "loss": 1.0705, |
| "loss/crossentropy": 3.121675968170166, |
| "loss/hidden": 0.92578125, |
| "loss/logits": 0.1427587866783142, |
| "loss/reg": 0.0001915483589982614, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.014375, |
| "grad_norm": 2.3879568576812744, |
| "grad_norm_var": 1.2353765898385585, |
| "learning_rate": 0.0001, |
| "loss": 1.1787, |
| "loss/crossentropy": 2.59924578666687, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.16120225191116333, |
| "loss/reg": 0.0001915154862217605, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.0145, |
| "grad_norm": 2.207024097442627, |
| "grad_norm_var": 1.2330085058190583, |
| "learning_rate": 0.0001, |
| "loss": 1.1257, |
| "loss/crossentropy": 2.3122026920318604, |
| "loss/hidden": 0.96484375, |
| "loss/logits": 0.15894125401973724, |
| "loss/reg": 0.0001914859312819317, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.014625, |
| "grad_norm": 2.5943527221679688, |
| "grad_norm_var": 1.233512504208524, |
| "learning_rate": 0.0001, |
| "loss": 1.0578, |
| "loss/crossentropy": 2.4962315559387207, |
| "loss/hidden": 0.9296875, |
| "loss/logits": 0.12624415755271912, |
| "loss/reg": 0.0001914564927574247, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.01475, |
| "grad_norm": 2.5308315753936768, |
| "grad_norm_var": 1.2194136468208996, |
| "learning_rate": 0.0001, |
| "loss": 1.3378, |
| "loss/crossentropy": 2.2406933307647705, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.17962361872196198, |
| "loss/reg": 0.00019143095414619893, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.014875, |
| "grad_norm": 3.8317177295684814, |
| "grad_norm_var": 1.2753290966219148, |
| "learning_rate": 0.0001, |
| "loss": 1.5298, |
| "loss/crossentropy": 2.7381229400634766, |
| "loss/hidden": 1.296875, |
| "loss/logits": 0.2309999018907547, |
| "loss/reg": 0.00019139735377393663, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 2.552171468734741, |
| "grad_norm_var": 1.2787832219082094, |
| "learning_rate": 0.0001, |
| "loss": 1.1698, |
| "loss/crossentropy": 2.678725481033325, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.16004806756973267, |
| "loss/reg": 0.0001913599990075454, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.015125, |
| "grad_norm": 2.930230140686035, |
| "grad_norm_var": 1.2679826365318356, |
| "learning_rate": 0.0001, |
| "loss": 1.3195, |
| "loss/crossentropy": 2.5161564350128174, |
| "loss/hidden": 1.1015625, |
| "loss/logits": 0.21604904532432556, |
| "loss/reg": 0.00019133626483380795, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.01525, |
| "grad_norm": 2.184554100036621, |
| "grad_norm_var": 1.2793169490082976, |
| "learning_rate": 0.0001, |
| "loss": 1.3258, |
| "loss/crossentropy": 2.1544501781463623, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.18321675062179565, |
| "loss/reg": 0.0001913021260406822, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.015375, |
| "grad_norm": 2.3296663761138916, |
| "grad_norm_var": 1.2751879992777064, |
| "learning_rate": 0.0001, |
| "loss": 1.2465, |
| "loss/crossentropy": 2.3694839477539062, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.16649125516414642, |
| "loss/reg": 0.00019128025451209396, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.0155, |
| "grad_norm": 2.683415174484253, |
| "grad_norm_var": 1.2536762853317933, |
| "learning_rate": 0.0001, |
| "loss": 1.3152, |
| "loss/crossentropy": 2.480109930038452, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.1961439996957779, |
| "loss/reg": 0.00019124921527691185, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.015625, |
| "grad_norm": 2.2008211612701416, |
| "grad_norm_var": 1.2606593807518112, |
| "learning_rate": 0.0001, |
| "loss": 1.2683, |
| "loss/crossentropy": 2.3611254692077637, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.17260757088661194, |
| "loss/reg": 0.00019121899094898254, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.01575, |
| "grad_norm": 2.5672714710235596, |
| "grad_norm_var": 1.2561244980458308, |
| "learning_rate": 0.0001, |
| "loss": 1.1256, |
| "loss/crossentropy": 2.4654054641723633, |
| "loss/hidden": 0.97265625, |
| "loss/logits": 0.1510714888572693, |
| "loss/reg": 0.00019119179341942072, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.015875, |
| "grad_norm": 5.104769706726074, |
| "grad_norm_var": 1.5559544106053274, |
| "learning_rate": 0.0001, |
| "loss": 1.7662, |
| "loss/crossentropy": 2.8167941570281982, |
| "loss/hidden": 1.46875, |
| "loss/logits": 0.2955778241157532, |
| "loss/reg": 0.00019115611212328076, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 2.780869483947754, |
| "grad_norm_var": 1.5547640591921663, |
| "learning_rate": 0.0001, |
| "loss": 1.3813, |
| "loss/crossentropy": 2.4478983879089355, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.19183939695358276, |
| "loss/reg": 0.00019112625159323215, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.016125, |
| "grad_norm": 2.753885269165039, |
| "grad_norm_var": 0.5433630068432194, |
| "learning_rate": 0.0001, |
| "loss": 1.3723, |
| "loss/crossentropy": 2.6112186908721924, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.2141554355621338, |
| "loss/reg": 0.0001910965656861663, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.01625, |
| "grad_norm": 2.9186818599700928, |
| "grad_norm_var": 0.5420405140966994, |
| "learning_rate": 0.0001, |
| "loss": 1.4122, |
| "loss/crossentropy": 2.4329094886779785, |
| "loss/hidden": 1.2109375, |
| "loss/logits": 0.199338898062706, |
| "loss/reg": 0.00019106207764707506, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.016375, |
| "grad_norm": 3.3282015323638916, |
| "grad_norm_var": 0.5475325270303432, |
| "learning_rate": 0.0001, |
| "loss": 1.3236, |
| "loss/crossentropy": 2.808492660522461, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.20454144477844238, |
| "loss/reg": 0.00019102977239526808, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.0165, |
| "grad_norm": 2.5251357555389404, |
| "grad_norm_var": 0.5268546307130834, |
| "learning_rate": 0.0001, |
| "loss": 1.0806, |
| "loss/crossentropy": 2.606909990310669, |
| "loss/hidden": 0.94140625, |
| "loss/logits": 0.1372692584991455, |
| "loss/reg": 0.00019099873316008598, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.016625, |
| "grad_norm": 2.695373773574829, |
| "grad_norm_var": 0.5238667023797914, |
| "learning_rate": 0.0001, |
| "loss": 1.1349, |
| "loss/crossentropy": 2.5870134830474854, |
| "loss/hidden": 0.97265625, |
| "loss/logits": 0.16037797927856445, |
| "loss/reg": 0.00019096150936093181, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.01675, |
| "grad_norm": 2.967905282974243, |
| "grad_norm_var": 0.5160494986535296, |
| "learning_rate": 0.0001, |
| "loss": 1.3901, |
| "loss/crossentropy": 2.3568673133850098, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.20067735016345978, |
| "loss/reg": 0.0001909265120048076, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.016875, |
| "grad_norm": 2.248267412185669, |
| "grad_norm_var": 0.4754480378524304, |
| "learning_rate": 0.0001, |
| "loss": 1.2149, |
| "loss/crossentropy": 2.513349771499634, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.1660783886909485, |
| "loss/reg": 0.00019089688430540264, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.017, |
| "grad_norm": 2.6341240406036377, |
| "grad_norm_var": 0.4731794320985157, |
| "learning_rate": 0.0001, |
| "loss": 1.4788, |
| "loss/crossentropy": 2.307194709777832, |
| "loss/hidden": 1.2421875, |
| "loss/logits": 0.23469889163970947, |
| "loss/reg": 0.00019086015527136624, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.017125, |
| "grad_norm": 26.939998626708984, |
| "grad_norm_var": 36.90875808790156, |
| "learning_rate": 0.0001, |
| "loss": 1.9603, |
| "loss/crossentropy": 2.4554264545440674, |
| "loss/hidden": 1.7265625, |
| "loss/logits": 0.23182228207588196, |
| "loss/reg": 0.00019083071674685925, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.01725, |
| "grad_norm": 2.5422139167785645, |
| "grad_norm_var": 36.81568419391185, |
| "learning_rate": 0.0001, |
| "loss": 1.0683, |
| "loss/crossentropy": 2.4824135303497314, |
| "loss/hidden": 0.921875, |
| "loss/logits": 0.14447075128555298, |
| "loss/reg": 0.00019080075435340405, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.017375, |
| "grad_norm": 8.87843132019043, |
| "grad_norm_var": 37.75269230148678, |
| "learning_rate": 0.0001, |
| "loss": 2.5968, |
| "loss/crossentropy": 3.632392406463623, |
| "loss/hidden": 2.0, |
| "loss/logits": 0.594857931137085, |
| "loss/reg": 0.0001907727710204199, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.0175, |
| "grad_norm": 2.6119391918182373, |
| "grad_norm_var": 37.77256905325698, |
| "learning_rate": 0.0001, |
| "loss": 1.2831, |
| "loss/crossentropy": 2.694307804107666, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.19521979987621307, |
| "loss/reg": 0.00019074990996159613, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.017625, |
| "grad_norm": 2.630040407180786, |
| "grad_norm_var": 37.63927642256101, |
| "learning_rate": 0.0001, |
| "loss": 1.2854, |
| "loss/crossentropy": 2.7011289596557617, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.17411190271377563, |
| "loss/reg": 0.0001907304977066815, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.01775, |
| "grad_norm": 2.6480321884155273, |
| "grad_norm_var": 37.616094691169124, |
| "learning_rate": 0.0001, |
| "loss": 1.5096, |
| "loss/crossentropy": 2.260585069656372, |
| "loss/hidden": 1.2890625, |
| "loss/logits": 0.218610018491745, |
| "loss/reg": 0.00019069462723564357, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.017875, |
| "grad_norm": 6.334838390350342, |
| "grad_norm_var": 37.766716198289636, |
| "learning_rate": 0.0001, |
| "loss": 1.4905, |
| "loss/crossentropy": 2.702542543411255, |
| "loss/hidden": 1.265625, |
| "loss/logits": 0.22292816638946533, |
| "loss/reg": 0.00019065497326664627, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.018, |
| "grad_norm": 2.5124268531799316, |
| "grad_norm_var": 37.84491654864667, |
| "learning_rate": 0.0001, |
| "loss": 1.4406, |
| "loss/crossentropy": 2.3640494346618652, |
| "loss/hidden": 1.234375, |
| "loss/logits": 0.20432478189468384, |
| "loss/reg": 0.00019061053171753883, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.018125, |
| "grad_norm": 2.5983355045318604, |
| "grad_norm_var": 37.889344095265606, |
| "learning_rate": 0.0001, |
| "loss": 1.2585, |
| "loss/crossentropy": 2.386502265930176, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.1472403109073639, |
| "loss/reg": 0.0001905607496155426, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.01825, |
| "grad_norm": 2.3283042907714844, |
| "grad_norm_var": 38.06027251189639, |
| "learning_rate": 0.0001, |
| "loss": 1.0838, |
| "loss/crossentropy": 2.589846134185791, |
| "loss/hidden": 0.94140625, |
| "loss/logits": 0.14048275351524353, |
| "loss/reg": 0.00019053251889999956, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.018375, |
| "grad_norm": 3.144895315170288, |
| "grad_norm_var": 38.09776954094634, |
| "learning_rate": 0.0001, |
| "loss": 1.44, |
| "loss/crossentropy": 2.7745416164398193, |
| "loss/hidden": 1.234375, |
| "loss/logits": 0.2037464678287506, |
| "loss/reg": 0.00019048065587412566, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.0185, |
| "grad_norm": 26.648195266723633, |
| "grad_norm_var": 67.26352470044017, |
| "learning_rate": 0.0001, |
| "loss": 1.5186, |
| "loss/crossentropy": 2.64324951171875, |
| "loss/hidden": 1.3125, |
| "loss/logits": 0.20418909192085266, |
| "loss/reg": 0.00019045177032239735, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.018625, |
| "grad_norm": 3.2434163093566895, |
| "grad_norm_var": 67.02089246655065, |
| "learning_rate": 0.0001, |
| "loss": 1.4708, |
| "loss/crossentropy": 2.3727641105651855, |
| "loss/hidden": 1.25, |
| "loss/logits": 0.2188955843448639, |
| "loss/reg": 0.0001904223026940599, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.01875, |
| "grad_norm": 2.80842661857605, |
| "grad_norm_var": 67.09348312744588, |
| "learning_rate": 0.0001, |
| "loss": 1.4085, |
| "loss/crossentropy": 2.262383460998535, |
| "loss/hidden": 1.21875, |
| "loss/logits": 0.18781118094921112, |
| "loss/reg": 0.0001903773081721738, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.018875, |
| "grad_norm": 2.683870553970337, |
| "grad_norm_var": 66.87019083886788, |
| "learning_rate": 0.0001, |
| "loss": 1.2521, |
| "loss/crossentropy": 2.5074141025543213, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.16421037912368774, |
| "loss/reg": 0.00019033256103284657, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.019, |
| "grad_norm": 2.229814291000366, |
| "grad_norm_var": 67.0793329518605, |
| "learning_rate": 0.0001, |
| "loss": 1.1402, |
| "loss/crossentropy": 2.541126012802124, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.1382928341627121, |
| "loss/reg": 0.00019028309907298535, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.019125, |
| "grad_norm": 3.54891300201416, |
| "grad_norm_var": 36.90022117788916, |
| "learning_rate": 0.0001, |
| "loss": 1.6011, |
| "loss/crossentropy": 2.307600975036621, |
| "loss/hidden": 1.3671875, |
| "loss/logits": 0.23196424543857574, |
| "loss/reg": 0.000190236751222983, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.01925, |
| "grad_norm": 2.360466718673706, |
| "grad_norm_var": 36.95789528091398, |
| "learning_rate": 0.0001, |
| "loss": 1.1651, |
| "loss/crossentropy": 2.8502321243286133, |
| "loss/hidden": 0.99609375, |
| "loss/logits": 0.16714423894882202, |
| "loss/reg": 0.00019020496984012425, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.019375, |
| "grad_norm": 3.1953799724578857, |
| "grad_norm_var": 35.90550452702474, |
| "learning_rate": 0.0001, |
| "loss": 1.0914, |
| "loss/crossentropy": 2.809048652648926, |
| "loss/hidden": 0.9453125, |
| "loss/logits": 0.14417339861392975, |
| "loss/reg": 0.00019016550504602492, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.0195, |
| "grad_norm": 2.6717631816864014, |
| "grad_norm_var": 35.890903690685896, |
| "learning_rate": 0.0001, |
| "loss": 1.2632, |
| "loss/crossentropy": 2.7067086696624756, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.21447248756885529, |
| "loss/reg": 0.00019013263226952404, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.019625, |
| "grad_norm": 2.3536930084228516, |
| "grad_norm_var": 35.96362699082104, |
| "learning_rate": 0.0001, |
| "loss": 1.1418, |
| "loss/crossentropy": 2.5986528396606445, |
| "loss/hidden": 0.98046875, |
| "loss/logits": 0.15946394205093384, |
| "loss/reg": 0.00019009722745977342, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.01975, |
| "grad_norm": 2.0697736740112305, |
| "grad_norm_var": 36.1239934744858, |
| "learning_rate": 0.0001, |
| "loss": 1.1946, |
| "loss/crossentropy": 2.4766931533813477, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.16143161058425903, |
| "loss/reg": 0.0001900633069453761, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.019875, |
| "grad_norm": 2.4079439640045166, |
| "grad_norm_var": 36.08560176253508, |
| "learning_rate": 0.0001, |
| "loss": 1.2239, |
| "loss/crossentropy": 2.5432755947113037, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.18291215598583221, |
| "loss/reg": 0.00019003944180440158, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 1.9956291913986206, |
| "grad_norm_var": 36.21688030379835, |
| "learning_rate": 0.0001, |
| "loss": 1.2381, |
| "loss/crossentropy": 2.6434080600738525, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.1736569106578827, |
| "loss/reg": 0.0001900054921861738, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.020125, |
| "grad_norm": 2.3660035133361816, |
| "grad_norm_var": 36.268105524765524, |
| "learning_rate": 0.0001, |
| "loss": 1.3402, |
| "loss/crossentropy": 2.7457640171051025, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.19771495461463928, |
| "loss/reg": 0.00018996208382304758, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.02025, |
| "grad_norm": 2.2741127014160156, |
| "grad_norm_var": 36.28129668661176, |
| "learning_rate": 0.0001, |
| "loss": 1.1122, |
| "loss/crossentropy": 2.694679021835327, |
| "loss/hidden": 0.9453125, |
| "loss/logits": 0.16500751674175262, |
| "loss/reg": 0.00018992851255461574, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.020375, |
| "grad_norm": 4.299161911010742, |
| "grad_norm_var": 36.213705020452686, |
| "learning_rate": 0.0001, |
| "loss": 1.5997, |
| "loss/crossentropy": 2.41153621673584, |
| "loss/hidden": 1.3515625, |
| "loss/logits": 0.24626833200454712, |
| "loss/reg": 0.00018988759256899357, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.0205, |
| "grad_norm": 2.6989567279815674, |
| "grad_norm_var": 0.37062173740110177, |
| "learning_rate": 0.0001, |
| "loss": 1.4101, |
| "loss/crossentropy": 2.6252450942993164, |
| "loss/hidden": 1.1796875, |
| "loss/logits": 0.2285255789756775, |
| "loss/reg": 0.00018985987117048353, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.020625, |
| "grad_norm": 3.6266448497772217, |
| "grad_norm_var": 0.40754436908909636, |
| "learning_rate": 0.0001, |
| "loss": 1.3041, |
| "loss/crossentropy": 2.768099308013916, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.18500390648841858, |
| "loss/reg": 0.00018983366317115724, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.02075, |
| "grad_norm": 3.0785396099090576, |
| "grad_norm_var": 0.4151303111429129, |
| "learning_rate": 0.0001, |
| "loss": 1.4192, |
| "loss/crossentropy": 2.621722459793091, |
| "loss/hidden": 1.203125, |
| "loss/logits": 0.21420907974243164, |
| "loss/reg": 0.00018979469314217567, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.020875, |
| "grad_norm": 2.837559938430786, |
| "grad_norm_var": 0.4154299188334017, |
| "learning_rate": 0.0001, |
| "loss": 1.3458, |
| "loss/crossentropy": 2.430216073989868, |
| "loss/hidden": 1.1640625, |
| "loss/logits": 0.17987295985221863, |
| "loss/reg": 0.00018976339197251946, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.021, |
| "grad_norm": 2.8974390029907227, |
| "grad_norm_var": 0.396902541608811, |
| "learning_rate": 0.0001, |
| "loss": 1.2952, |
| "loss/crossentropy": 2.8876166343688965, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.19960089027881622, |
| "loss/reg": 0.00018972392717842013, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.021125, |
| "grad_norm": 2.533341646194458, |
| "grad_norm_var": 0.3589553633283116, |
| "learning_rate": 0.0001, |
| "loss": 1.2505, |
| "loss/crossentropy": 2.7849295139312744, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.17050783336162567, |
| "loss/reg": 0.0001896892354125157, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.02125, |
| "grad_norm": 2.5643837451934814, |
| "grad_norm_var": 0.35153012514085774, |
| "learning_rate": 0.0001, |
| "loss": 1.4785, |
| "loss/crossentropy": 2.497753381729126, |
| "loss/hidden": 1.234375, |
| "loss/logits": 0.2422376275062561, |
| "loss/reg": 0.00018963789625559002, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.021375, |
| "grad_norm": 2.6845455169677734, |
| "grad_norm_var": 0.3369522102595737, |
| "learning_rate": 0.0001, |
| "loss": 1.2292, |
| "loss/crossentropy": 2.5292482376098633, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.18040287494659424, |
| "loss/reg": 0.00018960374291054904, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.0215, |
| "grad_norm": 2.3379464149475098, |
| "grad_norm_var": 0.3456172785279794, |
| "learning_rate": 0.0001, |
| "loss": 1.3626, |
| "loss/crossentropy": 2.5603489875793457, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.20441675186157227, |
| "loss/reg": 0.00018956881831400096, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.021625, |
| "grad_norm": 2.594144582748413, |
| "grad_norm_var": 0.33847746883165203, |
| "learning_rate": 0.0001, |
| "loss": 1.2032, |
| "loss/crossentropy": 2.6486992835998535, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.17790886759757996, |
| "loss/reg": 0.00018953454855363816, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.02175, |
| "grad_norm": 2.9685492515563965, |
| "grad_norm_var": 0.3129452666235446, |
| "learning_rate": 0.0001, |
| "loss": 1.4619, |
| "loss/crossentropy": 2.455124855041504, |
| "loss/hidden": 1.25, |
| "loss/logits": 0.21000754833221436, |
| "loss/reg": 0.0001895053283078596, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.021875, |
| "grad_norm": 2.533724308013916, |
| "grad_norm_var": 0.3080246907592235, |
| "learning_rate": 0.0001, |
| "loss": 1.3506, |
| "loss/crossentropy": 2.4606072902679443, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.1768466681241989, |
| "loss/reg": 0.0001894600281957537, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.022, |
| "grad_norm": 2.629793643951416, |
| "grad_norm_var": 0.2678377821192363, |
| "learning_rate": 0.0001, |
| "loss": 1.3554, |
| "loss/crossentropy": 2.50494647026062, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.2129271924495697, |
| "loss/reg": 0.0001894147862913087, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.022125, |
| "grad_norm": 2.426023006439209, |
| "grad_norm_var": 0.2645273844934496, |
| "learning_rate": 0.0001, |
| "loss": 1.2205, |
| "loss/crossentropy": 2.4932215213775635, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.1717696189880371, |
| "loss/reg": 0.0001893793960334733, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.02225, |
| "grad_norm": 12.438695907592773, |
| "grad_norm_var": 5.993566887433531, |
| "learning_rate": 0.0001, |
| "loss": 1.4933, |
| "loss/crossentropy": 2.6111667156219482, |
| "loss/hidden": 1.3125, |
| "loss/logits": 0.17892810702323914, |
| "loss/reg": 0.00018933985847979784, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.022375, |
| "grad_norm": 2.2316975593566895, |
| "grad_norm_var": 6.025764924701703, |
| "learning_rate": 0.0001, |
| "loss": 1.2337, |
| "loss/crossentropy": 2.452789306640625, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.19278019666671753, |
| "loss/reg": 0.0001893048029160127, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.0225, |
| "grad_norm": 2.285240650177002, |
| "grad_norm_var": 6.070589505634923, |
| "learning_rate": 0.0001, |
| "loss": 1.3731, |
| "loss/crossentropy": 2.4663660526275635, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.18373644351959229, |
| "loss/reg": 0.0001892724831122905, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.022625, |
| "grad_norm": 2.4717960357666016, |
| "grad_norm_var": 6.102379780965068, |
| "learning_rate": 0.0001, |
| "loss": 1.1602, |
| "loss/crossentropy": 2.7595460414886475, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.16613003611564636, |
| "loss/reg": 0.00018924209871329367, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.02275, |
| "grad_norm": 2.4991321563720703, |
| "grad_norm_var": 6.134258503662537, |
| "learning_rate": 0.0001, |
| "loss": 1.1269, |
| "loss/crossentropy": 2.270359516143799, |
| "loss/hidden": 0.9765625, |
| "loss/logits": 0.14847519993782043, |
| "loss/reg": 0.00018921452283393592, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.022875, |
| "grad_norm": 2.3847415447235107, |
| "grad_norm_var": 6.167952691299849, |
| "learning_rate": 0.0001, |
| "loss": 1.2154, |
| "loss/crossentropy": 2.49910831451416, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.158855140209198, |
| "loss/reg": 0.00018917533452622592, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.023, |
| "grad_norm": 2.5306761264801025, |
| "grad_norm_var": 6.188958706490437, |
| "learning_rate": 0.0001, |
| "loss": 1.4351, |
| "loss/crossentropy": 2.3558568954467773, |
| "loss/hidden": 1.2265625, |
| "loss/logits": 0.2066374272108078, |
| "loss/reg": 0.00018914024985861033, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.023125, |
| "grad_norm": 2.2277228832244873, |
| "grad_norm_var": 6.219197407448101, |
| "learning_rate": 0.0001, |
| "loss": 1.0945, |
| "loss/crossentropy": 2.6959574222564697, |
| "loss/hidden": 0.9375, |
| "loss/logits": 0.1550855189561844, |
| "loss/reg": 0.00018909583741333336, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.02325, |
| "grad_norm": 3.3319711685180664, |
| "grad_norm_var": 6.199868483198406, |
| "learning_rate": 0.0001, |
| "loss": 1.2316, |
| "loss/crossentropy": 2.3768630027770996, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.15154394507408142, |
| "loss/reg": 0.00018906217883341014, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.023375, |
| "grad_norm": 2.5634615421295166, |
| "grad_norm_var": 6.208477354320166, |
| "learning_rate": 0.0001, |
| "loss": 1.2211, |
| "loss/crossentropy": 2.679447650909424, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.18797855079174042, |
| "loss/reg": 0.00018902822921518236, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.0235, |
| "grad_norm": 2.460848093032837, |
| "grad_norm_var": 6.196057718240725, |
| "learning_rate": 0.0001, |
| "loss": 1.2703, |
| "loss/crossentropy": 2.6185357570648193, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.18242943286895752, |
| "loss/reg": 0.000188985766726546, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.023625, |
| "grad_norm": 2.464927911758423, |
| "grad_norm_var": 6.206869955671459, |
| "learning_rate": 0.0001, |
| "loss": 1.1848, |
| "loss/crossentropy": 2.4472947120666504, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.16724231839179993, |
| "loss/reg": 0.00018895140965469182, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.02375, |
| "grad_norm": 2.6658761501312256, |
| "grad_norm_var": 6.220041941034168, |
| "learning_rate": 0.0001, |
| "loss": 1.2639, |
| "loss/crossentropy": 2.3520965576171875, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.1682794690132141, |
| "loss/reg": 0.0001889086706796661, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.023875, |
| "grad_norm": 2.450937509536743, |
| "grad_norm_var": 6.227097887980031, |
| "learning_rate": 0.0001, |
| "loss": 1.3416, |
| "loss/crossentropy": 2.4653656482696533, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.19132134318351746, |
| "loss/reg": 0.00018886124598793685, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.024, |
| "grad_norm": 2.5486769676208496, |
| "grad_norm_var": 6.232908017729262, |
| "learning_rate": 0.0001, |
| "loss": 1.408, |
| "loss/crossentropy": 2.4828147888183594, |
| "loss/hidden": 1.1796875, |
| "loss/logits": 0.22640305757522583, |
| "loss/reg": 0.00018882614676840603, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.024125, |
| "grad_norm": 2.733919858932495, |
| "grad_norm_var": 6.210183098557901, |
| "learning_rate": 0.0001, |
| "loss": 1.3418, |
| "loss/crossentropy": 2.7277894020080566, |
| "loss/hidden": 1.1640625, |
| "loss/logits": 0.17583458125591278, |
| "loss/reg": 0.00018878061382565647, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.02425, |
| "grad_norm": 4.0347490310668945, |
| "grad_norm_var": 0.20841963510717557, |
| "learning_rate": 0.0001, |
| "loss": 1.2188, |
| "loss/crossentropy": 2.671088933944702, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.16220757365226746, |
| "loss/reg": 0.00018873742374125868, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.024375, |
| "grad_norm": 2.1080739498138428, |
| "grad_norm_var": 0.21574061631260597, |
| "learning_rate": 0.0001, |
| "loss": 1.0983, |
| "loss/crossentropy": 2.465228319168091, |
| "loss/hidden": 0.953125, |
| "loss/logits": 0.1432487666606903, |
| "loss/reg": 0.00018869389896281064, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.0245, |
| "grad_norm": 3.3122878074645996, |
| "grad_norm_var": 0.23717126048259918, |
| "learning_rate": 0.0001, |
| "loss": 1.4535, |
| "loss/crossentropy": 2.543299913406372, |
| "loss/hidden": 1.2421875, |
| "loss/logits": 0.2094225287437439, |
| "loss/reg": 0.00018865260062739253, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.024625, |
| "grad_norm": 3.8393805027008057, |
| "grad_norm_var": 0.3171273295177985, |
| "learning_rate": 0.0001, |
| "loss": 1.4855, |
| "loss/crossentropy": 2.7230522632598877, |
| "loss/hidden": 1.2265625, |
| "loss/logits": 0.25702908635139465, |
| "loss/reg": 0.00018861188436858356, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.02475, |
| "grad_norm": 2.3128864765167236, |
| "grad_norm_var": 0.32576930180668173, |
| "learning_rate": 0.0001, |
| "loss": 1.2876, |
| "loss/crossentropy": 2.470360517501831, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.19192585349082947, |
| "loss/reg": 0.0001885706151369959, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.024875, |
| "grad_norm": 2.222193956375122, |
| "grad_norm_var": 0.33529781396605357, |
| "learning_rate": 0.0001, |
| "loss": 1.2726, |
| "loss/crossentropy": 2.4833052158355713, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.1925581395626068, |
| "loss/reg": 0.00018852519860956818, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 2.148888111114502, |
| "grad_norm_var": 0.35496365745480013, |
| "learning_rate": 0.0001, |
| "loss": 1.1089, |
| "loss/crossentropy": 2.445056915283203, |
| "loss/hidden": 0.95703125, |
| "loss/logits": 0.14994922280311584, |
| "loss/reg": 0.00018848305626306683, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.025125, |
| "grad_norm": 3.009246587753296, |
| "grad_norm_var": 0.34244750319667006, |
| "learning_rate": 0.0001, |
| "loss": 1.2831, |
| "loss/crossentropy": 2.5913355350494385, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.18746912479400635, |
| "loss/reg": 0.00018844057922251523, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.02525, |
| "grad_norm": 2.6826765537261963, |
| "grad_norm_var": 0.31954091153954683, |
| "learning_rate": 0.0001, |
| "loss": 1.2207, |
| "loss/crossentropy": 2.2942862510681152, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.14849132299423218, |
| "loss/reg": 0.00018840315169654787, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.025375, |
| "grad_norm": 2.089634418487549, |
| "grad_norm_var": 0.34361665903956173, |
| "learning_rate": 0.0001, |
| "loss": 1.2152, |
| "loss/crossentropy": 2.565154790878296, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.17425358295440674, |
| "loss/reg": 0.00018837135576177388, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.0255, |
| "grad_norm": 2.1764748096466064, |
| "grad_norm_var": 0.35746666647812214, |
| "learning_rate": 0.0001, |
| "loss": 1.3562, |
| "loss/crossentropy": 2.24812912940979, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.21368308365345, |
| "loss/reg": 0.00018833015928976238, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.025625, |
| "grad_norm": 2.8530831336975098, |
| "grad_norm_var": 0.35600843248713165, |
| "learning_rate": 0.0001, |
| "loss": 1.475, |
| "loss/crossentropy": 2.0817508697509766, |
| "loss/hidden": 1.2421875, |
| "loss/logits": 0.23092345893383026, |
| "loss/reg": 0.00018830793851520866, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.02575, |
| "grad_norm": 2.663485050201416, |
| "grad_norm_var": 0.3560194494934476, |
| "learning_rate": 0.0001, |
| "loss": 1.1357, |
| "loss/crossentropy": 2.7369191646575928, |
| "loss/hidden": 0.97265625, |
| "loss/logits": 0.1611793339252472, |
| "loss/reg": 0.0001882883079815656, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.025875, |
| "grad_norm": 2.7267813682556152, |
| "grad_norm_var": 0.35164556437612193, |
| "learning_rate": 0.0001, |
| "loss": 1.4018, |
| "loss/crossentropy": 2.521235704421997, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.21241608262062073, |
| "loss/reg": 0.0001882474316516891, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.026, |
| "grad_norm": 3.3730874061584473, |
| "grad_norm_var": 0.37568723584622227, |
| "learning_rate": 0.0001, |
| "loss": 1.6168, |
| "loss/crossentropy": 2.67789626121521, |
| "loss/hidden": 1.375, |
| "loss/logits": 0.2398722767829895, |
| "loss/reg": 0.0001882062351796776, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.026125, |
| "grad_norm": 2.58087158203125, |
| "grad_norm_var": 0.37784520807643934, |
| "learning_rate": 0.0001, |
| "loss": 1.4245, |
| "loss/crossentropy": 2.64697003364563, |
| "loss/hidden": 1.21875, |
| "loss/logits": 0.2038642168045044, |
| "loss/reg": 0.0001881623174995184, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.02625, |
| "grad_norm": 2.222271680831909, |
| "grad_norm_var": 0.27470612970490343, |
| "learning_rate": 0.0001, |
| "loss": 1.3277, |
| "loss/crossentropy": 2.5434417724609375, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.18515345454216003, |
| "loss/reg": 0.00018812257621902972, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.026375, |
| "grad_norm": 1.9148452281951904, |
| "grad_norm_var": 0.29087511560338725, |
| "learning_rate": 0.0001, |
| "loss": 1.097, |
| "loss/crossentropy": 2.769158363342285, |
| "loss/hidden": 0.94921875, |
| "loss/logits": 0.14593489468097687, |
| "loss/reg": 0.0001880890631582588, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.0265, |
| "grad_norm": 2.2211508750915527, |
| "grad_norm_var": 0.26646107901443655, |
| "learning_rate": 0.0001, |
| "loss": 1.3446, |
| "loss/crossentropy": 2.4616775512695312, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.19431401789188385, |
| "loss/reg": 0.00018806415027938783, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.026625, |
| "grad_norm": 2.763000965118408, |
| "grad_norm_var": 0.15595023444909822, |
| "learning_rate": 0.0001, |
| "loss": 1.3695, |
| "loss/crossentropy": 2.598646640777588, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.22699865698814392, |
| "loss/reg": 0.00018803446437232196, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.02675, |
| "grad_norm": 2.321552276611328, |
| "grad_norm_var": 0.15574157634795635, |
| "learning_rate": 0.0001, |
| "loss": 1.3229, |
| "loss/crossentropy": 2.574349880218506, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.18040841817855835, |
| "loss/reg": 0.00018799355893861502, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.026875, |
| "grad_norm": 2.252286195755005, |
| "grad_norm_var": 0.15469124462205552, |
| "learning_rate": 0.0001, |
| "loss": 1.2042, |
| "loss/crossentropy": 2.391897439956665, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.17103314399719238, |
| "loss/reg": 0.00018794478091876954, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.027, |
| "grad_norm": 2.9572079181671143, |
| "grad_norm_var": 0.15769059669121527, |
| "learning_rate": 0.0001, |
| "loss": 1.4114, |
| "loss/crossentropy": 2.4063875675201416, |
| "loss/hidden": 1.2109375, |
| "loss/logits": 0.1986089050769806, |
| "loss/reg": 0.00018789219029713422, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.027125, |
| "grad_norm": 2.3973214626312256, |
| "grad_norm_var": 0.14366297343363474, |
| "learning_rate": 0.0001, |
| "loss": 1.2443, |
| "loss/crossentropy": 2.5074877738952637, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.1798933446407318, |
| "loss/reg": 0.00018785115389619023, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.02725, |
| "grad_norm": 3.122760534286499, |
| "grad_norm_var": 0.16576884575759981, |
| "learning_rate": 0.0001, |
| "loss": 1.3388, |
| "loss/crossentropy": 2.428725242614746, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.19626876711845398, |
| "loss/reg": 0.00018780773098114878, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.027375, |
| "grad_norm": 2.321108341217041, |
| "grad_norm_var": 0.15522596127473184, |
| "learning_rate": 0.0001, |
| "loss": 1.2505, |
| "loss/crossentropy": 2.409878969192505, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.2017081379890442, |
| "loss/reg": 0.00018775733769871294, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.0275, |
| "grad_norm": 2.365797281265259, |
| "grad_norm_var": 0.14793109297261797, |
| "learning_rate": 0.0001, |
| "loss": 1.1702, |
| "loss/crossentropy": 2.588578224182129, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.15268871188163757, |
| "loss/reg": 0.00018771766917780042, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.027625, |
| "grad_norm": 2.1108896732330322, |
| "grad_norm_var": 0.153953573032076, |
| "learning_rate": 0.0001, |
| "loss": 1.255, |
| "loss/crossentropy": 2.527043581008911, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.16717128455638885, |
| "loss/reg": 0.00018766772700473666, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.02775, |
| "grad_norm": 2.5774085521698, |
| "grad_norm_var": 0.15276588289228218, |
| "learning_rate": 0.0001, |
| "loss": 1.2479, |
| "loss/crossentropy": 2.713797092437744, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.1835174411535263, |
| "loss/reg": 0.00018762832041829824, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.027875, |
| "grad_norm": 2.253450393676758, |
| "grad_norm_var": 0.1533568435494087, |
| "learning_rate": 0.0001, |
| "loss": 1.4177, |
| "loss/crossentropy": 2.6281967163085938, |
| "loss/hidden": 1.1953125, |
| "loss/logits": 0.22053499519824982, |
| "loss/reg": 0.0001875831076176837, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.028, |
| "grad_norm": 2.454873561859131, |
| "grad_norm_var": 0.09728623528139314, |
| "learning_rate": 0.0001, |
| "loss": 1.4309, |
| "loss/crossentropy": 2.572938919067383, |
| "loss/hidden": 1.21875, |
| "loss/logits": 0.21031853556632996, |
| "loss/reg": 0.00018754607299342752, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.028125, |
| "grad_norm": 2.402453899383545, |
| "grad_norm_var": 0.0956224663481746, |
| "learning_rate": 0.0001, |
| "loss": 1.2624, |
| "loss/crossentropy": 2.50075101852417, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.17460693418979645, |
| "loss/reg": 0.0001875244197435677, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.02825, |
| "grad_norm": 1.8176156282424927, |
| "grad_norm_var": 0.1163170905904891, |
| "learning_rate": 0.0001, |
| "loss": 1.203, |
| "loss/crossentropy": 2.2213640213012695, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.16985741257667542, |
| "loss/reg": 0.00018748667207546532, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.028375, |
| "grad_norm": 2.240485906600952, |
| "grad_norm_var": 0.10227683752628369, |
| "learning_rate": 0.0001, |
| "loss": 1.2129, |
| "loss/crossentropy": 2.7936031818389893, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.17974941432476044, |
| "loss/reg": 0.00018747476860880852, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.0285, |
| "grad_norm": 2.478255033493042, |
| "grad_norm_var": 0.09989290718763275, |
| "learning_rate": 0.0001, |
| "loss": 1.3474, |
| "loss/crossentropy": 2.237700939178467, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.18926523625850677, |
| "loss/reg": 0.00018743629334494472, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.028625, |
| "grad_norm": 2.2553858757019043, |
| "grad_norm_var": 0.09327515190840353, |
| "learning_rate": 0.0001, |
| "loss": 1.302, |
| "loss/crossentropy": 2.5399932861328125, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.1829543262720108, |
| "loss/reg": 0.0001873974542832002, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.02875, |
| "grad_norm": 2.5666871070861816, |
| "grad_norm_var": 0.09461214816090115, |
| "learning_rate": 0.0001, |
| "loss": 1.1668, |
| "loss/crossentropy": 2.5342864990234375, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.16493582725524902, |
| "loss/reg": 0.0001873665169114247, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.028875, |
| "grad_norm": 2.3809237480163574, |
| "grad_norm_var": 0.09292632453379727, |
| "learning_rate": 0.0001, |
| "loss": 1.2757, |
| "loss/crossentropy": 2.565258026123047, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.2034991830587387, |
| "loss/reg": 0.00018733744218479842, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.029, |
| "grad_norm": 11.484387397766113, |
| "grad_norm_var": 5.249492807099011, |
| "learning_rate": 0.0001, |
| "loss": 1.2536, |
| "loss/crossentropy": 2.610589027404785, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.15799307823181152, |
| "loss/reg": 0.00018729745352175087, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.029125, |
| "grad_norm": 2.276120901107788, |
| "grad_norm_var": 5.259372334728978, |
| "learning_rate": 0.0001, |
| "loss": 1.2413, |
| "loss/crossentropy": 2.6497323513031006, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.16128413379192352, |
| "loss/reg": 0.0001872599241323769, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.02925, |
| "grad_norm": 2.5592195987701416, |
| "grad_norm_var": 5.265810753770302, |
| "learning_rate": 0.0001, |
| "loss": 1.3953, |
| "loss/crossentropy": 2.6213786602020264, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.22152823209762573, |
| "loss/reg": 0.0001872186257969588, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.029375, |
| "grad_norm": 2.91487455368042, |
| "grad_norm_var": 5.241297695369635, |
| "learning_rate": 0.0001, |
| "loss": 1.4322, |
| "loss/crossentropy": 2.8353567123413086, |
| "loss/hidden": 1.2265625, |
| "loss/logits": 0.2038136124610901, |
| "loss/reg": 0.0001871798885986209, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.0295, |
| "grad_norm": 2.8779571056365967, |
| "grad_norm_var": 5.218058981409527, |
| "learning_rate": 0.0001, |
| "loss": 1.1964, |
| "loss/crossentropy": 2.818354606628418, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.16329507529735565, |
| "loss/reg": 0.0001871388085419312, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.029625, |
| "grad_norm": 2.4912526607513428, |
| "grad_norm_var": 5.183116200958803, |
| "learning_rate": 0.0001, |
| "loss": 1.1612, |
| "loss/crossentropy": 2.485246419906616, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.15928974747657776, |
| "loss/reg": 0.0001870945852715522, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.02975, |
| "grad_norm": 2.1257100105285645, |
| "grad_norm_var": 5.221437379820037, |
| "learning_rate": 0.0001, |
| "loss": 1.1011, |
| "loss/crossentropy": 2.5041284561157227, |
| "loss/hidden": 0.96484375, |
| "loss/logits": 0.1343885362148285, |
| "loss/reg": 0.00018704126705415547, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.029875, |
| "grad_norm": 2.609513998031616, |
| "grad_norm_var": 5.195165909077186, |
| "learning_rate": 0.0001, |
| "loss": 1.2685, |
| "loss/crossentropy": 2.5717175006866455, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.1728888750076294, |
| "loss/reg": 0.00018699871725402772, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 3.054771900177002, |
| "grad_norm_var": 5.174376919782516, |
| "learning_rate": 0.0001, |
| "loss": 1.3059, |
| "loss/crossentropy": 2.3587000370025635, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.19463828206062317, |
| "loss/reg": 0.00018695260223466903, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.030125, |
| "grad_norm": 4.069124221801758, |
| "grad_norm_var": 5.207761360833089, |
| "learning_rate": 0.0001, |
| "loss": 1.5327, |
| "loss/crossentropy": 2.519747257232666, |
| "loss/hidden": 1.3046875, |
| "loss/logits": 0.22609561681747437, |
| "loss/reg": 0.00018690834986045957, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.03025, |
| "grad_norm": 2.6919775009155273, |
| "grad_norm_var": 5.101652290115754, |
| "learning_rate": 0.0001, |
| "loss": 1.4243, |
| "loss/crossentropy": 2.5264573097229004, |
| "loss/hidden": 1.203125, |
| "loss/logits": 0.21935243904590607, |
| "loss/reg": 0.00018685661780182272, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.030375, |
| "grad_norm": 2.9397759437561035, |
| "grad_norm_var": 5.043470206735567, |
| "learning_rate": 0.0001, |
| "loss": 1.1593, |
| "loss/crossentropy": 2.649984836578369, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.16523152589797974, |
| "loss/reg": 0.00018679779896046966, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.0305, |
| "grad_norm": 2.597238063812256, |
| "grad_norm_var": 5.032333906433272, |
| "learning_rate": 0.0001, |
| "loss": 1.3144, |
| "loss/crossentropy": 2.7007710933685303, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.20317299664020538, |
| "loss/reg": 0.00018673852900974452, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.030625, |
| "grad_norm": 4.779845237731934, |
| "grad_norm_var": 5.0980686958710555, |
| "learning_rate": 0.0001, |
| "loss": 1.9743, |
| "loss/crossentropy": 2.7656006813049316, |
| "loss/hidden": 1.5625, |
| "loss/logits": 0.4099583327770233, |
| "loss/reg": 0.0001866959355538711, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.03075, |
| "grad_norm": 3.3142945766448975, |
| "grad_norm_var": 5.049814806516395, |
| "learning_rate": 0.0001, |
| "loss": 1.3097, |
| "loss/crossentropy": 2.7472550868988037, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.1828756332397461, |
| "loss/reg": 0.00018664947128854692, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.030875, |
| "grad_norm": 2.6288223266601562, |
| "grad_norm_var": 5.01838753008573, |
| "learning_rate": 0.0001, |
| "loss": 1.2715, |
| "loss/crossentropy": 2.5212960243225098, |
| "loss/hidden": 1.1015625, |
| "loss/logits": 0.16810457408428192, |
| "loss/reg": 0.00018659804482012987, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.031, |
| "grad_norm": 2.1690521240234375, |
| "grad_norm_var": 0.4794672993561553, |
| "learning_rate": 0.0001, |
| "loss": 1.081, |
| "loss/crossentropy": 2.517190933227539, |
| "loss/hidden": 0.94140625, |
| "loss/logits": 0.13774770498275757, |
| "loss/reg": 0.0001865396770881489, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.031125, |
| "grad_norm": 2.642812490463257, |
| "grad_norm_var": 0.45828649220525125, |
| "learning_rate": 0.0001, |
| "loss": 1.2944, |
| "loss/crossentropy": 2.411734104156494, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.16756606101989746, |
| "loss/reg": 0.00018649944104254246, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.03125, |
| "grad_norm": 2.553938627243042, |
| "grad_norm_var": 0.4585311039907272, |
| "learning_rate": 0.0001, |
| "loss": 1.1273, |
| "loss/crossentropy": 2.9807546138763428, |
| "loss/hidden": 0.9765625, |
| "loss/logits": 0.1488645374774933, |
| "loss/reg": 0.00018645105592440814, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.031375, |
| "grad_norm": 2.010117292404175, |
| "grad_norm_var": 0.5083579557676783, |
| "learning_rate": 0.0001, |
| "loss": 1.2306, |
| "loss/crossentropy": 2.562990427017212, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.18183979392051697, |
| "loss/reg": 0.00018639408517628908, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.0315, |
| "grad_norm": 2.712961196899414, |
| "grad_norm_var": 0.5093841749170441, |
| "learning_rate": 0.0001, |
| "loss": 1.4763, |
| "loss/crossentropy": 2.573420524597168, |
| "loss/hidden": 1.2578125, |
| "loss/logits": 0.21662938594818115, |
| "loss/reg": 0.0001863273064373061, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.031625, |
| "grad_norm": 2.6070237159729004, |
| "grad_norm_var": 0.504885617842933, |
| "learning_rate": 0.0001, |
| "loss": 1.1148, |
| "loss/crossentropy": 2.421952486038208, |
| "loss/hidden": 0.98046875, |
| "loss/logits": 0.13251450657844543, |
| "loss/reg": 0.00018625622033141553, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.03175, |
| "grad_norm": 2.6277847290039062, |
| "grad_norm_var": 0.4725433925882648, |
| "learning_rate": 0.0001, |
| "loss": 1.3581, |
| "loss/crossentropy": 2.6136345863342285, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.1999516785144806, |
| "loss/reg": 0.00018621186609379947, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.031875, |
| "grad_norm": 4.751771926879883, |
| "grad_norm_var": 0.683379142444816, |
| "learning_rate": 0.0001, |
| "loss": 1.5569, |
| "loss/crossentropy": 2.437589645385742, |
| "loss/hidden": 1.3125, |
| "loss/logits": 0.2425535023212433, |
| "loss/reg": 0.00018616624583955854, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 2.601804494857788, |
| "grad_norm_var": 0.693466035829209, |
| "learning_rate": 0.0001, |
| "loss": 1.477, |
| "loss/crossentropy": 2.3009605407714844, |
| "loss/hidden": 1.25, |
| "loss/logits": 0.22516345977783203, |
| "loss/reg": 0.00018612167332321405, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.032125, |
| "grad_norm": 2.594728708267212, |
| "grad_norm_var": 0.6154499118248385, |
| "learning_rate": 0.0001, |
| "loss": 1.3481, |
| "loss/crossentropy": 2.5490729808807373, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.1899745911359787, |
| "loss/reg": 0.0001860785996541381, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.03225, |
| "grad_norm": 1.9762139320373535, |
| "grad_norm_var": 0.666272320547633, |
| "learning_rate": 0.0001, |
| "loss": 1.1268, |
| "loss/crossentropy": 2.6095380783081055, |
| "loss/hidden": 0.96875, |
| "loss/logits": 0.15619653463363647, |
| "loss/reg": 0.00018601951887831092, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.032375, |
| "grad_norm": 2.5357439517974854, |
| "grad_norm_var": 0.6713294887447486, |
| "learning_rate": 0.0001, |
| "loss": 1.1937, |
| "loss/crossentropy": 2.7029552459716797, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.1605946123600006, |
| "loss/reg": 0.00018597528105601668, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.0325, |
| "grad_norm": 2.8872361183166504, |
| "grad_norm_var": 0.6680105601783904, |
| "learning_rate": 0.0001, |
| "loss": 1.1618, |
| "loss/crossentropy": 2.5793302059173584, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.15989942848682404, |
| "loss/reg": 0.00018592287960927933, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.032625, |
| "grad_norm": 2.6888515949249268, |
| "grad_norm_var": 0.39965034448394665, |
| "learning_rate": 0.0001, |
| "loss": 1.1163, |
| "loss/crossentropy": 2.501629114151001, |
| "loss/hidden": 0.9609375, |
| "loss/logits": 0.15351587533950806, |
| "loss/reg": 0.00018586948863230646, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.03275, |
| "grad_norm": 2.794921398162842, |
| "grad_norm_var": 0.3744163537172878, |
| "learning_rate": 0.0001, |
| "loss": 1.3868, |
| "loss/crossentropy": 2.473355293273926, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.19742509722709656, |
| "loss/reg": 0.00018582609482109547, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.032875, |
| "grad_norm": 11.51866340637207, |
| "grad_norm_var": 5.260212315476766, |
| "learning_rate": 0.0001, |
| "loss": 1.9065, |
| "loss/crossentropy": 2.5873212814331055, |
| "loss/hidden": 1.4765625, |
| "loss/logits": 0.4280346632003784, |
| "loss/reg": 0.00018578370509203523, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.033, |
| "grad_norm": 3.3071396350860596, |
| "grad_norm_var": 5.180231931586934, |
| "learning_rate": 0.0001, |
| "loss": 1.237, |
| "loss/crossentropy": 2.6701910495758057, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.17262707650661469, |
| "loss/reg": 0.0001857366441981867, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.033125, |
| "grad_norm": 2.6719601154327393, |
| "grad_norm_var": 5.177728124810292, |
| "learning_rate": 0.0001, |
| "loss": 1.4876, |
| "loss/crossentropy": 2.741184711456299, |
| "loss/hidden": 1.265625, |
| "loss/logits": 0.22007906436920166, |
| "loss/reg": 0.00018568705127108842, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.03325, |
| "grad_norm": 4.4371514320373535, |
| "grad_norm_var": 5.211410221157409, |
| "learning_rate": 0.0001, |
| "loss": 1.8731, |
| "loss/crossentropy": 2.4437432289123535, |
| "loss/hidden": 1.6015625, |
| "loss/logits": 0.269656777381897, |
| "loss/reg": 0.00018562644254416227, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.033375, |
| "grad_norm": 3.256206750869751, |
| "grad_norm_var": 5.074168773112569, |
| "learning_rate": 0.0001, |
| "loss": 1.3223, |
| "loss/crossentropy": 2.6366071701049805, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.1875854730606079, |
| "loss/reg": 0.00018558187002781779, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.0335, |
| "grad_norm": 2.422538995742798, |
| "grad_norm_var": 5.109844600456275, |
| "learning_rate": 0.0001, |
| "loss": 1.3075, |
| "loss/crossentropy": 2.5296108722686768, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.1884431093931198, |
| "loss/reg": 0.00018553413974586874, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.033625, |
| "grad_norm": 4.04595422744751, |
| "grad_norm_var": 5.071768309380559, |
| "learning_rate": 0.0001, |
| "loss": 1.8971, |
| "loss/crossentropy": 2.3316025733947754, |
| "loss/hidden": 1.625, |
| "loss/logits": 0.2702447175979614, |
| "loss/reg": 0.00018548894149716944, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.03375, |
| "grad_norm": 3.397867441177368, |
| "grad_norm_var": 5.012096554664663, |
| "learning_rate": 0.0001, |
| "loss": 1.43, |
| "loss/crossentropy": 2.2132034301757812, |
| "loss/hidden": 1.234375, |
| "loss/logits": 0.19377078115940094, |
| "loss/reg": 0.00018543725309427828, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.033875, |
| "grad_norm": 2.360687494277954, |
| "grad_norm_var": 5.007982625032073, |
| "learning_rate": 0.0001, |
| "loss": 1.2015, |
| "loss/crossentropy": 2.6397757530212402, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.1683761179447174, |
| "loss/reg": 0.00018539318989496678, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.034, |
| "grad_norm": 3.795145273208618, |
| "grad_norm_var": 4.95906816389108, |
| "learning_rate": 0.0001, |
| "loss": 1.7341, |
| "loss/crossentropy": 2.2904622554779053, |
| "loss/hidden": 1.5, |
| "loss/logits": 0.23220419883728027, |
| "loss/reg": 0.000185342927579768, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.034125, |
| "grad_norm": 3.19511079788208, |
| "grad_norm_var": 4.905671754488361, |
| "learning_rate": 0.0001, |
| "loss": 1.4552, |
| "loss/crossentropy": 2.4893336296081543, |
| "loss/hidden": 1.25, |
| "loss/logits": 0.20336630940437317, |
| "loss/reg": 0.00018529384396970272, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.03425, |
| "grad_norm": 2.818743944168091, |
| "grad_norm_var": 4.769792764968277, |
| "learning_rate": 0.0001, |
| "loss": 1.1492, |
| "loss/crossentropy": 2.5468082427978516, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.1551104187965393, |
| "loss/reg": 0.00018524785991758108, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.034375, |
| "grad_norm": 2.7294118404388428, |
| "grad_norm_var": 4.743793641432837, |
| "learning_rate": 0.0001, |
| "loss": 1.2939, |
| "loss/crossentropy": 2.7086288928985596, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.18263299763202667, |
| "loss/reg": 0.00018520389858167619, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.0345, |
| "grad_norm": 4.2614545822143555, |
| "grad_norm_var": 4.722892075276445, |
| "learning_rate": 0.0001, |
| "loss": 2.152, |
| "loss/crossentropy": 2.3208584785461426, |
| "loss/hidden": 1.7578125, |
| "loss/logits": 0.3923119604587555, |
| "loss/reg": 0.0001851657871156931, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.034625, |
| "grad_norm": 3.1579630374908447, |
| "grad_norm_var": 4.671438964356158, |
| "learning_rate": 0.0001, |
| "loss": 1.5186, |
| "loss/crossentropy": 2.52303409576416, |
| "loss/hidden": 1.28125, |
| "loss/logits": 0.23553629219532013, |
| "loss/reg": 0.00018512809765525162, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.03475, |
| "grad_norm": 2.9455530643463135, |
| "grad_norm_var": 4.653460522047108, |
| "learning_rate": 0.0001, |
| "loss": 1.2198, |
| "loss/crossentropy": 2.64249324798584, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.1554383635520935, |
| "loss/reg": 0.00018509359506424516, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.034875, |
| "grad_norm": 2.588207721710205, |
| "grad_norm_var": 0.4115949243025471, |
| "learning_rate": 0.0001, |
| "loss": 1.3877, |
| "loss/crossentropy": 2.829423427581787, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.2140059471130371, |
| "loss/reg": 0.00018506502965465188, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.035, |
| "grad_norm": 3.4081485271453857, |
| "grad_norm_var": 0.41351468625660626, |
| "learning_rate": 0.0001, |
| "loss": 1.436, |
| "loss/crossentropy": 2.624629259109497, |
| "loss/hidden": 1.2265625, |
| "loss/logits": 0.2076188027858734, |
| "loss/reg": 0.00018502252351026982, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.035125, |
| "grad_norm": 3.1615681648254395, |
| "grad_norm_var": 0.39283411950296265, |
| "learning_rate": 0.0001, |
| "loss": 1.3441, |
| "loss/crossentropy": 2.465009927749634, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.19385108351707458, |
| "loss/reg": 0.00018499059660825878, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.03525, |
| "grad_norm": 3.2208797931671143, |
| "grad_norm_var": 0.2925862508398, |
| "learning_rate": 0.0001, |
| "loss": 1.9994, |
| "loss/crossentropy": 2.230750322341919, |
| "loss/hidden": 1.578125, |
| "loss/logits": 0.41937941312789917, |
| "loss/reg": 0.00018494235700927675, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.035375, |
| "grad_norm": 3.2623984813690186, |
| "grad_norm_var": 0.292657471443624, |
| "learning_rate": 0.0001, |
| "loss": 1.5126, |
| "loss/crossentropy": 2.5379672050476074, |
| "loss/hidden": 1.2734375, |
| "loss/logits": 0.23736032843589783, |
| "loss/reg": 0.00018488496425561607, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.0355, |
| "grad_norm": 2.368196487426758, |
| "grad_norm_var": 0.2982812772165668, |
| "learning_rate": 0.0001, |
| "loss": 1.3005, |
| "loss/crossentropy": 2.516019344329834, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.18929743766784668, |
| "loss/reg": 0.0001848430110840127, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.035625, |
| "grad_norm": 3.191399335861206, |
| "grad_norm_var": 0.24409669271128276, |
| "learning_rate": 0.0001, |
| "loss": 1.2823, |
| "loss/crossentropy": 2.751084089279175, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.17108574509620667, |
| "loss/reg": 0.00018479586287867278, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.03575, |
| "grad_norm": 2.592566967010498, |
| "grad_norm_var": 0.2544086356402346, |
| "learning_rate": 0.0001, |
| "loss": 1.3514, |
| "loss/crossentropy": 2.5993878841400146, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.23240423202514648, |
| "loss/reg": 0.0001847518578870222, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.035875, |
| "grad_norm": 3.8178586959838867, |
| "grad_norm_var": 0.2500656389811674, |
| "learning_rate": 0.0001, |
| "loss": 1.3704, |
| "loss/crossentropy": 2.688004970550537, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.18104243278503418, |
| "loss/reg": 0.00018470516079105437, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.036, |
| "grad_norm": 2.0925023555755615, |
| "grad_norm_var": 0.2864185440912022, |
| "learning_rate": 0.0001, |
| "loss": 1.2565, |
| "loss/crossentropy": 2.4902563095092773, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.16092851758003235, |
| "loss/reg": 0.0001846601371653378, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.036125, |
| "grad_norm": 2.821599245071411, |
| "grad_norm_var": 0.28794847130561624, |
| "learning_rate": 0.0001, |
| "loss": 1.5127, |
| "loss/crossentropy": 2.331846237182617, |
| "loss/hidden": 1.2578125, |
| "loss/logits": 0.25302040576934814, |
| "loss/reg": 0.00018461896979715675, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.03625, |
| "grad_norm": 2.387387275695801, |
| "grad_norm_var": 0.31157863588131224, |
| "learning_rate": 0.0001, |
| "loss": 1.2607, |
| "loss/crossentropy": 2.76906681060791, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.18852418661117554, |
| "loss/reg": 0.00018456355610396713, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.036375, |
| "grad_norm": 3.449770450592041, |
| "grad_norm_var": 0.3179789227700856, |
| "learning_rate": 0.0001, |
| "loss": 1.7591, |
| "loss/crossentropy": 1.8213207721710205, |
| "loss/hidden": 1.5390625, |
| "loss/logits": 0.21822357177734375, |
| "loss/reg": 0.00018452556105330586, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.0365, |
| "grad_norm": 2.499556303024292, |
| "grad_norm_var": 0.2263369840310491, |
| "learning_rate": 0.0001, |
| "loss": 1.2925, |
| "loss/crossentropy": 2.6276936531066895, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.18123921751976013, |
| "loss/reg": 0.0001844725920818746, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.036625, |
| "grad_norm": 2.115734815597534, |
| "grad_norm_var": 0.2632914348592083, |
| "learning_rate": 0.0001, |
| "loss": 1.2089, |
| "loss/crossentropy": 2.6672511100769043, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.1757870316505432, |
| "loss/reg": 0.0001844176003942266, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.03675, |
| "grad_norm": 2.684098958969116, |
| "grad_norm_var": 0.2649372545619843, |
| "learning_rate": 0.0001, |
| "loss": 1.2995, |
| "loss/crossentropy": 2.5663037300109863, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.172622948884964, |
| "loss/reg": 0.00018438571714796126, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.036875, |
| "grad_norm": 7.823673248291016, |
| "grad_norm_var": 1.792621724898212, |
| "learning_rate": 0.0001, |
| "loss": 1.3821, |
| "loss/crossentropy": 2.6714284420013428, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.22403180599212646, |
| "loss/reg": 0.00018433824880048633, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.037, |
| "grad_norm": 2.6135120391845703, |
| "grad_norm_var": 1.808029304785425, |
| "learning_rate": 0.0001, |
| "loss": 1.2104, |
| "loss/crossentropy": 2.68971848487854, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.16948801279067993, |
| "loss/reg": 0.00018430246564093977, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.037125, |
| "grad_norm": 8.026154518127441, |
| "grad_norm_var": 3.3065969805558173, |
| "learning_rate": 0.0001, |
| "loss": 2.1847, |
| "loss/crossentropy": 2.3800106048583984, |
| "loss/hidden": 1.9453125, |
| "loss/logits": 0.2375316321849823, |
| "loss/reg": 0.0001842674391809851, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.03725, |
| "grad_norm": 2.896569013595581, |
| "grad_norm_var": 3.3224491377570446, |
| "learning_rate": 0.0001, |
| "loss": 1.3748, |
| "loss/crossentropy": 2.3903732299804688, |
| "loss/hidden": 1.1953125, |
| "loss/logits": 0.17761950194835663, |
| "loss/reg": 0.00018421628919895738, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.037375, |
| "grad_norm": 3.096365451812744, |
| "grad_norm_var": 3.327554446166753, |
| "learning_rate": 0.0001, |
| "loss": 1.4918, |
| "loss/crossentropy": 2.355905532836914, |
| "loss/hidden": 1.2734375, |
| "loss/logits": 0.2165393829345703, |
| "loss/reg": 0.00018417388491798192, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.0375, |
| "grad_norm": 5.26882266998291, |
| "grad_norm_var": 3.452496381081827, |
| "learning_rate": 0.0001, |
| "loss": 1.9899, |
| "loss/crossentropy": 2.9516849517822266, |
| "loss/hidden": 1.6875, |
| "loss/logits": 0.3005185127258301, |
| "loss/reg": 0.0001841239572968334, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.037625, |
| "grad_norm": 3.068439483642578, |
| "grad_norm_var": 3.4599122750924143, |
| "learning_rate": 0.0001, |
| "loss": 1.3886, |
| "loss/crossentropy": 2.6546504497528076, |
| "loss/hidden": 1.1796875, |
| "loss/logits": 0.20704253017902374, |
| "loss/reg": 0.00018408475443720818, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.03775, |
| "grad_norm": 2.2956972122192383, |
| "grad_norm_var": 3.504442894615314, |
| "learning_rate": 0.0001, |
| "loss": 1.1094, |
| "loss/crossentropy": 2.4977052211761475, |
| "loss/hidden": 0.96875, |
| "loss/logits": 0.13884802162647247, |
| "loss/reg": 0.00018404283036943525, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.037875, |
| "grad_norm": 2.5937392711639404, |
| "grad_norm_var": 3.5559874858295015, |
| "learning_rate": 0.0001, |
| "loss": 1.3949, |
| "loss/crossentropy": 2.379765033721924, |
| "loss/hidden": 1.21875, |
| "loss/logits": 0.17428620159626007, |
| "loss/reg": 0.00018399336840957403, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.038, |
| "grad_norm": 3.512799024581909, |
| "grad_norm_var": 3.4186760491291084, |
| "learning_rate": 0.0001, |
| "loss": 1.381, |
| "loss/crossentropy": 2.4689581394195557, |
| "loss/hidden": 1.1640625, |
| "loss/logits": 0.2151029407978058, |
| "loss/reg": 0.00018394750077277422, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.038125, |
| "grad_norm": 2.610177755355835, |
| "grad_norm_var": 3.442626566538619, |
| "learning_rate": 0.0001, |
| "loss": 1.2278, |
| "loss/crossentropy": 2.4508001804351807, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.1634678691625595, |
| "loss/reg": 0.00018390185141470283, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.03825, |
| "grad_norm": 2.4446496963500977, |
| "grad_norm_var": 3.433886969311868, |
| "learning_rate": 0.0001, |
| "loss": 1.5023, |
| "loss/crossentropy": 2.110161066055298, |
| "loss/hidden": 1.3125, |
| "loss/logits": 0.18798431754112244, |
| "loss/reg": 0.00018385711882729083, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.038375, |
| "grad_norm": 2.9203567504882812, |
| "grad_norm_var": 3.4593607482629065, |
| "learning_rate": 0.0001, |
| "loss": 1.2248, |
| "loss/crossentropy": 2.6589579582214355, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.16043730080127716, |
| "loss/reg": 0.00018381779955234379, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.0385, |
| "grad_norm": 3.625650644302368, |
| "grad_norm_var": 3.3839899608280857, |
| "learning_rate": 0.0001, |
| "loss": 1.3888, |
| "loss/crossentropy": 2.7281792163848877, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.269817590713501, |
| "loss/reg": 0.00018376816296949983, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.038625, |
| "grad_norm": 2.889061689376831, |
| "grad_norm_var": 3.268347098659014, |
| "learning_rate": 0.0001, |
| "loss": 1.1888, |
| "loss/crossentropy": 2.523449420928955, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.17915961146354675, |
| "loss/reg": 0.00018371775513514876, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.03875, |
| "grad_norm": 2.768836736679077, |
| "grad_norm_var": 3.257904120325863, |
| "learning_rate": 0.0001, |
| "loss": 1.3947, |
| "loss/crossentropy": 2.3002350330352783, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.20538243651390076, |
| "loss/reg": 0.00018366229778621346, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.038875, |
| "grad_norm": 2.4714229106903076, |
| "grad_norm_var": 2.0722741056598655, |
| "learning_rate": 0.0001, |
| "loss": 1.3588, |
| "loss/crossentropy": 2.5036730766296387, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.2085750699043274, |
| "loss/reg": 0.00018361561524216086, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.039, |
| "grad_norm": 2.3803367614746094, |
| "grad_norm_var": 2.0976025308533433, |
| "learning_rate": 0.0001, |
| "loss": 1.3446, |
| "loss/crossentropy": 2.5774199962615967, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.18654365837574005, |
| "loss/reg": 0.00018356091459281743, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.039125, |
| "grad_norm": 2.1344940662384033, |
| "grad_norm_var": 0.557820051408968, |
| "learning_rate": 0.0001, |
| "loss": 1.1236, |
| "loss/crossentropy": 2.492638349533081, |
| "loss/hidden": 0.96875, |
| "loss/logits": 0.15305551886558533, |
| "loss/reg": 0.00018350353639107198, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.03925, |
| "grad_norm": 2.3380954265594482, |
| "grad_norm_var": 0.5802561079704179, |
| "learning_rate": 0.0001, |
| "loss": 1.2665, |
| "loss/crossentropy": 2.538328170776367, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.1786879152059555, |
| "loss/reg": 0.00018345742137171328, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.039375, |
| "grad_norm": 2.4094135761260986, |
| "grad_norm_var": 0.5918726782285414, |
| "learning_rate": 0.0001, |
| "loss": 1.3272, |
| "loss/crossentropy": 2.4620587825775146, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.1925506591796875, |
| "loss/reg": 0.00018340005772188306, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.0395, |
| "grad_norm": 2.4094653129577637, |
| "grad_norm_var": 0.18384264866255365, |
| "learning_rate": 0.0001, |
| "loss": 1.6038, |
| "loss/crossentropy": 2.343863010406494, |
| "loss/hidden": 1.3125, |
| "loss/logits": 0.28949666023254395, |
| "loss/reg": 0.00018334249034523964, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.039625, |
| "grad_norm": 2.327509880065918, |
| "grad_norm_var": 0.17973404957112868, |
| "learning_rate": 0.0001, |
| "loss": 1.2969, |
| "loss/crossentropy": 2.6910533905029297, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.17789438366889954, |
| "loss/reg": 0.0001832786510931328, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.03975, |
| "grad_norm": 1.9041651487350464, |
| "grad_norm_var": 0.2069358760498758, |
| "learning_rate": 0.0001, |
| "loss": 1.1054, |
| "loss/crossentropy": 2.636301279067993, |
| "loss/hidden": 0.96875, |
| "loss/logits": 0.13483907282352448, |
| "loss/reg": 0.00018322949472349137, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.039875, |
| "grad_norm": 2.493028163909912, |
| "grad_norm_var": 0.2077715093556349, |
| "learning_rate": 0.0001, |
| "loss": 1.3427, |
| "loss/crossentropy": 2.3860602378845215, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.18460425734519958, |
| "loss/reg": 0.00018318284128326923, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 2.566598892211914, |
| "grad_norm_var": 0.14887985654726912, |
| "learning_rate": 0.0001, |
| "loss": 1.1799, |
| "loss/crossentropy": 2.562800884246826, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.15464608371257782, |
| "loss/reg": 0.00018314005865249783, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.040125, |
| "grad_norm": 2.3247175216674805, |
| "grad_norm_var": 0.15142847186754452, |
| "learning_rate": 0.0001, |
| "loss": 1.138, |
| "loss/crossentropy": 2.6008517742156982, |
| "loss/hidden": 0.97265625, |
| "loss/logits": 0.16352114081382751, |
| "loss/reg": 0.00018309340521227568, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.04025, |
| "grad_norm": 2.961496114730835, |
| "grad_norm_var": 0.1625533330376787, |
| "learning_rate": 0.0001, |
| "loss": 1.4689, |
| "loss/crossentropy": 1.9118317365646362, |
| "loss/hidden": 1.296875, |
| "loss/logits": 0.17024032771587372, |
| "loss/reg": 0.00018305043340660632, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.040375, |
| "grad_norm": 3.0667834281921387, |
| "grad_norm_var": 0.17097196220394337, |
| "learning_rate": 0.0001, |
| "loss": 1.7235, |
| "loss/crossentropy": 2.053375482559204, |
| "loss/hidden": 1.421875, |
| "loss/logits": 0.2998199462890625, |
| "loss/reg": 0.00018301077943760902, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.0405, |
| "grad_norm": 2.930929183959961, |
| "grad_norm_var": 0.10306917410381727, |
| "learning_rate": 0.0001, |
| "loss": 1.2964, |
| "loss/crossentropy": 2.5874621868133545, |
| "loss/hidden": 1.1015625, |
| "loss/logits": 0.19297093152999878, |
| "loss/reg": 0.00018295719928573817, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.040625, |
| "grad_norm": 2.871757745742798, |
| "grad_norm_var": 0.1022445182394837, |
| "learning_rate": 0.0001, |
| "loss": 1.2449, |
| "loss/crossentropy": 2.6085433959960938, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.16499356925487518, |
| "loss/reg": 0.0001828953973017633, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.04075, |
| "grad_norm": 2.489987850189209, |
| "grad_norm_var": 0.09794334325425678, |
| "learning_rate": 0.0001, |
| "loss": 1.3676, |
| "loss/crossentropy": 2.4801414012908936, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.1939350813627243, |
| "loss/reg": 0.00018283820827491581, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.040875, |
| "grad_norm": 2.536003828048706, |
| "grad_norm_var": 0.09791477775173399, |
| "learning_rate": 0.0001, |
| "loss": 1.3227, |
| "loss/crossentropy": 2.7262425422668457, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.21148526668548584, |
| "loss/reg": 0.0001827785454224795, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.041, |
| "grad_norm": 2.0945851802825928, |
| "grad_norm_var": 0.10792211144253508, |
| "learning_rate": 0.0001, |
| "loss": 1.1186, |
| "loss/crossentropy": 2.4119601249694824, |
| "loss/hidden": 0.97265625, |
| "loss/logits": 0.1441338062286377, |
| "loss/reg": 0.0001827288360800594, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.041125, |
| "grad_norm": 2.836939811706543, |
| "grad_norm_var": 0.10535360002502277, |
| "learning_rate": 0.0001, |
| "loss": 1.4351, |
| "loss/crossentropy": 2.3377010822296143, |
| "loss/hidden": 1.21875, |
| "loss/logits": 0.21447591483592987, |
| "loss/reg": 0.00018267397535964847, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.04125, |
| "grad_norm": 2.326241970062256, |
| "grad_norm_var": 0.10567372742739559, |
| "learning_rate": 0.0001, |
| "loss": 1.1899, |
| "loss/crossentropy": 2.7874836921691895, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.17246156930923462, |
| "loss/reg": 0.00018262714729644358, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.041375, |
| "grad_norm": 2.6195242404937744, |
| "grad_norm_var": 0.10493277845914116, |
| "learning_rate": 0.0001, |
| "loss": 1.4583, |
| "loss/crossentropy": 2.6836705207824707, |
| "loss/hidden": 1.25, |
| "loss/logits": 0.20644301176071167, |
| "loss/reg": 0.0001825743674999103, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.0415, |
| "grad_norm": 2.837834119796753, |
| "grad_norm_var": 0.10851849947722905, |
| "learning_rate": 0.0001, |
| "loss": 1.3683, |
| "loss/crossentropy": 2.336994171142578, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.21025338768959045, |
| "loss/reg": 0.0001825322542572394, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.041625, |
| "grad_norm": 2.7226598262786865, |
| "grad_norm_var": 0.10527721486086339, |
| "learning_rate": 0.0001, |
| "loss": 1.1225, |
| "loss/crossentropy": 2.621798276901245, |
| "loss/hidden": 0.97265625, |
| "loss/logits": 0.1480618715286255, |
| "loss/reg": 0.00018248235573992133, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.04175, |
| "grad_norm": 1.9014396667480469, |
| "grad_norm_var": 0.10553016347722419, |
| "learning_rate": 0.0001, |
| "loss": 1.121, |
| "loss/crossentropy": 2.642059087753296, |
| "loss/hidden": 0.96875, |
| "loss/logits": 0.15041938424110413, |
| "loss/reg": 0.00018242868827655911, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.041875, |
| "grad_norm": 2.2809882164001465, |
| "grad_norm_var": 0.11133012136604989, |
| "learning_rate": 0.0001, |
| "loss": 1.3509, |
| "loss/crossentropy": 2.59220027923584, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.20842507481575012, |
| "loss/reg": 0.00018237272161059082, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.042, |
| "grad_norm": 2.722670793533325, |
| "grad_norm_var": 0.11245856535336524, |
| "learning_rate": 0.0001, |
| "loss": 1.3534, |
| "loss/crossentropy": 2.7409770488739014, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.195334792137146, |
| "loss/reg": 0.00018231497961096466, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.042125, |
| "grad_norm": 2.7385120391845703, |
| "grad_norm_var": 0.1082322741184413, |
| "learning_rate": 0.0001, |
| "loss": 1.2894, |
| "loss/crossentropy": 2.5753767490386963, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.17039254307746887, |
| "loss/reg": 0.00018226687097921968, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.04225, |
| "grad_norm": 2.278662919998169, |
| "grad_norm_var": 0.106386719047498, |
| "learning_rate": 0.0001, |
| "loss": 1.4188, |
| "loss/crossentropy": 2.519120693206787, |
| "loss/hidden": 1.203125, |
| "loss/logits": 0.21385297179222107, |
| "loss/reg": 0.00018221234495285898, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.042375, |
| "grad_norm": 2.7543537616729736, |
| "grad_norm_var": 0.09214567617970199, |
| "learning_rate": 0.0001, |
| "loss": 1.2684, |
| "loss/crossentropy": 2.615980625152588, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.17285585403442383, |
| "loss/reg": 0.00018216087482869625, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.0425, |
| "grad_norm": 2.740844249725342, |
| "grad_norm_var": 0.08497608623963113, |
| "learning_rate": 0.0001, |
| "loss": 1.3717, |
| "loss/crossentropy": 2.3156495094299316, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.18235422670841217, |
| "loss/reg": 0.000182102681719698, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.042625, |
| "grad_norm": 2.183875799179077, |
| "grad_norm_var": 0.08476970381204461, |
| "learning_rate": 0.0001, |
| "loss": 1.2376, |
| "loss/crossentropy": 2.5132007598876953, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.1733236014842987, |
| "loss/reg": 0.00018205813830718398, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.04275, |
| "grad_norm": 2.5386435985565186, |
| "grad_norm_var": 0.0848263064399248, |
| "learning_rate": 0.0001, |
| "loss": 1.3462, |
| "loss/crossentropy": 2.4127066135406494, |
| "loss/hidden": 1.1796875, |
| "loss/logits": 0.16472230851650238, |
| "loss/reg": 0.00018201676721218973, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.042875, |
| "grad_norm": 2.3057498931884766, |
| "grad_norm_var": 0.08725284383438421, |
| "learning_rate": 0.0001, |
| "loss": 1.2497, |
| "loss/crossentropy": 2.184678316116333, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.18537552654743195, |
| "loss/reg": 0.0001819680182961747, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.043, |
| "grad_norm": 2.218367099761963, |
| "grad_norm_var": 0.0816395413206275, |
| "learning_rate": 0.0001, |
| "loss": 1.1513, |
| "loss/crossentropy": 2.449817419052124, |
| "loss/hidden": 0.99609375, |
| "loss/logits": 0.15338429808616638, |
| "loss/reg": 0.00018192335846833885, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.043125, |
| "grad_norm": 2.2711985111236572, |
| "grad_norm_var": 0.07626184388881233, |
| "learning_rate": 0.0001, |
| "loss": 1.0171, |
| "loss/crossentropy": 2.509692907333374, |
| "loss/hidden": 0.89453125, |
| "loss/logits": 0.12079230695962906, |
| "loss/reg": 0.00018188220565207303, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.04325, |
| "grad_norm": 2.787581443786621, |
| "grad_norm_var": 0.08102267837075464, |
| "learning_rate": 0.0001, |
| "loss": 1.2428, |
| "loss/crossentropy": 2.3301100730895996, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.18630483746528625, |
| "loss/reg": 0.00018185042426921427, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.043375, |
| "grad_norm": 2.0392682552337646, |
| "grad_norm_var": 0.09234946001928937, |
| "learning_rate": 0.0001, |
| "loss": 1.0241, |
| "loss/crossentropy": 2.588428020477295, |
| "loss/hidden": 0.89453125, |
| "loss/logits": 0.1277570128440857, |
| "loss/reg": 0.00018179781909566373, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.0435, |
| "grad_norm": 2.7076616287231445, |
| "grad_norm_var": 0.08681018440338353, |
| "learning_rate": 0.0001, |
| "loss": 1.4478, |
| "loss/crossentropy": 2.6002542972564697, |
| "loss/hidden": 1.2265625, |
| "loss/logits": 0.2193761169910431, |
| "loss/reg": 0.0001817511219996959, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.043625, |
| "grad_norm": 2.353895425796509, |
| "grad_norm_var": 0.08187996873415007, |
| "learning_rate": 0.0001, |
| "loss": 1.2846, |
| "loss/crossentropy": 2.441397190093994, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.18899187445640564, |
| "loss/reg": 0.0001816989533836022, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.04375, |
| "grad_norm": 2.078312397003174, |
| "grad_norm_var": 0.07145312501918302, |
| "learning_rate": 0.0001, |
| "loss": 1.2337, |
| "loss/crossentropy": 2.320770263671875, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.1694108545780182, |
| "loss/reg": 0.00018164912762586027, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.043875, |
| "grad_norm": 2.455012321472168, |
| "grad_norm_var": 0.06971347306554586, |
| "learning_rate": 0.0001, |
| "loss": 1.3011, |
| "loss/crossentropy": 2.6782124042510986, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.1899489462375641, |
| "loss/reg": 0.00018160381296183914, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.044, |
| "grad_norm": 3.1685619354248047, |
| "grad_norm_var": 0.09844486312005239, |
| "learning_rate": 0.0001, |
| "loss": 1.3515, |
| "loss/crossentropy": 2.8243625164031982, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.20903226733207703, |
| "loss/reg": 0.00018155867292080075, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.044125, |
| "grad_norm": 3.2489728927612305, |
| "grad_norm_var": 0.13257830736299236, |
| "learning_rate": 0.0001, |
| "loss": 1.5585, |
| "loss/crossentropy": 2.8111252784729004, |
| "loss/hidden": 1.3125, |
| "loss/logits": 0.24414658546447754, |
| "loss/reg": 0.0001815159630496055, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.04425, |
| "grad_norm": 3.3230886459350586, |
| "grad_norm_var": 0.16879235535394155, |
| "learning_rate": 0.0001, |
| "loss": 1.4469, |
| "loss/crossentropy": 2.5517845153808594, |
| "loss/hidden": 1.2109375, |
| "loss/logits": 0.23417417705059052, |
| "loss/reg": 0.00018148007802665234, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.044375, |
| "grad_norm": 2.2562191486358643, |
| "grad_norm_var": 0.17228650926600247, |
| "learning_rate": 0.0001, |
| "loss": 1.1829, |
| "loss/crossentropy": 2.422496795654297, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.16546528041362762, |
| "loss/reg": 0.00018143345369026065, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.0445, |
| "grad_norm": 2.7640106678009033, |
| "grad_norm_var": 0.17293323899421179, |
| "learning_rate": 0.0001, |
| "loss": 1.167, |
| "loss/crossentropy": 2.828662157058716, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.16519883275032043, |
| "loss/reg": 0.00018138332234229892, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.044625, |
| "grad_norm": 2.3983893394470215, |
| "grad_norm_var": 0.16551544063244136, |
| "learning_rate": 0.0001, |
| "loss": 1.1707, |
| "loss/crossentropy": 2.605727434158325, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.16108599305152893, |
| "loss/reg": 0.0001813305716495961, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.04475, |
| "grad_norm": 3.6282835006713867, |
| "grad_norm_var": 0.2370290852634845, |
| "learning_rate": 0.0001, |
| "loss": 1.3762, |
| "loss/crossentropy": 2.6275181770324707, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.21812118589878082, |
| "loss/reg": 0.00018128403462469578, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.044875, |
| "grad_norm": 3.763258934020996, |
| "grad_norm_var": 0.30770300622119723, |
| "learning_rate": 0.0001, |
| "loss": 1.5858, |
| "loss/crossentropy": 2.6596243381500244, |
| "loss/hidden": 1.2890625, |
| "loss/logits": 0.29489603638648987, |
| "loss/reg": 0.00018124622874893248, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.045, |
| "grad_norm": 2.6321170330047607, |
| "grad_norm_var": 0.29092860453977115, |
| "learning_rate": 0.0001, |
| "loss": 1.1651, |
| "loss/crossentropy": 2.679645299911499, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.1476357877254486, |
| "loss/reg": 0.00018119592277798802, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.045125, |
| "grad_norm": 2.794433832168579, |
| "grad_norm_var": 0.27517751652292094, |
| "learning_rate": 0.0001, |
| "loss": 1.291, |
| "loss/crossentropy": 2.46774959564209, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.1954488754272461, |
| "loss/reg": 0.00018115356215275824, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.04525, |
| "grad_norm": 3.043492078781128, |
| "grad_norm_var": 0.2797019428924839, |
| "learning_rate": 0.0001, |
| "loss": 1.5628, |
| "loss/crossentropy": 2.5124197006225586, |
| "loss/hidden": 1.328125, |
| "loss/logits": 0.23281781375408173, |
| "loss/reg": 0.00018111053213942796, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.045375, |
| "grad_norm": 2.155243158340454, |
| "grad_norm_var": 0.2689192978759799, |
| "learning_rate": 0.0001, |
| "loss": 1.2775, |
| "loss/crossentropy": 2.5151214599609375, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.1897062063217163, |
| "loss/reg": 0.00018105284834746271, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.0455, |
| "grad_norm": 2.609884023666382, |
| "grad_norm_var": 0.2706969753359004, |
| "learning_rate": 0.0001, |
| "loss": 1.3514, |
| "loss/crossentropy": 2.506584882736206, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.20117563009262085, |
| "loss/reg": 0.00018099797307513654, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.045625, |
| "grad_norm": 2.342069387435913, |
| "grad_norm_var": 0.2713966376478879, |
| "learning_rate": 0.0001, |
| "loss": 1.3665, |
| "loss/crossentropy": 2.7815566062927246, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.20847788453102112, |
| "loss/reg": 0.00018094982078764588, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.04575, |
| "grad_norm": 3.0326218605041504, |
| "grad_norm_var": 0.23758998657840114, |
| "learning_rate": 0.0001, |
| "loss": 1.3364, |
| "loss/crossentropy": 2.523134469985962, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.20178905129432678, |
| "loss/reg": 0.0001809141831472516, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.045875, |
| "grad_norm": 10.035235404968262, |
| "grad_norm_var": 3.4286245913836826, |
| "learning_rate": 0.0001, |
| "loss": 1.7972, |
| "loss/crossentropy": 2.548880100250244, |
| "loss/hidden": 1.453125, |
| "loss/logits": 0.34224259853363037, |
| "loss/reg": 0.0001808816014090553, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.046, |
| "grad_norm": 2.35805606842041, |
| "grad_norm_var": 3.4865601240502655, |
| "learning_rate": 0.0001, |
| "loss": 1.3183, |
| "loss/crossentropy": 2.6983084678649902, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.1837082803249359, |
| "loss/reg": 0.00018084091425407678, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.046125, |
| "grad_norm": 2.228501319885254, |
| "grad_norm_var": 3.555062224897286, |
| "learning_rate": 0.0001, |
| "loss": 1.182, |
| "loss/crossentropy": 2.2725465297698975, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.16455045342445374, |
| "loss/reg": 0.0001807970111258328, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.04625, |
| "grad_norm": 3.036968469619751, |
| "grad_norm_var": 3.555876206735041, |
| "learning_rate": 0.0001, |
| "loss": 1.3473, |
| "loss/crossentropy": 2.462448835372925, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.1736328899860382, |
| "loss/reg": 0.00018076066044159234, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.046375, |
| "grad_norm": 2.4165427684783936, |
| "grad_norm_var": 3.5374699186157517, |
| "learning_rate": 0.0001, |
| "loss": 1.3649, |
| "loss/crossentropy": 2.7596888542175293, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.20686297118663788, |
| "loss/reg": 0.00018071448721457273, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.0465, |
| "grad_norm": 3.086303949356079, |
| "grad_norm_var": 3.525121419257785, |
| "learning_rate": 0.0001, |
| "loss": 1.5493, |
| "loss/crossentropy": 2.500018835067749, |
| "loss/hidden": 1.28125, |
| "loss/logits": 0.26621347665786743, |
| "loss/reg": 0.00018066992925014347, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.046625, |
| "grad_norm": 2.4726622104644775, |
| "grad_norm_var": 3.5173041221135075, |
| "learning_rate": 0.0001, |
| "loss": 1.136, |
| "loss/crossentropy": 2.5688652992248535, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.1419672667980194, |
| "loss/reg": 0.00018063545576296747, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.04675, |
| "grad_norm": 2.6667373180389404, |
| "grad_norm_var": 3.523672237020999, |
| "learning_rate": 0.0001, |
| "loss": 1.361, |
| "loss/crossentropy": 2.5163731575012207, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.21853341162204742, |
| "loss/reg": 0.0001806024374673143, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.046875, |
| "grad_norm": 2.3773033618927, |
| "grad_norm_var": 3.5335662465775757, |
| "learning_rate": 0.0001, |
| "loss": 1.1721, |
| "loss/crossentropy": 2.8756470680236816, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.16248445212841034, |
| "loss/reg": 0.00018057989655062556, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.047, |
| "grad_norm": 2.7833797931671143, |
| "grad_norm_var": 3.5259529031790064, |
| "learning_rate": 0.0001, |
| "loss": 1.3487, |
| "loss/crossentropy": 2.7149651050567627, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.20630021393299103, |
| "loss/reg": 0.0001805323117878288, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.047125, |
| "grad_norm": 2.4678053855895996, |
| "grad_norm_var": 3.545491291634365, |
| "learning_rate": 0.0001, |
| "loss": 1.4444, |
| "loss/crossentropy": 2.2349612712860107, |
| "loss/hidden": 1.2265625, |
| "loss/logits": 0.21606594324111938, |
| "loss/reg": 0.00018048338824883103, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.04725, |
| "grad_norm": 2.368433952331543, |
| "grad_norm_var": 3.5763182105236933, |
| "learning_rate": 0.0001, |
| "loss": 1.2423, |
| "loss/crossentropy": 2.3095428943634033, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.1624036729335785, |
| "loss/reg": 0.00018044162425212562, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.047375, |
| "grad_norm": 4.887925148010254, |
| "grad_norm_var": 3.7252780043467792, |
| "learning_rate": 0.0001, |
| "loss": 1.357, |
| "loss/crossentropy": 2.2361862659454346, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.1989278942346573, |
| "loss/reg": 0.0001804078638087958, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.0475, |
| "grad_norm": 2.6829822063446045, |
| "grad_norm_var": 3.7198784549945154, |
| "learning_rate": 0.0001, |
| "loss": 1.3576, |
| "loss/crossentropy": 2.5164873600006104, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.20736883580684662, |
| "loss/reg": 0.00018035774701274931, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.047625, |
| "grad_norm": 2.9376659393310547, |
| "grad_norm_var": 3.673702627279779, |
| "learning_rate": 0.0001, |
| "loss": 1.2767, |
| "loss/crossentropy": 2.5471227169036865, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.18117238581180573, |
| "loss/reg": 0.000180321978405118, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.04775, |
| "grad_norm": 3.1138343811035156, |
| "grad_norm_var": 3.671869876252352, |
| "learning_rate": 0.0001, |
| "loss": 1.5706, |
| "loss/crossentropy": 2.5659494400024414, |
| "loss/hidden": 1.28125, |
| "loss/logits": 0.2875041365623474, |
| "loss/reg": 0.00018027149781119078, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.047875, |
| "grad_norm": 2.51397705078125, |
| "grad_norm_var": 0.3979920239186678, |
| "learning_rate": 0.0001, |
| "loss": 1.3404, |
| "loss/crossentropy": 2.6721858978271484, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.19799375534057617, |
| "loss/reg": 0.0001802304177545011, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 2.6631741523742676, |
| "grad_norm_var": 0.3868506457320857, |
| "learning_rate": 0.0001, |
| "loss": 1.1959, |
| "loss/crossentropy": 2.8570642471313477, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.1472700834274292, |
| "loss/reg": 0.00018018792616203427, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.048125, |
| "grad_norm": 2.225167989730835, |
| "grad_norm_var": 0.38710267816574984, |
| "learning_rate": 0.0001, |
| "loss": 1.3387, |
| "loss/crossentropy": 2.449850082397461, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.19622622430324554, |
| "loss/reg": 0.00018013913359027356, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.04825, |
| "grad_norm": 3.642057418823242, |
| "grad_norm_var": 0.429604118678219, |
| "learning_rate": 0.0001, |
| "loss": 1.5255, |
| "loss/crossentropy": 3.060318946838379, |
| "loss/hidden": 1.296875, |
| "loss/logits": 0.2267942726612091, |
| "loss/reg": 0.00018009122868534178, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.048375, |
| "grad_norm": 2.4205105304718018, |
| "grad_norm_var": 0.4293855111770417, |
| "learning_rate": 0.0001, |
| "loss": 1.4919, |
| "loss/crossentropy": 2.5439326763153076, |
| "loss/hidden": 1.2578125, |
| "loss/logits": 0.23233534395694733, |
| "loss/reg": 0.0001800364989321679, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.0485, |
| "grad_norm": 3.090855598449707, |
| "grad_norm_var": 0.429541218532165, |
| "learning_rate": 0.0001, |
| "loss": 1.3946, |
| "loss/crossentropy": 2.3761212825775146, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.20534038543701172, |
| "loss/reg": 0.0001799749006750062, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.048625, |
| "grad_norm": 2.829037666320801, |
| "grad_norm_var": 0.4203970366893562, |
| "learning_rate": 0.0001, |
| "loss": 1.3953, |
| "loss/crossentropy": 2.8099021911621094, |
| "loss/hidden": 1.203125, |
| "loss/logits": 0.19039469957351685, |
| "loss/reg": 0.0001799268211470917, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.04875, |
| "grad_norm": 2.4180996417999268, |
| "grad_norm_var": 0.43048309318027406, |
| "learning_rate": 0.0001, |
| "loss": 1.2337, |
| "loss/crossentropy": 2.5344197750091553, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.18504786491394043, |
| "loss/reg": 0.0001798665034584701, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.048875, |
| "grad_norm": 6.686630725860596, |
| "grad_norm_var": 1.3259110009600303, |
| "learning_rate": 0.0001, |
| "loss": 1.4048, |
| "loss/crossentropy": 1.59429132938385, |
| "loss/hidden": 1.234375, |
| "loss/logits": 0.16859778761863708, |
| "loss/reg": 0.00017980851407628506, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.049, |
| "grad_norm": 2.47430682182312, |
| "grad_norm_var": 1.3452680046498242, |
| "learning_rate": 0.0001, |
| "loss": 1.2505, |
| "loss/crossentropy": 2.4976658821105957, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.17843782901763916, |
| "loss/reg": 0.00017974060028791428, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.049125, |
| "grad_norm": 2.3827688694000244, |
| "grad_norm_var": 1.3527620972999594, |
| "learning_rate": 0.0001, |
| "loss": 1.2697, |
| "loss/crossentropy": 2.6137402057647705, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.17413964867591858, |
| "loss/reg": 0.00017966754967346787, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.04925, |
| "grad_norm": 3.5299346446990967, |
| "grad_norm_var": 1.3263260544293964, |
| "learning_rate": 0.0001, |
| "loss": 1.6096, |
| "loss/crossentropy": 2.1338679790496826, |
| "loss/hidden": 1.359375, |
| "loss/logits": 0.24847310781478882, |
| "loss/reg": 0.00017959915567189455, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.049375, |
| "grad_norm": 2.1859142780303955, |
| "grad_norm_var": 1.1587385123986338, |
| "learning_rate": 0.0001, |
| "loss": 1.1247, |
| "loss/crossentropy": 2.686222791671753, |
| "loss/hidden": 0.96875, |
| "loss/logits": 0.15412843227386475, |
| "loss/reg": 0.00017955088696908206, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.0495, |
| "grad_norm": 2.372267484664917, |
| "grad_norm_var": 1.1773802642484006, |
| "learning_rate": 0.0001, |
| "loss": 1.1605, |
| "loss/crossentropy": 2.648873805999756, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.16648846864700317, |
| "loss/reg": 0.00017949036555364728, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.049625, |
| "grad_norm": 2.343231439590454, |
| "grad_norm_var": 1.2018601019134032, |
| "learning_rate": 0.0001, |
| "loss": 1.1771, |
| "loss/crossentropy": 2.3791165351867676, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.15182971954345703, |
| "loss/reg": 0.00017943643615581095, |
| "step": 397 |
| }, |
| { |
| "epoch": 0.04975, |
| "grad_norm": 2.7274091243743896, |
| "grad_norm_var": 1.2017590131362377, |
| "learning_rate": 0.0001, |
| "loss": 1.324, |
| "loss/crossentropy": 2.57863712310791, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.18160942196846008, |
| "loss/reg": 0.00017938419478014112, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.049875, |
| "grad_norm": 4.155604362487793, |
| "grad_norm_var": 1.2842575464972394, |
| "learning_rate": 0.0001, |
| "loss": 1.5653, |
| "loss/crossentropy": 2.892331123352051, |
| "loss/hidden": 1.34375, |
| "loss/logits": 0.2197520136833191, |
| "loss/reg": 0.0001793357077986002, |
| "step": 399 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 2.741466522216797, |
| "grad_norm_var": 1.2810286441991583, |
| "learning_rate": 0.0001, |
| "loss": 1.3348, |
| "loss/crossentropy": 2.546194314956665, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.19236674904823303, |
| "loss/reg": 0.00017928793386090547, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.050125, |
| "grad_norm": 3.055189847946167, |
| "grad_norm_var": 1.236778717086356, |
| "learning_rate": 0.0001, |
| "loss": 1.3152, |
| "loss/crossentropy": 2.6719818115234375, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.23531492054462433, |
| "loss/reg": 0.00017924165877047926, |
| "step": 401 |
| }, |
| { |
| "epoch": 0.05025, |
| "grad_norm": 3.778313636779785, |
| "grad_norm_var": 1.2484054094760106, |
| "learning_rate": 0.0001, |
| "loss": 1.5425, |
| "loss/crossentropy": 2.7033426761627197, |
| "loss/hidden": 1.28125, |
| "loss/logits": 0.25942352414131165, |
| "loss/reg": 0.00017917431250680238, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.050375, |
| "grad_norm": 3.0340867042541504, |
| "grad_norm_var": 1.2184345071185567, |
| "learning_rate": 0.0001, |
| "loss": 1.455, |
| "loss/crossentropy": 2.5875117778778076, |
| "loss/hidden": 1.2265625, |
| "loss/logits": 0.22662264108657837, |
| "loss/reg": 0.00017911198665387928, |
| "step": 403 |
| }, |
| { |
| "epoch": 0.0505, |
| "grad_norm": 2.5151872634887695, |
| "grad_norm_var": 1.2408325162170568, |
| "learning_rate": 0.0001, |
| "loss": 1.253, |
| "loss/crossentropy": 2.6171019077301025, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.17306920886039734, |
| "loss/reg": 0.0001790501846699044, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.050625, |
| "grad_norm": 2.4294838905334473, |
| "grad_norm_var": 1.2640116286061014, |
| "learning_rate": 0.0001, |
| "loss": 1.1984, |
| "loss/crossentropy": 2.4346368312835693, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.18100124597549438, |
| "loss/reg": 0.00017898838268592954, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.05075, |
| "grad_norm": 2.7509233951568604, |
| "grad_norm_var": 1.242810414819528, |
| "learning_rate": 0.0001, |
| "loss": 1.62, |
| "loss/crossentropy": 2.047776937484741, |
| "loss/hidden": 1.421875, |
| "loss/logits": 0.19628939032554626, |
| "loss/reg": 0.00017893253243528306, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.050875, |
| "grad_norm": 2.4834845066070557, |
| "grad_norm_var": 0.3216287157750268, |
| "learning_rate": 0.0001, |
| "loss": 1.3514, |
| "loss/crossentropy": 2.6643404960632324, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.1933945268392563, |
| "loss/reg": 0.00017888467118609697, |
| "step": 407 |
| }, |
| { |
| "epoch": 0.051, |
| "grad_norm": 2.417241334915161, |
| "grad_norm_var": 0.32438624126894616, |
| "learning_rate": 0.0001, |
| "loss": 1.3021, |
| "loss/crossentropy": 2.600691556930542, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.19094930589199066, |
| "loss/reg": 0.00017882059910334647, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.051125, |
| "grad_norm": 2.3977980613708496, |
| "grad_norm_var": 0.32355143397302466, |
| "learning_rate": 0.0001, |
| "loss": 1.4318, |
| "loss/crossentropy": 2.6039021015167236, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.24255430698394775, |
| "loss/reg": 0.00017876985657494515, |
| "step": 409 |
| }, |
| { |
| "epoch": 0.05125, |
| "grad_norm": 2.205808639526367, |
| "grad_norm_var": 0.3055601722416668, |
| "learning_rate": 0.0001, |
| "loss": 1.1411, |
| "loss/crossentropy": 2.5921945571899414, |
| "loss/hidden": 0.98828125, |
| "loss/logits": 0.1510002166032791, |
| "loss/reg": 0.0001787317160051316, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.051375, |
| "grad_norm": 3.166574478149414, |
| "grad_norm_var": 0.2952319363017691, |
| "learning_rate": 0.0001, |
| "loss": 1.3997, |
| "loss/crossentropy": 2.4906835556030273, |
| "loss/hidden": 1.1796875, |
| "loss/logits": 0.21822378039360046, |
| "loss/reg": 0.00017868747818283737, |
| "step": 411 |
| }, |
| { |
| "epoch": 0.0515, |
| "grad_norm": 2.248802661895752, |
| "grad_norm_var": 0.30299352883237796, |
| "learning_rate": 0.0001, |
| "loss": 1.0924, |
| "loss/crossentropy": 2.599449396133423, |
| "loss/hidden": 0.9453125, |
| "loss/logits": 0.14533644914627075, |
| "loss/reg": 0.0001786567154340446, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.051625, |
| "grad_norm": 3.7585434913635254, |
| "grad_norm_var": 0.3461126328200045, |
| "learning_rate": 0.0001, |
| "loss": 1.4199, |
| "loss/crossentropy": 2.7979090213775635, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.23058170080184937, |
| "loss/reg": 0.00017861124069895595, |
| "step": 413 |
| }, |
| { |
| "epoch": 0.05175, |
| "grad_norm": 2.615172863006592, |
| "grad_norm_var": 0.34898320978636455, |
| "learning_rate": 0.0001, |
| "loss": 1.3951, |
| "loss/crossentropy": 1.9655026197433472, |
| "loss/hidden": 1.2109375, |
| "loss/logits": 0.18238556385040283, |
| "loss/reg": 0.00017857542843557894, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.051875, |
| "grad_norm": 2.38726806640625, |
| "grad_norm_var": 0.2388532010949414, |
| "learning_rate": 0.0001, |
| "loss": 1.0998, |
| "loss/crossentropy": 2.581796646118164, |
| "loss/hidden": 0.94921875, |
| "loss/logits": 0.1488036811351776, |
| "loss/reg": 0.00017854152247309685, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.052, |
| "grad_norm": 2.1067628860473633, |
| "grad_norm_var": 0.264675897864124, |
| "learning_rate": 0.0001, |
| "loss": 1.3556, |
| "loss/crossentropy": 2.202812910079956, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.18191415071487427, |
| "loss/reg": 0.00017848903371486813, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.052125, |
| "grad_norm": 2.8035178184509277, |
| "grad_norm_var": 0.25703166277479733, |
| "learning_rate": 0.0001, |
| "loss": 1.367, |
| "loss/crossentropy": 2.37416934967041, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.24018406867980957, |
| "loss/reg": 0.00017843768000602722, |
| "step": 417 |
| }, |
| { |
| "epoch": 0.05225, |
| "grad_norm": 2.9603078365325928, |
| "grad_norm_var": 0.1805549031603429, |
| "learning_rate": 0.0001, |
| "loss": 1.6557, |
| "loss/crossentropy": 2.2720189094543457, |
| "loss/hidden": 1.390625, |
| "loss/logits": 0.2632881999015808, |
| "loss/reg": 0.0001783809857442975, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.052375, |
| "grad_norm": 2.5229651927948, |
| "grad_norm_var": 0.17020038194862058, |
| "learning_rate": 0.0001, |
| "loss": 1.4756, |
| "loss/crossentropy": 2.63793683052063, |
| "loss/hidden": 1.2265625, |
| "loss/logits": 0.24721495807170868, |
| "loss/reg": 0.00017833360470831394, |
| "step": 419 |
| }, |
| { |
| "epoch": 0.0525, |
| "grad_norm": 2.355572462081909, |
| "grad_norm_var": 0.17382358098597403, |
| "learning_rate": 0.0001, |
| "loss": 1.2231, |
| "loss/crossentropy": 2.549771308898926, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.1823006570339203, |
| "loss/reg": 0.0001782874605851248, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.052625, |
| "grad_norm": 2.9690279960632324, |
| "grad_norm_var": 0.17970504092186654, |
| "learning_rate": 0.0001, |
| "loss": 1.3476, |
| "loss/crossentropy": 3.0549476146698, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.19741559028625488, |
| "loss/reg": 0.00017824411042965949, |
| "step": 421 |
| }, |
| { |
| "epoch": 0.05275, |
| "grad_norm": 2.880521059036255, |
| "grad_norm_var": 0.18276892961250724, |
| "learning_rate": 0.0001, |
| "loss": 1.1749, |
| "loss/crossentropy": 2.4630181789398193, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.16526931524276733, |
| "loss/reg": 0.00017819386266637594, |
| "step": 422 |
| }, |
| { |
| "epoch": 0.052875, |
| "grad_norm": 2.373995542526245, |
| "grad_norm_var": 0.18583898600397694, |
| "learning_rate": 0.0001, |
| "loss": 1.1957, |
| "loss/crossentropy": 2.5825746059417725, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.17832687497138977, |
| "loss/reg": 0.00017814231978263706, |
| "step": 423 |
| }, |
| { |
| "epoch": 0.053, |
| "grad_norm": 2.8790440559387207, |
| "grad_norm_var": 0.18572161644163973, |
| "learning_rate": 0.0001, |
| "loss": 1.3801, |
| "loss/crossentropy": 2.7846927642822266, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.22202245891094208, |
| "loss/reg": 0.00017809156270232052, |
| "step": 424 |
| }, |
| { |
| "epoch": 0.053125, |
| "grad_norm": 2.737457275390625, |
| "grad_norm_var": 0.18085466780064482, |
| "learning_rate": 0.0001, |
| "loss": 1.0736, |
| "loss/crossentropy": 2.3963334560394287, |
| "loss/hidden": 0.9375, |
| "loss/logits": 0.1343117207288742, |
| "loss/reg": 0.00017803694936446846, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.05325, |
| "grad_norm": 2.428834915161133, |
| "grad_norm_var": 0.16969274721350278, |
| "learning_rate": 0.0001, |
| "loss": 1.157, |
| "loss/crossentropy": 2.741081476211548, |
| "loss/hidden": 0.99609375, |
| "loss/logits": 0.15910392999649048, |
| "loss/reg": 0.00017797992040868849, |
| "step": 426 |
| }, |
| { |
| "epoch": 0.053375, |
| "grad_norm": 2.404265880584717, |
| "grad_norm_var": 0.1585534584039614, |
| "learning_rate": 0.0001, |
| "loss": 1.2847, |
| "loss/crossentropy": 2.89072322845459, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.1891355812549591, |
| "loss/reg": 0.00017792356084100902, |
| "step": 427 |
| }, |
| { |
| "epoch": 0.0535, |
| "grad_norm": 4.136691093444824, |
| "grad_norm_var": 0.27981797299989897, |
| "learning_rate": 0.0001, |
| "loss": 1.4735, |
| "loss/crossentropy": 2.6474928855895996, |
| "loss/hidden": 1.21875, |
| "loss/logits": 0.2529890537261963, |
| "loss/reg": 0.0001778633304638788, |
| "step": 428 |
| }, |
| { |
| "epoch": 0.053625, |
| "grad_norm": 2.81740140914917, |
| "grad_norm_var": 0.21112886078796553, |
| "learning_rate": 0.0001, |
| "loss": 1.4668, |
| "loss/crossentropy": 2.431791067123413, |
| "loss/hidden": 1.21875, |
| "loss/logits": 0.2462722361087799, |
| "loss/reg": 0.00017780056805349886, |
| "step": 429 |
| }, |
| { |
| "epoch": 0.05375, |
| "grad_norm": 3.5630476474761963, |
| "grad_norm_var": 0.2551499062270144, |
| "learning_rate": 0.0001, |
| "loss": 1.4714, |
| "loss/crossentropy": 2.573620557785034, |
| "loss/hidden": 1.2421875, |
| "loss/logits": 0.22747303545475006, |
| "loss/reg": 0.00017774660955183208, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.053875, |
| "grad_norm": 27.927080154418945, |
| "grad_norm_var": 39.71803281932496, |
| "learning_rate": 0.0001, |
| "loss": 1.2788, |
| "loss/crossentropy": 2.723278522491455, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.191043421626091, |
| "loss/reg": 0.00017769775877241045, |
| "step": 431 |
| }, |
| { |
| "epoch": 0.054, |
| "grad_norm": 2.35017991065979, |
| "grad_norm_var": 39.64838987108033, |
| "learning_rate": 0.0001, |
| "loss": 1.131, |
| "loss/crossentropy": 2.7018210887908936, |
| "loss/hidden": 0.98828125, |
| "loss/logits": 0.14092248678207397, |
| "loss/reg": 0.00017764477524906397, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.054125, |
| "grad_norm": 4.339812278747559, |
| "grad_norm_var": 39.472594042649305, |
| "learning_rate": 0.0001, |
| "loss": 1.348, |
| "loss/crossentropy": 2.808999538421631, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.17429979145526886, |
| "loss/reg": 0.0001775865093804896, |
| "step": 433 |
| }, |
| { |
| "epoch": 0.05425, |
| "grad_norm": 2.577270030975342, |
| "grad_norm_var": 39.559269314754324, |
| "learning_rate": 0.0001, |
| "loss": 1.3842, |
| "loss/crossentropy": 2.3989908695220947, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.19493895769119263, |
| "loss/reg": 0.00017753323481883854, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.054375, |
| "grad_norm": 3.111717462539673, |
| "grad_norm_var": 39.42935091258095, |
| "learning_rate": 0.0001, |
| "loss": 1.4152, |
| "loss/crossentropy": 2.464742660522461, |
| "loss/hidden": 1.203125, |
| "loss/logits": 0.2103499472141266, |
| "loss/reg": 0.00017748030950315297, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.0545, |
| "grad_norm": 2.0744073390960693, |
| "grad_norm_var": 39.51433658135766, |
| "learning_rate": 0.0001, |
| "loss": 1.2571, |
| "loss/crossentropy": 2.4160804748535156, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.18496504426002502, |
| "loss/reg": 0.00017741357441991568, |
| "step": 436 |
| }, |
| { |
| "epoch": 0.054625, |
| "grad_norm": 2.2014148235321045, |
| "grad_norm_var": 39.705110235168064, |
| "learning_rate": 0.0001, |
| "loss": 1.2757, |
| "loss/crossentropy": 2.558213710784912, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.20358391106128693, |
| "loss/reg": 0.00017734503489919007, |
| "step": 437 |
| }, |
| { |
| "epoch": 0.05475, |
| "grad_norm": 2.42464017868042, |
| "grad_norm_var": 39.81199116769601, |
| "learning_rate": 0.0001, |
| "loss": 1.2475, |
| "loss/crossentropy": 2.4901251792907715, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.16764749586582184, |
| "loss/reg": 0.00017729295359458774, |
| "step": 438 |
| }, |
| { |
| "epoch": 0.054875, |
| "grad_norm": 2.230916976928711, |
| "grad_norm_var": 39.85185812679954, |
| "learning_rate": 0.0001, |
| "loss": 1.0936, |
| "loss/crossentropy": 2.447075366973877, |
| "loss/hidden": 0.9453125, |
| "loss/logits": 0.1465335488319397, |
| "loss/reg": 0.0001772290706867352, |
| "step": 439 |
| }, |
| { |
| "epoch": 0.055, |
| "grad_norm": 2.749454975128174, |
| "grad_norm_var": 39.878976148047535, |
| "learning_rate": 0.0001, |
| "loss": 1.4814, |
| "loss/crossentropy": 2.2224342823028564, |
| "loss/hidden": 1.2734375, |
| "loss/logits": 0.20615623891353607, |
| "loss/reg": 0.00017717515584081411, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.055125, |
| "grad_norm": 3.0442757606506348, |
| "grad_norm_var": 39.81767857726663, |
| "learning_rate": 0.0001, |
| "loss": 1.8732, |
| "loss/crossentropy": 2.110996723175049, |
| "loss/hidden": 1.515625, |
| "loss/logits": 0.35577309131622314, |
| "loss/reg": 0.0001771111856214702, |
| "step": 441 |
| }, |
| { |
| "epoch": 0.05525, |
| "grad_norm": 4.601929187774658, |
| "grad_norm_var": 39.54202437298279, |
| "learning_rate": 0.0001, |
| "loss": 1.6329, |
| "loss/crossentropy": 2.708014965057373, |
| "loss/hidden": 1.3203125, |
| "loss/logits": 0.3108658790588379, |
| "loss/reg": 0.00017704560013953596, |
| "step": 442 |
| }, |
| { |
| "epoch": 0.055375, |
| "grad_norm": 3.026305913925171, |
| "grad_norm_var": 39.3895159629985, |
| "learning_rate": 0.0001, |
| "loss": 1.4032, |
| "loss/crossentropy": 2.738539934158325, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.21392673254013062, |
| "loss/reg": 0.00017697943258099258, |
| "step": 443 |
| }, |
| { |
| "epoch": 0.0555, |
| "grad_norm": 3.2805826663970947, |
| "grad_norm_var": 39.48518822606245, |
| "learning_rate": 0.0001, |
| "loss": 1.4055, |
| "loss/crossentropy": 2.6368248462677, |
| "loss/hidden": 1.203125, |
| "loss/logits": 0.2005615532398224, |
| "loss/reg": 0.0001769265509210527, |
| "step": 444 |
| }, |
| { |
| "epoch": 0.055625, |
| "grad_norm": 2.9948861598968506, |
| "grad_norm_var": 39.44686501090276, |
| "learning_rate": 0.0001, |
| "loss": 1.1039, |
| "loss/crossentropy": 2.6170451641082764, |
| "loss/hidden": 0.96484375, |
| "loss/logits": 0.13730549812316895, |
| "loss/reg": 0.00017686416686046869, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.05575, |
| "grad_norm": 2.2163240909576416, |
| "grad_norm_var": 39.734049160677394, |
| "learning_rate": 0.0001, |
| "loss": 1.2018, |
| "loss/crossentropy": 2.454167127609253, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.16098245978355408, |
| "loss/reg": 0.00017681345343589783, |
| "step": 446 |
| }, |
| { |
| "epoch": 0.055875, |
| "grad_norm": 2.6603283882141113, |
| "grad_norm_var": 0.5323792649670357, |
| "learning_rate": 0.0001, |
| "loss": 1.3646, |
| "loss/crossentropy": 2.499635696411133, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.22998638451099396, |
| "loss/reg": 0.00017675201524980366, |
| "step": 447 |
| }, |
| { |
| "epoch": 0.056, |
| "grad_norm": 2.228543758392334, |
| "grad_norm_var": 0.5416984580164314, |
| "learning_rate": 0.0001, |
| "loss": 1.1798, |
| "loss/crossentropy": 2.4845035076141357, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.16240708529949188, |
| "loss/reg": 0.00017670769011601806, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.056125, |
| "grad_norm": 2.2279977798461914, |
| "grad_norm_var": 0.4038044026013947, |
| "learning_rate": 0.0001, |
| "loss": 1.325, |
| "loss/crossentropy": 2.3559677600860596, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.18264812231063843, |
| "loss/reg": 0.00017664962797425687, |
| "step": 449 |
| }, |
| { |
| "epoch": 0.05625, |
| "grad_norm": 2.7545721530914307, |
| "grad_norm_var": 0.4022014302476805, |
| "learning_rate": 0.0001, |
| "loss": 1.4368, |
| "loss/crossentropy": 2.374678373336792, |
| "loss/hidden": 1.234375, |
| "loss/logits": 0.20067429542541504, |
| "loss/reg": 0.0001765888009686023, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.056375, |
| "grad_norm": 2.0431902408599854, |
| "grad_norm_var": 0.4204979320743064, |
| "learning_rate": 0.0001, |
| "loss": 1.0879, |
| "loss/crossentropy": 2.464282751083374, |
| "loss/hidden": 0.94140625, |
| "loss/logits": 0.1447741687297821, |
| "loss/reg": 0.00017654395196586847, |
| "step": 451 |
| }, |
| { |
| "epoch": 0.0565, |
| "grad_norm": 2.6901564598083496, |
| "grad_norm_var": 0.39509245912944924, |
| "learning_rate": 0.0001, |
| "loss": 1.2069, |
| "loss/crossentropy": 2.819383382797241, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.16609331965446472, |
| "loss/reg": 0.00017649627989158034, |
| "step": 452 |
| }, |
| { |
| "epoch": 0.056625, |
| "grad_norm": 2.1076624393463135, |
| "grad_norm_var": 0.402011404785177, |
| "learning_rate": 0.0001, |
| "loss": 1.2556, |
| "loss/crossentropy": 2.4384539127349854, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.16008606553077698, |
| "loss/reg": 0.0001764398330124095, |
| "step": 453 |
| }, |
| { |
| "epoch": 0.05675, |
| "grad_norm": 5.258527755737305, |
| "grad_norm_var": 0.7979676690528414, |
| "learning_rate": 0.0001, |
| "loss": 1.6846, |
| "loss/crossentropy": 2.5452864170074463, |
| "loss/hidden": 1.4140625, |
| "loss/logits": 0.26873037219047546, |
| "loss/reg": 0.00017638294957578182, |
| "step": 454 |
| }, |
| { |
| "epoch": 0.056875, |
| "grad_norm": 2.660983085632324, |
| "grad_norm_var": 0.7721798756654529, |
| "learning_rate": 0.0001, |
| "loss": 1.4255, |
| "loss/crossentropy": 2.585622549057007, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.23628488183021545, |
| "loss/reg": 0.00017633216339163482, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.057, |
| "grad_norm": 2.7722721099853516, |
| "grad_norm_var": 0.7717267059376826, |
| "learning_rate": 0.0001, |
| "loss": 1.2022, |
| "loss/crossentropy": 2.800078868865967, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.15360021591186523, |
| "loss/reg": 0.0001762784959282726, |
| "step": 456 |
| }, |
| { |
| "epoch": 0.057125, |
| "grad_norm": 2.398487091064453, |
| "grad_norm_var": 0.7862760060851559, |
| "learning_rate": 0.0001, |
| "loss": 1.6049, |
| "loss/crossentropy": 2.1760308742523193, |
| "loss/hidden": 1.328125, |
| "loss/logits": 0.2749764025211334, |
| "loss/reg": 0.00017622820450924337, |
| "step": 457 |
| }, |
| { |
| "epoch": 0.05725, |
| "grad_norm": 2.1820383071899414, |
| "grad_norm_var": 0.5935128198963845, |
| "learning_rate": 0.0001, |
| "loss": 1.2135, |
| "loss/crossentropy": 2.6835989952087402, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.16489288210868835, |
| "loss/reg": 0.00017618124547880143, |
| "step": 458 |
| }, |
| { |
| "epoch": 0.057375, |
| "grad_norm": 2.3541500568389893, |
| "grad_norm_var": 0.5942025229741127, |
| "learning_rate": 0.0001, |
| "loss": 1.3202, |
| "loss/crossentropy": 2.6355140209198, |
| "loss/hidden": 1.1328125, |
| "loss/logits": 0.1856687068939209, |
| "loss/reg": 0.00017612463852856308, |
| "step": 459 |
| }, |
| { |
| "epoch": 0.0575, |
| "grad_norm": 2.4668965339660645, |
| "grad_norm_var": 0.5700904660282996, |
| "learning_rate": 0.0001, |
| "loss": 1.3106, |
| "loss/crossentropy": 2.656648874282837, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.18380066752433777, |
| "loss/reg": 0.00017607423069421202, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.057625, |
| "grad_norm": 2.6230361461639404, |
| "grad_norm_var": 0.5604462661929044, |
| "learning_rate": 0.0001, |
| "loss": 1.2208, |
| "loss/crossentropy": 2.62496280670166, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.16433754563331604, |
| "loss/reg": 0.0001760145096341148, |
| "step": 461 |
| }, |
| { |
| "epoch": 0.05775, |
| "grad_norm": 2.7273848056793213, |
| "grad_norm_var": 0.5504336260767483, |
| "learning_rate": 0.0001, |
| "loss": 1.4464, |
| "loss/crossentropy": 2.558682441711426, |
| "loss/hidden": 1.25, |
| "loss/logits": 0.19467273354530334, |
| "loss/reg": 0.00017597198893781751, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.057875, |
| "grad_norm": 2.6174404621124268, |
| "grad_norm_var": 0.5504024009310662, |
| "learning_rate": 0.0001, |
| "loss": 1.4354, |
| "loss/crossentropy": 2.1391024589538574, |
| "loss/hidden": 1.234375, |
| "loss/logits": 0.19923442602157593, |
| "loss/reg": 0.00017592695076018572, |
| "step": 463 |
| }, |
| { |
| "epoch": 0.058, |
| "grad_norm": 2.1421875953674316, |
| "grad_norm_var": 0.5555149090661641, |
| "learning_rate": 0.0001, |
| "loss": 1.2089, |
| "loss/crossentropy": 2.5548229217529297, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.17589880526065826, |
| "loss/reg": 0.00017588076298125088, |
| "step": 464 |
| }, |
| { |
| "epoch": 0.058125, |
| "grad_norm": 2.4386532306671143, |
| "grad_norm_var": 0.5470902662726682, |
| "learning_rate": 0.0001, |
| "loss": 1.0766, |
| "loss/crossentropy": 2.741363286972046, |
| "loss/hidden": 0.94140625, |
| "loss/logits": 0.13345757126808167, |
| "loss/reg": 0.00017582789587322623, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.05825, |
| "grad_norm": 2.7671115398406982, |
| "grad_norm_var": 0.5472918955756455, |
| "learning_rate": 0.0001, |
| "loss": 1.2597, |
| "loss/crossentropy": 2.585339307785034, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.1642206907272339, |
| "loss/reg": 0.00017577498510945588, |
| "step": 466 |
| }, |
| { |
| "epoch": 0.058375, |
| "grad_norm": 2.075833797454834, |
| "grad_norm_var": 0.5447581279211853, |
| "learning_rate": 0.0001, |
| "loss": 1.1296, |
| "loss/crossentropy": 2.4353983402252197, |
| "loss/hidden": 0.9765625, |
| "loss/logits": 0.1513109803199768, |
| "loss/reg": 0.00017573049990460277, |
| "step": 467 |
| }, |
| { |
| "epoch": 0.0585, |
| "grad_norm": 2.181549072265625, |
| "grad_norm_var": 0.557705888030068, |
| "learning_rate": 0.0001, |
| "loss": 1.234, |
| "loss/crossentropy": 2.389277696609497, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.16970399022102356, |
| "loss/reg": 0.0001756956335157156, |
| "step": 468 |
| }, |
| { |
| "epoch": 0.058625, |
| "grad_norm": 2.239053249359131, |
| "grad_norm_var": 0.5499689577837131, |
| "learning_rate": 0.0001, |
| "loss": 1.3819, |
| "loss/crossentropy": 2.655245780944824, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.2238529622554779, |
| "loss/reg": 0.00017564196605235338, |
| "step": 469 |
| }, |
| { |
| "epoch": 0.05875, |
| "grad_norm": 2.3605029582977295, |
| "grad_norm_var": 0.05499469594172955, |
| "learning_rate": 0.0001, |
| "loss": 1.3717, |
| "loss/crossentropy": 2.3876919746398926, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.19806072115898132, |
| "loss/reg": 0.00017558463150635362, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.058875, |
| "grad_norm": 2.848848581314087, |
| "grad_norm_var": 0.06278663740608768, |
| "learning_rate": 0.0001, |
| "loss": 1.5331, |
| "loss/crossentropy": 2.2159836292266846, |
| "loss/hidden": 1.2890625, |
| "loss/logits": 0.24231435358524323, |
| "loss/reg": 0.00017553169163875282, |
| "step": 471 |
| }, |
| { |
| "epoch": 0.059, |
| "grad_norm": 3.3035221099853516, |
| "grad_norm_var": 0.10327356833769556, |
| "learning_rate": 0.0001, |
| "loss": 1.3096, |
| "loss/crossentropy": 3.1982433795928955, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.16723856329917908, |
| "loss/reg": 0.0001754647382767871, |
| "step": 472 |
| }, |
| { |
| "epoch": 0.059125, |
| "grad_norm": 2.1994223594665527, |
| "grad_norm_var": 0.10799121596538726, |
| "learning_rate": 0.0001, |
| "loss": 1.2996, |
| "loss/crossentropy": 2.478044033050537, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.18064665794372559, |
| "loss/reg": 0.0001753960968926549, |
| "step": 473 |
| }, |
| { |
| "epoch": 0.05925, |
| "grad_norm": 2.7087080478668213, |
| "grad_norm_var": 0.10507261048413336, |
| "learning_rate": 0.0001, |
| "loss": 1.2304, |
| "loss/crossentropy": 2.533445119857788, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.19739562273025513, |
| "loss/reg": 0.0001753264368744567, |
| "step": 474 |
| }, |
| { |
| "epoch": 0.059375, |
| "grad_norm": 2.4171252250671387, |
| "grad_norm_var": 0.1040673242944185, |
| "learning_rate": 0.0001, |
| "loss": 1.2977, |
| "loss/crossentropy": 2.5889570713043213, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.21002550423145294, |
| "loss/reg": 0.00017525417206343263, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.0595, |
| "grad_norm": 3.3070313930511475, |
| "grad_norm_var": 0.14365224039723495, |
| "learning_rate": 0.0001, |
| "loss": 1.7167, |
| "loss/crossentropy": 2.7916061878204346, |
| "loss/hidden": 1.4453125, |
| "loss/logits": 0.2696676552295685, |
| "loss/reg": 0.0001752021926222369, |
| "step": 476 |
| }, |
| { |
| "epoch": 0.059625, |
| "grad_norm": 3.215808391571045, |
| "grad_norm_var": 0.17060835871623775, |
| "learning_rate": 0.0001, |
| "loss": 1.519, |
| "loss/crossentropy": 2.4236576557159424, |
| "loss/hidden": 1.2734375, |
| "loss/logits": 0.24385260045528412, |
| "loss/reg": 0.0001751292875269428, |
| "step": 477 |
| }, |
| { |
| "epoch": 0.05975, |
| "grad_norm": 2.4063854217529297, |
| "grad_norm_var": 0.17146307657458593, |
| "learning_rate": 0.0001, |
| "loss": 1.0966, |
| "loss/crossentropy": 2.4138169288635254, |
| "loss/hidden": 0.94921875, |
| "loss/logits": 0.14562630653381348, |
| "loss/reg": 0.00017507674056105316, |
| "step": 478 |
| }, |
| { |
| "epoch": 0.059875, |
| "grad_norm": 2.100240707397461, |
| "grad_norm_var": 0.18538063838473515, |
| "learning_rate": 0.0001, |
| "loss": 1.2543, |
| "loss/crossentropy": 2.656923770904541, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.18223264813423157, |
| "loss/reg": 0.0001750182273099199, |
| "step": 479 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 2.416749954223633, |
| "grad_norm_var": 0.17536422723811237, |
| "learning_rate": 0.0001, |
| "loss": 1.3935, |
| "loss/crossentropy": 2.5389764308929443, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.21990099549293518, |
| "loss/reg": 0.00017496509826742113, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.060125, |
| "grad_norm": 2.2483956813812256, |
| "grad_norm_var": 0.18074697157593392, |
| "learning_rate": 0.0001, |
| "loss": 1.3504, |
| "loss/crossentropy": 2.7335619926452637, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.19241070747375488, |
| "loss/reg": 0.00017491589824203402, |
| "step": 481 |
| }, |
| { |
| "epoch": 0.06025, |
| "grad_norm": 2.1877689361572266, |
| "grad_norm_var": 0.1849354900853349, |
| "learning_rate": 0.0001, |
| "loss": 1.248, |
| "loss/crossentropy": 2.66105318069458, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.18372279405593872, |
| "loss/reg": 0.00017485868011135608, |
| "step": 482 |
| }, |
| { |
| "epoch": 0.060375, |
| "grad_norm": 2.092740297317505, |
| "grad_norm_var": 0.18396663403457297, |
| "learning_rate": 0.0001, |
| "loss": 1.2386, |
| "loss/crossentropy": 2.7327380180358887, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.19782081246376038, |
| "loss/reg": 0.00017478555673733354, |
| "step": 483 |
| }, |
| { |
| "epoch": 0.0605, |
| "grad_norm": 2.3183560371398926, |
| "grad_norm_var": 0.17906094719213855, |
| "learning_rate": 0.0001, |
| "loss": 1.2824, |
| "loss/crossentropy": 2.4746274948120117, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.1869126260280609, |
| "loss/reg": 0.0001747341302689165, |
| "step": 484 |
| }, |
| { |
| "epoch": 0.060625, |
| "grad_norm": 3.5248806476593018, |
| "grad_norm_var": 0.23368608955632125, |
| "learning_rate": 0.0001, |
| "loss": 1.1942, |
| "loss/crossentropy": 2.6376962661743164, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.1689874827861786, |
| "loss/reg": 0.00017466919962316751, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.06075, |
| "grad_norm": 2.8418338298797607, |
| "grad_norm_var": 0.23256916977224643, |
| "learning_rate": 0.0001, |
| "loss": 1.4452, |
| "loss/crossentropy": 2.3843576908111572, |
| "loss/hidden": 1.1953125, |
| "loss/logits": 0.24809806048870087, |
| "loss/reg": 0.00017459361697547138, |
| "step": 486 |
| }, |
| { |
| "epoch": 0.060875, |
| "grad_norm": 2.697395086288452, |
| "grad_norm_var": 0.22965639284835385, |
| "learning_rate": 0.0001, |
| "loss": 1.1924, |
| "loss/crossentropy": 2.4852230548858643, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.15941958129405975, |
| "loss/reg": 0.000174528788193129, |
| "step": 487 |
| }, |
| { |
| "epoch": 0.061, |
| "grad_norm": 3.3923656940460205, |
| "grad_norm_var": 0.23819745706471546, |
| "learning_rate": 0.0001, |
| "loss": 1.2613, |
| "loss/crossentropy": 2.665536880493164, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.18143823742866516, |
| "loss/reg": 0.00017445418052375317, |
| "step": 488 |
| }, |
| { |
| "epoch": 0.061125, |
| "grad_norm": 3.820457935333252, |
| "grad_norm_var": 0.30943274234138396, |
| "learning_rate": 0.0001, |
| "loss": 1.4177, |
| "loss/crossentropy": 2.529301881790161, |
| "loss/hidden": 1.203125, |
| "loss/logits": 0.21287089586257935, |
| "loss/reg": 0.00017437619681004435, |
| "step": 489 |
| }, |
| { |
| "epoch": 0.06125, |
| "grad_norm": 2.441828966140747, |
| "grad_norm_var": 0.3146780452696741, |
| "learning_rate": 0.0001, |
| "loss": 1.1788, |
| "loss/crossentropy": 2.6879920959472656, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.1613830029964447, |
| "loss/reg": 0.00017432142340112478, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.061375, |
| "grad_norm": 2.8247528076171875, |
| "grad_norm_var": 0.30890959275739743, |
| "learning_rate": 0.0001, |
| "loss": 1.4539, |
| "loss/crossentropy": 2.2834181785583496, |
| "loss/hidden": 1.234375, |
| "loss/logits": 0.21779246628284454, |
| "loss/reg": 0.00017424933321308345, |
| "step": 491 |
| }, |
| { |
| "epoch": 0.0615, |
| "grad_norm": 2.4493589401245117, |
| "grad_norm_var": 0.2900195920917798, |
| "learning_rate": 0.0001, |
| "loss": 1.1726, |
| "loss/crossentropy": 2.2889974117279053, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.14741873741149902, |
| "loss/reg": 0.00017419550567865372, |
| "step": 492 |
| }, |
| { |
| "epoch": 0.061625, |
| "grad_norm": 2.667480707168579, |
| "grad_norm_var": 0.2700917314036502, |
| "learning_rate": 0.0001, |
| "loss": 1.4045, |
| "loss/crossentropy": 2.404547929763794, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.23084591329097748, |
| "loss/reg": 0.0001741291634971276, |
| "step": 493 |
| }, |
| { |
| "epoch": 0.06175, |
| "grad_norm": 3.452826499938965, |
| "grad_norm_var": 0.3042709664857275, |
| "learning_rate": 0.0001, |
| "loss": 1.3829, |
| "loss/crossentropy": 2.9492647647857666, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.19365137815475464, |
| "loss/reg": 0.00017407909035682678, |
| "step": 494 |
| }, |
| { |
| "epoch": 0.061875, |
| "grad_norm": 2.6484220027923584, |
| "grad_norm_var": 0.2779481152143487, |
| "learning_rate": 0.0001, |
| "loss": 1.4393, |
| "loss/crossentropy": 2.4739902019500732, |
| "loss/hidden": 1.2109375, |
| "loss/logits": 0.22663193941116333, |
| "loss/reg": 0.00017401424702256918, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.062, |
| "grad_norm": 2.3823630809783936, |
| "grad_norm_var": 0.2795572822758951, |
| "learning_rate": 0.0001, |
| "loss": 1.4423, |
| "loss/crossentropy": 2.1366026401519775, |
| "loss/hidden": 1.25, |
| "loss/logits": 0.1905536651611328, |
| "loss/reg": 0.00017397114424966276, |
| "step": 496 |
| }, |
| { |
| "epoch": 0.062125, |
| "grad_norm": 3.4144351482391357, |
| "grad_norm_var": 0.28663513944516883, |
| "learning_rate": 0.0001, |
| "loss": 1.3353, |
| "loss/crossentropy": 2.4802052974700928, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.21641235053539276, |
| "loss/reg": 0.00017391364963259548, |
| "step": 497 |
| }, |
| { |
| "epoch": 0.06225, |
| "grad_norm": 2.339780330657959, |
| "grad_norm_var": 0.2752179712740135, |
| "learning_rate": 0.0001, |
| "loss": 1.3462, |
| "loss/crossentropy": 2.416666030883789, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.20387353003025055, |
| "loss/reg": 0.00017384960665367544, |
| "step": 498 |
| }, |
| { |
| "epoch": 0.062375, |
| "grad_norm": 2.3220226764678955, |
| "grad_norm_var": 0.2559089262690113, |
| "learning_rate": 0.0001, |
| "loss": 1.1988, |
| "loss/crossentropy": 2.668480634689331, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.1736161708831787, |
| "loss/reg": 0.00017379365453962237, |
| "step": 499 |
| }, |
| { |
| "epoch": 0.0625, |
| "grad_norm": 2.35937762260437, |
| "grad_norm_var": 0.25312725190736174, |
| "learning_rate": 0.0001, |
| "loss": 1.2363, |
| "loss/crossentropy": 2.5724241733551025, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.16428104043006897, |
| "loss/reg": 0.00017374279559589922, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 8000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": true, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.2202930782208e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|