{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.375, "eval_steps": 250, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000125, "grad_norm": 2.4410972595214844, "learning_rate": 1.0000000000000002e-06, "loss": 1.3861, "loss/crossentropy": 2.3799004554748535, "loss/hidden": 1.1796875, "loss/logits": 0.20445775985717773, "loss/reg": 0.00019296666141599417, "step": 1 }, { "epoch": 0.00025, "grad_norm": 5.245810508728027, "learning_rate": 2.0000000000000003e-06, "loss": 1.8002, "loss/crossentropy": 3.2071006298065186, "loss/hidden": 1.5234375, "loss/logits": 0.27488040924072266, "loss/reg": 0.00019296666141599417, "step": 2 }, { "epoch": 0.000375, "grad_norm": 2.093414545059204, "learning_rate": 3e-06, "loss": 1.1295, "loss/crossentropy": 2.474642753601074, "loss/hidden": 0.98046875, "loss/logits": 0.14706741273403168, "loss/reg": 0.00019296648679301143, "step": 3 }, { "epoch": 0.0005, "grad_norm": 2.3313233852386475, "learning_rate": 4.000000000000001e-06, "loss": 1.1829, "loss/crossentropy": 2.6595563888549805, "loss/hidden": 1.0234375, "loss/logits": 0.15751230716705322, "loss/reg": 0.00019296605023555458, "step": 4 }, { "epoch": 0.000625, "grad_norm": 2.8557937145233154, "learning_rate": 5e-06, "loss": 1.2347, "loss/crossentropy": 2.4212286472320557, "loss/hidden": 1.0546875, "loss/logits": 0.1781189739704132, "loss/reg": 0.00019296558457426727, "step": 5 }, { "epoch": 0.00075, "grad_norm": 3.0438766479492188, "learning_rate": 6e-06, "loss": 1.3852, "loss/crossentropy": 2.5387792587280273, "loss/hidden": 1.1953125, "loss/logits": 0.1879885196685791, "loss/reg": 0.00019296500249765813, "step": 6 }, { "epoch": 0.000875, "grad_norm": 2.6279566287994385, "learning_rate": 7.000000000000001e-06, "loss": 1.3298, "loss/crossentropy": 2.4866859912872314, "loss/hidden": 1.1484375, "loss/logits": 0.17939803004264832, "loss/reg": 0.00019296453683637083, "step": 7 }, { "epoch": 0.001, "grad_norm": 4.1743645668029785, "learning_rate": 8.000000000000001e-06, "loss": 1.3617, "loss/crossentropy": 2.6065773963928223, "loss/hidden": 1.15625, "loss/logits": 0.20351174473762512, "loss/reg": 0.00019296388200018555, "step": 8 }, { "epoch": 0.001125, "grad_norm": 2.4077372550964355, "learning_rate": 9e-06, "loss": 1.3601, "loss/crossentropy": 2.5489792823791504, "loss/hidden": 1.15625, "loss/logits": 0.20188993215560913, "loss/reg": 0.00019296332902740687, "step": 9 }, { "epoch": 0.00125, "grad_norm": 2.785814046859741, "learning_rate": 1e-05, "loss": 1.3588, "loss/crossentropy": 2.892237424850464, "loss/hidden": 1.1640625, "loss/logits": 0.19282634556293488, "loss/reg": 0.0001929624268086627, "step": 10 }, { "epoch": 0.001375, "grad_norm": 2.223435163497925, "learning_rate": 1.1000000000000001e-05, "loss": 1.3473, "loss/crossentropy": 2.6144230365753174, "loss/hidden": 1.140625, "loss/logits": 0.20472508668899536, "loss/reg": 0.0001929616992129013, "step": 11 }, { "epoch": 0.0015, "grad_norm": 2.2602744102478027, "learning_rate": 1.2e-05, "loss": 1.4537, "loss/crossentropy": 2.445138931274414, "loss/hidden": 1.234375, "loss/logits": 0.21741002798080444, "loss/reg": 0.0001929609279613942, "step": 12 }, { "epoch": 0.001625, "grad_norm": 2.167941093444824, "learning_rate": 1.3000000000000001e-05, "loss": 1.3285, "loss/crossentropy": 2.4828319549560547, "loss/hidden": 1.140625, "loss/logits": 0.18590743839740753, "loss/reg": 0.00019296009850222617, "step": 13 }, { "epoch": 0.00175, "grad_norm": 3.135204315185547, "learning_rate": 1.4000000000000001e-05, "loss": 1.2881, "loss/crossentropy": 2.485008955001831, "loss/hidden": 1.1171875, "loss/logits": 0.16897538304328918, "loss/reg": 0.00019296004029456526, "step": 14 }, { "epoch": 0.001875, "grad_norm": 1.9516435861587524, "learning_rate": 1.5e-05, "loss": 1.3793, "loss/crossentropy": 2.6037957668304443, "loss/hidden": 1.15625, "loss/logits": 0.22108706831932068, "loss/reg": 0.0001929590798681602, "step": 15 }, { "epoch": 0.002, "grad_norm": 2.4786994457244873, "grad_norm_var": 0.7268455688420417, "learning_rate": 1.6000000000000003e-05, "loss": 1.4938, "loss/crossentropy": 2.6338419914245605, "loss/hidden": 1.2578125, "loss/logits": 0.23409520089626312, "loss/reg": 0.00019295798847451806, "step": 16 }, { "epoch": 0.002125, "grad_norm": 1.935854196548462, "grad_norm_var": 0.7645541886139611, "learning_rate": 1.7000000000000003e-05, "loss": 1.1905, "loss/crossentropy": 2.5458219051361084, "loss/hidden": 1.015625, "loss/logits": 0.17296427488327026, "loss/reg": 0.00019295624224469066, "step": 17 }, { "epoch": 0.00225, "grad_norm": 4.175808906555176, "grad_norm_var": 0.4775368463766857, "learning_rate": 1.8e-05, "loss": 1.4189, "loss/crossentropy": 2.745898485183716, "loss/hidden": 1.21875, "loss/logits": 0.1982022523880005, "loss/reg": 0.00019295603851787746, "step": 18 }, { "epoch": 0.002375, "grad_norm": 2.6384661197662354, "grad_norm_var": 0.45452375883730595, "learning_rate": 1.9e-05, "loss": 1.546, "loss/crossentropy": 2.5159096717834473, "loss/hidden": 1.3203125, "loss/logits": 0.22378988564014435, "loss/reg": 0.00019295531092211604, "step": 19 }, { "epoch": 0.0025, "grad_norm": 1.930158019065857, "grad_norm_var": 0.48428273913252134, "learning_rate": 2e-05, "loss": 1.1151, "loss/crossentropy": 2.6261353492736816, "loss/hidden": 0.9609375, "loss/logits": 0.1521872580051422, "loss/reg": 0.00019295411766506732, "step": 20 }, { "epoch": 0.002625, "grad_norm": 2.75396728515625, "grad_norm_var": 0.48247025151936307, "learning_rate": 2.1e-05, "loss": 1.4758, "loss/crossentropy": 2.448762893676758, "loss/hidden": 1.2578125, "loss/logits": 0.21609561145305634, "loss/reg": 0.00019295368110761046, "step": 21 }, { "epoch": 0.00275, "grad_norm": 2.8601644039154053, "grad_norm_var": 0.475377454219718, "learning_rate": 2.2000000000000003e-05, "loss": 1.5402, "loss/crossentropy": 2.4616007804870605, "loss/hidden": 1.328125, "loss/logits": 0.21012833714485168, "loss/reg": 0.00019295132369734347, "step": 22 }, { "epoch": 0.002875, "grad_norm": 2.119006395339966, "grad_norm_var": 0.493518604142709, "learning_rate": 2.3000000000000003e-05, "loss": 1.3082, "loss/crossentropy": 2.633119821548462, "loss/hidden": 1.125, "loss/logits": 0.18130342662334442, "loss/reg": 0.0001929480058606714, "step": 23 }, { "epoch": 0.003, "grad_norm": 2.163001537322998, "grad_norm_var": 0.3308316653990078, "learning_rate": 2.4e-05, "loss": 1.2387, "loss/crossentropy": 2.494175672531128, "loss/hidden": 1.0703125, "loss/logits": 0.16641545295715332, "loss/reg": 0.00019294496451038867, "step": 24 }, { "epoch": 0.003125, "grad_norm": 2.0063726902008057, "grad_norm_var": 0.34579458432486, "learning_rate": 2.5e-05, "loss": 1.122, "loss/crossentropy": 2.4526851177215576, "loss/hidden": 0.9765625, "loss/logits": 0.1435042917728424, "loss/reg": 0.00019294198136776686, "step": 25 }, { "epoch": 0.00325, "grad_norm": 2.8159995079040527, "grad_norm_var": 0.34710604301850645, "learning_rate": 2.6000000000000002e-05, "loss": 1.4971, "loss/crossentropy": 2.3794209957122803, "loss/hidden": 1.28125, "loss/logits": 0.2139449566602707, "loss/reg": 0.00019293944933451712, "step": 26 }, { "epoch": 0.003375, "grad_norm": 4.617008209228516, "grad_norm_var": 0.6245762786757856, "learning_rate": 2.7000000000000002e-05, "loss": 1.5303, "loss/crossentropy": 2.374497413635254, "loss/hidden": 1.3046875, "loss/logits": 0.22372832894325256, "loss/reg": 0.0001929359568748623, "step": 27 }, { "epoch": 0.0035, "grad_norm": 3.328347682952881, "grad_norm_var": 0.6438493937520643, "learning_rate": 2.8000000000000003e-05, "loss": 1.5591, "loss/crossentropy": 2.732736587524414, "loss/hidden": 1.3203125, "loss/logits": 0.23687124252319336, "loss/reg": 0.00019293361401651055, "step": 28 }, { "epoch": 0.003625, "grad_norm": 2.3474817276000977, "grad_norm_var": 0.6333103119315846, "learning_rate": 2.9e-05, "loss": 1.2983, "loss/crossentropy": 2.3827714920043945, "loss/hidden": 1.09375, "loss/logits": 0.20258934795856476, "loss/reg": 0.00019292977231089026, "step": 29 }, { "epoch": 0.00375, "grad_norm": 2.560765504837036, "grad_norm_var": 0.6208746981103727, "learning_rate": 3e-05, "loss": 1.2558, "loss/crossentropy": 2.5275678634643555, "loss/hidden": 1.0859375, "loss/logits": 0.16795387864112854, "loss/reg": 0.00019292706565465778, "step": 30 }, { "epoch": 0.003875, "grad_norm": 2.21565580368042, "grad_norm_var": 0.6000257496388314, "learning_rate": 3.1e-05, "loss": 1.1396, "loss/crossentropy": 2.991466999053955, "loss/hidden": 1.0, "loss/logits": 0.13766203820705414, "loss/reg": 0.00019292418437544256, "step": 31 }, { "epoch": 0.004, "grad_norm": 3.0804481506347656, "grad_norm_var": 0.6061713539146301, "learning_rate": 3.2000000000000005e-05, "loss": 1.4502, "loss/crossentropy": 2.5434651374816895, "loss/hidden": 1.234375, "loss/logits": 0.21392151713371277, "loss/reg": 0.00019291977514512837, "step": 32 }, { "epoch": 0.004125, "grad_norm": 3.4683053493499756, "grad_norm_var": 0.5923607081005333, "learning_rate": 3.3e-05, "loss": 1.586, "loss/crossentropy": 2.4229018688201904, "loss/hidden": 1.359375, "loss/logits": 0.2246609479188919, "loss/reg": 0.0001929169666254893, "step": 33 }, { "epoch": 0.00425, "grad_norm": 2.0332539081573486, "grad_norm_var": 0.4912531320085567, "learning_rate": 3.4000000000000007e-05, "loss": 1.1758, "loss/crossentropy": 2.4137802124023438, "loss/hidden": 1.015625, "loss/logits": 0.15825411677360535, "loss/reg": 0.000192913124919869, "step": 34 }, { "epoch": 0.004375, "grad_norm": 2.101600408554077, "grad_norm_var": 0.5125015485684291, "learning_rate": 3.5e-05, "loss": 1.2872, "loss/crossentropy": 2.5844082832336426, "loss/hidden": 1.09375, "loss/logits": 0.19150416553020477, "loss/reg": 0.0001929093268699944, "step": 35 }, { "epoch": 0.0045, "grad_norm": 3.8385353088378906, "grad_norm_var": 0.556932092742056, "learning_rate": 3.6e-05, "loss": 1.1458, "loss/crossentropy": 2.294477701187134, "loss/hidden": 0.98828125, "loss/logits": 0.15560123324394226, "loss/reg": 0.0001929069694597274, "step": 36 }, { "epoch": 0.004625, "grad_norm": 1.8931406736373901, "grad_norm_var": 0.6050138278151498, "learning_rate": 3.7e-05, "loss": 1.0789, "loss/crossentropy": 2.5690038204193115, "loss/hidden": 0.93359375, "loss/logits": 0.14334949851036072, "loss/reg": 0.00019290446653030813, "step": 37 }, { "epoch": 0.00475, "grad_norm": 1.720080018043518, "grad_norm_var": 0.6642705659226048, "learning_rate": 3.8e-05, "loss": 1.2069, "loss/crossentropy": 2.289663791656494, "loss/hidden": 1.046875, "loss/logits": 0.158083975315094, "loss/reg": 0.0001929002464748919, "step": 38 }, { "epoch": 0.004875, "grad_norm": 1.7338802814483643, "grad_norm_var": 0.7005152543709433, "learning_rate": 3.9000000000000006e-05, "loss": 1.0516, "loss/crossentropy": 2.6059696674346924, "loss/hidden": 0.921875, "loss/logits": 0.1277628391981125, "loss/reg": 0.00019289416377432644, "step": 39 }, { "epoch": 0.005, "grad_norm": 2.491640329360962, "grad_norm_var": 0.6872298635287296, "learning_rate": 4e-05, "loss": 1.4446, "loss/crossentropy": 2.4038562774658203, "loss/hidden": 1.2109375, "loss/logits": 0.2316935658454895, "loss/reg": 0.0001928891142597422, "step": 40 }, { "epoch": 0.005125, "grad_norm": 2.42004656791687, "grad_norm_var": 0.6629334231954789, "learning_rate": 4.1e-05, "loss": 1.2198, "loss/crossentropy": 2.4321351051330566, "loss/hidden": 1.0625, "loss/logits": 0.15540093183517456, "loss/reg": 0.00019288310431875288, "step": 41 }, { "epoch": 0.00525, "grad_norm": 2.5629024505615234, "grad_norm_var": 0.6618966221430126, "learning_rate": 4.2e-05, "loss": 1.5254, "loss/crossentropy": 2.23368501663208, "loss/hidden": 1.3046875, "loss/logits": 0.21881133317947388, "loss/reg": 0.0001928747951751575, "step": 42 }, { "epoch": 0.005375, "grad_norm": 2.2626430988311768, "grad_norm_var": 0.3911191161730377, "learning_rate": 4.3e-05, "loss": 1.4362, "loss/crossentropy": 2.2880733013153076, "loss/hidden": 1.2109375, "loss/logits": 0.22328704595565796, "loss/reg": 0.0001928645942825824, "step": 43 }, { "epoch": 0.0055, "grad_norm": 2.928407907485962, "grad_norm_var": 0.3571399417370721, "learning_rate": 4.4000000000000006e-05, "loss": 1.1088, "loss/crossentropy": 2.837071418762207, "loss/hidden": 0.96484375, "loss/logits": 0.1420312374830246, "loss/reg": 0.00019285624148324132, "step": 44 }, { "epoch": 0.005625, "grad_norm": 2.7014455795288086, "grad_norm_var": 0.35877893903101093, "learning_rate": 4.5e-05, "loss": 1.3821, "loss/crossentropy": 2.2957558631896973, "loss/hidden": 1.1953125, "loss/logits": 0.18486037850379944, "loss/reg": 0.0001928448909893632, "step": 45 }, { "epoch": 0.00575, "grad_norm": 1.8899732828140259, "grad_norm_var": 0.38153805228543447, "learning_rate": 4.600000000000001e-05, "loss": 1.0258, "loss/crossentropy": 2.3359339237213135, "loss/hidden": 0.90625, "loss/logits": 0.11766321957111359, "loss/reg": 0.00019283634901512414, "step": 46 }, { "epoch": 0.005875, "grad_norm": 2.2188658714294434, "grad_norm_var": 0.3814345973993899, "learning_rate": 4.7e-05, "loss": 1.0929, "loss/crossentropy": 2.6772139072418213, "loss/hidden": 0.96484375, "loss/logits": 0.12616246938705444, "loss/reg": 0.00019282741413917392, "step": 47 }, { "epoch": 0.006, "grad_norm": 2.1530601978302, "grad_norm_var": 0.35835352199148307, "learning_rate": 4.8e-05, "loss": 1.2993, "loss/crossentropy": 2.678344249725342, "loss/hidden": 1.109375, "loss/logits": 0.18795417249202728, "loss/reg": 0.00019281756249256432, "step": 48 }, { "epoch": 0.006125, "grad_norm": 2.0299501419067383, "grad_norm_var": 0.28299041785001716, "learning_rate": 4.9e-05, "loss": 1.1433, "loss/crossentropy": 2.6076881885528564, "loss/hidden": 0.9921875, "loss/logits": 0.14920485019683838, "loss/reg": 0.00019280907872598618, "step": 49 }, { "epoch": 0.00625, "grad_norm": 2.6268253326416016, "grad_norm_var": 0.28301229188597427, "learning_rate": 5e-05, "loss": 1.057, "loss/crossentropy": 2.625675678253174, "loss/hidden": 0.91796875, "loss/logits": 0.13711729645729065, "loss/reg": 0.00019279817934148014, "step": 50 }, { "epoch": 0.006375, "grad_norm": 1.8832714557647705, "grad_norm_var": 0.29317342698340093, "learning_rate": 5.1000000000000006e-05, "loss": 1.2667, "loss/crossentropy": 2.4522390365600586, "loss/hidden": 1.0859375, "loss/logits": 0.17878666520118713, "loss/reg": 0.00019278968102298677, "step": 51 }, { "epoch": 0.0065, "grad_norm": 2.7895314693450928, "grad_norm_var": 0.15160714498306468, "learning_rate": 5.2000000000000004e-05, "loss": 1.38, "loss/crossentropy": 2.143043041229248, "loss/hidden": 1.21875, "loss/logits": 0.15935076773166656, "loss/reg": 0.00019278022227808833, "step": 52 }, { "epoch": 0.006625, "grad_norm": 3.2793004512786865, "grad_norm_var": 0.20221103833160992, "learning_rate": 5.300000000000001e-05, "loss": 1.5103, "loss/crossentropy": 2.9073903560638428, "loss/hidden": 1.25, "loss/logits": 0.2583482563495636, "loss/reg": 0.00019277248065918684, "step": 53 }, { "epoch": 0.00675, "grad_norm": 2.013625144958496, "grad_norm_var": 0.18271730407283251, "learning_rate": 5.4000000000000005e-05, "loss": 1.0809, "loss/crossentropy": 2.50508975982666, "loss/hidden": 0.9453125, "loss/logits": 0.1336328089237213, "loss/reg": 0.00019276094099041075, "step": 54 }, { "epoch": 0.006875, "grad_norm": 2.584568738937378, "grad_norm_var": 0.15533136257730426, "learning_rate": 5.500000000000001e-05, "loss": 1.2027, "loss/crossentropy": 2.4901885986328125, "loss/hidden": 1.046875, "loss/logits": 0.1539323329925537, "loss/reg": 0.00019275395607110113, "step": 55 }, { "epoch": 0.007, "grad_norm": 2.0262043476104736, "grad_norm_var": 0.16487505994896826, "learning_rate": 5.6000000000000006e-05, "loss": 1.3553, "loss/crossentropy": 2.299118757247925, "loss/hidden": 1.1484375, "loss/logits": 0.20495867729187012, "loss/reg": 0.0001927466510096565, "step": 56 }, { "epoch": 0.007125, "grad_norm": 2.4767560958862305, "grad_norm_var": 0.16524151904890463, "learning_rate": 5.6999999999999996e-05, "loss": 1.2841, "loss/crossentropy": 2.5054197311401367, "loss/hidden": 1.1171875, "loss/logits": 0.1650199145078659, "loss/reg": 0.00019273380166850984, "step": 57 }, { "epoch": 0.00725, "grad_norm": 1.8864933252334595, "grad_norm_var": 0.17929933439751622, "learning_rate": 5.8e-05, "loss": 1.0831, "loss/crossentropy": 2.6685423851013184, "loss/hidden": 0.95703125, "loss/logits": 0.1241319477558136, "loss/reg": 0.00019271954079158604, "step": 58 }, { "epoch": 0.007375, "grad_norm": 1.927100658416748, "grad_norm_var": 0.19066639705673746, "learning_rate": 5.9e-05, "loss": 1.2188, "loss/crossentropy": 2.4530575275421143, "loss/hidden": 1.0546875, "loss/logits": 0.1622183918952942, "loss/reg": 0.0001927079283632338, "step": 59 }, { "epoch": 0.0075, "grad_norm": 1.9683936834335327, "grad_norm_var": 0.17275381294839126, "learning_rate": 6e-05, "loss": 1.1956, "loss/crossentropy": 2.699939250946045, "loss/hidden": 1.0390625, "loss/logits": 0.15460126101970673, "loss/reg": 0.00019269227050244808, "step": 60 }, { "epoch": 0.007625, "grad_norm": 2.430546998977661, "grad_norm_var": 0.1620622944705805, "learning_rate": 6.1e-05, "loss": 1.2826, "loss/crossentropy": 2.2341370582580566, "loss/hidden": 1.1171875, "loss/logits": 0.16349473595619202, "loss/reg": 0.0001926780241774395, "step": 61 }, { "epoch": 0.00775, "grad_norm": 2.28157377243042, "grad_norm_var": 0.15224653123686924, "learning_rate": 6.2e-05, "loss": 1.0961, "loss/crossentropy": 2.3603291511535645, "loss/hidden": 0.953125, "loss/logits": 0.14104235172271729, "loss/reg": 0.00019266277377028018, "step": 62 }, { "epoch": 0.007875, "grad_norm": 1.9362468719482422, "grad_norm_var": 0.1597685683605616, "learning_rate": 6.3e-05, "loss": 1.1251, "loss/crossentropy": 2.6113460063934326, "loss/hidden": 0.984375, "loss/logits": 0.1387864351272583, "loss/reg": 0.00019264982256572694, "step": 63 }, { "epoch": 0.008, "grad_norm": 3.4527783393859863, "grad_norm_var": 0.245370177212871, "learning_rate": 6.400000000000001e-05, "loss": 1.5699, "loss/crossentropy": 2.3186392784118652, "loss/hidden": 1.3359375, "loss/logits": 0.2319989800453186, "loss/reg": 0.00019263311696704477, "step": 64 }, { "epoch": 0.008125, "grad_norm": 2.002469062805176, "grad_norm_var": 0.24658852169075185, "learning_rate": 6.500000000000001e-05, "loss": 1.2089, "loss/crossentropy": 2.1945531368255615, "loss/hidden": 1.046875, "loss/logits": 0.1600569784641266, "loss/reg": 0.0001926190307131037, "step": 65 }, { "epoch": 0.00825, "grad_norm": 1.8942357301712036, "grad_norm_var": 0.25288209179575216, "learning_rate": 6.6e-05, "loss": 1.1301, "loss/crossentropy": 2.3987529277801514, "loss/hidden": 0.98046875, "loss/logits": 0.1477484107017517, "loss/reg": 0.0001926012773765251, "step": 66 }, { "epoch": 0.008375, "grad_norm": 2.9109678268432617, "grad_norm_var": 0.2615059196420252, "learning_rate": 6.7e-05, "loss": 1.2255, "loss/crossentropy": 2.5337891578674316, "loss/hidden": 1.0625, "loss/logits": 0.16106070578098297, "loss/reg": 0.00019258313113823533, "step": 67 }, { "epoch": 0.0085, "grad_norm": 2.1384077072143555, "grad_norm_var": 0.25126003810038394, "learning_rate": 6.800000000000001e-05, "loss": 1.3278, "loss/crossentropy": 2.577103614807129, "loss/hidden": 1.125, "loss/logits": 0.20084746181964874, "loss/reg": 0.00019256297673564404, "step": 68 }, { "epoch": 0.008625, "grad_norm": 2.1597537994384766, "grad_norm_var": 0.18723560405016518, "learning_rate": 6.9e-05, "loss": 1.1538, "loss/crossentropy": 2.6212854385375977, "loss/hidden": 1.0, "loss/logits": 0.15191976726055145, "loss/reg": 0.00019254189101047814, "step": 69 }, { "epoch": 0.00875, "grad_norm": 1.8894274234771729, "grad_norm_var": 0.19220724163081773, "learning_rate": 7e-05, "loss": 1.1419, "loss/crossentropy": 2.439669132232666, "loss/hidden": 0.96484375, "loss/logits": 0.17508690059185028, "loss/reg": 0.00019252618949394673, "step": 70 }, { "epoch": 0.008875, "grad_norm": 1.9923198223114014, "grad_norm_var": 0.1875417585129336, "learning_rate": 7.1e-05, "loss": 1.159, "loss/crossentropy": 2.593405246734619, "loss/hidden": 0.99609375, "loss/logits": 0.16099245846271515, "loss/reg": 0.00019250869809184223, "step": 71 }, { "epoch": 0.009, "grad_norm": 2.1258373260498047, "grad_norm_var": 0.18570921033151055, "learning_rate": 7.2e-05, "loss": 1.2886, "loss/crossentropy": 2.31851863861084, "loss/hidden": 1.1171875, "loss/logits": 0.16945742070674896, "loss/reg": 0.00019249116303399205, "step": 72 }, { "epoch": 0.009125, "grad_norm": 2.2625362873077393, "grad_norm_var": 0.1811603588675789, "learning_rate": 7.3e-05, "loss": 1.0782, "loss/crossentropy": 2.4330179691314697, "loss/hidden": 0.93359375, "loss/logits": 0.14271824061870575, "loss/reg": 0.0001924755924846977, "step": 73 }, { "epoch": 0.00925, "grad_norm": 2.4275050163269043, "grad_norm_var": 0.17657254479349263, "learning_rate": 7.4e-05, "loss": 1.2584, "loss/crossentropy": 2.5500473976135254, "loss/hidden": 1.0703125, "loss/logits": 0.186170294880867, "loss/reg": 0.0001924607204273343, "step": 74 }, { "epoch": 0.009375, "grad_norm": 2.333041191101074, "grad_norm_var": 0.17007094778421958, "learning_rate": 7.500000000000001e-05, "loss": 1.0767, "loss/crossentropy": 2.6195261478424072, "loss/hidden": 0.94921875, "loss/logits": 0.12556512653827667, "loss/reg": 0.0001924492244143039, "step": 75 }, { "epoch": 0.0095, "grad_norm": 2.6031246185302734, "grad_norm_var": 0.1703287548027977, "learning_rate": 7.6e-05, "loss": 1.2179, "loss/crossentropy": 2.488292932510376, "loss/hidden": 1.0234375, "loss/logits": 0.1925877332687378, "loss/reg": 0.000192438907106407, "step": 76 }, { "epoch": 0.009625, "grad_norm": 2.139622211456299, "grad_norm_var": 0.17065351345722057, "learning_rate": 7.7e-05, "loss": 1.0607, "loss/crossentropy": 2.934234619140625, "loss/hidden": 0.921875, "loss/logits": 0.1368916928768158, "loss/reg": 0.0001924296229844913, "step": 77 }, { "epoch": 0.00975, "grad_norm": 1.9406142234802246, "grad_norm_var": 0.17804626450119793, "learning_rate": 7.800000000000001e-05, "loss": 1.2752, "loss/crossentropy": 2.2727177143096924, "loss/hidden": 1.1015625, "loss/logits": 0.17168399691581726, "loss/reg": 0.0001924206007970497, "step": 78 }, { "epoch": 0.009875, "grad_norm": 3.055265426635742, "grad_norm_var": 0.20754827159903322, "learning_rate": 7.900000000000001e-05, "loss": 1.1607, "loss/crossentropy": 2.6755881309509277, "loss/hidden": 1.0, "loss/logits": 0.15880770981311798, "loss/reg": 0.00019241031259298325, "step": 79 }, { "epoch": 0.01, "grad_norm": 2.129188060760498, "grad_norm_var": 0.11942340663251176, "learning_rate": 8e-05, "loss": 1.2684, "loss/crossentropy": 2.5891458988189697, "loss/hidden": 1.078125, "loss/logits": 0.18836861848831177, "loss/reg": 0.00019238927052356303, "step": 80 }, { "epoch": 0.010125, "grad_norm": 3.2390594482421875, "grad_norm_var": 0.17413858607061497, "learning_rate": 8.1e-05, "loss": 2.3912, "loss/crossentropy": 2.566899061203003, "loss/hidden": 1.7265625, "loss/logits": 0.6627247333526611, "loss/reg": 0.00019236840307712555, "step": 81 }, { "epoch": 0.01025, "grad_norm": 2.6307613849639893, "grad_norm_var": 0.16548936874203926, "learning_rate": 8.2e-05, "loss": 1.2744, "loss/crossentropy": 1.8344066143035889, "loss/hidden": 1.140625, "loss/logits": 0.1318967193365097, "loss/reg": 0.00019235462241340429, "step": 82 }, { "epoch": 0.010375, "grad_norm": 3.545522928237915, "grad_norm_var": 0.23612178547081322, "learning_rate": 8.3e-05, "loss": 1.2902, "loss/crossentropy": 2.7840187549591064, "loss/hidden": 1.109375, "loss/logits": 0.1788717359304428, "loss/reg": 0.00019232937484048307, "step": 83 }, { "epoch": 0.0105, "grad_norm": 2.2964553833007812, "grad_norm_var": 0.23189123641267292, "learning_rate": 8.4e-05, "loss": 1.4269, "loss/crossentropy": 2.6213274002075195, "loss/hidden": 1.2265625, "loss/logits": 0.19838152825832367, "loss/reg": 0.00019230577163398266, "step": 84 }, { "epoch": 0.010625, "grad_norm": 2.2252533435821533, "grad_norm_var": 0.229859261969087, "learning_rate": 8.5e-05, "loss": 1.1939, "loss/crossentropy": 2.493159055709839, "loss/hidden": 1.0234375, "loss/logits": 0.16855394840240479, "loss/reg": 0.00019228595192544162, "step": 85 }, { "epoch": 0.01075, "grad_norm": 2.2857768535614014, "grad_norm_var": 0.21125701567141184, "learning_rate": 8.6e-05, "loss": 1.2406, "loss/crossentropy": 2.486485719680786, "loss/hidden": 1.046875, "loss/logits": 0.1917887181043625, "loss/reg": 0.00019227097800467163, "step": 86 }, { "epoch": 0.010875, "grad_norm": 4.509547710418701, "grad_norm_var": 0.4530040889277906, "learning_rate": 8.7e-05, "loss": 1.1073, "loss/crossentropy": 2.630571126937866, "loss/hidden": 0.96484375, "loss/logits": 0.14057384431362152, "loss/reg": 0.00019225555297452956, "step": 87 }, { "epoch": 0.011, "grad_norm": 2.0168137550354004, "grad_norm_var": 0.4607750991685954, "learning_rate": 8.800000000000001e-05, "loss": 1.053, "loss/crossentropy": 2.3603971004486084, "loss/hidden": 0.9140625, "loss/logits": 0.13705039024353027, "loss/reg": 0.0001922310038935393, "step": 88 }, { "epoch": 0.011125, "grad_norm": 2.73807430267334, "grad_norm_var": 0.45335285375272844, "learning_rate": 8.900000000000001e-05, "loss": 1.3178, "loss/crossentropy": 2.93936824798584, "loss/hidden": 1.1171875, "loss/logits": 0.19865703582763672, "loss/reg": 0.00019220533431507647, "step": 89 }, { "epoch": 0.01125, "grad_norm": 2.305682420730591, "grad_norm_var": 0.4576056958578417, "learning_rate": 9e-05, "loss": 1.0605, "loss/crossentropy": 2.4470906257629395, "loss/hidden": 0.921875, "loss/logits": 0.13670536875724792, "loss/reg": 0.00019217943190596998, "step": 90 }, { "epoch": 0.011375, "grad_norm": 2.333709239959717, "grad_norm_var": 0.45757975254874145, "learning_rate": 9.1e-05, "loss": 1.1485, "loss/crossentropy": 2.3914265632629395, "loss/hidden": 1.0, "loss/logits": 0.14661094546318054, "loss/reg": 0.00019215639622416347, "step": 91 }, { "epoch": 0.0115, "grad_norm": 3.1051077842712402, "grad_norm_var": 0.4718879306887779, "learning_rate": 9.200000000000001e-05, "loss": 1.1946, "loss/crossentropy": 2.3768973350524902, "loss/hidden": 1.046875, "loss/logits": 0.1458442062139511, "loss/reg": 0.00019213555788155645, "step": 92 }, { "epoch": 0.011625, "grad_norm": 1.9250227212905884, "grad_norm_var": 0.48954230695473144, "learning_rate": 9.300000000000001e-05, "loss": 1.0554, "loss/crossentropy": 2.565377712249756, "loss/hidden": 0.9140625, "loss/logits": 0.13942840695381165, "loss/reg": 0.00019211515609640628, "step": 93 }, { "epoch": 0.01175, "grad_norm": 2.176602840423584, "grad_norm_var": 0.4709343827101543, "learning_rate": 9.4e-05, "loss": 1.1256, "loss/crossentropy": 2.4452078342437744, "loss/hidden": 0.9765625, "loss/logits": 0.1470910608768463, "loss/reg": 0.00019209457968827337, "step": 94 }, { "epoch": 0.011875, "grad_norm": 3.14528489112854, "grad_norm_var": 0.4762166867826877, "learning_rate": 9.5e-05, "loss": 1.6721, "loss/crossentropy": 2.0026628971099854, "loss/hidden": 1.421875, "loss/logits": 0.24830523133277893, "loss/reg": 0.0001920641807373613, "step": 95 }, { "epoch": 0.012, "grad_norm": 2.5958688259124756, "grad_norm_var": 0.4566131842781544, "learning_rate": 9.6e-05, "loss": 1.2013, "loss/crossentropy": 2.620640993118286, "loss/hidden": 1.0234375, "loss/logits": 0.1759262979030609, "loss/reg": 0.00019204463751520962, "step": 96 }, { "epoch": 0.012125, "grad_norm": 2.195146322250366, "grad_norm_var": 0.4486006387079304, "learning_rate": 9.7e-05, "loss": 1.1398, "loss/crossentropy": 2.547090768814087, "loss/hidden": 0.9765625, "loss/logits": 0.16133451461791992, "loss/reg": 0.0001920342183439061, "step": 97 }, { "epoch": 0.01225, "grad_norm": 2.252614974975586, "grad_norm_var": 0.4573438457489293, "learning_rate": 9.8e-05, "loss": 1.1583, "loss/crossentropy": 2.4780287742614746, "loss/hidden": 1.0, "loss/logits": 0.15635529160499573, "loss/reg": 0.00019200837414246053, "step": 98 }, { "epoch": 0.012375, "grad_norm": 2.421096086502075, "grad_norm_var": 0.3951004366779675, "learning_rate": 9.900000000000001e-05, "loss": 1.1038, "loss/crossentropy": 2.2537543773651123, "loss/hidden": 0.96484375, "loss/logits": 0.13705003261566162, "loss/reg": 0.0001919834321597591, "step": 99 }, { "epoch": 0.0125, "grad_norm": 2.174814462661743, "grad_norm_var": 0.3998617443443601, "learning_rate": 0.0001, "loss": 1.1691, "loss/crossentropy": 2.458591938018799, "loss/hidden": 0.99609375, "loss/logits": 0.17106780409812927, "loss/reg": 0.0001919578353408724, "step": 100 }, { "epoch": 0.012625, "grad_norm": 2.622562885284424, "grad_norm_var": 0.39382746835866644, "learning_rate": 0.0001, "loss": 1.1769, "loss/crossentropy": 2.5160534381866455, "loss/hidden": 1.0078125, "loss/logits": 0.16712725162506104, "loss/reg": 0.00019193820480722934, "step": 101 }, { "epoch": 0.01275, "grad_norm": 2.24092960357666, "grad_norm_var": 0.3955345231673798, "learning_rate": 0.0001, "loss": 1.2354, "loss/crossentropy": 2.1451447010040283, "loss/hidden": 1.078125, "loss/logits": 0.1553465873003006, "loss/reg": 0.00019191819592379034, "step": 102 }, { "epoch": 0.012875, "grad_norm": 2.344604730606079, "grad_norm_var": 0.12208757192350952, "learning_rate": 0.0001, "loss": 1.4627, "loss/crossentropy": 2.641226053237915, "loss/hidden": 1.203125, "loss/logits": 0.2576836943626404, "loss/reg": 0.00019189789600204676, "step": 103 }, { "epoch": 0.013, "grad_norm": 2.6583104133605957, "grad_norm_var": 0.1139956751841869, "learning_rate": 0.0001, "loss": 1.2235, "loss/crossentropy": 2.2702269554138184, "loss/hidden": 1.046875, "loss/logits": 0.1746675968170166, "loss/reg": 0.00019186925783287734, "step": 104 }, { "epoch": 0.013125, "grad_norm": 2.452503204345703, "grad_norm_var": 0.10820816494096158, "learning_rate": 0.0001, "loss": 1.2496, "loss/crossentropy": 2.344975709915161, "loss/hidden": 1.0859375, "loss/logits": 0.1617823839187622, "loss/reg": 0.000191839542821981, "step": 105 }, { "epoch": 0.01325, "grad_norm": 2.3220486640930176, "grad_norm_var": 0.1079440961702573, "learning_rate": 0.0001, "loss": 1.3184, "loss/crossentropy": 2.382570505142212, "loss/hidden": 1.140625, "loss/logits": 0.1758350431919098, "loss/reg": 0.00019180966774001718, "step": 106 }, { "epoch": 0.013375, "grad_norm": 2.275015115737915, "grad_norm_var": 0.1089551443983727, "learning_rate": 0.0001, "loss": 1.1398, "loss/crossentropy": 2.288180112838745, "loss/hidden": 0.9765625, "loss/logits": 0.1613542139530182, "loss/reg": 0.00019177970534656197, "step": 107 }, { "epoch": 0.0135, "grad_norm": 2.281081199645996, "grad_norm_var": 0.0774087174098435, "learning_rate": 0.0001, "loss": 1.1199, "loss/crossentropy": 2.566871404647827, "loss/hidden": 0.96875, "loss/logits": 0.14925265312194824, "loss/reg": 0.00019174529006704688, "step": 108 }, { "epoch": 0.013625, "grad_norm": 2.281104564666748, "grad_norm_var": 0.06372173379328742, "learning_rate": 0.0001, "loss": 1.1916, "loss/crossentropy": 2.335575580596924, "loss/hidden": 1.015625, "loss/logits": 0.1740630865097046, "loss/reg": 0.00019171558960806578, "step": 109 }, { "epoch": 0.01375, "grad_norm": 3.2676873207092285, "grad_norm_var": 0.10526650532868657, "learning_rate": 0.0001, "loss": 1.4065, "loss/crossentropy": 2.33091402053833, "loss/hidden": 1.1796875, "loss/logits": 0.2248753160238266, "loss/reg": 0.00019168361905030906, "step": 110 }, { "epoch": 0.013875, "grad_norm": 2.448396921157837, "grad_norm_var": 0.07293540299188876, "learning_rate": 0.0001, "loss": 1.1487, "loss/crossentropy": 2.759523391723633, "loss/hidden": 0.98828125, "loss/logits": 0.1585177183151245, "loss/reg": 0.0001916515757329762, "step": 111 }, { "epoch": 0.014, "grad_norm": 2.745311975479126, "grad_norm_var": 0.07769384665264147, "learning_rate": 0.0001, "loss": 1.3054, "loss/crossentropy": 2.6752800941467285, "loss/hidden": 1.1328125, "loss/logits": 0.17071780562400818, "loss/reg": 0.00019161647651344538, "step": 112 }, { "epoch": 0.014125, "grad_norm": 6.7866058349609375, "grad_norm_var": 1.2475617279497555, "learning_rate": 0.0001, "loss": 1.527, "loss/crossentropy": 2.536259174346924, "loss/hidden": 1.328125, "loss/logits": 0.19696056842803955, "loss/reg": 0.0001915794564411044, "step": 113 }, { "epoch": 0.01425, "grad_norm": 2.572187900543213, "grad_norm_var": 1.2338838698080574, "learning_rate": 0.0001, "loss": 1.0705, "loss/crossentropy": 3.121675968170166, "loss/hidden": 0.92578125, "loss/logits": 0.1427587866783142, "loss/reg": 0.0001915483589982614, "step": 114 }, { "epoch": 0.014375, "grad_norm": 2.3879568576812744, "grad_norm_var": 1.2353765898385585, "learning_rate": 0.0001, "loss": 1.1787, "loss/crossentropy": 2.59924578666687, "loss/hidden": 1.015625, "loss/logits": 0.16120225191116333, "loss/reg": 0.0001915154862217605, "step": 115 }, { "epoch": 0.0145, "grad_norm": 2.207024097442627, "grad_norm_var": 1.2330085058190583, "learning_rate": 0.0001, "loss": 1.1257, "loss/crossentropy": 2.3122026920318604, "loss/hidden": 0.96484375, "loss/logits": 0.15894125401973724, "loss/reg": 0.0001914859312819317, "step": 116 }, { "epoch": 0.014625, "grad_norm": 2.5943527221679688, "grad_norm_var": 1.233512504208524, "learning_rate": 0.0001, "loss": 1.0578, "loss/crossentropy": 2.4962315559387207, "loss/hidden": 0.9296875, "loss/logits": 0.12624415755271912, "loss/reg": 0.0001914564927574247, "step": 117 }, { "epoch": 0.01475, "grad_norm": 2.5308315753936768, "grad_norm_var": 1.2194136468208996, "learning_rate": 0.0001, "loss": 1.3378, "loss/crossentropy": 2.2406933307647705, "loss/hidden": 1.15625, "loss/logits": 0.17962361872196198, "loss/reg": 0.00019143095414619893, "step": 118 }, { "epoch": 0.014875, "grad_norm": 3.8317177295684814, "grad_norm_var": 1.2753290966219148, "learning_rate": 0.0001, "loss": 1.5298, "loss/crossentropy": 2.7381229400634766, "loss/hidden": 1.296875, "loss/logits": 0.2309999018907547, "loss/reg": 0.00019139735377393663, "step": 119 }, { "epoch": 0.015, "grad_norm": 2.552171468734741, "grad_norm_var": 1.2787832219082094, "learning_rate": 0.0001, "loss": 1.1698, "loss/crossentropy": 2.678725481033325, "loss/hidden": 1.0078125, "loss/logits": 0.16004806756973267, "loss/reg": 0.0001913599990075454, "step": 120 }, { "epoch": 0.015125, "grad_norm": 2.930230140686035, "grad_norm_var": 1.2679826365318356, "learning_rate": 0.0001, "loss": 1.3195, "loss/crossentropy": 2.5161564350128174, "loss/hidden": 1.1015625, "loss/logits": 0.21604904532432556, "loss/reg": 0.00019133626483380795, "step": 121 }, { "epoch": 0.01525, "grad_norm": 2.184554100036621, "grad_norm_var": 1.2793169490082976, "learning_rate": 0.0001, "loss": 1.3258, "loss/crossentropy": 2.1544501781463623, "loss/hidden": 1.140625, "loss/logits": 0.18321675062179565, "loss/reg": 0.0001913021260406822, "step": 122 }, { "epoch": 0.015375, "grad_norm": 2.3296663761138916, "grad_norm_var": 1.2751879992777064, "learning_rate": 0.0001, "loss": 1.2465, "loss/crossentropy": 2.3694839477539062, "loss/hidden": 1.078125, "loss/logits": 0.16649125516414642, "loss/reg": 0.00019128025451209396, "step": 123 }, { "epoch": 0.0155, "grad_norm": 2.683415174484253, "grad_norm_var": 1.2536762853317933, "learning_rate": 0.0001, "loss": 1.3152, "loss/crossentropy": 2.480109930038452, "loss/hidden": 1.1171875, "loss/logits": 0.1961439996957779, "loss/reg": 0.00019124921527691185, "step": 124 }, { "epoch": 0.015625, "grad_norm": 2.2008211612701416, "grad_norm_var": 1.2606593807518112, "learning_rate": 0.0001, "loss": 1.2683, "loss/crossentropy": 2.3611254692077637, "loss/hidden": 1.09375, "loss/logits": 0.17260757088661194, "loss/reg": 0.00019121899094898254, "step": 125 }, { "epoch": 0.01575, "grad_norm": 2.5672714710235596, "grad_norm_var": 1.2561244980458308, "learning_rate": 0.0001, "loss": 1.1256, "loss/crossentropy": 2.4654054641723633, "loss/hidden": 0.97265625, "loss/logits": 0.1510714888572693, "loss/reg": 0.00019119179341942072, "step": 126 }, { "epoch": 0.015875, "grad_norm": 5.104769706726074, "grad_norm_var": 1.5559544106053274, "learning_rate": 0.0001, "loss": 1.7662, "loss/crossentropy": 2.8167941570281982, "loss/hidden": 1.46875, "loss/logits": 0.2955778241157532, "loss/reg": 0.00019115611212328076, "step": 127 }, { "epoch": 0.016, "grad_norm": 2.780869483947754, "grad_norm_var": 1.5547640591921663, "learning_rate": 0.0001, "loss": 1.3813, "loss/crossentropy": 2.4478983879089355, "loss/hidden": 1.1875, "loss/logits": 0.19183939695358276, "loss/reg": 0.00019112625159323215, "step": 128 }, { "epoch": 0.016125, "grad_norm": 2.753885269165039, "grad_norm_var": 0.5433630068432194, "learning_rate": 0.0001, "loss": 1.3723, "loss/crossentropy": 2.6112186908721924, "loss/hidden": 1.15625, "loss/logits": 0.2141554355621338, "loss/reg": 0.0001910965656861663, "step": 129 }, { "epoch": 0.01625, "grad_norm": 2.9186818599700928, "grad_norm_var": 0.5420405140966994, "learning_rate": 0.0001, "loss": 1.4122, "loss/crossentropy": 2.4329094886779785, "loss/hidden": 1.2109375, "loss/logits": 0.199338898062706, "loss/reg": 0.00019106207764707506, "step": 130 }, { "epoch": 0.016375, "grad_norm": 3.3282015323638916, "grad_norm_var": 0.5475325270303432, "learning_rate": 0.0001, "loss": 1.3236, "loss/crossentropy": 2.808492660522461, "loss/hidden": 1.1171875, "loss/logits": 0.20454144477844238, "loss/reg": 0.00019102977239526808, "step": 131 }, { "epoch": 0.0165, "grad_norm": 2.5251357555389404, "grad_norm_var": 0.5268546307130834, "learning_rate": 0.0001, "loss": 1.0806, "loss/crossentropy": 2.606909990310669, "loss/hidden": 0.94140625, "loss/logits": 0.1372692584991455, "loss/reg": 0.00019099873316008598, "step": 132 }, { "epoch": 0.016625, "grad_norm": 2.695373773574829, "grad_norm_var": 0.5238667023797914, "learning_rate": 0.0001, "loss": 1.1349, "loss/crossentropy": 2.5870134830474854, "loss/hidden": 0.97265625, "loss/logits": 0.16037797927856445, "loss/reg": 0.00019096150936093181, "step": 133 }, { "epoch": 0.01675, "grad_norm": 2.967905282974243, "grad_norm_var": 0.5160494986535296, "learning_rate": 0.0001, "loss": 1.3901, "loss/crossentropy": 2.3568673133850098, "loss/hidden": 1.1875, "loss/logits": 0.20067735016345978, "loss/reg": 0.0001909265120048076, "step": 134 }, { "epoch": 0.016875, "grad_norm": 2.248267412185669, "grad_norm_var": 0.4754480378524304, "learning_rate": 0.0001, "loss": 1.2149, "loss/crossentropy": 2.513349771499634, "loss/hidden": 1.046875, "loss/logits": 0.1660783886909485, "loss/reg": 0.00019089688430540264, "step": 135 }, { "epoch": 0.017, "grad_norm": 2.6341240406036377, "grad_norm_var": 0.4731794320985157, "learning_rate": 0.0001, "loss": 1.4788, "loss/crossentropy": 2.307194709777832, "loss/hidden": 1.2421875, "loss/logits": 0.23469889163970947, "loss/reg": 0.00019086015527136624, "step": 136 }, { "epoch": 0.017125, "grad_norm": 26.939998626708984, "grad_norm_var": 36.90875808790156, "learning_rate": 0.0001, "loss": 1.9603, "loss/crossentropy": 2.4554264545440674, "loss/hidden": 1.7265625, "loss/logits": 0.23182228207588196, "loss/reg": 0.00019083071674685925, "step": 137 }, { "epoch": 0.01725, "grad_norm": 2.5422139167785645, "grad_norm_var": 36.81568419391185, "learning_rate": 0.0001, "loss": 1.0683, "loss/crossentropy": 2.4824135303497314, "loss/hidden": 0.921875, "loss/logits": 0.14447075128555298, "loss/reg": 0.00019080075435340405, "step": 138 }, { "epoch": 0.017375, "grad_norm": 8.87843132019043, "grad_norm_var": 37.75269230148678, "learning_rate": 0.0001, "loss": 2.5968, "loss/crossentropy": 3.632392406463623, "loss/hidden": 2.0, "loss/logits": 0.594857931137085, "loss/reg": 0.0001907727710204199, "step": 139 }, { "epoch": 0.0175, "grad_norm": 2.6119391918182373, "grad_norm_var": 37.77256905325698, "learning_rate": 0.0001, "loss": 1.2831, "loss/crossentropy": 2.694307804107666, "loss/hidden": 1.0859375, "loss/logits": 0.19521979987621307, "loss/reg": 0.00019074990996159613, "step": 140 }, { "epoch": 0.017625, "grad_norm": 2.630040407180786, "grad_norm_var": 37.63927642256101, "learning_rate": 0.0001, "loss": 1.2854, "loss/crossentropy": 2.7011289596557617, "loss/hidden": 1.109375, "loss/logits": 0.17411190271377563, "loss/reg": 0.0001907304977066815, "step": 141 }, { "epoch": 0.01775, "grad_norm": 2.6480321884155273, "grad_norm_var": 37.616094691169124, "learning_rate": 0.0001, "loss": 1.5096, "loss/crossentropy": 2.260585069656372, "loss/hidden": 1.2890625, "loss/logits": 0.218610018491745, "loss/reg": 0.00019069462723564357, "step": 142 }, { "epoch": 0.017875, "grad_norm": 6.334838390350342, "grad_norm_var": 37.766716198289636, "learning_rate": 0.0001, "loss": 1.4905, "loss/crossentropy": 2.702542543411255, "loss/hidden": 1.265625, "loss/logits": 0.22292816638946533, "loss/reg": 0.00019065497326664627, "step": 143 }, { "epoch": 0.018, "grad_norm": 2.5124268531799316, "grad_norm_var": 37.84491654864667, "learning_rate": 0.0001, "loss": 1.4406, "loss/crossentropy": 2.3640494346618652, "loss/hidden": 1.234375, "loss/logits": 0.20432478189468384, "loss/reg": 0.00019061053171753883, "step": 144 }, { "epoch": 0.018125, "grad_norm": 2.5983355045318604, "grad_norm_var": 37.889344095265606, "learning_rate": 0.0001, "loss": 1.2585, "loss/crossentropy": 2.386502265930176, "loss/hidden": 1.109375, "loss/logits": 0.1472403109073639, "loss/reg": 0.0001905607496155426, "step": 145 }, { "epoch": 0.01825, "grad_norm": 2.3283042907714844, "grad_norm_var": 38.06027251189639, "learning_rate": 0.0001, "loss": 1.0838, "loss/crossentropy": 2.589846134185791, "loss/hidden": 0.94140625, "loss/logits": 0.14048275351524353, "loss/reg": 0.00019053251889999956, "step": 146 }, { "epoch": 0.018375, "grad_norm": 3.144895315170288, "grad_norm_var": 38.09776954094634, "learning_rate": 0.0001, "loss": 1.44, "loss/crossentropy": 2.7745416164398193, "loss/hidden": 1.234375, "loss/logits": 0.2037464678287506, "loss/reg": 0.00019048065587412566, "step": 147 }, { "epoch": 0.0185, "grad_norm": 26.648195266723633, "grad_norm_var": 67.26352470044017, "learning_rate": 0.0001, "loss": 1.5186, "loss/crossentropy": 2.64324951171875, "loss/hidden": 1.3125, "loss/logits": 0.20418909192085266, "loss/reg": 0.00019045177032239735, "step": 148 }, { "epoch": 0.018625, "grad_norm": 3.2434163093566895, "grad_norm_var": 67.02089246655065, "learning_rate": 0.0001, "loss": 1.4708, "loss/crossentropy": 2.3727641105651855, "loss/hidden": 1.25, "loss/logits": 0.2188955843448639, "loss/reg": 0.0001904223026940599, "step": 149 }, { "epoch": 0.01875, "grad_norm": 2.80842661857605, "grad_norm_var": 67.09348312744588, "learning_rate": 0.0001, "loss": 1.4085, "loss/crossentropy": 2.262383460998535, "loss/hidden": 1.21875, "loss/logits": 0.18781118094921112, "loss/reg": 0.0001903773081721738, "step": 150 }, { "epoch": 0.018875, "grad_norm": 2.683870553970337, "grad_norm_var": 66.87019083886788, "learning_rate": 0.0001, "loss": 1.2521, "loss/crossentropy": 2.5074141025543213, "loss/hidden": 1.0859375, "loss/logits": 0.16421037912368774, "loss/reg": 0.00019033256103284657, "step": 151 }, { "epoch": 0.019, "grad_norm": 2.229814291000366, "grad_norm_var": 67.0793329518605, "learning_rate": 0.0001, "loss": 1.1402, "loss/crossentropy": 2.541126012802124, "loss/hidden": 1.0, "loss/logits": 0.1382928341627121, "loss/reg": 0.00019028309907298535, "step": 152 }, { "epoch": 0.019125, "grad_norm": 3.54891300201416, "grad_norm_var": 36.90022117788916, "learning_rate": 0.0001, "loss": 1.6011, "loss/crossentropy": 2.307600975036621, "loss/hidden": 1.3671875, "loss/logits": 0.23196424543857574, "loss/reg": 0.000190236751222983, "step": 153 }, { "epoch": 0.01925, "grad_norm": 2.360466718673706, "grad_norm_var": 36.95789528091398, "learning_rate": 0.0001, "loss": 1.1651, "loss/crossentropy": 2.8502321243286133, "loss/hidden": 0.99609375, "loss/logits": 0.16714423894882202, "loss/reg": 0.00019020496984012425, "step": 154 }, { "epoch": 0.019375, "grad_norm": 3.1953799724578857, "grad_norm_var": 35.90550452702474, "learning_rate": 0.0001, "loss": 1.0914, "loss/crossentropy": 2.809048652648926, "loss/hidden": 0.9453125, "loss/logits": 0.14417339861392975, "loss/reg": 0.00019016550504602492, "step": 155 }, { "epoch": 0.0195, "grad_norm": 2.6717631816864014, "grad_norm_var": 35.890903690685896, "learning_rate": 0.0001, "loss": 1.2632, "loss/crossentropy": 2.7067086696624756, "loss/hidden": 1.046875, "loss/logits": 0.21447248756885529, "loss/reg": 0.00019013263226952404, "step": 156 }, { "epoch": 0.019625, "grad_norm": 2.3536930084228516, "grad_norm_var": 35.96362699082104, "learning_rate": 0.0001, "loss": 1.1418, "loss/crossentropy": 2.5986528396606445, "loss/hidden": 0.98046875, "loss/logits": 0.15946394205093384, "loss/reg": 0.00019009722745977342, "step": 157 }, { "epoch": 0.01975, "grad_norm": 2.0697736740112305, "grad_norm_var": 36.1239934744858, "learning_rate": 0.0001, "loss": 1.1946, "loss/crossentropy": 2.4766931533813477, "loss/hidden": 1.03125, "loss/logits": 0.16143161058425903, "loss/reg": 0.0001900633069453761, "step": 158 }, { "epoch": 0.019875, "grad_norm": 2.4079439640045166, "grad_norm_var": 36.08560176253508, "learning_rate": 0.0001, "loss": 1.2239, "loss/crossentropy": 2.5432755947113037, "loss/hidden": 1.0390625, "loss/logits": 0.18291215598583221, "loss/reg": 0.00019003944180440158, "step": 159 }, { "epoch": 0.02, "grad_norm": 1.9956291913986206, "grad_norm_var": 36.21688030379835, "learning_rate": 0.0001, "loss": 1.2381, "loss/crossentropy": 2.6434080600738525, "loss/hidden": 1.0625, "loss/logits": 0.1736569106578827, "loss/reg": 0.0001900054921861738, "step": 160 }, { "epoch": 0.020125, "grad_norm": 2.3660035133361816, "grad_norm_var": 36.268105524765524, "learning_rate": 0.0001, "loss": 1.3402, "loss/crossentropy": 2.7457640171051025, "loss/hidden": 1.140625, "loss/logits": 0.19771495461463928, "loss/reg": 0.00018996208382304758, "step": 161 }, { "epoch": 0.02025, "grad_norm": 2.2741127014160156, "grad_norm_var": 36.28129668661176, "learning_rate": 0.0001, "loss": 1.1122, "loss/crossentropy": 2.694679021835327, "loss/hidden": 0.9453125, "loss/logits": 0.16500751674175262, "loss/reg": 0.00018992851255461574, "step": 162 }, { "epoch": 0.020375, "grad_norm": 4.299161911010742, "grad_norm_var": 36.213705020452686, "learning_rate": 0.0001, "loss": 1.5997, "loss/crossentropy": 2.41153621673584, "loss/hidden": 1.3515625, "loss/logits": 0.24626833200454712, "loss/reg": 0.00018988759256899357, "step": 163 }, { "epoch": 0.0205, "grad_norm": 2.6989567279815674, "grad_norm_var": 0.37062173740110177, "learning_rate": 0.0001, "loss": 1.4101, "loss/crossentropy": 2.6252450942993164, "loss/hidden": 1.1796875, "loss/logits": 0.2285255789756775, "loss/reg": 0.00018985987117048353, "step": 164 }, { "epoch": 0.020625, "grad_norm": 3.6266448497772217, "grad_norm_var": 0.40754436908909636, "learning_rate": 0.0001, "loss": 1.3041, "loss/crossentropy": 2.768099308013916, "loss/hidden": 1.1171875, "loss/logits": 0.18500390648841858, "loss/reg": 0.00018983366317115724, "step": 165 }, { "epoch": 0.02075, "grad_norm": 3.0785396099090576, "grad_norm_var": 0.4151303111429129, "learning_rate": 0.0001, "loss": 1.4192, "loss/crossentropy": 2.621722459793091, "loss/hidden": 1.203125, "loss/logits": 0.21420907974243164, "loss/reg": 0.00018979469314217567, "step": 166 }, { "epoch": 0.020875, "grad_norm": 2.837559938430786, "grad_norm_var": 0.4154299188334017, "learning_rate": 0.0001, "loss": 1.3458, "loss/crossentropy": 2.430216073989868, "loss/hidden": 1.1640625, "loss/logits": 0.17987295985221863, "loss/reg": 0.00018976339197251946, "step": 167 }, { "epoch": 0.021, "grad_norm": 2.8974390029907227, "grad_norm_var": 0.396902541608811, "learning_rate": 0.0001, "loss": 1.2952, "loss/crossentropy": 2.8876166343688965, "loss/hidden": 1.09375, "loss/logits": 0.19960089027881622, "loss/reg": 0.00018972392717842013, "step": 168 }, { "epoch": 0.021125, "grad_norm": 2.533341646194458, "grad_norm_var": 0.3589553633283116, "learning_rate": 0.0001, "loss": 1.2505, "loss/crossentropy": 2.7849295139312744, "loss/hidden": 1.078125, "loss/logits": 0.17050783336162567, "loss/reg": 0.0001896892354125157, "step": 169 }, { "epoch": 0.02125, "grad_norm": 2.5643837451934814, "grad_norm_var": 0.35153012514085774, "learning_rate": 0.0001, "loss": 1.4785, "loss/crossentropy": 2.497753381729126, "loss/hidden": 1.234375, "loss/logits": 0.2422376275062561, "loss/reg": 0.00018963789625559002, "step": 170 }, { "epoch": 0.021375, "grad_norm": 2.6845455169677734, "grad_norm_var": 0.3369522102595737, "learning_rate": 0.0001, "loss": 1.2292, "loss/crossentropy": 2.5292482376098633, "loss/hidden": 1.046875, "loss/logits": 0.18040287494659424, "loss/reg": 0.00018960374291054904, "step": 171 }, { "epoch": 0.0215, "grad_norm": 2.3379464149475098, "grad_norm_var": 0.3456172785279794, "learning_rate": 0.0001, "loss": 1.3626, "loss/crossentropy": 2.5603489875793457, "loss/hidden": 1.15625, "loss/logits": 0.20441675186157227, "loss/reg": 0.00018956881831400096, "step": 172 }, { "epoch": 0.021625, "grad_norm": 2.594144582748413, "grad_norm_var": 0.33847746883165203, "learning_rate": 0.0001, "loss": 1.2032, "loss/crossentropy": 2.6486992835998535, "loss/hidden": 1.0234375, "loss/logits": 0.17790886759757996, "loss/reg": 0.00018953454855363816, "step": 173 }, { "epoch": 0.02175, "grad_norm": 2.9685492515563965, "grad_norm_var": 0.3129452666235446, "learning_rate": 0.0001, "loss": 1.4619, "loss/crossentropy": 2.455124855041504, "loss/hidden": 1.25, "loss/logits": 0.21000754833221436, "loss/reg": 0.0001895053283078596, "step": 174 }, { "epoch": 0.021875, "grad_norm": 2.533724308013916, "grad_norm_var": 0.3080246907592235, "learning_rate": 0.0001, "loss": 1.3506, "loss/crossentropy": 2.4606072902679443, "loss/hidden": 1.171875, "loss/logits": 0.1768466681241989, "loss/reg": 0.0001894600281957537, "step": 175 }, { "epoch": 0.022, "grad_norm": 2.629793643951416, "grad_norm_var": 0.2678377821192363, "learning_rate": 0.0001, "loss": 1.3554, "loss/crossentropy": 2.50494647026062, "loss/hidden": 1.140625, "loss/logits": 0.2129271924495697, "loss/reg": 0.0001894147862913087, "step": 176 }, { "epoch": 0.022125, "grad_norm": 2.426023006439209, "grad_norm_var": 0.2645273844934496, "learning_rate": 0.0001, "loss": 1.2205, "loss/crossentropy": 2.4932215213775635, "loss/hidden": 1.046875, "loss/logits": 0.1717696189880371, "loss/reg": 0.0001893793960334733, "step": 177 }, { "epoch": 0.02225, "grad_norm": 12.438695907592773, "grad_norm_var": 5.993566887433531, "learning_rate": 0.0001, "loss": 1.4933, "loss/crossentropy": 2.6111667156219482, "loss/hidden": 1.3125, "loss/logits": 0.17892810702323914, "loss/reg": 0.00018933985847979784, "step": 178 }, { "epoch": 0.022375, "grad_norm": 2.2316975593566895, "grad_norm_var": 6.025764924701703, "learning_rate": 0.0001, "loss": 1.2337, "loss/crossentropy": 2.452789306640625, "loss/hidden": 1.0390625, "loss/logits": 0.19278019666671753, "loss/reg": 0.0001893048029160127, "step": 179 }, { "epoch": 0.0225, "grad_norm": 2.285240650177002, "grad_norm_var": 6.070589505634923, "learning_rate": 0.0001, "loss": 1.3731, "loss/crossentropy": 2.4663660526275635, "loss/hidden": 1.1875, "loss/logits": 0.18373644351959229, "loss/reg": 0.0001892724831122905, "step": 180 }, { "epoch": 0.022625, "grad_norm": 2.4717960357666016, "grad_norm_var": 6.102379780965068, "learning_rate": 0.0001, "loss": 1.1602, "loss/crossentropy": 2.7595460414886475, "loss/hidden": 0.9921875, "loss/logits": 0.16613003611564636, "loss/reg": 0.00018924209871329367, "step": 181 }, { "epoch": 0.02275, "grad_norm": 2.4991321563720703, "grad_norm_var": 6.134258503662537, "learning_rate": 0.0001, "loss": 1.1269, "loss/crossentropy": 2.270359516143799, "loss/hidden": 0.9765625, "loss/logits": 0.14847519993782043, "loss/reg": 0.00018921452283393592, "step": 182 }, { "epoch": 0.022875, "grad_norm": 2.3847415447235107, "grad_norm_var": 6.167952691299849, "learning_rate": 0.0001, "loss": 1.2154, "loss/crossentropy": 2.49910831451416, "loss/hidden": 1.0546875, "loss/logits": 0.158855140209198, "loss/reg": 0.00018917533452622592, "step": 183 }, { "epoch": 0.023, "grad_norm": 2.5306761264801025, "grad_norm_var": 6.188958706490437, "learning_rate": 0.0001, "loss": 1.4351, "loss/crossentropy": 2.3558568954467773, "loss/hidden": 1.2265625, "loss/logits": 0.2066374272108078, "loss/reg": 0.00018914024985861033, "step": 184 }, { "epoch": 0.023125, "grad_norm": 2.2277228832244873, "grad_norm_var": 6.219197407448101, "learning_rate": 0.0001, "loss": 1.0945, "loss/crossentropy": 2.6959574222564697, "loss/hidden": 0.9375, "loss/logits": 0.1550855189561844, "loss/reg": 0.00018909583741333336, "step": 185 }, { "epoch": 0.02325, "grad_norm": 3.3319711685180664, "grad_norm_var": 6.199868483198406, "learning_rate": 0.0001, "loss": 1.2316, "loss/crossentropy": 2.3768630027770996, "loss/hidden": 1.078125, "loss/logits": 0.15154394507408142, "loss/reg": 0.00018906217883341014, "step": 186 }, { "epoch": 0.023375, "grad_norm": 2.5634615421295166, "grad_norm_var": 6.208477354320166, "learning_rate": 0.0001, "loss": 1.2211, "loss/crossentropy": 2.679447650909424, "loss/hidden": 1.03125, "loss/logits": 0.18797855079174042, "loss/reg": 0.00018902822921518236, "step": 187 }, { "epoch": 0.0235, "grad_norm": 2.460848093032837, "grad_norm_var": 6.196057718240725, "learning_rate": 0.0001, "loss": 1.2703, "loss/crossentropy": 2.6185357570648193, "loss/hidden": 1.0859375, "loss/logits": 0.18242943286895752, "loss/reg": 0.000188985766726546, "step": 188 }, { "epoch": 0.023625, "grad_norm": 2.464927911758423, "grad_norm_var": 6.206869955671459, "learning_rate": 0.0001, "loss": 1.1848, "loss/crossentropy": 2.4472947120666504, "loss/hidden": 1.015625, "loss/logits": 0.16724231839179993, "loss/reg": 0.00018895140965469182, "step": 189 }, { "epoch": 0.02375, "grad_norm": 2.6658761501312256, "grad_norm_var": 6.220041941034168, "learning_rate": 0.0001, "loss": 1.2639, "loss/crossentropy": 2.3520965576171875, "loss/hidden": 1.09375, "loss/logits": 0.1682794690132141, "loss/reg": 0.0001889086706796661, "step": 190 }, { "epoch": 0.023875, "grad_norm": 2.450937509536743, "grad_norm_var": 6.227097887980031, "learning_rate": 0.0001, "loss": 1.3416, "loss/crossentropy": 2.4653656482696533, "loss/hidden": 1.1484375, "loss/logits": 0.19132134318351746, "loss/reg": 0.00018886124598793685, "step": 191 }, { "epoch": 0.024, "grad_norm": 2.5486769676208496, "grad_norm_var": 6.232908017729262, "learning_rate": 0.0001, "loss": 1.408, "loss/crossentropy": 2.4828147888183594, "loss/hidden": 1.1796875, "loss/logits": 0.22640305757522583, "loss/reg": 0.00018882614676840603, "step": 192 }, { "epoch": 0.024125, "grad_norm": 2.733919858932495, "grad_norm_var": 6.210183098557901, "learning_rate": 0.0001, "loss": 1.3418, "loss/crossentropy": 2.7277894020080566, "loss/hidden": 1.1640625, "loss/logits": 0.17583458125591278, "loss/reg": 0.00018878061382565647, "step": 193 }, { "epoch": 0.02425, "grad_norm": 4.0347490310668945, "grad_norm_var": 0.20841963510717557, "learning_rate": 0.0001, "loss": 1.2188, "loss/crossentropy": 2.671088933944702, "loss/hidden": 1.0546875, "loss/logits": 0.16220757365226746, "loss/reg": 0.00018873742374125868, "step": 194 }, { "epoch": 0.024375, "grad_norm": 2.1080739498138428, "grad_norm_var": 0.21574061631260597, "learning_rate": 0.0001, "loss": 1.0983, "loss/crossentropy": 2.465228319168091, "loss/hidden": 0.953125, "loss/logits": 0.1432487666606903, "loss/reg": 0.00018869389896281064, "step": 195 }, { "epoch": 0.0245, "grad_norm": 3.3122878074645996, "grad_norm_var": 0.23717126048259918, "learning_rate": 0.0001, "loss": 1.4535, "loss/crossentropy": 2.543299913406372, "loss/hidden": 1.2421875, "loss/logits": 0.2094225287437439, "loss/reg": 0.00018865260062739253, "step": 196 }, { "epoch": 0.024625, "grad_norm": 3.8393805027008057, "grad_norm_var": 0.3171273295177985, "learning_rate": 0.0001, "loss": 1.4855, "loss/crossentropy": 2.7230522632598877, "loss/hidden": 1.2265625, "loss/logits": 0.25702908635139465, "loss/reg": 0.00018861188436858356, "step": 197 }, { "epoch": 0.02475, "grad_norm": 2.3128864765167236, "grad_norm_var": 0.32576930180668173, "learning_rate": 0.0001, "loss": 1.2876, "loss/crossentropy": 2.470360517501831, "loss/hidden": 1.09375, "loss/logits": 0.19192585349082947, "loss/reg": 0.0001885706151369959, "step": 198 }, { "epoch": 0.024875, "grad_norm": 2.222193956375122, "grad_norm_var": 0.33529781396605357, "learning_rate": 0.0001, "loss": 1.2726, "loss/crossentropy": 2.4833052158355713, "loss/hidden": 1.078125, "loss/logits": 0.1925581395626068, "loss/reg": 0.00018852519860956818, "step": 199 }, { "epoch": 0.025, "grad_norm": 2.148888111114502, "grad_norm_var": 0.35496365745480013, "learning_rate": 0.0001, "loss": 1.1089, "loss/crossentropy": 2.445056915283203, "loss/hidden": 0.95703125, "loss/logits": 0.14994922280311584, "loss/reg": 0.00018848305626306683, "step": 200 }, { "epoch": 0.025125, "grad_norm": 3.009246587753296, "grad_norm_var": 0.34244750319667006, "learning_rate": 0.0001, "loss": 1.2831, "loss/crossentropy": 2.5913355350494385, "loss/hidden": 1.09375, "loss/logits": 0.18746912479400635, "loss/reg": 0.00018844057922251523, "step": 201 }, { "epoch": 0.02525, "grad_norm": 2.6826765537261963, "grad_norm_var": 0.31954091153954683, "learning_rate": 0.0001, "loss": 1.2207, "loss/crossentropy": 2.2942862510681152, "loss/hidden": 1.0703125, "loss/logits": 0.14849132299423218, "loss/reg": 0.00018840315169654787, "step": 202 }, { "epoch": 0.025375, "grad_norm": 2.089634418487549, "grad_norm_var": 0.34361665903956173, "learning_rate": 0.0001, "loss": 1.2152, "loss/crossentropy": 2.565154790878296, "loss/hidden": 1.0390625, "loss/logits": 0.17425358295440674, "loss/reg": 0.00018837135576177388, "step": 203 }, { "epoch": 0.0255, "grad_norm": 2.1764748096466064, "grad_norm_var": 0.35746666647812214, "learning_rate": 0.0001, "loss": 1.3562, "loss/crossentropy": 2.24812912940979, "loss/hidden": 1.140625, "loss/logits": 0.21368308365345, "loss/reg": 0.00018833015928976238, "step": 204 }, { "epoch": 0.025625, "grad_norm": 2.8530831336975098, "grad_norm_var": 0.35600843248713165, "learning_rate": 0.0001, "loss": 1.475, "loss/crossentropy": 2.0817508697509766, "loss/hidden": 1.2421875, "loss/logits": 0.23092345893383026, "loss/reg": 0.00018830793851520866, "step": 205 }, { "epoch": 0.02575, "grad_norm": 2.663485050201416, "grad_norm_var": 0.3560194494934476, "learning_rate": 0.0001, "loss": 1.1357, "loss/crossentropy": 2.7369191646575928, "loss/hidden": 0.97265625, "loss/logits": 0.1611793339252472, "loss/reg": 0.0001882883079815656, "step": 206 }, { "epoch": 0.025875, "grad_norm": 2.7267813682556152, "grad_norm_var": 0.35164556437612193, "learning_rate": 0.0001, "loss": 1.4018, "loss/crossentropy": 2.521235704421997, "loss/hidden": 1.1875, "loss/logits": 0.21241608262062073, "loss/reg": 0.0001882474316516891, "step": 207 }, { "epoch": 0.026, "grad_norm": 3.3730874061584473, "grad_norm_var": 0.37568723584622227, "learning_rate": 0.0001, "loss": 1.6168, "loss/crossentropy": 2.67789626121521, "loss/hidden": 1.375, "loss/logits": 0.2398722767829895, "loss/reg": 0.0001882062351796776, "step": 208 }, { "epoch": 0.026125, "grad_norm": 2.58087158203125, "grad_norm_var": 0.37784520807643934, "learning_rate": 0.0001, "loss": 1.4245, "loss/crossentropy": 2.64697003364563, "loss/hidden": 1.21875, "loss/logits": 0.2038642168045044, "loss/reg": 0.0001881623174995184, "step": 209 }, { "epoch": 0.02625, "grad_norm": 2.222271680831909, "grad_norm_var": 0.27470612970490343, "learning_rate": 0.0001, "loss": 1.3277, "loss/crossentropy": 2.5434417724609375, "loss/hidden": 1.140625, "loss/logits": 0.18515345454216003, "loss/reg": 0.00018812257621902972, "step": 210 }, { "epoch": 0.026375, "grad_norm": 1.9148452281951904, "grad_norm_var": 0.29087511560338725, "learning_rate": 0.0001, "loss": 1.097, "loss/crossentropy": 2.769158363342285, "loss/hidden": 0.94921875, "loss/logits": 0.14593489468097687, "loss/reg": 0.0001880890631582588, "step": 211 }, { "epoch": 0.0265, "grad_norm": 2.2211508750915527, "grad_norm_var": 0.26646107901443655, "learning_rate": 0.0001, "loss": 1.3446, "loss/crossentropy": 2.4616775512695312, "loss/hidden": 1.1484375, "loss/logits": 0.19431401789188385, "loss/reg": 0.00018806415027938783, "step": 212 }, { "epoch": 0.026625, "grad_norm": 2.763000965118408, "grad_norm_var": 0.15595023444909822, "learning_rate": 0.0001, "loss": 1.3695, "loss/crossentropy": 2.598646640777588, "loss/hidden": 1.140625, "loss/logits": 0.22699865698814392, "loss/reg": 0.00018803446437232196, "step": 213 }, { "epoch": 0.02675, "grad_norm": 2.321552276611328, "grad_norm_var": 0.15574157634795635, "learning_rate": 0.0001, "loss": 1.3229, "loss/crossentropy": 2.574349880218506, "loss/hidden": 1.140625, "loss/logits": 0.18040841817855835, "loss/reg": 0.00018799355893861502, "step": 214 }, { "epoch": 0.026875, "grad_norm": 2.252286195755005, "grad_norm_var": 0.15469124462205552, "learning_rate": 0.0001, "loss": 1.2042, "loss/crossentropy": 2.391897439956665, "loss/hidden": 1.03125, "loss/logits": 0.17103314399719238, "loss/reg": 0.00018794478091876954, "step": 215 }, { "epoch": 0.027, "grad_norm": 2.9572079181671143, "grad_norm_var": 0.15769059669121527, "learning_rate": 0.0001, "loss": 1.4114, "loss/crossentropy": 2.4063875675201416, "loss/hidden": 1.2109375, "loss/logits": 0.1986089050769806, "loss/reg": 0.00018789219029713422, "step": 216 }, { "epoch": 0.027125, "grad_norm": 2.3973214626312256, "grad_norm_var": 0.14366297343363474, "learning_rate": 0.0001, "loss": 1.2443, "loss/crossentropy": 2.5074877738952637, "loss/hidden": 1.0625, "loss/logits": 0.1798933446407318, "loss/reg": 0.00018785115389619023, "step": 217 }, { "epoch": 0.02725, "grad_norm": 3.122760534286499, "grad_norm_var": 0.16576884575759981, "learning_rate": 0.0001, "loss": 1.3388, "loss/crossentropy": 2.428725242614746, "loss/hidden": 1.140625, "loss/logits": 0.19626876711845398, "loss/reg": 0.00018780773098114878, "step": 218 }, { "epoch": 0.027375, "grad_norm": 2.321108341217041, "grad_norm_var": 0.15522596127473184, "learning_rate": 0.0001, "loss": 1.2505, "loss/crossentropy": 2.409878969192505, "loss/hidden": 1.046875, "loss/logits": 0.2017081379890442, "loss/reg": 0.00018775733769871294, "step": 219 }, { "epoch": 0.0275, "grad_norm": 2.365797281265259, "grad_norm_var": 0.14793109297261797, "learning_rate": 0.0001, "loss": 1.1702, "loss/crossentropy": 2.588578224182129, "loss/hidden": 1.015625, "loss/logits": 0.15268871188163757, "loss/reg": 0.00018771766917780042, "step": 220 }, { "epoch": 0.027625, "grad_norm": 2.1108896732330322, "grad_norm_var": 0.153953573032076, "learning_rate": 0.0001, "loss": 1.255, "loss/crossentropy": 2.527043581008911, "loss/hidden": 1.0859375, "loss/logits": 0.16717128455638885, "loss/reg": 0.00018766772700473666, "step": 221 }, { "epoch": 0.02775, "grad_norm": 2.5774085521698, "grad_norm_var": 0.15276588289228218, "learning_rate": 0.0001, "loss": 1.2479, "loss/crossentropy": 2.713797092437744, "loss/hidden": 1.0625, "loss/logits": 0.1835174411535263, "loss/reg": 0.00018762832041829824, "step": 222 }, { "epoch": 0.027875, "grad_norm": 2.253450393676758, "grad_norm_var": 0.1533568435494087, "learning_rate": 0.0001, "loss": 1.4177, "loss/crossentropy": 2.6281967163085938, "loss/hidden": 1.1953125, "loss/logits": 0.22053499519824982, "loss/reg": 0.0001875831076176837, "step": 223 }, { "epoch": 0.028, "grad_norm": 2.454873561859131, "grad_norm_var": 0.09728623528139314, "learning_rate": 0.0001, "loss": 1.4309, "loss/crossentropy": 2.572938919067383, "loss/hidden": 1.21875, "loss/logits": 0.21031853556632996, "loss/reg": 0.00018754607299342752, "step": 224 }, { "epoch": 0.028125, "grad_norm": 2.402453899383545, "grad_norm_var": 0.0956224663481746, "learning_rate": 0.0001, "loss": 1.2624, "loss/crossentropy": 2.50075101852417, "loss/hidden": 1.0859375, "loss/logits": 0.17460693418979645, "loss/reg": 0.0001875244197435677, "step": 225 }, { "epoch": 0.02825, "grad_norm": 1.8176156282424927, "grad_norm_var": 0.1163170905904891, "learning_rate": 0.0001, "loss": 1.203, "loss/crossentropy": 2.2213640213012695, "loss/hidden": 1.03125, "loss/logits": 0.16985741257667542, "loss/reg": 0.00018748667207546532, "step": 226 }, { "epoch": 0.028375, "grad_norm": 2.240485906600952, "grad_norm_var": 0.10227683752628369, "learning_rate": 0.0001, "loss": 1.2129, "loss/crossentropy": 2.7936031818389893, "loss/hidden": 1.03125, "loss/logits": 0.17974941432476044, "loss/reg": 0.00018747476860880852, "step": 227 }, { "epoch": 0.0285, "grad_norm": 2.478255033493042, "grad_norm_var": 0.09989290718763275, "learning_rate": 0.0001, "loss": 1.3474, "loss/crossentropy": 2.237700939178467, "loss/hidden": 1.15625, "loss/logits": 0.18926523625850677, "loss/reg": 0.00018743629334494472, "step": 228 }, { "epoch": 0.028625, "grad_norm": 2.2553858757019043, "grad_norm_var": 0.09327515190840353, "learning_rate": 0.0001, "loss": 1.302, "loss/crossentropy": 2.5399932861328125, "loss/hidden": 1.1171875, "loss/logits": 0.1829543262720108, "loss/reg": 0.0001873974542832002, "step": 229 }, { "epoch": 0.02875, "grad_norm": 2.5666871070861816, "grad_norm_var": 0.09461214816090115, "learning_rate": 0.0001, "loss": 1.1668, "loss/crossentropy": 2.5342864990234375, "loss/hidden": 1.0, "loss/logits": 0.16493582725524902, "loss/reg": 0.0001873665169114247, "step": 230 }, { "epoch": 0.028875, "grad_norm": 2.3809237480163574, "grad_norm_var": 0.09292632453379727, "learning_rate": 0.0001, "loss": 1.2757, "loss/crossentropy": 2.565258026123047, "loss/hidden": 1.0703125, "loss/logits": 0.2034991830587387, "loss/reg": 0.00018733744218479842, "step": 231 }, { "epoch": 0.029, "grad_norm": 11.484387397766113, "grad_norm_var": 5.249492807099011, "learning_rate": 0.0001, "loss": 1.2536, "loss/crossentropy": 2.610589027404785, "loss/hidden": 1.09375, "loss/logits": 0.15799307823181152, "loss/reg": 0.00018729745352175087, "step": 232 }, { "epoch": 0.029125, "grad_norm": 2.276120901107788, "grad_norm_var": 5.259372334728978, "learning_rate": 0.0001, "loss": 1.2413, "loss/crossentropy": 2.6497323513031006, "loss/hidden": 1.078125, "loss/logits": 0.16128413379192352, "loss/reg": 0.0001872599241323769, "step": 233 }, { "epoch": 0.02925, "grad_norm": 2.5592195987701416, "grad_norm_var": 5.265810753770302, "learning_rate": 0.0001, "loss": 1.3953, "loss/crossentropy": 2.6213786602020264, "loss/hidden": 1.171875, "loss/logits": 0.22152823209762573, "loss/reg": 0.0001872186257969588, "step": 234 }, { "epoch": 0.029375, "grad_norm": 2.91487455368042, "grad_norm_var": 5.241297695369635, "learning_rate": 0.0001, "loss": 1.4322, "loss/crossentropy": 2.8353567123413086, "loss/hidden": 1.2265625, "loss/logits": 0.2038136124610901, "loss/reg": 0.0001871798885986209, "step": 235 }, { "epoch": 0.0295, "grad_norm": 2.8779571056365967, "grad_norm_var": 5.218058981409527, "learning_rate": 0.0001, "loss": 1.1964, "loss/crossentropy": 2.818354606628418, "loss/hidden": 1.03125, "loss/logits": 0.16329507529735565, "loss/reg": 0.0001871388085419312, "step": 236 }, { "epoch": 0.029625, "grad_norm": 2.4912526607513428, "grad_norm_var": 5.183116200958803, "learning_rate": 0.0001, "loss": 1.1612, "loss/crossentropy": 2.485246419906616, "loss/hidden": 1.0, "loss/logits": 0.15928974747657776, "loss/reg": 0.0001870945852715522, "step": 237 }, { "epoch": 0.02975, "grad_norm": 2.1257100105285645, "grad_norm_var": 5.221437379820037, "learning_rate": 0.0001, "loss": 1.1011, "loss/crossentropy": 2.5041284561157227, "loss/hidden": 0.96484375, "loss/logits": 0.1343885362148285, "loss/reg": 0.00018704126705415547, "step": 238 }, { "epoch": 0.029875, "grad_norm": 2.609513998031616, "grad_norm_var": 5.195165909077186, "learning_rate": 0.0001, "loss": 1.2685, "loss/crossentropy": 2.5717175006866455, "loss/hidden": 1.09375, "loss/logits": 0.1728888750076294, "loss/reg": 0.00018699871725402772, "step": 239 }, { "epoch": 0.03, "grad_norm": 3.054771900177002, "grad_norm_var": 5.174376919782516, "learning_rate": 0.0001, "loss": 1.3059, "loss/crossentropy": 2.3587000370025635, "loss/hidden": 1.109375, "loss/logits": 0.19463828206062317, "loss/reg": 0.00018695260223466903, "step": 240 }, { "epoch": 0.030125, "grad_norm": 4.069124221801758, "grad_norm_var": 5.207761360833089, "learning_rate": 0.0001, "loss": 1.5327, "loss/crossentropy": 2.519747257232666, "loss/hidden": 1.3046875, "loss/logits": 0.22609561681747437, "loss/reg": 0.00018690834986045957, "step": 241 }, { "epoch": 0.03025, "grad_norm": 2.6919775009155273, "grad_norm_var": 5.101652290115754, "learning_rate": 0.0001, "loss": 1.4243, "loss/crossentropy": 2.5264573097229004, "loss/hidden": 1.203125, "loss/logits": 0.21935243904590607, "loss/reg": 0.00018685661780182272, "step": 242 }, { "epoch": 0.030375, "grad_norm": 2.9397759437561035, "grad_norm_var": 5.043470206735567, "learning_rate": 0.0001, "loss": 1.1593, "loss/crossentropy": 2.649984836578369, "loss/hidden": 0.9921875, "loss/logits": 0.16523152589797974, "loss/reg": 0.00018679779896046966, "step": 243 }, { "epoch": 0.0305, "grad_norm": 2.597238063812256, "grad_norm_var": 5.032333906433272, "learning_rate": 0.0001, "loss": 1.3144, "loss/crossentropy": 2.7007710933685303, "loss/hidden": 1.109375, "loss/logits": 0.20317299664020538, "loss/reg": 0.00018673852900974452, "step": 244 }, { "epoch": 0.030625, "grad_norm": 4.779845237731934, "grad_norm_var": 5.0980686958710555, "learning_rate": 0.0001, "loss": 1.9743, "loss/crossentropy": 2.7656006813049316, "loss/hidden": 1.5625, "loss/logits": 0.4099583327770233, "loss/reg": 0.0001866959355538711, "step": 245 }, { "epoch": 0.03075, "grad_norm": 3.3142945766448975, "grad_norm_var": 5.049814806516395, "learning_rate": 0.0001, "loss": 1.3097, "loss/crossentropy": 2.7472550868988037, "loss/hidden": 1.125, "loss/logits": 0.1828756332397461, "loss/reg": 0.00018664947128854692, "step": 246 }, { "epoch": 0.030875, "grad_norm": 2.6288223266601562, "grad_norm_var": 5.01838753008573, "learning_rate": 0.0001, "loss": 1.2715, "loss/crossentropy": 2.5212960243225098, "loss/hidden": 1.1015625, "loss/logits": 0.16810457408428192, "loss/reg": 0.00018659804482012987, "step": 247 }, { "epoch": 0.031, "grad_norm": 2.1690521240234375, "grad_norm_var": 0.4794672993561553, "learning_rate": 0.0001, "loss": 1.081, "loss/crossentropy": 2.517190933227539, "loss/hidden": 0.94140625, "loss/logits": 0.13774770498275757, "loss/reg": 0.0001865396770881489, "step": 248 }, { "epoch": 0.031125, "grad_norm": 2.642812490463257, "grad_norm_var": 0.45828649220525125, "learning_rate": 0.0001, "loss": 1.2944, "loss/crossentropy": 2.411734104156494, "loss/hidden": 1.125, "loss/logits": 0.16756606101989746, "loss/reg": 0.00018649944104254246, "step": 249 }, { "epoch": 0.03125, "grad_norm": 2.553938627243042, "grad_norm_var": 0.4585311039907272, "learning_rate": 0.0001, "loss": 1.1273, "loss/crossentropy": 2.9807546138763428, "loss/hidden": 0.9765625, "loss/logits": 0.1488645374774933, "loss/reg": 0.00018645105592440814, "step": 250 }, { "epoch": 0.031375, "grad_norm": 2.010117292404175, "grad_norm_var": 0.5083579557676783, "learning_rate": 0.0001, "loss": 1.2306, "loss/crossentropy": 2.562990427017212, "loss/hidden": 1.046875, "loss/logits": 0.18183979392051697, "loss/reg": 0.00018639408517628908, "step": 251 }, { "epoch": 0.0315, "grad_norm": 2.712961196899414, "grad_norm_var": 0.5093841749170441, "learning_rate": 0.0001, "loss": 1.4763, "loss/crossentropy": 2.573420524597168, "loss/hidden": 1.2578125, "loss/logits": 0.21662938594818115, "loss/reg": 0.0001863273064373061, "step": 252 }, { "epoch": 0.031625, "grad_norm": 2.6070237159729004, "grad_norm_var": 0.504885617842933, "learning_rate": 0.0001, "loss": 1.1148, "loss/crossentropy": 2.421952486038208, "loss/hidden": 0.98046875, "loss/logits": 0.13251450657844543, "loss/reg": 0.00018625622033141553, "step": 253 }, { "epoch": 0.03175, "grad_norm": 2.6277847290039062, "grad_norm_var": 0.4725433925882648, "learning_rate": 0.0001, "loss": 1.3581, "loss/crossentropy": 2.6136345863342285, "loss/hidden": 1.15625, "loss/logits": 0.1999516785144806, "loss/reg": 0.00018621186609379947, "step": 254 }, { "epoch": 0.031875, "grad_norm": 4.751771926879883, "grad_norm_var": 0.683379142444816, "learning_rate": 0.0001, "loss": 1.5569, "loss/crossentropy": 2.437589645385742, "loss/hidden": 1.3125, "loss/logits": 0.2425535023212433, "loss/reg": 0.00018616624583955854, "step": 255 }, { "epoch": 0.032, "grad_norm": 2.601804494857788, "grad_norm_var": 0.693466035829209, "learning_rate": 0.0001, "loss": 1.477, "loss/crossentropy": 2.3009605407714844, "loss/hidden": 1.25, "loss/logits": 0.22516345977783203, "loss/reg": 0.00018612167332321405, "step": 256 }, { "epoch": 0.032125, "grad_norm": 2.594728708267212, "grad_norm_var": 0.6154499118248385, "learning_rate": 0.0001, "loss": 1.3481, "loss/crossentropy": 2.5490729808807373, "loss/hidden": 1.15625, "loss/logits": 0.1899745911359787, "loss/reg": 0.0001860785996541381, "step": 257 }, { "epoch": 0.03225, "grad_norm": 1.9762139320373535, "grad_norm_var": 0.666272320547633, "learning_rate": 0.0001, "loss": 1.1268, "loss/crossentropy": 2.6095380783081055, "loss/hidden": 0.96875, "loss/logits": 0.15619653463363647, "loss/reg": 0.00018601951887831092, "step": 258 }, { "epoch": 0.032375, "grad_norm": 2.5357439517974854, "grad_norm_var": 0.6713294887447486, "learning_rate": 0.0001, "loss": 1.1937, "loss/crossentropy": 2.7029552459716797, "loss/hidden": 1.03125, "loss/logits": 0.1605946123600006, "loss/reg": 0.00018597528105601668, "step": 259 }, { "epoch": 0.0325, "grad_norm": 2.8872361183166504, "grad_norm_var": 0.6680105601783904, "learning_rate": 0.0001, "loss": 1.1618, "loss/crossentropy": 2.5793302059173584, "loss/hidden": 1.0, "loss/logits": 0.15989942848682404, "loss/reg": 0.00018592287960927933, "step": 260 }, { "epoch": 0.032625, "grad_norm": 2.6888515949249268, "grad_norm_var": 0.39965034448394665, "learning_rate": 0.0001, "loss": 1.1163, "loss/crossentropy": 2.501629114151001, "loss/hidden": 0.9609375, "loss/logits": 0.15351587533950806, "loss/reg": 0.00018586948863230646, "step": 261 }, { "epoch": 0.03275, "grad_norm": 2.794921398162842, "grad_norm_var": 0.3744163537172878, "learning_rate": 0.0001, "loss": 1.3868, "loss/crossentropy": 2.473355293273926, "loss/hidden": 1.1875, "loss/logits": 0.19742509722709656, "loss/reg": 0.00018582609482109547, "step": 262 }, { "epoch": 0.032875, "grad_norm": 11.51866340637207, "grad_norm_var": 5.260212315476766, "learning_rate": 0.0001, "loss": 1.9065, "loss/crossentropy": 2.5873212814331055, "loss/hidden": 1.4765625, "loss/logits": 0.4280346632003784, "loss/reg": 0.00018578370509203523, "step": 263 }, { "epoch": 0.033, "grad_norm": 3.3071396350860596, "grad_norm_var": 5.180231931586934, "learning_rate": 0.0001, "loss": 1.237, "loss/crossentropy": 2.6701910495758057, "loss/hidden": 1.0625, "loss/logits": 0.17262707650661469, "loss/reg": 0.0001857366441981867, "step": 264 }, { "epoch": 0.033125, "grad_norm": 2.6719601154327393, "grad_norm_var": 5.177728124810292, "learning_rate": 0.0001, "loss": 1.4876, "loss/crossentropy": 2.741184711456299, "loss/hidden": 1.265625, "loss/logits": 0.22007906436920166, "loss/reg": 0.00018568705127108842, "step": 265 }, { "epoch": 0.03325, "grad_norm": 4.4371514320373535, "grad_norm_var": 5.211410221157409, "learning_rate": 0.0001, "loss": 1.8731, "loss/crossentropy": 2.4437432289123535, "loss/hidden": 1.6015625, "loss/logits": 0.269656777381897, "loss/reg": 0.00018562644254416227, "step": 266 }, { "epoch": 0.033375, "grad_norm": 3.256206750869751, "grad_norm_var": 5.074168773112569, "learning_rate": 0.0001, "loss": 1.3223, "loss/crossentropy": 2.6366071701049805, "loss/hidden": 1.1328125, "loss/logits": 0.1875854730606079, "loss/reg": 0.00018558187002781779, "step": 267 }, { "epoch": 0.0335, "grad_norm": 2.422538995742798, "grad_norm_var": 5.109844600456275, "learning_rate": 0.0001, "loss": 1.3075, "loss/crossentropy": 2.5296108722686768, "loss/hidden": 1.1171875, "loss/logits": 0.1884431093931198, "loss/reg": 0.00018553413974586874, "step": 268 }, { "epoch": 0.033625, "grad_norm": 4.04595422744751, "grad_norm_var": 5.071768309380559, "learning_rate": 0.0001, "loss": 1.8971, "loss/crossentropy": 2.3316025733947754, "loss/hidden": 1.625, "loss/logits": 0.2702447175979614, "loss/reg": 0.00018548894149716944, "step": 269 }, { "epoch": 0.03375, "grad_norm": 3.397867441177368, "grad_norm_var": 5.012096554664663, "learning_rate": 0.0001, "loss": 1.43, "loss/crossentropy": 2.2132034301757812, "loss/hidden": 1.234375, "loss/logits": 0.19377078115940094, "loss/reg": 0.00018543725309427828, "step": 270 }, { "epoch": 0.033875, "grad_norm": 2.360687494277954, "grad_norm_var": 5.007982625032073, "learning_rate": 0.0001, "loss": 1.2015, "loss/crossentropy": 2.6397757530212402, "loss/hidden": 1.03125, "loss/logits": 0.1683761179447174, "loss/reg": 0.00018539318989496678, "step": 271 }, { "epoch": 0.034, "grad_norm": 3.795145273208618, "grad_norm_var": 4.95906816389108, "learning_rate": 0.0001, "loss": 1.7341, "loss/crossentropy": 2.2904622554779053, "loss/hidden": 1.5, "loss/logits": 0.23220419883728027, "loss/reg": 0.000185342927579768, "step": 272 }, { "epoch": 0.034125, "grad_norm": 3.19511079788208, "grad_norm_var": 4.905671754488361, "learning_rate": 0.0001, "loss": 1.4552, "loss/crossentropy": 2.4893336296081543, "loss/hidden": 1.25, "loss/logits": 0.20336630940437317, "loss/reg": 0.00018529384396970272, "step": 273 }, { "epoch": 0.03425, "grad_norm": 2.818743944168091, "grad_norm_var": 4.769792764968277, "learning_rate": 0.0001, "loss": 1.1492, "loss/crossentropy": 2.5468082427978516, "loss/hidden": 0.9921875, "loss/logits": 0.1551104187965393, "loss/reg": 0.00018524785991758108, "step": 274 }, { "epoch": 0.034375, "grad_norm": 2.7294118404388428, "grad_norm_var": 4.743793641432837, "learning_rate": 0.0001, "loss": 1.2939, "loss/crossentropy": 2.7086288928985596, "loss/hidden": 1.109375, "loss/logits": 0.18263299763202667, "loss/reg": 0.00018520389858167619, "step": 275 }, { "epoch": 0.0345, "grad_norm": 4.2614545822143555, "grad_norm_var": 4.722892075276445, "learning_rate": 0.0001, "loss": 2.152, "loss/crossentropy": 2.3208584785461426, "loss/hidden": 1.7578125, "loss/logits": 0.3923119604587555, "loss/reg": 0.0001851657871156931, "step": 276 }, { "epoch": 0.034625, "grad_norm": 3.1579630374908447, "grad_norm_var": 4.671438964356158, "learning_rate": 0.0001, "loss": 1.5186, "loss/crossentropy": 2.52303409576416, "loss/hidden": 1.28125, "loss/logits": 0.23553629219532013, "loss/reg": 0.00018512809765525162, "step": 277 }, { "epoch": 0.03475, "grad_norm": 2.9455530643463135, "grad_norm_var": 4.653460522047108, "learning_rate": 0.0001, "loss": 1.2198, "loss/crossentropy": 2.64249324798584, "loss/hidden": 1.0625, "loss/logits": 0.1554383635520935, "loss/reg": 0.00018509359506424516, "step": 278 }, { "epoch": 0.034875, "grad_norm": 2.588207721710205, "grad_norm_var": 0.4115949243025471, "learning_rate": 0.0001, "loss": 1.3877, "loss/crossentropy": 2.829423427581787, "loss/hidden": 1.171875, "loss/logits": 0.2140059471130371, "loss/reg": 0.00018506502965465188, "step": 279 }, { "epoch": 0.035, "grad_norm": 3.4081485271453857, "grad_norm_var": 0.41351468625660626, "learning_rate": 0.0001, "loss": 1.436, "loss/crossentropy": 2.624629259109497, "loss/hidden": 1.2265625, "loss/logits": 0.2076188027858734, "loss/reg": 0.00018502252351026982, "step": 280 }, { "epoch": 0.035125, "grad_norm": 3.1615681648254395, "grad_norm_var": 0.39283411950296265, "learning_rate": 0.0001, "loss": 1.3441, "loss/crossentropy": 2.465009927749634, "loss/hidden": 1.1484375, "loss/logits": 0.19385108351707458, "loss/reg": 0.00018499059660825878, "step": 281 }, { "epoch": 0.03525, "grad_norm": 3.2208797931671143, "grad_norm_var": 0.2925862508398, "learning_rate": 0.0001, "loss": 1.9994, "loss/crossentropy": 2.230750322341919, "loss/hidden": 1.578125, "loss/logits": 0.41937941312789917, "loss/reg": 0.00018494235700927675, "step": 282 }, { "epoch": 0.035375, "grad_norm": 3.2623984813690186, "grad_norm_var": 0.292657471443624, "learning_rate": 0.0001, "loss": 1.5126, "loss/crossentropy": 2.5379672050476074, "loss/hidden": 1.2734375, "loss/logits": 0.23736032843589783, "loss/reg": 0.00018488496425561607, "step": 283 }, { "epoch": 0.0355, "grad_norm": 2.368196487426758, "grad_norm_var": 0.2982812772165668, "learning_rate": 0.0001, "loss": 1.3005, "loss/crossentropy": 2.516019344329834, "loss/hidden": 1.109375, "loss/logits": 0.18929743766784668, "loss/reg": 0.0001848430110840127, "step": 284 }, { "epoch": 0.035625, "grad_norm": 3.191399335861206, "grad_norm_var": 0.24409669271128276, "learning_rate": 0.0001, "loss": 1.2823, "loss/crossentropy": 2.751084089279175, "loss/hidden": 1.109375, "loss/logits": 0.17108574509620667, "loss/reg": 0.00018479586287867278, "step": 285 }, { "epoch": 0.03575, "grad_norm": 2.592566967010498, "grad_norm_var": 0.2544086356402346, "learning_rate": 0.0001, "loss": 1.3514, "loss/crossentropy": 2.5993878841400146, "loss/hidden": 1.1171875, "loss/logits": 0.23240423202514648, "loss/reg": 0.0001847518578870222, "step": 286 }, { "epoch": 0.035875, "grad_norm": 3.8178586959838867, "grad_norm_var": 0.2500656389811674, "learning_rate": 0.0001, "loss": 1.3704, "loss/crossentropy": 2.688004970550537, "loss/hidden": 1.1875, "loss/logits": 0.18104243278503418, "loss/reg": 0.00018470516079105437, "step": 287 }, { "epoch": 0.036, "grad_norm": 2.0925023555755615, "grad_norm_var": 0.2864185440912022, "learning_rate": 0.0001, "loss": 1.2565, "loss/crossentropy": 2.4902563095092773, "loss/hidden": 1.09375, "loss/logits": 0.16092851758003235, "loss/reg": 0.0001846601371653378, "step": 288 }, { "epoch": 0.036125, "grad_norm": 2.821599245071411, "grad_norm_var": 0.28794847130561624, "learning_rate": 0.0001, "loss": 1.5127, "loss/crossentropy": 2.331846237182617, "loss/hidden": 1.2578125, "loss/logits": 0.25302040576934814, "loss/reg": 0.00018461896979715675, "step": 289 }, { "epoch": 0.03625, "grad_norm": 2.387387275695801, "grad_norm_var": 0.31157863588131224, "learning_rate": 0.0001, "loss": 1.2607, "loss/crossentropy": 2.76906681060791, "loss/hidden": 1.0703125, "loss/logits": 0.18852418661117554, "loss/reg": 0.00018456355610396713, "step": 290 }, { "epoch": 0.036375, "grad_norm": 3.449770450592041, "grad_norm_var": 0.3179789227700856, "learning_rate": 0.0001, "loss": 1.7591, "loss/crossentropy": 1.8213207721710205, "loss/hidden": 1.5390625, "loss/logits": 0.21822357177734375, "loss/reg": 0.00018452556105330586, "step": 291 }, { "epoch": 0.0365, "grad_norm": 2.499556303024292, "grad_norm_var": 0.2263369840310491, "learning_rate": 0.0001, "loss": 1.2925, "loss/crossentropy": 2.6276936531066895, "loss/hidden": 1.109375, "loss/logits": 0.18123921751976013, "loss/reg": 0.0001844725920818746, "step": 292 }, { "epoch": 0.036625, "grad_norm": 2.115734815597534, "grad_norm_var": 0.2632914348592083, "learning_rate": 0.0001, "loss": 1.2089, "loss/crossentropy": 2.6672511100769043, "loss/hidden": 1.03125, "loss/logits": 0.1757870316505432, "loss/reg": 0.0001844176003942266, "step": 293 }, { "epoch": 0.03675, "grad_norm": 2.684098958969116, "grad_norm_var": 0.2649372545619843, "learning_rate": 0.0001, "loss": 1.2995, "loss/crossentropy": 2.5663037300109863, "loss/hidden": 1.125, "loss/logits": 0.172622948884964, "loss/reg": 0.00018438571714796126, "step": 294 }, { "epoch": 0.036875, "grad_norm": 7.823673248291016, "grad_norm_var": 1.792621724898212, "learning_rate": 0.0001, "loss": 1.3821, "loss/crossentropy": 2.6714284420013428, "loss/hidden": 1.15625, "loss/logits": 0.22403180599212646, "loss/reg": 0.00018433824880048633, "step": 295 }, { "epoch": 0.037, "grad_norm": 2.6135120391845703, "grad_norm_var": 1.808029304785425, "learning_rate": 0.0001, "loss": 1.2104, "loss/crossentropy": 2.68971848487854, "loss/hidden": 1.0390625, "loss/logits": 0.16948801279067993, "loss/reg": 0.00018430246564093977, "step": 296 }, { "epoch": 0.037125, "grad_norm": 8.026154518127441, "grad_norm_var": 3.3065969805558173, "learning_rate": 0.0001, "loss": 2.1847, "loss/crossentropy": 2.3800106048583984, "loss/hidden": 1.9453125, "loss/logits": 0.2375316321849823, "loss/reg": 0.0001842674391809851, "step": 297 }, { "epoch": 0.03725, "grad_norm": 2.896569013595581, "grad_norm_var": 3.3224491377570446, "learning_rate": 0.0001, "loss": 1.3748, "loss/crossentropy": 2.3903732299804688, "loss/hidden": 1.1953125, "loss/logits": 0.17761950194835663, "loss/reg": 0.00018421628919895738, "step": 298 }, { "epoch": 0.037375, "grad_norm": 3.096365451812744, "grad_norm_var": 3.327554446166753, "learning_rate": 0.0001, "loss": 1.4918, "loss/crossentropy": 2.355905532836914, "loss/hidden": 1.2734375, "loss/logits": 0.2165393829345703, "loss/reg": 0.00018417388491798192, "step": 299 }, { "epoch": 0.0375, "grad_norm": 5.26882266998291, "grad_norm_var": 3.452496381081827, "learning_rate": 0.0001, "loss": 1.9899, "loss/crossentropy": 2.9516849517822266, "loss/hidden": 1.6875, "loss/logits": 0.3005185127258301, "loss/reg": 0.0001841239572968334, "step": 300 }, { "epoch": 0.037625, "grad_norm": 3.068439483642578, "grad_norm_var": 3.4599122750924143, "learning_rate": 0.0001, "loss": 1.3886, "loss/crossentropy": 2.6546504497528076, "loss/hidden": 1.1796875, "loss/logits": 0.20704253017902374, "loss/reg": 0.00018408475443720818, "step": 301 }, { "epoch": 0.03775, "grad_norm": 2.2956972122192383, "grad_norm_var": 3.504442894615314, "learning_rate": 0.0001, "loss": 1.1094, "loss/crossentropy": 2.4977052211761475, "loss/hidden": 0.96875, "loss/logits": 0.13884802162647247, "loss/reg": 0.00018404283036943525, "step": 302 }, { "epoch": 0.037875, "grad_norm": 2.5937392711639404, "grad_norm_var": 3.5559874858295015, "learning_rate": 0.0001, "loss": 1.3949, "loss/crossentropy": 2.379765033721924, "loss/hidden": 1.21875, "loss/logits": 0.17428620159626007, "loss/reg": 0.00018399336840957403, "step": 303 }, { "epoch": 0.038, "grad_norm": 3.512799024581909, "grad_norm_var": 3.4186760491291084, "learning_rate": 0.0001, "loss": 1.381, "loss/crossentropy": 2.4689581394195557, "loss/hidden": 1.1640625, "loss/logits": 0.2151029407978058, "loss/reg": 0.00018394750077277422, "step": 304 }, { "epoch": 0.038125, "grad_norm": 2.610177755355835, "grad_norm_var": 3.442626566538619, "learning_rate": 0.0001, "loss": 1.2278, "loss/crossentropy": 2.4508001804351807, "loss/hidden": 1.0625, "loss/logits": 0.1634678691625595, "loss/reg": 0.00018390185141470283, "step": 305 }, { "epoch": 0.03825, "grad_norm": 2.4446496963500977, "grad_norm_var": 3.433886969311868, "learning_rate": 0.0001, "loss": 1.5023, "loss/crossentropy": 2.110161066055298, "loss/hidden": 1.3125, "loss/logits": 0.18798431754112244, "loss/reg": 0.00018385711882729083, "step": 306 }, { "epoch": 0.038375, "grad_norm": 2.9203567504882812, "grad_norm_var": 3.4593607482629065, "learning_rate": 0.0001, "loss": 1.2248, "loss/crossentropy": 2.6589579582214355, "loss/hidden": 1.0625, "loss/logits": 0.16043730080127716, "loss/reg": 0.00018381779955234379, "step": 307 }, { "epoch": 0.0385, "grad_norm": 3.625650644302368, "grad_norm_var": 3.3839899608280857, "learning_rate": 0.0001, "loss": 1.3888, "loss/crossentropy": 2.7281792163848877, "loss/hidden": 1.1171875, "loss/logits": 0.269817590713501, "loss/reg": 0.00018376816296949983, "step": 308 }, { "epoch": 0.038625, "grad_norm": 2.889061689376831, "grad_norm_var": 3.268347098659014, "learning_rate": 0.0001, "loss": 1.1888, "loss/crossentropy": 2.523449420928955, "loss/hidden": 1.0078125, "loss/logits": 0.17915961146354675, "loss/reg": 0.00018371775513514876, "step": 309 }, { "epoch": 0.03875, "grad_norm": 2.768836736679077, "grad_norm_var": 3.257904120325863, "learning_rate": 0.0001, "loss": 1.3947, "loss/crossentropy": 2.3002350330352783, "loss/hidden": 1.1875, "loss/logits": 0.20538243651390076, "loss/reg": 0.00018366229778621346, "step": 310 }, { "epoch": 0.038875, "grad_norm": 2.4714229106903076, "grad_norm_var": 2.0722741056598655, "learning_rate": 0.0001, "loss": 1.3588, "loss/crossentropy": 2.5036730766296387, "loss/hidden": 1.1484375, "loss/logits": 0.2085750699043274, "loss/reg": 0.00018361561524216086, "step": 311 }, { "epoch": 0.039, "grad_norm": 2.3803367614746094, "grad_norm_var": 2.0976025308533433, "learning_rate": 0.0001, "loss": 1.3446, "loss/crossentropy": 2.5774199962615967, "loss/hidden": 1.15625, "loss/logits": 0.18654365837574005, "loss/reg": 0.00018356091459281743, "step": 312 }, { "epoch": 0.039125, "grad_norm": 2.1344940662384033, "grad_norm_var": 0.557820051408968, "learning_rate": 0.0001, "loss": 1.1236, "loss/crossentropy": 2.492638349533081, "loss/hidden": 0.96875, "loss/logits": 0.15305551886558533, "loss/reg": 0.00018350353639107198, "step": 313 }, { "epoch": 0.03925, "grad_norm": 2.3380954265594482, "grad_norm_var": 0.5802561079704179, "learning_rate": 0.0001, "loss": 1.2665, "loss/crossentropy": 2.538328170776367, "loss/hidden": 1.0859375, "loss/logits": 0.1786879152059555, "loss/reg": 0.00018345742137171328, "step": 314 }, { "epoch": 0.039375, "grad_norm": 2.4094135761260986, "grad_norm_var": 0.5918726782285414, "learning_rate": 0.0001, "loss": 1.3272, "loss/crossentropy": 2.4620587825775146, "loss/hidden": 1.1328125, "loss/logits": 0.1925506591796875, "loss/reg": 0.00018340005772188306, "step": 315 }, { "epoch": 0.0395, "grad_norm": 2.4094653129577637, "grad_norm_var": 0.18384264866255365, "learning_rate": 0.0001, "loss": 1.6038, "loss/crossentropy": 2.343863010406494, "loss/hidden": 1.3125, "loss/logits": 0.28949666023254395, "loss/reg": 0.00018334249034523964, "step": 316 }, { "epoch": 0.039625, "grad_norm": 2.327509880065918, "grad_norm_var": 0.17973404957112868, "learning_rate": 0.0001, "loss": 1.2969, "loss/crossentropy": 2.6910533905029297, "loss/hidden": 1.1171875, "loss/logits": 0.17789438366889954, "loss/reg": 0.0001832786510931328, "step": 317 }, { "epoch": 0.03975, "grad_norm": 1.9041651487350464, "grad_norm_var": 0.2069358760498758, "learning_rate": 0.0001, "loss": 1.1054, "loss/crossentropy": 2.636301279067993, "loss/hidden": 0.96875, "loss/logits": 0.13483907282352448, "loss/reg": 0.00018322949472349137, "step": 318 }, { "epoch": 0.039875, "grad_norm": 2.493028163909912, "grad_norm_var": 0.2077715093556349, "learning_rate": 0.0001, "loss": 1.3427, "loss/crossentropy": 2.3860602378845215, "loss/hidden": 1.15625, "loss/logits": 0.18460425734519958, "loss/reg": 0.00018318284128326923, "step": 319 }, { "epoch": 0.04, "grad_norm": 2.566598892211914, "grad_norm_var": 0.14887985654726912, "learning_rate": 0.0001, "loss": 1.1799, "loss/crossentropy": 2.562800884246826, "loss/hidden": 1.0234375, "loss/logits": 0.15464608371257782, "loss/reg": 0.00018314005865249783, "step": 320 }, { "epoch": 0.040125, "grad_norm": 2.3247175216674805, "grad_norm_var": 0.15142847186754452, "learning_rate": 0.0001, "loss": 1.138, "loss/crossentropy": 2.6008517742156982, "loss/hidden": 0.97265625, "loss/logits": 0.16352114081382751, "loss/reg": 0.00018309340521227568, "step": 321 }, { "epoch": 0.04025, "grad_norm": 2.961496114730835, "grad_norm_var": 0.1625533330376787, "learning_rate": 0.0001, "loss": 1.4689, "loss/crossentropy": 1.9118317365646362, "loss/hidden": 1.296875, "loss/logits": 0.17024032771587372, "loss/reg": 0.00018305043340660632, "step": 322 }, { "epoch": 0.040375, "grad_norm": 3.0667834281921387, "grad_norm_var": 0.17097196220394337, "learning_rate": 0.0001, "loss": 1.7235, "loss/crossentropy": 2.053375482559204, "loss/hidden": 1.421875, "loss/logits": 0.2998199462890625, "loss/reg": 0.00018301077943760902, "step": 323 }, { "epoch": 0.0405, "grad_norm": 2.930929183959961, "grad_norm_var": 0.10306917410381727, "learning_rate": 0.0001, "loss": 1.2964, "loss/crossentropy": 2.5874621868133545, "loss/hidden": 1.1015625, "loss/logits": 0.19297093152999878, "loss/reg": 0.00018295719928573817, "step": 324 }, { "epoch": 0.040625, "grad_norm": 2.871757745742798, "grad_norm_var": 0.1022445182394837, "learning_rate": 0.0001, "loss": 1.2449, "loss/crossentropy": 2.6085433959960938, "loss/hidden": 1.078125, "loss/logits": 0.16499356925487518, "loss/reg": 0.0001828953973017633, "step": 325 }, { "epoch": 0.04075, "grad_norm": 2.489987850189209, "grad_norm_var": 0.09794334325425678, "learning_rate": 0.0001, "loss": 1.3676, "loss/crossentropy": 2.4801414012908936, "loss/hidden": 1.171875, "loss/logits": 0.1939350813627243, "loss/reg": 0.00018283820827491581, "step": 326 }, { "epoch": 0.040875, "grad_norm": 2.536003828048706, "grad_norm_var": 0.09791477775173399, "learning_rate": 0.0001, "loss": 1.3227, "loss/crossentropy": 2.7262425422668457, "loss/hidden": 1.109375, "loss/logits": 0.21148526668548584, "loss/reg": 0.0001827785454224795, "step": 327 }, { "epoch": 0.041, "grad_norm": 2.0945851802825928, "grad_norm_var": 0.10792211144253508, "learning_rate": 0.0001, "loss": 1.1186, "loss/crossentropy": 2.4119601249694824, "loss/hidden": 0.97265625, "loss/logits": 0.1441338062286377, "loss/reg": 0.0001827288360800594, "step": 328 }, { "epoch": 0.041125, "grad_norm": 2.836939811706543, "grad_norm_var": 0.10535360002502277, "learning_rate": 0.0001, "loss": 1.4351, "loss/crossentropy": 2.3377010822296143, "loss/hidden": 1.21875, "loss/logits": 0.21447591483592987, "loss/reg": 0.00018267397535964847, "step": 329 }, { "epoch": 0.04125, "grad_norm": 2.326241970062256, "grad_norm_var": 0.10567372742739559, "learning_rate": 0.0001, "loss": 1.1899, "loss/crossentropy": 2.7874836921691895, "loss/hidden": 1.015625, "loss/logits": 0.17246156930923462, "loss/reg": 0.00018262714729644358, "step": 330 }, { "epoch": 0.041375, "grad_norm": 2.6195242404937744, "grad_norm_var": 0.10493277845914116, "learning_rate": 0.0001, "loss": 1.4583, "loss/crossentropy": 2.6836705207824707, "loss/hidden": 1.25, "loss/logits": 0.20644301176071167, "loss/reg": 0.0001825743674999103, "step": 331 }, { "epoch": 0.0415, "grad_norm": 2.837834119796753, "grad_norm_var": 0.10851849947722905, "learning_rate": 0.0001, "loss": 1.3683, "loss/crossentropy": 2.336994171142578, "loss/hidden": 1.15625, "loss/logits": 0.21025338768959045, "loss/reg": 0.0001825322542572394, "step": 332 }, { "epoch": 0.041625, "grad_norm": 2.7226598262786865, "grad_norm_var": 0.10527721486086339, "learning_rate": 0.0001, "loss": 1.1225, "loss/crossentropy": 2.621798276901245, "loss/hidden": 0.97265625, "loss/logits": 0.1480618715286255, "loss/reg": 0.00018248235573992133, "step": 333 }, { "epoch": 0.04175, "grad_norm": 1.9014396667480469, "grad_norm_var": 0.10553016347722419, "learning_rate": 0.0001, "loss": 1.121, "loss/crossentropy": 2.642059087753296, "loss/hidden": 0.96875, "loss/logits": 0.15041938424110413, "loss/reg": 0.00018242868827655911, "step": 334 }, { "epoch": 0.041875, "grad_norm": 2.2809882164001465, "grad_norm_var": 0.11133012136604989, "learning_rate": 0.0001, "loss": 1.3509, "loss/crossentropy": 2.59220027923584, "loss/hidden": 1.140625, "loss/logits": 0.20842507481575012, "loss/reg": 0.00018237272161059082, "step": 335 }, { "epoch": 0.042, "grad_norm": 2.722670793533325, "grad_norm_var": 0.11245856535336524, "learning_rate": 0.0001, "loss": 1.3534, "loss/crossentropy": 2.7409770488739014, "loss/hidden": 1.15625, "loss/logits": 0.195334792137146, "loss/reg": 0.00018231497961096466, "step": 336 }, { "epoch": 0.042125, "grad_norm": 2.7385120391845703, "grad_norm_var": 0.1082322741184413, "learning_rate": 0.0001, "loss": 1.2894, "loss/crossentropy": 2.5753767490386963, "loss/hidden": 1.1171875, "loss/logits": 0.17039254307746887, "loss/reg": 0.00018226687097921968, "step": 337 }, { "epoch": 0.04225, "grad_norm": 2.278662919998169, "grad_norm_var": 0.106386719047498, "learning_rate": 0.0001, "loss": 1.4188, "loss/crossentropy": 2.519120693206787, "loss/hidden": 1.203125, "loss/logits": 0.21385297179222107, "loss/reg": 0.00018221234495285898, "step": 338 }, { "epoch": 0.042375, "grad_norm": 2.7543537616729736, "grad_norm_var": 0.09214567617970199, "learning_rate": 0.0001, "loss": 1.2684, "loss/crossentropy": 2.615980625152588, "loss/hidden": 1.09375, "loss/logits": 0.17285585403442383, "loss/reg": 0.00018216087482869625, "step": 339 }, { "epoch": 0.0425, "grad_norm": 2.740844249725342, "grad_norm_var": 0.08497608623963113, "learning_rate": 0.0001, "loss": 1.3717, "loss/crossentropy": 2.3156495094299316, "loss/hidden": 1.1875, "loss/logits": 0.18235422670841217, "loss/reg": 0.000182102681719698, "step": 340 }, { "epoch": 0.042625, "grad_norm": 2.183875799179077, "grad_norm_var": 0.08476970381204461, "learning_rate": 0.0001, "loss": 1.2376, "loss/crossentropy": 2.5132007598876953, "loss/hidden": 1.0625, "loss/logits": 0.1733236014842987, "loss/reg": 0.00018205813830718398, "step": 341 }, { "epoch": 0.04275, "grad_norm": 2.5386435985565186, "grad_norm_var": 0.0848263064399248, "learning_rate": 0.0001, "loss": 1.3462, "loss/crossentropy": 2.4127066135406494, "loss/hidden": 1.1796875, "loss/logits": 0.16472230851650238, "loss/reg": 0.00018201676721218973, "step": 342 }, { "epoch": 0.042875, "grad_norm": 2.3057498931884766, "grad_norm_var": 0.08725284383438421, "learning_rate": 0.0001, "loss": 1.2497, "loss/crossentropy": 2.184678316116333, "loss/hidden": 1.0625, "loss/logits": 0.18537552654743195, "loss/reg": 0.0001819680182961747, "step": 343 }, { "epoch": 0.043, "grad_norm": 2.218367099761963, "grad_norm_var": 0.0816395413206275, "learning_rate": 0.0001, "loss": 1.1513, "loss/crossentropy": 2.449817419052124, "loss/hidden": 0.99609375, "loss/logits": 0.15338429808616638, "loss/reg": 0.00018192335846833885, "step": 344 }, { "epoch": 0.043125, "grad_norm": 2.2711985111236572, "grad_norm_var": 0.07626184388881233, "learning_rate": 0.0001, "loss": 1.0171, "loss/crossentropy": 2.509692907333374, "loss/hidden": 0.89453125, "loss/logits": 0.12079230695962906, "loss/reg": 0.00018188220565207303, "step": 345 }, { "epoch": 0.04325, "grad_norm": 2.787581443786621, "grad_norm_var": 0.08102267837075464, "learning_rate": 0.0001, "loss": 1.2428, "loss/crossentropy": 2.3301100730895996, "loss/hidden": 1.0546875, "loss/logits": 0.18630483746528625, "loss/reg": 0.00018185042426921427, "step": 346 }, { "epoch": 0.043375, "grad_norm": 2.0392682552337646, "grad_norm_var": 0.09234946001928937, "learning_rate": 0.0001, "loss": 1.0241, "loss/crossentropy": 2.588428020477295, "loss/hidden": 0.89453125, "loss/logits": 0.1277570128440857, "loss/reg": 0.00018179781909566373, "step": 347 }, { "epoch": 0.0435, "grad_norm": 2.7076616287231445, "grad_norm_var": 0.08681018440338353, "learning_rate": 0.0001, "loss": 1.4478, "loss/crossentropy": 2.6002542972564697, "loss/hidden": 1.2265625, "loss/logits": 0.2193761169910431, "loss/reg": 0.0001817511219996959, "step": 348 }, { "epoch": 0.043625, "grad_norm": 2.353895425796509, "grad_norm_var": 0.08187996873415007, "learning_rate": 0.0001, "loss": 1.2846, "loss/crossentropy": 2.441397190093994, "loss/hidden": 1.09375, "loss/logits": 0.18899187445640564, "loss/reg": 0.0001816989533836022, "step": 349 }, { "epoch": 0.04375, "grad_norm": 2.078312397003174, "grad_norm_var": 0.07145312501918302, "learning_rate": 0.0001, "loss": 1.2337, "loss/crossentropy": 2.320770263671875, "loss/hidden": 1.0625, "loss/logits": 0.1694108545780182, "loss/reg": 0.00018164912762586027, "step": 350 }, { "epoch": 0.043875, "grad_norm": 2.455012321472168, "grad_norm_var": 0.06971347306554586, "learning_rate": 0.0001, "loss": 1.3011, "loss/crossentropy": 2.6782124042510986, "loss/hidden": 1.109375, "loss/logits": 0.1899489462375641, "loss/reg": 0.00018160381296183914, "step": 351 }, { "epoch": 0.044, "grad_norm": 3.1685619354248047, "grad_norm_var": 0.09844486312005239, "learning_rate": 0.0001, "loss": 1.3515, "loss/crossentropy": 2.8243625164031982, "loss/hidden": 1.140625, "loss/logits": 0.20903226733207703, "loss/reg": 0.00018155867292080075, "step": 352 }, { "epoch": 0.044125, "grad_norm": 3.2489728927612305, "grad_norm_var": 0.13257830736299236, "learning_rate": 0.0001, "loss": 1.5585, "loss/crossentropy": 2.8111252784729004, "loss/hidden": 1.3125, "loss/logits": 0.24414658546447754, "loss/reg": 0.0001815159630496055, "step": 353 }, { "epoch": 0.04425, "grad_norm": 3.3230886459350586, "grad_norm_var": 0.16879235535394155, "learning_rate": 0.0001, "loss": 1.4469, "loss/crossentropy": 2.5517845153808594, "loss/hidden": 1.2109375, "loss/logits": 0.23417417705059052, "loss/reg": 0.00018148007802665234, "step": 354 }, { "epoch": 0.044375, "grad_norm": 2.2562191486358643, "grad_norm_var": 0.17228650926600247, "learning_rate": 0.0001, "loss": 1.1829, "loss/crossentropy": 2.422496795654297, "loss/hidden": 1.015625, "loss/logits": 0.16546528041362762, "loss/reg": 0.00018143345369026065, "step": 355 }, { "epoch": 0.0445, "grad_norm": 2.7640106678009033, "grad_norm_var": 0.17293323899421179, "learning_rate": 0.0001, "loss": 1.167, "loss/crossentropy": 2.828662157058716, "loss/hidden": 1.0, "loss/logits": 0.16519883275032043, "loss/reg": 0.00018138332234229892, "step": 356 }, { "epoch": 0.044625, "grad_norm": 2.3983893394470215, "grad_norm_var": 0.16551544063244136, "learning_rate": 0.0001, "loss": 1.1707, "loss/crossentropy": 2.605727434158325, "loss/hidden": 1.0078125, "loss/logits": 0.16108599305152893, "loss/reg": 0.0001813305716495961, "step": 357 }, { "epoch": 0.04475, "grad_norm": 3.6282835006713867, "grad_norm_var": 0.2370290852634845, "learning_rate": 0.0001, "loss": 1.3762, "loss/crossentropy": 2.6275181770324707, "loss/hidden": 1.15625, "loss/logits": 0.21812118589878082, "loss/reg": 0.00018128403462469578, "step": 358 }, { "epoch": 0.044875, "grad_norm": 3.763258934020996, "grad_norm_var": 0.30770300622119723, "learning_rate": 0.0001, "loss": 1.5858, "loss/crossentropy": 2.6596243381500244, "loss/hidden": 1.2890625, "loss/logits": 0.29489603638648987, "loss/reg": 0.00018124622874893248, "step": 359 }, { "epoch": 0.045, "grad_norm": 2.6321170330047607, "grad_norm_var": 0.29092860453977115, "learning_rate": 0.0001, "loss": 1.1651, "loss/crossentropy": 2.679645299911499, "loss/hidden": 1.015625, "loss/logits": 0.1476357877254486, "loss/reg": 0.00018119592277798802, "step": 360 }, { "epoch": 0.045125, "grad_norm": 2.794433832168579, "grad_norm_var": 0.27517751652292094, "learning_rate": 0.0001, "loss": 1.291, "loss/crossentropy": 2.46774959564209, "loss/hidden": 1.09375, "loss/logits": 0.1954488754272461, "loss/reg": 0.00018115356215275824, "step": 361 }, { "epoch": 0.04525, "grad_norm": 3.043492078781128, "grad_norm_var": 0.2797019428924839, "learning_rate": 0.0001, "loss": 1.5628, "loss/crossentropy": 2.5124197006225586, "loss/hidden": 1.328125, "loss/logits": 0.23281781375408173, "loss/reg": 0.00018111053213942796, "step": 362 }, { "epoch": 0.045375, "grad_norm": 2.155243158340454, "grad_norm_var": 0.2689192978759799, "learning_rate": 0.0001, "loss": 1.2775, "loss/crossentropy": 2.5151214599609375, "loss/hidden": 1.0859375, "loss/logits": 0.1897062063217163, "loss/reg": 0.00018105284834746271, "step": 363 }, { "epoch": 0.0455, "grad_norm": 2.609884023666382, "grad_norm_var": 0.2706969753359004, "learning_rate": 0.0001, "loss": 1.3514, "loss/crossentropy": 2.506584882736206, "loss/hidden": 1.1484375, "loss/logits": 0.20117563009262085, "loss/reg": 0.00018099797307513654, "step": 364 }, { "epoch": 0.045625, "grad_norm": 2.342069387435913, "grad_norm_var": 0.2713966376478879, "learning_rate": 0.0001, "loss": 1.3665, "loss/crossentropy": 2.7815566062927246, "loss/hidden": 1.15625, "loss/logits": 0.20847788453102112, "loss/reg": 0.00018094982078764588, "step": 365 }, { "epoch": 0.04575, "grad_norm": 3.0326218605041504, "grad_norm_var": 0.23758998657840114, "learning_rate": 0.0001, "loss": 1.3364, "loss/crossentropy": 2.523134469985962, "loss/hidden": 1.1328125, "loss/logits": 0.20178905129432678, "loss/reg": 0.0001809141831472516, "step": 366 }, { "epoch": 0.045875, "grad_norm": 10.035235404968262, "grad_norm_var": 3.4286245913836826, "learning_rate": 0.0001, "loss": 1.7972, "loss/crossentropy": 2.548880100250244, "loss/hidden": 1.453125, "loss/logits": 0.34224259853363037, "loss/reg": 0.0001808816014090553, "step": 367 }, { "epoch": 0.046, "grad_norm": 2.35805606842041, "grad_norm_var": 3.4865601240502655, "learning_rate": 0.0001, "loss": 1.3183, "loss/crossentropy": 2.6983084678649902, "loss/hidden": 1.1328125, "loss/logits": 0.1837082803249359, "loss/reg": 0.00018084091425407678, "step": 368 }, { "epoch": 0.046125, "grad_norm": 2.228501319885254, "grad_norm_var": 3.555062224897286, "learning_rate": 0.0001, "loss": 1.182, "loss/crossentropy": 2.2725465297698975, "loss/hidden": 1.015625, "loss/logits": 0.16455045342445374, "loss/reg": 0.0001807970111258328, "step": 369 }, { "epoch": 0.04625, "grad_norm": 3.036968469619751, "grad_norm_var": 3.555876206735041, "learning_rate": 0.0001, "loss": 1.3473, "loss/crossentropy": 2.462448835372925, "loss/hidden": 1.171875, "loss/logits": 0.1736328899860382, "loss/reg": 0.00018076066044159234, "step": 370 }, { "epoch": 0.046375, "grad_norm": 2.4165427684783936, "grad_norm_var": 3.5374699186157517, "learning_rate": 0.0001, "loss": 1.3649, "loss/crossentropy": 2.7596888542175293, "loss/hidden": 1.15625, "loss/logits": 0.20686297118663788, "loss/reg": 0.00018071448721457273, "step": 371 }, { "epoch": 0.0465, "grad_norm": 3.086303949356079, "grad_norm_var": 3.525121419257785, "learning_rate": 0.0001, "loss": 1.5493, "loss/crossentropy": 2.500018835067749, "loss/hidden": 1.28125, "loss/logits": 0.26621347665786743, "loss/reg": 0.00018066992925014347, "step": 372 }, { "epoch": 0.046625, "grad_norm": 2.4726622104644775, "grad_norm_var": 3.5173041221135075, "learning_rate": 0.0001, "loss": 1.136, "loss/crossentropy": 2.5688652992248535, "loss/hidden": 0.9921875, "loss/logits": 0.1419672667980194, "loss/reg": 0.00018063545576296747, "step": 373 }, { "epoch": 0.04675, "grad_norm": 2.6667373180389404, "grad_norm_var": 3.523672237020999, "learning_rate": 0.0001, "loss": 1.361, "loss/crossentropy": 2.5163731575012207, "loss/hidden": 1.140625, "loss/logits": 0.21853341162204742, "loss/reg": 0.0001806024374673143, "step": 374 }, { "epoch": 0.046875, "grad_norm": 2.3773033618927, "grad_norm_var": 3.5335662465775757, "learning_rate": 0.0001, "loss": 1.1721, "loss/crossentropy": 2.8756470680236816, "loss/hidden": 1.0078125, "loss/logits": 0.16248445212841034, "loss/reg": 0.00018057989655062556, "step": 375 }, { "epoch": 0.047, "grad_norm": 2.7833797931671143, "grad_norm_var": 3.5259529031790064, "learning_rate": 0.0001, "loss": 1.3487, "loss/crossentropy": 2.7149651050567627, "loss/hidden": 1.140625, "loss/logits": 0.20630021393299103, "loss/reg": 0.0001805323117878288, "step": 376 }, { "epoch": 0.047125, "grad_norm": 2.4678053855895996, "grad_norm_var": 3.545491291634365, "learning_rate": 0.0001, "loss": 1.4444, "loss/crossentropy": 2.2349612712860107, "loss/hidden": 1.2265625, "loss/logits": 0.21606594324111938, "loss/reg": 0.00018048338824883103, "step": 377 }, { "epoch": 0.04725, "grad_norm": 2.368433952331543, "grad_norm_var": 3.5763182105236933, "learning_rate": 0.0001, "loss": 1.2423, "loss/crossentropy": 2.3095428943634033, "loss/hidden": 1.078125, "loss/logits": 0.1624036729335785, "loss/reg": 0.00018044162425212562, "step": 378 }, { "epoch": 0.047375, "grad_norm": 4.887925148010254, "grad_norm_var": 3.7252780043467792, "learning_rate": 0.0001, "loss": 1.357, "loss/crossentropy": 2.2361862659454346, "loss/hidden": 1.15625, "loss/logits": 0.1989278942346573, "loss/reg": 0.0001804078638087958, "step": 379 }, { "epoch": 0.0475, "grad_norm": 2.6829822063446045, "grad_norm_var": 3.7198784549945154, "learning_rate": 0.0001, "loss": 1.3576, "loss/crossentropy": 2.5164873600006104, "loss/hidden": 1.1484375, "loss/logits": 0.20736883580684662, "loss/reg": 0.00018035774701274931, "step": 380 }, { "epoch": 0.047625, "grad_norm": 2.9376659393310547, "grad_norm_var": 3.673702627279779, "learning_rate": 0.0001, "loss": 1.2767, "loss/crossentropy": 2.5471227169036865, "loss/hidden": 1.09375, "loss/logits": 0.18117238581180573, "loss/reg": 0.000180321978405118, "step": 381 }, { "epoch": 0.04775, "grad_norm": 3.1138343811035156, "grad_norm_var": 3.671869876252352, "learning_rate": 0.0001, "loss": 1.5706, "loss/crossentropy": 2.5659494400024414, "loss/hidden": 1.28125, "loss/logits": 0.2875041365623474, "loss/reg": 0.00018027149781119078, "step": 382 }, { "epoch": 0.047875, "grad_norm": 2.51397705078125, "grad_norm_var": 0.3979920239186678, "learning_rate": 0.0001, "loss": 1.3404, "loss/crossentropy": 2.6721858978271484, "loss/hidden": 1.140625, "loss/logits": 0.19799375534057617, "loss/reg": 0.0001802304177545011, "step": 383 }, { "epoch": 0.048, "grad_norm": 2.6631741523742676, "grad_norm_var": 0.3868506457320857, "learning_rate": 0.0001, "loss": 1.1959, "loss/crossentropy": 2.8570642471313477, "loss/hidden": 1.046875, "loss/logits": 0.1472700834274292, "loss/reg": 0.00018018792616203427, "step": 384 }, { "epoch": 0.048125, "grad_norm": 2.225167989730835, "grad_norm_var": 0.38710267816574984, "learning_rate": 0.0001, "loss": 1.3387, "loss/crossentropy": 2.449850082397461, "loss/hidden": 1.140625, "loss/logits": 0.19622622430324554, "loss/reg": 0.00018013913359027356, "step": 385 }, { "epoch": 0.04825, "grad_norm": 3.642057418823242, "grad_norm_var": 0.429604118678219, "learning_rate": 0.0001, "loss": 1.5255, "loss/crossentropy": 3.060318946838379, "loss/hidden": 1.296875, "loss/logits": 0.2267942726612091, "loss/reg": 0.00018009122868534178, "step": 386 }, { "epoch": 0.048375, "grad_norm": 2.4205105304718018, "grad_norm_var": 0.4293855111770417, "learning_rate": 0.0001, "loss": 1.4919, "loss/crossentropy": 2.5439326763153076, "loss/hidden": 1.2578125, "loss/logits": 0.23233534395694733, "loss/reg": 0.0001800364989321679, "step": 387 }, { "epoch": 0.0485, "grad_norm": 3.090855598449707, "grad_norm_var": 0.429541218532165, "learning_rate": 0.0001, "loss": 1.3946, "loss/crossentropy": 2.3761212825775146, "loss/hidden": 1.1875, "loss/logits": 0.20534038543701172, "loss/reg": 0.0001799749006750062, "step": 388 }, { "epoch": 0.048625, "grad_norm": 2.829037666320801, "grad_norm_var": 0.4203970366893562, "learning_rate": 0.0001, "loss": 1.3953, "loss/crossentropy": 2.8099021911621094, "loss/hidden": 1.203125, "loss/logits": 0.19039469957351685, "loss/reg": 0.0001799268211470917, "step": 389 }, { "epoch": 0.04875, "grad_norm": 2.4180996417999268, "grad_norm_var": 0.43048309318027406, "learning_rate": 0.0001, "loss": 1.2337, "loss/crossentropy": 2.5344197750091553, "loss/hidden": 1.046875, "loss/logits": 0.18504786491394043, "loss/reg": 0.0001798665034584701, "step": 390 }, { "epoch": 0.048875, "grad_norm": 6.686630725860596, "grad_norm_var": 1.3259110009600303, "learning_rate": 0.0001, "loss": 1.4048, "loss/crossentropy": 1.59429132938385, "loss/hidden": 1.234375, "loss/logits": 0.16859778761863708, "loss/reg": 0.00017980851407628506, "step": 391 }, { "epoch": 0.049, "grad_norm": 2.47430682182312, "grad_norm_var": 1.3452680046498242, "learning_rate": 0.0001, "loss": 1.2505, "loss/crossentropy": 2.4976658821105957, "loss/hidden": 1.0703125, "loss/logits": 0.17843782901763916, "loss/reg": 0.00017974060028791428, "step": 392 }, { "epoch": 0.049125, "grad_norm": 2.3827688694000244, "grad_norm_var": 1.3527620972999594, "learning_rate": 0.0001, "loss": 1.2697, "loss/crossentropy": 2.6137402057647705, "loss/hidden": 1.09375, "loss/logits": 0.17413964867591858, "loss/reg": 0.00017966754967346787, "step": 393 }, { "epoch": 0.04925, "grad_norm": 3.5299346446990967, "grad_norm_var": 1.3263260544293964, "learning_rate": 0.0001, "loss": 1.6096, "loss/crossentropy": 2.1338679790496826, "loss/hidden": 1.359375, "loss/logits": 0.24847310781478882, "loss/reg": 0.00017959915567189455, "step": 394 }, { "epoch": 0.049375, "grad_norm": 2.1859142780303955, "grad_norm_var": 1.1587385123986338, "learning_rate": 0.0001, "loss": 1.1247, "loss/crossentropy": 2.686222791671753, "loss/hidden": 0.96875, "loss/logits": 0.15412843227386475, "loss/reg": 0.00017955088696908206, "step": 395 }, { "epoch": 0.0495, "grad_norm": 2.372267484664917, "grad_norm_var": 1.1773802642484006, "learning_rate": 0.0001, "loss": 1.1605, "loss/crossentropy": 2.648873805999756, "loss/hidden": 0.9921875, "loss/logits": 0.16648846864700317, "loss/reg": 0.00017949036555364728, "step": 396 }, { "epoch": 0.049625, "grad_norm": 2.343231439590454, "grad_norm_var": 1.2018601019134032, "learning_rate": 0.0001, "loss": 1.1771, "loss/crossentropy": 2.3791165351867676, "loss/hidden": 1.0234375, "loss/logits": 0.15182971954345703, "loss/reg": 0.00017943643615581095, "step": 397 }, { "epoch": 0.04975, "grad_norm": 2.7274091243743896, "grad_norm_var": 1.2017590131362377, "learning_rate": 0.0001, "loss": 1.324, "loss/crossentropy": 2.57863712310791, "loss/hidden": 1.140625, "loss/logits": 0.18160942196846008, "loss/reg": 0.00017938419478014112, "step": 398 }, { "epoch": 0.049875, "grad_norm": 4.155604362487793, "grad_norm_var": 1.2842575464972394, "learning_rate": 0.0001, "loss": 1.5653, "loss/crossentropy": 2.892331123352051, "loss/hidden": 1.34375, "loss/logits": 0.2197520136833191, "loss/reg": 0.0001793357077986002, "step": 399 }, { "epoch": 0.05, "grad_norm": 2.741466522216797, "grad_norm_var": 1.2810286441991583, "learning_rate": 0.0001, "loss": 1.3348, "loss/crossentropy": 2.546194314956665, "loss/hidden": 1.140625, "loss/logits": 0.19236674904823303, "loss/reg": 0.00017928793386090547, "step": 400 }, { "epoch": 0.050125, "grad_norm": 3.055189847946167, "grad_norm_var": 1.236778717086356, "learning_rate": 0.0001, "loss": 1.3152, "loss/crossentropy": 2.6719818115234375, "loss/hidden": 1.078125, "loss/logits": 0.23531492054462433, "loss/reg": 0.00017924165877047926, "step": 401 }, { "epoch": 0.05025, "grad_norm": 3.778313636779785, "grad_norm_var": 1.2484054094760106, "learning_rate": 0.0001, "loss": 1.5425, "loss/crossentropy": 2.7033426761627197, "loss/hidden": 1.28125, "loss/logits": 0.25942352414131165, "loss/reg": 0.00017917431250680238, "step": 402 }, { "epoch": 0.050375, "grad_norm": 3.0340867042541504, "grad_norm_var": 1.2184345071185567, "learning_rate": 0.0001, "loss": 1.455, "loss/crossentropy": 2.5875117778778076, "loss/hidden": 1.2265625, "loss/logits": 0.22662264108657837, "loss/reg": 0.00017911198665387928, "step": 403 }, { "epoch": 0.0505, "grad_norm": 2.5151872634887695, "grad_norm_var": 1.2408325162170568, "learning_rate": 0.0001, "loss": 1.253, "loss/crossentropy": 2.6171019077301025, "loss/hidden": 1.078125, "loss/logits": 0.17306920886039734, "loss/reg": 0.0001790501846699044, "step": 404 }, { "epoch": 0.050625, "grad_norm": 2.4294838905334473, "grad_norm_var": 1.2640116286061014, "learning_rate": 0.0001, "loss": 1.1984, "loss/crossentropy": 2.4346368312835693, "loss/hidden": 1.015625, "loss/logits": 0.18100124597549438, "loss/reg": 0.00017898838268592954, "step": 405 }, { "epoch": 0.05075, "grad_norm": 2.7509233951568604, "grad_norm_var": 1.242810414819528, "learning_rate": 0.0001, "loss": 1.62, "loss/crossentropy": 2.047776937484741, "loss/hidden": 1.421875, "loss/logits": 0.19628939032554626, "loss/reg": 0.00017893253243528306, "step": 406 }, { "epoch": 0.050875, "grad_norm": 2.4834845066070557, "grad_norm_var": 0.3216287157750268, "learning_rate": 0.0001, "loss": 1.3514, "loss/crossentropy": 2.6643404960632324, "loss/hidden": 1.15625, "loss/logits": 0.1933945268392563, "loss/reg": 0.00017888467118609697, "step": 407 }, { "epoch": 0.051, "grad_norm": 2.417241334915161, "grad_norm_var": 0.32438624126894616, "learning_rate": 0.0001, "loss": 1.3021, "loss/crossentropy": 2.600691556930542, "loss/hidden": 1.109375, "loss/logits": 0.19094930589199066, "loss/reg": 0.00017882059910334647, "step": 408 }, { "epoch": 0.051125, "grad_norm": 2.3977980613708496, "grad_norm_var": 0.32355143397302466, "learning_rate": 0.0001, "loss": 1.4318, "loss/crossentropy": 2.6039021015167236, "loss/hidden": 1.1875, "loss/logits": 0.24255430698394775, "loss/reg": 0.00017876985657494515, "step": 409 }, { "epoch": 0.05125, "grad_norm": 2.205808639526367, "grad_norm_var": 0.3055601722416668, "learning_rate": 0.0001, "loss": 1.1411, "loss/crossentropy": 2.5921945571899414, "loss/hidden": 0.98828125, "loss/logits": 0.1510002166032791, "loss/reg": 0.0001787317160051316, "step": 410 }, { "epoch": 0.051375, "grad_norm": 3.166574478149414, "grad_norm_var": 0.2952319363017691, "learning_rate": 0.0001, "loss": 1.3997, "loss/crossentropy": 2.4906835556030273, "loss/hidden": 1.1796875, "loss/logits": 0.21822378039360046, "loss/reg": 0.00017868747818283737, "step": 411 }, { "epoch": 0.0515, "grad_norm": 2.248802661895752, "grad_norm_var": 0.30299352883237796, "learning_rate": 0.0001, "loss": 1.0924, "loss/crossentropy": 2.599449396133423, "loss/hidden": 0.9453125, "loss/logits": 0.14533644914627075, "loss/reg": 0.0001786567154340446, "step": 412 }, { "epoch": 0.051625, "grad_norm": 3.7585434913635254, "grad_norm_var": 0.3461126328200045, "learning_rate": 0.0001, "loss": 1.4199, "loss/crossentropy": 2.7979090213775635, "loss/hidden": 1.1875, "loss/logits": 0.23058170080184937, "loss/reg": 0.00017861124069895595, "step": 413 }, { "epoch": 0.05175, "grad_norm": 2.615172863006592, "grad_norm_var": 0.34898320978636455, "learning_rate": 0.0001, "loss": 1.3951, "loss/crossentropy": 1.9655026197433472, "loss/hidden": 1.2109375, "loss/logits": 0.18238556385040283, "loss/reg": 0.00017857542843557894, "step": 414 }, { "epoch": 0.051875, "grad_norm": 2.38726806640625, "grad_norm_var": 0.2388532010949414, "learning_rate": 0.0001, "loss": 1.0998, "loss/crossentropy": 2.581796646118164, "loss/hidden": 0.94921875, "loss/logits": 0.1488036811351776, "loss/reg": 0.00017854152247309685, "step": 415 }, { "epoch": 0.052, "grad_norm": 2.1067628860473633, "grad_norm_var": 0.264675897864124, "learning_rate": 0.0001, "loss": 1.3556, "loss/crossentropy": 2.202812910079956, "loss/hidden": 1.171875, "loss/logits": 0.18191415071487427, "loss/reg": 0.00017848903371486813, "step": 416 }, { "epoch": 0.052125, "grad_norm": 2.8035178184509277, "grad_norm_var": 0.25703166277479733, "learning_rate": 0.0001, "loss": 1.367, "loss/crossentropy": 2.37416934967041, "loss/hidden": 1.125, "loss/logits": 0.24018406867980957, "loss/reg": 0.00017843768000602722, "step": 417 }, { "epoch": 0.05225, "grad_norm": 2.9603078365325928, "grad_norm_var": 0.1805549031603429, "learning_rate": 0.0001, "loss": 1.6557, "loss/crossentropy": 2.2720189094543457, "loss/hidden": 1.390625, "loss/logits": 0.2632881999015808, "loss/reg": 0.0001783809857442975, "step": 418 }, { "epoch": 0.052375, "grad_norm": 2.5229651927948, "grad_norm_var": 0.17020038194862058, "learning_rate": 0.0001, "loss": 1.4756, "loss/crossentropy": 2.63793683052063, "loss/hidden": 1.2265625, "loss/logits": 0.24721495807170868, "loss/reg": 0.00017833360470831394, "step": 419 }, { "epoch": 0.0525, "grad_norm": 2.355572462081909, "grad_norm_var": 0.17382358098597403, "learning_rate": 0.0001, "loss": 1.2231, "loss/crossentropy": 2.549771308898926, "loss/hidden": 1.0390625, "loss/logits": 0.1823006570339203, "loss/reg": 0.0001782874605851248, "step": 420 }, { "epoch": 0.052625, "grad_norm": 2.9690279960632324, "grad_norm_var": 0.17970504092186654, "learning_rate": 0.0001, "loss": 1.3476, "loss/crossentropy": 3.0549476146698, "loss/hidden": 1.1484375, "loss/logits": 0.19741559028625488, "loss/reg": 0.00017824411042965949, "step": 421 }, { "epoch": 0.05275, "grad_norm": 2.880521059036255, "grad_norm_var": 0.18276892961250724, "learning_rate": 0.0001, "loss": 1.1749, "loss/crossentropy": 2.4630181789398193, "loss/hidden": 1.0078125, "loss/logits": 0.16526931524276733, "loss/reg": 0.00017819386266637594, "step": 422 }, { "epoch": 0.052875, "grad_norm": 2.373995542526245, "grad_norm_var": 0.18583898600397694, "learning_rate": 0.0001, "loss": 1.1957, "loss/crossentropy": 2.5825746059417725, "loss/hidden": 1.015625, "loss/logits": 0.17832687497138977, "loss/reg": 0.00017814231978263706, "step": 423 }, { "epoch": 0.053, "grad_norm": 2.8790440559387207, "grad_norm_var": 0.18572161644163973, "learning_rate": 0.0001, "loss": 1.3801, "loss/crossentropy": 2.7846927642822266, "loss/hidden": 1.15625, "loss/logits": 0.22202245891094208, "loss/reg": 0.00017809156270232052, "step": 424 }, { "epoch": 0.053125, "grad_norm": 2.737457275390625, "grad_norm_var": 0.18085466780064482, "learning_rate": 0.0001, "loss": 1.0736, "loss/crossentropy": 2.3963334560394287, "loss/hidden": 0.9375, "loss/logits": 0.1343117207288742, "loss/reg": 0.00017803694936446846, "step": 425 }, { "epoch": 0.05325, "grad_norm": 2.428834915161133, "grad_norm_var": 0.16969274721350278, "learning_rate": 0.0001, "loss": 1.157, "loss/crossentropy": 2.741081476211548, "loss/hidden": 0.99609375, "loss/logits": 0.15910392999649048, "loss/reg": 0.00017797992040868849, "step": 426 }, { "epoch": 0.053375, "grad_norm": 2.404265880584717, "grad_norm_var": 0.1585534584039614, "learning_rate": 0.0001, "loss": 1.2847, "loss/crossentropy": 2.89072322845459, "loss/hidden": 1.09375, "loss/logits": 0.1891355812549591, "loss/reg": 0.00017792356084100902, "step": 427 }, { "epoch": 0.0535, "grad_norm": 4.136691093444824, "grad_norm_var": 0.27981797299989897, "learning_rate": 0.0001, "loss": 1.4735, "loss/crossentropy": 2.6474928855895996, "loss/hidden": 1.21875, "loss/logits": 0.2529890537261963, "loss/reg": 0.0001778633304638788, "step": 428 }, { "epoch": 0.053625, "grad_norm": 2.81740140914917, "grad_norm_var": 0.21112886078796553, "learning_rate": 0.0001, "loss": 1.4668, "loss/crossentropy": 2.431791067123413, "loss/hidden": 1.21875, "loss/logits": 0.2462722361087799, "loss/reg": 0.00017780056805349886, "step": 429 }, { "epoch": 0.05375, "grad_norm": 3.5630476474761963, "grad_norm_var": 0.2551499062270144, "learning_rate": 0.0001, "loss": 1.4714, "loss/crossentropy": 2.573620557785034, "loss/hidden": 1.2421875, "loss/logits": 0.22747303545475006, "loss/reg": 0.00017774660955183208, "step": 430 }, { "epoch": 0.053875, "grad_norm": 27.927080154418945, "grad_norm_var": 39.71803281932496, "learning_rate": 0.0001, "loss": 1.2788, "loss/crossentropy": 2.723278522491455, "loss/hidden": 1.0859375, "loss/logits": 0.191043421626091, "loss/reg": 0.00017769775877241045, "step": 431 }, { "epoch": 0.054, "grad_norm": 2.35017991065979, "grad_norm_var": 39.64838987108033, "learning_rate": 0.0001, "loss": 1.131, "loss/crossentropy": 2.7018210887908936, "loss/hidden": 0.98828125, "loss/logits": 0.14092248678207397, "loss/reg": 0.00017764477524906397, "step": 432 }, { "epoch": 0.054125, "grad_norm": 4.339812278747559, "grad_norm_var": 39.472594042649305, "learning_rate": 0.0001, "loss": 1.348, "loss/crossentropy": 2.808999538421631, "loss/hidden": 1.171875, "loss/logits": 0.17429979145526886, "loss/reg": 0.0001775865093804896, "step": 433 }, { "epoch": 0.05425, "grad_norm": 2.577270030975342, "grad_norm_var": 39.559269314754324, "learning_rate": 0.0001, "loss": 1.3842, "loss/crossentropy": 2.3989908695220947, "loss/hidden": 1.1875, "loss/logits": 0.19493895769119263, "loss/reg": 0.00017753323481883854, "step": 434 }, { "epoch": 0.054375, "grad_norm": 3.111717462539673, "grad_norm_var": 39.42935091258095, "learning_rate": 0.0001, "loss": 1.4152, "loss/crossentropy": 2.464742660522461, "loss/hidden": 1.203125, "loss/logits": 0.2103499472141266, "loss/reg": 0.00017748030950315297, "step": 435 }, { "epoch": 0.0545, "grad_norm": 2.0744073390960693, "grad_norm_var": 39.51433658135766, "learning_rate": 0.0001, "loss": 1.2571, "loss/crossentropy": 2.4160804748535156, "loss/hidden": 1.0703125, "loss/logits": 0.18496504426002502, "loss/reg": 0.00017741357441991568, "step": 436 }, { "epoch": 0.054625, "grad_norm": 2.2014148235321045, "grad_norm_var": 39.705110235168064, "learning_rate": 0.0001, "loss": 1.2757, "loss/crossentropy": 2.558213710784912, "loss/hidden": 1.0703125, "loss/logits": 0.20358391106128693, "loss/reg": 0.00017734503489919007, "step": 437 }, { "epoch": 0.05475, "grad_norm": 2.42464017868042, "grad_norm_var": 39.81199116769601, "learning_rate": 0.0001, "loss": 1.2475, "loss/crossentropy": 2.4901251792907715, "loss/hidden": 1.078125, "loss/logits": 0.16764749586582184, "loss/reg": 0.00017729295359458774, "step": 438 }, { "epoch": 0.054875, "grad_norm": 2.230916976928711, "grad_norm_var": 39.85185812679954, "learning_rate": 0.0001, "loss": 1.0936, "loss/crossentropy": 2.447075366973877, "loss/hidden": 0.9453125, "loss/logits": 0.1465335488319397, "loss/reg": 0.0001772290706867352, "step": 439 }, { "epoch": 0.055, "grad_norm": 2.749454975128174, "grad_norm_var": 39.878976148047535, "learning_rate": 0.0001, "loss": 1.4814, "loss/crossentropy": 2.2224342823028564, "loss/hidden": 1.2734375, "loss/logits": 0.20615623891353607, "loss/reg": 0.00017717515584081411, "step": 440 }, { "epoch": 0.055125, "grad_norm": 3.0442757606506348, "grad_norm_var": 39.81767857726663, "learning_rate": 0.0001, "loss": 1.8732, "loss/crossentropy": 2.110996723175049, "loss/hidden": 1.515625, "loss/logits": 0.35577309131622314, "loss/reg": 0.0001771111856214702, "step": 441 }, { "epoch": 0.05525, "grad_norm": 4.601929187774658, "grad_norm_var": 39.54202437298279, "learning_rate": 0.0001, "loss": 1.6329, "loss/crossentropy": 2.708014965057373, "loss/hidden": 1.3203125, "loss/logits": 0.3108658790588379, "loss/reg": 0.00017704560013953596, "step": 442 }, { "epoch": 0.055375, "grad_norm": 3.026305913925171, "grad_norm_var": 39.3895159629985, "learning_rate": 0.0001, "loss": 1.4032, "loss/crossentropy": 2.738539934158325, "loss/hidden": 1.1875, "loss/logits": 0.21392673254013062, "loss/reg": 0.00017697943258099258, "step": 443 }, { "epoch": 0.0555, "grad_norm": 3.2805826663970947, "grad_norm_var": 39.48518822606245, "learning_rate": 0.0001, "loss": 1.4055, "loss/crossentropy": 2.6368248462677, "loss/hidden": 1.203125, "loss/logits": 0.2005615532398224, "loss/reg": 0.0001769265509210527, "step": 444 }, { "epoch": 0.055625, "grad_norm": 2.9948861598968506, "grad_norm_var": 39.44686501090276, "learning_rate": 0.0001, "loss": 1.1039, "loss/crossentropy": 2.6170451641082764, "loss/hidden": 0.96484375, "loss/logits": 0.13730549812316895, "loss/reg": 0.00017686416686046869, "step": 445 }, { "epoch": 0.05575, "grad_norm": 2.2163240909576416, "grad_norm_var": 39.734049160677394, "learning_rate": 0.0001, "loss": 1.2018, "loss/crossentropy": 2.454167127609253, "loss/hidden": 1.0390625, "loss/logits": 0.16098245978355408, "loss/reg": 0.00017681345343589783, "step": 446 }, { "epoch": 0.055875, "grad_norm": 2.6603283882141113, "grad_norm_var": 0.5323792649670357, "learning_rate": 0.0001, "loss": 1.3646, "loss/crossentropy": 2.499635696411133, "loss/hidden": 1.1328125, "loss/logits": 0.22998638451099396, "loss/reg": 0.00017675201524980366, "step": 447 }, { "epoch": 0.056, "grad_norm": 2.228543758392334, "grad_norm_var": 0.5416984580164314, "learning_rate": 0.0001, "loss": 1.1798, "loss/crossentropy": 2.4845035076141357, "loss/hidden": 1.015625, "loss/logits": 0.16240708529949188, "loss/reg": 0.00017670769011601806, "step": 448 }, { "epoch": 0.056125, "grad_norm": 2.2279977798461914, "grad_norm_var": 0.4038044026013947, "learning_rate": 0.0001, "loss": 1.325, "loss/crossentropy": 2.3559677600860596, "loss/hidden": 1.140625, "loss/logits": 0.18264812231063843, "loss/reg": 0.00017664962797425687, "step": 449 }, { "epoch": 0.05625, "grad_norm": 2.7545721530914307, "grad_norm_var": 0.4022014302476805, "learning_rate": 0.0001, "loss": 1.4368, "loss/crossentropy": 2.374678373336792, "loss/hidden": 1.234375, "loss/logits": 0.20067429542541504, "loss/reg": 0.0001765888009686023, "step": 450 }, { "epoch": 0.056375, "grad_norm": 2.0431902408599854, "grad_norm_var": 0.4204979320743064, "learning_rate": 0.0001, "loss": 1.0879, "loss/crossentropy": 2.464282751083374, "loss/hidden": 0.94140625, "loss/logits": 0.1447741687297821, "loss/reg": 0.00017654395196586847, "step": 451 }, { "epoch": 0.0565, "grad_norm": 2.6901564598083496, "grad_norm_var": 0.39509245912944924, "learning_rate": 0.0001, "loss": 1.2069, "loss/crossentropy": 2.819383382797241, "loss/hidden": 1.0390625, "loss/logits": 0.16609331965446472, "loss/reg": 0.00017649627989158034, "step": 452 }, { "epoch": 0.056625, "grad_norm": 2.1076624393463135, "grad_norm_var": 0.402011404785177, "learning_rate": 0.0001, "loss": 1.2556, "loss/crossentropy": 2.4384539127349854, "loss/hidden": 1.09375, "loss/logits": 0.16008606553077698, "loss/reg": 0.0001764398330124095, "step": 453 }, { "epoch": 0.05675, "grad_norm": 5.258527755737305, "grad_norm_var": 0.7979676690528414, "learning_rate": 0.0001, "loss": 1.6846, "loss/crossentropy": 2.5452864170074463, "loss/hidden": 1.4140625, "loss/logits": 0.26873037219047546, "loss/reg": 0.00017638294957578182, "step": 454 }, { "epoch": 0.056875, "grad_norm": 2.660983085632324, "grad_norm_var": 0.7721798756654529, "learning_rate": 0.0001, "loss": 1.4255, "loss/crossentropy": 2.585622549057007, "loss/hidden": 1.1875, "loss/logits": 0.23628488183021545, "loss/reg": 0.00017633216339163482, "step": 455 }, { "epoch": 0.057, "grad_norm": 2.7722721099853516, "grad_norm_var": 0.7717267059376826, "learning_rate": 0.0001, "loss": 1.2022, "loss/crossentropy": 2.800078868865967, "loss/hidden": 1.046875, "loss/logits": 0.15360021591186523, "loss/reg": 0.0001762784959282726, "step": 456 }, { "epoch": 0.057125, "grad_norm": 2.398487091064453, "grad_norm_var": 0.7862760060851559, "learning_rate": 0.0001, "loss": 1.6049, "loss/crossentropy": 2.1760308742523193, "loss/hidden": 1.328125, "loss/logits": 0.2749764025211334, "loss/reg": 0.00017622820450924337, "step": 457 }, { "epoch": 0.05725, "grad_norm": 2.1820383071899414, "grad_norm_var": 0.5935128198963845, "learning_rate": 0.0001, "loss": 1.2135, "loss/crossentropy": 2.6835989952087402, "loss/hidden": 1.046875, "loss/logits": 0.16489288210868835, "loss/reg": 0.00017618124547880143, "step": 458 }, { "epoch": 0.057375, "grad_norm": 2.3541500568389893, "grad_norm_var": 0.5942025229741127, "learning_rate": 0.0001, "loss": 1.3202, "loss/crossentropy": 2.6355140209198, "loss/hidden": 1.1328125, "loss/logits": 0.1856687068939209, "loss/reg": 0.00017612463852856308, "step": 459 }, { "epoch": 0.0575, "grad_norm": 2.4668965339660645, "grad_norm_var": 0.5700904660282996, "learning_rate": 0.0001, "loss": 1.3106, "loss/crossentropy": 2.656648874282837, "loss/hidden": 1.125, "loss/logits": 0.18380066752433777, "loss/reg": 0.00017607423069421202, "step": 460 }, { "epoch": 0.057625, "grad_norm": 2.6230361461639404, "grad_norm_var": 0.5604462661929044, "learning_rate": 0.0001, "loss": 1.2208, "loss/crossentropy": 2.62496280670166, "loss/hidden": 1.0546875, "loss/logits": 0.16433754563331604, "loss/reg": 0.0001760145096341148, "step": 461 }, { "epoch": 0.05775, "grad_norm": 2.7273848056793213, "grad_norm_var": 0.5504336260767483, "learning_rate": 0.0001, "loss": 1.4464, "loss/crossentropy": 2.558682441711426, "loss/hidden": 1.25, "loss/logits": 0.19467273354530334, "loss/reg": 0.00017597198893781751, "step": 462 }, { "epoch": 0.057875, "grad_norm": 2.6174404621124268, "grad_norm_var": 0.5504024009310662, "learning_rate": 0.0001, "loss": 1.4354, "loss/crossentropy": 2.1391024589538574, "loss/hidden": 1.234375, "loss/logits": 0.19923442602157593, "loss/reg": 0.00017592695076018572, "step": 463 }, { "epoch": 0.058, "grad_norm": 2.1421875953674316, "grad_norm_var": 0.5555149090661641, "learning_rate": 0.0001, "loss": 1.2089, "loss/crossentropy": 2.5548229217529297, "loss/hidden": 1.03125, "loss/logits": 0.17589880526065826, "loss/reg": 0.00017588076298125088, "step": 464 }, { "epoch": 0.058125, "grad_norm": 2.4386532306671143, "grad_norm_var": 0.5470902662726682, "learning_rate": 0.0001, "loss": 1.0766, "loss/crossentropy": 2.741363286972046, "loss/hidden": 0.94140625, "loss/logits": 0.13345757126808167, "loss/reg": 0.00017582789587322623, "step": 465 }, { "epoch": 0.05825, "grad_norm": 2.7671115398406982, "grad_norm_var": 0.5472918955756455, "learning_rate": 0.0001, "loss": 1.2597, "loss/crossentropy": 2.585339307785034, "loss/hidden": 1.09375, "loss/logits": 0.1642206907272339, "loss/reg": 0.00017577498510945588, "step": 466 }, { "epoch": 0.058375, "grad_norm": 2.075833797454834, "grad_norm_var": 0.5447581279211853, "learning_rate": 0.0001, "loss": 1.1296, "loss/crossentropy": 2.4353983402252197, "loss/hidden": 0.9765625, "loss/logits": 0.1513109803199768, "loss/reg": 0.00017573049990460277, "step": 467 }, { "epoch": 0.0585, "grad_norm": 2.181549072265625, "grad_norm_var": 0.557705888030068, "learning_rate": 0.0001, "loss": 1.234, "loss/crossentropy": 2.389277696609497, "loss/hidden": 1.0625, "loss/logits": 0.16970399022102356, "loss/reg": 0.0001756956335157156, "step": 468 }, { "epoch": 0.058625, "grad_norm": 2.239053249359131, "grad_norm_var": 0.5499689577837131, "learning_rate": 0.0001, "loss": 1.3819, "loss/crossentropy": 2.655245780944824, "loss/hidden": 1.15625, "loss/logits": 0.2238529622554779, "loss/reg": 0.00017564196605235338, "step": 469 }, { "epoch": 0.05875, "grad_norm": 2.3605029582977295, "grad_norm_var": 0.05499469594172955, "learning_rate": 0.0001, "loss": 1.3717, "loss/crossentropy": 2.3876919746398926, "loss/hidden": 1.171875, "loss/logits": 0.19806072115898132, "loss/reg": 0.00017558463150635362, "step": 470 }, { "epoch": 0.058875, "grad_norm": 2.848848581314087, "grad_norm_var": 0.06278663740608768, "learning_rate": 0.0001, "loss": 1.5331, "loss/crossentropy": 2.2159836292266846, "loss/hidden": 1.2890625, "loss/logits": 0.24231435358524323, "loss/reg": 0.00017553169163875282, "step": 471 }, { "epoch": 0.059, "grad_norm": 3.3035221099853516, "grad_norm_var": 0.10327356833769556, "learning_rate": 0.0001, "loss": 1.3096, "loss/crossentropy": 3.1982433795928955, "loss/hidden": 1.140625, "loss/logits": 0.16723856329917908, "loss/reg": 0.0001754647382767871, "step": 472 }, { "epoch": 0.059125, "grad_norm": 2.1994223594665527, "grad_norm_var": 0.10799121596538726, "learning_rate": 0.0001, "loss": 1.2996, "loss/crossentropy": 2.478044033050537, "loss/hidden": 1.1171875, "loss/logits": 0.18064665794372559, "loss/reg": 0.0001753960968926549, "step": 473 }, { "epoch": 0.05925, "grad_norm": 2.7087080478668213, "grad_norm_var": 0.10507261048413336, "learning_rate": 0.0001, "loss": 1.2304, "loss/crossentropy": 2.533445119857788, "loss/hidden": 1.03125, "loss/logits": 0.19739562273025513, "loss/reg": 0.0001753264368744567, "step": 474 }, { "epoch": 0.059375, "grad_norm": 2.4171252250671387, "grad_norm_var": 0.1040673242944185, "learning_rate": 0.0001, "loss": 1.2977, "loss/crossentropy": 2.5889570713043213, "loss/hidden": 1.0859375, "loss/logits": 0.21002550423145294, "loss/reg": 0.00017525417206343263, "step": 475 }, { "epoch": 0.0595, "grad_norm": 3.3070313930511475, "grad_norm_var": 0.14365224039723495, "learning_rate": 0.0001, "loss": 1.7167, "loss/crossentropy": 2.7916061878204346, "loss/hidden": 1.4453125, "loss/logits": 0.2696676552295685, "loss/reg": 0.0001752021926222369, "step": 476 }, { "epoch": 0.059625, "grad_norm": 3.215808391571045, "grad_norm_var": 0.17060835871623775, "learning_rate": 0.0001, "loss": 1.519, "loss/crossentropy": 2.4236576557159424, "loss/hidden": 1.2734375, "loss/logits": 0.24385260045528412, "loss/reg": 0.0001751292875269428, "step": 477 }, { "epoch": 0.05975, "grad_norm": 2.4063854217529297, "grad_norm_var": 0.17146307657458593, "learning_rate": 0.0001, "loss": 1.0966, "loss/crossentropy": 2.4138169288635254, "loss/hidden": 0.94921875, "loss/logits": 0.14562630653381348, "loss/reg": 0.00017507674056105316, "step": 478 }, { "epoch": 0.059875, "grad_norm": 2.100240707397461, "grad_norm_var": 0.18538063838473515, "learning_rate": 0.0001, "loss": 1.2543, "loss/crossentropy": 2.656923770904541, "loss/hidden": 1.0703125, "loss/logits": 0.18223264813423157, "loss/reg": 0.0001750182273099199, "step": 479 }, { "epoch": 0.06, "grad_norm": 2.416749954223633, "grad_norm_var": 0.17536422723811237, "learning_rate": 0.0001, "loss": 1.3935, "loss/crossentropy": 2.5389764308929443, "loss/hidden": 1.171875, "loss/logits": 0.21990099549293518, "loss/reg": 0.00017496509826742113, "step": 480 }, { "epoch": 0.060125, "grad_norm": 2.2483956813812256, "grad_norm_var": 0.18074697157593392, "learning_rate": 0.0001, "loss": 1.3504, "loss/crossentropy": 2.7335619926452637, "loss/hidden": 1.15625, "loss/logits": 0.19241070747375488, "loss/reg": 0.00017491589824203402, "step": 481 }, { "epoch": 0.06025, "grad_norm": 2.1877689361572266, "grad_norm_var": 0.1849354900853349, "learning_rate": 0.0001, "loss": 1.248, "loss/crossentropy": 2.66105318069458, "loss/hidden": 1.0625, "loss/logits": 0.18372279405593872, "loss/reg": 0.00017485868011135608, "step": 482 }, { "epoch": 0.060375, "grad_norm": 2.092740297317505, "grad_norm_var": 0.18396663403457297, "learning_rate": 0.0001, "loss": 1.2386, "loss/crossentropy": 2.7327380180358887, "loss/hidden": 1.0390625, "loss/logits": 0.19782081246376038, "loss/reg": 0.00017478555673733354, "step": 483 }, { "epoch": 0.0605, "grad_norm": 2.3183560371398926, "grad_norm_var": 0.17906094719213855, "learning_rate": 0.0001, "loss": 1.2824, "loss/crossentropy": 2.4746274948120117, "loss/hidden": 1.09375, "loss/logits": 0.1869126260280609, "loss/reg": 0.0001747341302689165, "step": 484 }, { "epoch": 0.060625, "grad_norm": 3.5248806476593018, "grad_norm_var": 0.23368608955632125, "learning_rate": 0.0001, "loss": 1.1942, "loss/crossentropy": 2.6376962661743164, "loss/hidden": 1.0234375, "loss/logits": 0.1689874827861786, "loss/reg": 0.00017466919962316751, "step": 485 }, { "epoch": 0.06075, "grad_norm": 2.8418338298797607, "grad_norm_var": 0.23256916977224643, "learning_rate": 0.0001, "loss": 1.4452, "loss/crossentropy": 2.3843576908111572, "loss/hidden": 1.1953125, "loss/logits": 0.24809806048870087, "loss/reg": 0.00017459361697547138, "step": 486 }, { "epoch": 0.060875, "grad_norm": 2.697395086288452, "grad_norm_var": 0.22965639284835385, "learning_rate": 0.0001, "loss": 1.1924, "loss/crossentropy": 2.4852230548858643, "loss/hidden": 1.03125, "loss/logits": 0.15941958129405975, "loss/reg": 0.000174528788193129, "step": 487 }, { "epoch": 0.061, "grad_norm": 3.3923656940460205, "grad_norm_var": 0.23819745706471546, "learning_rate": 0.0001, "loss": 1.2613, "loss/crossentropy": 2.665536880493164, "loss/hidden": 1.078125, "loss/logits": 0.18143823742866516, "loss/reg": 0.00017445418052375317, "step": 488 }, { "epoch": 0.061125, "grad_norm": 3.820457935333252, "grad_norm_var": 0.30943274234138396, "learning_rate": 0.0001, "loss": 1.4177, "loss/crossentropy": 2.529301881790161, "loss/hidden": 1.203125, "loss/logits": 0.21287089586257935, "loss/reg": 0.00017437619681004435, "step": 489 }, { "epoch": 0.06125, "grad_norm": 2.441828966140747, "grad_norm_var": 0.3146780452696741, "learning_rate": 0.0001, "loss": 1.1788, "loss/crossentropy": 2.6879920959472656, "loss/hidden": 1.015625, "loss/logits": 0.1613830029964447, "loss/reg": 0.00017432142340112478, "step": 490 }, { "epoch": 0.061375, "grad_norm": 2.8247528076171875, "grad_norm_var": 0.30890959275739743, "learning_rate": 0.0001, "loss": 1.4539, "loss/crossentropy": 2.2834181785583496, "loss/hidden": 1.234375, "loss/logits": 0.21779246628284454, "loss/reg": 0.00017424933321308345, "step": 491 }, { "epoch": 0.0615, "grad_norm": 2.4493589401245117, "grad_norm_var": 0.2900195920917798, "learning_rate": 0.0001, "loss": 1.1726, "loss/crossentropy": 2.2889974117279053, "loss/hidden": 1.0234375, "loss/logits": 0.14741873741149902, "loss/reg": 0.00017419550567865372, "step": 492 }, { "epoch": 0.061625, "grad_norm": 2.667480707168579, "grad_norm_var": 0.2700917314036502, "learning_rate": 0.0001, "loss": 1.4045, "loss/crossentropy": 2.404547929763794, "loss/hidden": 1.171875, "loss/logits": 0.23084591329097748, "loss/reg": 0.0001741291634971276, "step": 493 }, { "epoch": 0.06175, "grad_norm": 3.452826499938965, "grad_norm_var": 0.3042709664857275, "learning_rate": 0.0001, "loss": 1.3829, "loss/crossentropy": 2.9492647647857666, "loss/hidden": 1.1875, "loss/logits": 0.19365137815475464, "loss/reg": 0.00017407909035682678, "step": 494 }, { "epoch": 0.061875, "grad_norm": 2.6484220027923584, "grad_norm_var": 0.2779481152143487, "learning_rate": 0.0001, "loss": 1.4393, "loss/crossentropy": 2.4739902019500732, "loss/hidden": 1.2109375, "loss/logits": 0.22663193941116333, "loss/reg": 0.00017401424702256918, "step": 495 }, { "epoch": 0.062, "grad_norm": 2.3823630809783936, "grad_norm_var": 0.2795572822758951, "learning_rate": 0.0001, "loss": 1.4423, "loss/crossentropy": 2.1366026401519775, "loss/hidden": 1.25, "loss/logits": 0.1905536651611328, "loss/reg": 0.00017397114424966276, "step": 496 }, { "epoch": 0.062125, "grad_norm": 3.4144351482391357, "grad_norm_var": 0.28663513944516883, "learning_rate": 0.0001, "loss": 1.3353, "loss/crossentropy": 2.4802052974700928, "loss/hidden": 1.1171875, "loss/logits": 0.21641235053539276, "loss/reg": 0.00017391364963259548, "step": 497 }, { "epoch": 0.06225, "grad_norm": 2.339780330657959, "grad_norm_var": 0.2752179712740135, "learning_rate": 0.0001, "loss": 1.3462, "loss/crossentropy": 2.416666030883789, "loss/hidden": 1.140625, "loss/logits": 0.20387353003025055, "loss/reg": 0.00017384960665367544, "step": 498 }, { "epoch": 0.062375, "grad_norm": 2.3220226764678955, "grad_norm_var": 0.2559089262690113, "learning_rate": 0.0001, "loss": 1.1988, "loss/crossentropy": 2.668480634689331, "loss/hidden": 1.0234375, "loss/logits": 0.1736161708831787, "loss/reg": 0.00017379365453962237, "step": 499 }, { "epoch": 0.0625, "grad_norm": 2.35937762260437, "grad_norm_var": 0.25312725190736174, "learning_rate": 0.0001, "loss": 1.2363, "loss/crossentropy": 2.5724241733551025, "loss/hidden": 1.0703125, "loss/logits": 0.16428104043006897, "loss/reg": 0.00017374279559589922, "step": 500 }, { "epoch": 0.062625, "grad_norm": 3.2033684253692627, "grad_norm_var": 0.2306021947812873, "learning_rate": 0.0001, "loss": 1.2501, "loss/crossentropy": 2.9498679637908936, "loss/hidden": 1.09375, "loss/logits": 0.15456530451774597, "loss/reg": 0.00017368397675454617, "step": 501 }, { "epoch": 0.06275, "grad_norm": 7.13689661026001, "grad_norm_var": 1.3911368332536909, "learning_rate": 0.0001, "loss": 1.4688, "loss/crossentropy": 2.5023751258850098, "loss/hidden": 1.25, "loss/logits": 0.21706295013427734, "loss/reg": 0.00017362662765663117, "step": 502 }, { "epoch": 0.062875, "grad_norm": 2.780501365661621, "grad_norm_var": 1.3871397577560267, "learning_rate": 0.0001, "loss": 1.245, "loss/crossentropy": 2.501007318496704, "loss/hidden": 1.0703125, "loss/logits": 0.1729978322982788, "loss/reg": 0.00017356427269987762, "step": 503 }, { "epoch": 0.063, "grad_norm": 2.470470428466797, "grad_norm_var": 1.4045989344993046, "learning_rate": 0.0001, "loss": 1.1976, "loss/crossentropy": 2.6776809692382812, "loss/hidden": 1.0234375, "loss/logits": 0.17240548133850098, "loss/reg": 0.0001735100959194824, "step": 504 }, { "epoch": 0.063125, "grad_norm": 2.7640480995178223, "grad_norm_var": 1.3650723952075083, "learning_rate": 0.0001, "loss": 1.1676, "loss/crossentropy": 2.8096463680267334, "loss/hidden": 0.99609375, "loss/logits": 0.16976726055145264, "loss/reg": 0.0001734576653689146, "step": 505 }, { "epoch": 0.06325, "grad_norm": 4.206851959228516, "grad_norm_var": 1.4334523599411717, "learning_rate": 0.0001, "loss": 1.3828, "loss/crossentropy": 2.5923476219177246, "loss/hidden": 1.1796875, "loss/logits": 0.20132878422737122, "loss/reg": 0.0001734116958687082, "step": 506 }, { "epoch": 0.063375, "grad_norm": 2.804028272628784, "grad_norm_var": 1.434209210597612, "learning_rate": 0.0001, "loss": 1.4384, "loss/crossentropy": 2.673945426940918, "loss/hidden": 1.21875, "loss/logits": 0.21792729198932648, "loss/reg": 0.00017335994925815612, "step": 507 }, { "epoch": 0.0635, "grad_norm": 2.658412456512451, "grad_norm_var": 1.4191493650313673, "learning_rate": 0.0001, "loss": 1.3595, "loss/crossentropy": 2.3202786445617676, "loss/hidden": 1.140625, "loss/logits": 0.21719013154506683, "loss/reg": 0.00017331514391116798, "step": 508 }, { "epoch": 0.063625, "grad_norm": 3.360714912414551, "grad_norm_var": 1.4091417330272291, "learning_rate": 0.0001, "loss": 1.3161, "loss/crossentropy": 2.6554245948791504, "loss/hidden": 1.140625, "loss/logits": 0.17373251914978027, "loss/reg": 0.00017326146189589053, "step": 509 }, { "epoch": 0.06375, "grad_norm": 2.447488784790039, "grad_norm_var": 1.4309184266339967, "learning_rate": 0.0001, "loss": 1.2621, "loss/crossentropy": 2.881753921508789, "loss/hidden": 1.0546875, "loss/logits": 0.20565897226333618, "loss/reg": 0.0001732077362248674, "step": 510 }, { "epoch": 0.063875, "grad_norm": 1.941440463066101, "grad_norm_var": 1.5029527266984886, "learning_rate": 0.0001, "loss": 1.0587, "loss/crossentropy": 2.361740827560425, "loss/hidden": 0.92578125, "loss/logits": 0.13118453323841095, "loss/reg": 0.00017315799777861685, "step": 511 }, { "epoch": 0.064, "grad_norm": 2.2433526515960693, "grad_norm_var": 1.5162942173979714, "learning_rate": 0.0001, "loss": 1.1907, "loss/crossentropy": 2.500945568084717, "loss/hidden": 1.0234375, "loss/logits": 0.16551794111728668, "loss/reg": 0.0001730988296912983, "step": 512 }, { "epoch": 0.064125, "grad_norm": 2.449188470840454, "grad_norm_var": 1.5248332553676305, "learning_rate": 0.0001, "loss": 1.4152, "loss/crossentropy": 2.4291043281555176, "loss/hidden": 1.2265625, "loss/logits": 0.18695488572120667, "loss/reg": 0.00017305357323493809, "step": 513 }, { "epoch": 0.06425, "grad_norm": 2.403683662414551, "grad_norm_var": 1.519735802018065, "learning_rate": 0.0001, "loss": 1.2866, "loss/crossentropy": 2.5314974784851074, "loss/hidden": 1.09375, "loss/logits": 0.19113026559352875, "loss/reg": 0.00017301547632087022, "step": 514 }, { "epoch": 0.064375, "grad_norm": 2.6757254600524902, "grad_norm_var": 1.496902185277708, "learning_rate": 0.0001, "loss": 1.2652, "loss/crossentropy": 2.360200881958008, "loss/hidden": 1.0859375, "loss/logits": 0.17756882309913635, "loss/reg": 0.00017296474834438413, "step": 515 }, { "epoch": 0.0645, "grad_norm": 2.432098150253296, "grad_norm_var": 1.4910784201631766, "learning_rate": 0.0001, "loss": 1.3328, "loss/crossentropy": 2.2926135063171387, "loss/hidden": 1.15625, "loss/logits": 0.17481115460395813, "loss/reg": 0.00017292361008003354, "step": 516 }, { "epoch": 0.064625, "grad_norm": 2.576810121536255, "grad_norm_var": 1.4985112951366995, "learning_rate": 0.0001, "loss": 1.1377, "loss/crossentropy": 2.6856186389923096, "loss/hidden": 0.984375, "loss/logits": 0.1515754908323288, "loss/reg": 0.00017287083028350025, "step": 517 }, { "epoch": 0.06475, "grad_norm": 3.813851833343506, "grad_norm_var": 0.3377773464674575, "learning_rate": 0.0001, "loss": 1.5407, "loss/crossentropy": 2.821399688720703, "loss/hidden": 1.265625, "loss/logits": 0.27337172627449036, "loss/reg": 0.00017283220950048417, "step": 518 }, { "epoch": 0.064875, "grad_norm": 2.4768738746643066, "grad_norm_var": 0.3423769270252989, "learning_rate": 0.0001, "loss": 1.4961, "loss/crossentropy": 2.2091104984283447, "loss/hidden": 1.28125, "loss/logits": 0.21315285563468933, "loss/reg": 0.00017277758161071688, "step": 519 }, { "epoch": 0.065, "grad_norm": 2.4452717304229736, "grad_norm_var": 0.34329804505939254, "learning_rate": 0.0001, "loss": 1.2949, "loss/crossentropy": 2.3737871646881104, "loss/hidden": 1.109375, "loss/logits": 0.1838245391845703, "loss/reg": 0.000172733940416947, "step": 520 }, { "epoch": 0.065125, "grad_norm": 2.700247287750244, "grad_norm_var": 0.3432733633959595, "learning_rate": 0.0001, "loss": 1.2782, "loss/crossentropy": 2.703601598739624, "loss/hidden": 1.0859375, "loss/logits": 0.19057917594909668, "loss/reg": 0.0001726971531752497, "step": 521 }, { "epoch": 0.06525, "grad_norm": 2.2924041748046875, "grad_norm_var": 0.1946606389373267, "learning_rate": 0.0001, "loss": 1.2343, "loss/crossentropy": 2.479201316833496, "loss/hidden": 1.0625, "loss/logits": 0.1701183170080185, "loss/reg": 0.0001726403716020286, "step": 522 }, { "epoch": 0.065375, "grad_norm": 2.685793399810791, "grad_norm_var": 0.19243772626851877, "learning_rate": 0.0001, "loss": 1.4527, "loss/crossentropy": 2.712714672088623, "loss/hidden": 1.234375, "loss/logits": 0.21664533019065857, "loss/reg": 0.00017258411389775574, "step": 523 }, { "epoch": 0.0655, "grad_norm": 2.176912546157837, "grad_norm_var": 0.2031912541905872, "learning_rate": 0.0001, "loss": 1.2739, "loss/crossentropy": 2.6760213375091553, "loss/hidden": 1.0625, "loss/logits": 0.2096371352672577, "loss/reg": 0.000172529456904158, "step": 524 }, { "epoch": 0.065625, "grad_norm": 2.0959558486938477, "grad_norm_var": 0.16984495296527288, "learning_rate": 0.0001, "loss": 1.3593, "loss/crossentropy": 2.6553218364715576, "loss/hidden": 1.1640625, "loss/logits": 0.19346752762794495, "loss/reg": 0.00017247024516109377, "step": 525 }, { "epoch": 0.06575, "grad_norm": 8.542657852172852, "grad_norm_var": 2.456370936221297, "learning_rate": 0.0001, "loss": 1.9331, "loss/crossentropy": 2.337883710861206, "loss/hidden": 1.625, "loss/logits": 0.3063379228115082, "loss/reg": 0.00017242114699911326, "step": 526 }, { "epoch": 0.065875, "grad_norm": 3.128777265548706, "grad_norm_var": 2.397160487154512, "learning_rate": 0.0001, "loss": 1.443, "loss/crossentropy": 2.422700881958008, "loss/hidden": 1.2578125, "loss/logits": 0.18349535763263702, "loss/reg": 0.00017235790437553078, "step": 527 }, { "epoch": 0.066, "grad_norm": 2.870034694671631, "grad_norm_var": 2.3629757829101123, "learning_rate": 0.0001, "loss": 1.32, "loss/crossentropy": 2.599033832550049, "loss/hidden": 1.125, "loss/logits": 0.19324183464050293, "loss/reg": 0.00017229218792635947, "step": 528 }, { "epoch": 0.066125, "grad_norm": 2.2301864624023438, "grad_norm_var": 2.381630713264915, "learning_rate": 0.0001, "loss": 1.1189, "loss/crossentropy": 2.3612945079803467, "loss/hidden": 0.96875, "loss/logits": 0.14839375019073486, "loss/reg": 0.00017223399481736124, "step": 529 }, { "epoch": 0.06625, "grad_norm": 2.746755838394165, "grad_norm_var": 2.363003882652128, "learning_rate": 0.0001, "loss": 1.2935, "loss/crossentropy": 2.693129777908325, "loss/hidden": 1.109375, "loss/logits": 0.18237757682800293, "loss/reg": 0.00017216459673363715, "step": 530 }, { "epoch": 0.066375, "grad_norm": 2.2904253005981445, "grad_norm_var": 2.388589419863691, "learning_rate": 0.0001, "loss": 1.3138, "loss/crossentropy": 2.388449192047119, "loss/hidden": 1.1328125, "loss/logits": 0.17922545969486237, "loss/reg": 0.00017211108934134245, "step": 531 }, { "epoch": 0.0665, "grad_norm": 2.401120185852051, "grad_norm_var": 2.3908672865623823, "learning_rate": 0.0001, "loss": 1.1468, "loss/crossentropy": 2.6553752422332764, "loss/hidden": 0.98828125, "loss/logits": 0.15683269500732422, "loss/reg": 0.00017204153118655086, "step": 532 }, { "epoch": 0.066625, "grad_norm": 2.633457899093628, "grad_norm_var": 2.388119747491773, "learning_rate": 0.0001, "loss": 1.3705, "loss/crossentropy": 2.6518547534942627, "loss/hidden": 1.15625, "loss/logits": 0.21251234412193298, "loss/reg": 0.00017197159468196332, "step": 533 }, { "epoch": 0.06675, "grad_norm": 2.3268089294433594, "grad_norm_var": 2.3591461867007237, "learning_rate": 0.0001, "loss": 1.2765, "loss/crossentropy": 2.43649959564209, "loss/hidden": 1.1015625, "loss/logits": 0.17321743071079254, "loss/reg": 0.00017191444931086153, "step": 534 }, { "epoch": 0.066875, "grad_norm": 2.4145212173461914, "grad_norm_var": 2.3627217718932303, "learning_rate": 0.0001, "loss": 1.3301, "loss/crossentropy": 2.5456600189208984, "loss/hidden": 1.140625, "loss/logits": 0.18778590857982635, "loss/reg": 0.00017185932665597647, "step": 535 }, { "epoch": 0.067, "grad_norm": 3.6527416706085205, "grad_norm_var": 2.384849077186978, "learning_rate": 0.0001, "loss": 1.7707, "loss/crossentropy": 2.694108724594116, "loss/hidden": 1.4453125, "loss/logits": 0.3236609995365143, "loss/reg": 0.0001718000421533361, "step": 536 }, { "epoch": 0.067125, "grad_norm": 2.442765235900879, "grad_norm_var": 2.397542855138734, "learning_rate": 0.0001, "loss": 1.4162, "loss/crossentropy": 2.423489570617676, "loss/hidden": 1.21875, "loss/logits": 0.1956934630870819, "loss/reg": 0.00017172891239169985, "step": 537 }, { "epoch": 0.06725, "grad_norm": 2.602630615234375, "grad_norm_var": 2.377052002120225, "learning_rate": 0.0001, "loss": 1.1458, "loss/crossentropy": 2.823305368423462, "loss/hidden": 0.96875, "loss/logits": 0.17532846331596375, "loss/reg": 0.00017167623445857316, "step": 538 }, { "epoch": 0.067375, "grad_norm": 4.694828033447266, "grad_norm_var": 2.5578468568103476, "learning_rate": 0.0001, "loss": 1.9136, "loss/crossentropy": 2.148275136947632, "loss/hidden": 1.5859375, "loss/logits": 0.32593122124671936, "loss/reg": 0.00017162920266855508, "step": 539 }, { "epoch": 0.0675, "grad_norm": 2.7025794982910156, "grad_norm_var": 2.511949663048948, "learning_rate": 0.0001, "loss": 1.2986, "loss/crossentropy": 2.4942023754119873, "loss/hidden": 1.109375, "loss/logits": 0.18750452995300293, "loss/reg": 0.00017157263937406242, "step": 540 }, { "epoch": 0.067625, "grad_norm": 4.450098991394043, "grad_norm_var": 2.5397113002195955, "learning_rate": 0.0001, "loss": 1.6572, "loss/crossentropy": 2.220086097717285, "loss/hidden": 1.4296875, "loss/logits": 0.22576934099197388, "loss/reg": 0.00017152745567727834, "step": 541 }, { "epoch": 0.06775, "grad_norm": 2.070035457611084, "grad_norm_var": 0.5975217697848583, "learning_rate": 0.0001, "loss": 1.2034, "loss/crossentropy": 2.6207101345062256, "loss/hidden": 1.0390625, "loss/logits": 0.1625814139842987, "loss/reg": 0.00017148529877886176, "step": 542 }, { "epoch": 0.067875, "grad_norm": 2.567230463027954, "grad_norm_var": 0.5966276565563391, "learning_rate": 0.0001, "loss": 1.3821, "loss/crossentropy": 2.8155553340911865, "loss/hidden": 1.15625, "loss/logits": 0.22412577271461487, "loss/reg": 0.00017144184675998986, "step": 543 }, { "epoch": 0.068, "grad_norm": 2.597099542617798, "grad_norm_var": 0.5994085905893123, "learning_rate": 0.0001, "loss": 1.3903, "loss/crossentropy": 2.6148014068603516, "loss/hidden": 1.171875, "loss/logits": 0.21672894060611725, "loss/reg": 0.00017140124691650271, "step": 544 }, { "epoch": 0.068125, "grad_norm": 2.7950587272644043, "grad_norm_var": 0.5763252739819049, "learning_rate": 0.0001, "loss": 1.4865, "loss/crossentropy": 2.1710782051086426, "loss/hidden": 1.2734375, "loss/logits": 0.2113463282585144, "loss/reg": 0.00017136837414000183, "step": 545 }, { "epoch": 0.06825, "grad_norm": 12.58993911743164, "grad_norm_var": 6.513717875746707, "learning_rate": 0.0001, "loss": 1.7384, "loss/crossentropy": 2.1138253211975098, "loss/hidden": 1.5625, "loss/logits": 0.17419031262397766, "loss/reg": 0.00017131684580817819, "step": 546 }, { "epoch": 0.068375, "grad_norm": 2.6334447860717773, "grad_norm_var": 6.467947957023322, "learning_rate": 0.0001, "loss": 1.2565, "loss/crossentropy": 2.7349722385406494, "loss/hidden": 1.0859375, "loss/logits": 0.16887858510017395, "loss/reg": 0.00017128493345808238, "step": 547 }, { "epoch": 0.0685, "grad_norm": 3.3319921493530273, "grad_norm_var": 6.389018565580082, "learning_rate": 0.0001, "loss": 1.2192, "loss/crossentropy": 2.4898457527160645, "loss/hidden": 1.046875, "loss/logits": 0.17058822512626648, "loss/reg": 0.00017125166777987033, "step": 548 }, { "epoch": 0.068625, "grad_norm": 2.9781105518341064, "grad_norm_var": 6.3551707712299015, "learning_rate": 0.0001, "loss": 1.329, "loss/crossentropy": 2.1881654262542725, "loss/hidden": 1.1640625, "loss/logits": 0.16322261095046997, "loss/reg": 0.00017119194671977311, "step": 549 }, { "epoch": 0.06875, "grad_norm": 2.448098659515381, "grad_norm_var": 6.336258398035525, "learning_rate": 0.0001, "loss": 1.3555, "loss/crossentropy": 2.430213689804077, "loss/hidden": 1.15625, "loss/logits": 0.19751599431037903, "loss/reg": 0.0001711545482976362, "step": 550 }, { "epoch": 0.068875, "grad_norm": 3.612593650817871, "grad_norm_var": 6.242875720589851, "learning_rate": 0.0001, "loss": 1.7351, "loss/crossentropy": 2.5373525619506836, "loss/hidden": 1.46875, "loss/logits": 0.2646685242652893, "loss/reg": 0.0001711150398477912, "step": 551 }, { "epoch": 0.069, "grad_norm": 2.485044002532959, "grad_norm_var": 6.325423313821885, "learning_rate": 0.0001, "loss": 1.423, "loss/crossentropy": 2.5557191371917725, "loss/hidden": 1.2109375, "loss/logits": 0.21031953394412994, "loss/reg": 0.00017106575251091272, "step": 552 }, { "epoch": 0.069125, "grad_norm": 2.404714345932007, "grad_norm_var": 6.3311952176113575, "learning_rate": 0.0001, "loss": 1.4304, "loss/crossentropy": 2.5276741981506348, "loss/hidden": 1.2109375, "loss/logits": 0.21770742535591125, "loss/reg": 0.00017102489073295146, "step": 553 }, { "epoch": 0.06925, "grad_norm": 2.9133265018463135, "grad_norm_var": 6.297559307856219, "learning_rate": 0.0001, "loss": 1.3027, "loss/crossentropy": 2.432421922683716, "loss/hidden": 1.109375, "loss/logits": 0.1916545331478119, "loss/reg": 0.00017097701493185014, "step": 554 }, { "epoch": 0.069375, "grad_norm": 2.1913204193115234, "grad_norm_var": 6.317029358824686, "learning_rate": 0.0001, "loss": 1.1559, "loss/crossentropy": 2.5753936767578125, "loss/hidden": 0.98828125, "loss/logits": 0.1659434586763382, "loss/reg": 0.00017092663620132953, "step": 555 }, { "epoch": 0.0695, "grad_norm": 2.0332748889923096, "grad_norm_var": 6.409333154492003, "learning_rate": 0.0001, "loss": 1.1724, "loss/crossentropy": 2.150702953338623, "loss/hidden": 1.03125, "loss/logits": 0.13947990536689758, "loss/reg": 0.00017088439199142158, "step": 556 }, { "epoch": 0.069625, "grad_norm": 2.1695680618286133, "grad_norm_var": 6.409404998152714, "learning_rate": 0.0001, "loss": 1.1731, "loss/crossentropy": 2.6341793537139893, "loss/hidden": 1.0, "loss/logits": 0.17135721445083618, "loss/reg": 0.00017083503189496696, "step": 557 }, { "epoch": 0.06975, "grad_norm": 2.410304069519043, "grad_norm_var": 6.363615421331677, "learning_rate": 0.0001, "loss": 1.2344, "loss/crossentropy": 2.7346441745758057, "loss/hidden": 1.0546875, "loss/logits": 0.17796292901039124, "loss/reg": 0.0001707820047158748, "step": 558 }, { "epoch": 0.069875, "grad_norm": 2.483283281326294, "grad_norm_var": 6.371810790403832, "learning_rate": 0.0001, "loss": 1.1801, "loss/crossentropy": 2.4949450492858887, "loss/hidden": 1.015625, "loss/logits": 0.1627454161643982, "loss/reg": 0.00017073405615519732, "step": 559 }, { "epoch": 0.07, "grad_norm": 2.1867575645446777, "grad_norm_var": 6.418320129623587, "learning_rate": 0.0001, "loss": 1.1364, "loss/crossentropy": 2.6788835525512695, "loss/hidden": 0.9609375, "loss/logits": 0.1737128049135208, "loss/reg": 0.00017068682063836604, "step": 560 }, { "epoch": 0.070125, "grad_norm": 2.207019329071045, "grad_norm_var": 6.4739691725460995, "learning_rate": 0.0001, "loss": 1.2825, "loss/crossentropy": 2.4662907123565674, "loss/hidden": 1.0859375, "loss/logits": 0.19483013451099396, "loss/reg": 0.0001706261100480333, "step": 561 }, { "epoch": 0.07025, "grad_norm": 3.9981727600097656, "grad_norm_var": 0.32212220830916155, "learning_rate": 0.0001, "loss": 1.4458, "loss/crossentropy": 2.6690235137939453, "loss/hidden": 1.2578125, "loss/logits": 0.18629229068756104, "loss/reg": 0.00017057890363503247, "step": 562 }, { "epoch": 0.070375, "grad_norm": 2.2928075790405273, "grad_norm_var": 0.33037325756801933, "learning_rate": 0.0001, "loss": 1.1349, "loss/crossentropy": 2.551581382751465, "loss/hidden": 0.98046875, "loss/logits": 0.15273499488830566, "loss/reg": 0.00017053158080670983, "step": 563 }, { "epoch": 0.0705, "grad_norm": 2.3306846618652344, "grad_norm_var": 0.2998694227811823, "learning_rate": 0.0001, "loss": 1.2738, "loss/crossentropy": 2.6241610050201416, "loss/hidden": 1.09375, "loss/logits": 0.17837105691432953, "loss/reg": 0.00017048756126314402, "step": 564 }, { "epoch": 0.070625, "grad_norm": 2.9970686435699463, "grad_norm_var": 0.30091952320317217, "learning_rate": 0.0001, "loss": 1.5005, "loss/crossentropy": 2.6400721073150635, "loss/hidden": 1.265625, "loss/logits": 0.23314496874809265, "loss/reg": 0.00017043233674485236, "step": 565 }, { "epoch": 0.07075, "grad_norm": 2.154863119125366, "grad_norm_var": 0.3111674368126273, "learning_rate": 0.0001, "loss": 1.121, "loss/crossentropy": 2.449882745742798, "loss/hidden": 0.97265625, "loss/logits": 0.14665882289409637, "loss/reg": 0.00017037861107382923, "step": 566 }, { "epoch": 0.070875, "grad_norm": 2.3082728385925293, "grad_norm_var": 0.2334702477201887, "learning_rate": 0.0001, "loss": 1.3158, "loss/crossentropy": 2.278141736984253, "loss/hidden": 1.1328125, "loss/logits": 0.18126416206359863, "loss/reg": 0.00017032343021128327, "step": 567 }, { "epoch": 0.071, "grad_norm": 2.3926103115081787, "grad_norm_var": 0.2338546414734573, "learning_rate": 0.0001, "loss": 1.2985, "loss/crossentropy": 2.567387819290161, "loss/hidden": 1.109375, "loss/logits": 0.1873936802148819, "loss/reg": 0.00017027463763952255, "step": 568 }, { "epoch": 0.071125, "grad_norm": 2.1235768795013428, "grad_norm_var": 0.24113411008737154, "learning_rate": 0.0001, "loss": 1.1612, "loss/crossentropy": 2.462425947189331, "loss/hidden": 1.0, "loss/logits": 0.15948078036308289, "loss/reg": 0.00017021458188537508, "step": 569 }, { "epoch": 0.07125, "grad_norm": 2.8290655612945557, "grad_norm_var": 0.23636749952986874, "learning_rate": 0.0001, "loss": 1.7486, "loss/crossentropy": 2.516327142715454, "loss/hidden": 1.421875, "loss/logits": 0.32500743865966797, "loss/reg": 0.00017016558558680117, "step": 570 }, { "epoch": 0.071375, "grad_norm": 2.3170325756073, "grad_norm_var": 0.2331150305308474, "learning_rate": 0.0001, "loss": 1.3307, "loss/crossentropy": 2.597659111022949, "loss/hidden": 1.15625, "loss/logits": 0.17277465760707855, "loss/reg": 0.00017011785530485213, "step": 571 }, { "epoch": 0.0715, "grad_norm": 2.0035555362701416, "grad_norm_var": 0.23483004993557396, "learning_rate": 0.0001, "loss": 1.1944, "loss/crossentropy": 2.4430577754974365, "loss/hidden": 1.015625, "loss/logits": 0.17711691558361053, "loss/reg": 0.00017007363203447312, "step": 572 }, { "epoch": 0.071625, "grad_norm": 2.685924768447876, "grad_norm_var": 0.23216703280492249, "learning_rate": 0.0001, "loss": 1.3944, "loss/crossentropy": 2.324131965637207, "loss/hidden": 1.203125, "loss/logits": 0.1895420104265213, "loss/reg": 0.00017002593085635453, "step": 573 }, { "epoch": 0.07175, "grad_norm": 2.7526416778564453, "grad_norm_var": 0.2361934870674605, "learning_rate": 0.0001, "loss": 1.1758, "loss/crossentropy": 2.7989518642425537, "loss/hidden": 1.0, "loss/logits": 0.17413882911205292, "loss/reg": 0.00016996338672470301, "step": 574 }, { "epoch": 0.071875, "grad_norm": 2.7352824211120605, "grad_norm_var": 0.23946777271065306, "learning_rate": 0.0001, "loss": 1.4508, "loss/crossentropy": 2.579742908477783, "loss/hidden": 1.234375, "loss/logits": 0.21468819677829742, "loss/reg": 0.00016991051961667836, "step": 575 }, { "epoch": 0.072, "grad_norm": 2.4953577518463135, "grad_norm_var": 0.2317200723784725, "learning_rate": 0.0001, "loss": 1.3093, "loss/crossentropy": 2.509127140045166, "loss/hidden": 1.109375, "loss/logits": 0.19822360575199127, "loss/reg": 0.00016985305410344154, "step": 576 }, { "epoch": 0.072125, "grad_norm": 2.5039892196655273, "grad_norm_var": 0.22408707267770347, "learning_rate": 0.0001, "loss": 1.3646, "loss/crossentropy": 2.5353946685791016, "loss/hidden": 1.15625, "loss/logits": 0.20669099688529968, "loss/reg": 0.00016978620260488242, "step": 577 }, { "epoch": 0.07225, "grad_norm": 2.3266947269439697, "grad_norm_var": 0.07764090636938571, "learning_rate": 0.0001, "loss": 1.2418, "loss/crossentropy": 2.764356851577759, "loss/hidden": 1.0703125, "loss/logits": 0.16982382535934448, "loss/reg": 0.00016973311721812934, "step": 578 }, { "epoch": 0.072375, "grad_norm": 2.342703342437744, "grad_norm_var": 0.07673018861235051, "learning_rate": 0.0001, "loss": 1.3301, "loss/crossentropy": 2.312747001647949, "loss/hidden": 1.1328125, "loss/logits": 0.19554780423641205, "loss/reg": 0.00016968151612672955, "step": 579 }, { "epoch": 0.0725, "grad_norm": 11.706256866455078, "grad_norm_var": 5.4136513842247265, "learning_rate": 0.0001, "loss": 1.4191, "loss/crossentropy": 2.6392016410827637, "loss/hidden": 1.2265625, "loss/logits": 0.19086988270282745, "loss/reg": 0.00016962428344413638, "step": 580 }, { "epoch": 0.072625, "grad_norm": 2.3399274349212646, "grad_norm_var": 5.444593737837399, "learning_rate": 0.0001, "loss": 1.2527, "loss/crossentropy": 2.282559394836426, "loss/hidden": 1.0703125, "loss/logits": 0.1806895136833191, "loss/reg": 0.00016956489707808942, "step": 581 }, { "epoch": 0.07275, "grad_norm": 2.605700731277466, "grad_norm_var": 5.406427842961531, "learning_rate": 0.0001, "loss": 1.2855, "loss/crossentropy": 2.590782642364502, "loss/hidden": 1.109375, "loss/logits": 0.17442543804645538, "loss/reg": 0.00016952973965089768, "step": 582 }, { "epoch": 0.072875, "grad_norm": 2.0132784843444824, "grad_norm_var": 5.440226046451991, "learning_rate": 0.0001, "loss": 1.1835, "loss/crossentropy": 2.402843952178955, "loss/hidden": 1.015625, "loss/logits": 0.16617350280284882, "loss/reg": 0.00016945820243563503, "step": 583 }, { "epoch": 0.073, "grad_norm": 2.5126659870147705, "grad_norm_var": 5.4312304590327125, "learning_rate": 0.0001, "loss": 1.1143, "loss/crossentropy": 2.193049430847168, "loss/hidden": 0.98828125, "loss/logits": 0.12434712797403336, "loss/reg": 0.00016938996850512922, "step": 584 }, { "epoch": 0.073125, "grad_norm": 2.286170244216919, "grad_norm_var": 5.413484783911198, "learning_rate": 0.0001, "loss": 1.3935, "loss/crossentropy": 2.5293447971343994, "loss/hidden": 1.1875, "loss/logits": 0.2043066769838333, "loss/reg": 0.0001693202502792701, "step": 585 }, { "epoch": 0.07325, "grad_norm": 4.44750452041626, "grad_norm_var": 5.5341541609838405, "learning_rate": 0.0001, "loss": 1.7059, "loss/crossentropy": 2.1170272827148438, "loss/hidden": 1.4921875, "loss/logits": 0.2119699865579605, "loss/reg": 0.00016926974058151245, "step": 586 }, { "epoch": 0.073375, "grad_norm": 2.5299863815307617, "grad_norm_var": 5.5139146558762295, "learning_rate": 0.0001, "loss": 1.2437, "loss/crossentropy": 2.6854310035705566, "loss/hidden": 1.0546875, "loss/logits": 0.1872776746749878, "loss/reg": 0.00016921921633183956, "step": 587 }, { "epoch": 0.0735, "grad_norm": 2.0135693550109863, "grad_norm_var": 5.5123995944851325, "learning_rate": 0.0001, "loss": 1.1624, "loss/crossentropy": 2.4206924438476562, "loss/hidden": 1.0078125, "loss/logits": 0.15290623903274536, "loss/reg": 0.00016917330503929406, "step": 588 }, { "epoch": 0.073625, "grad_norm": 2.6473913192749023, "grad_norm_var": 5.514843854169513, "learning_rate": 0.0001, "loss": 1.2785, "loss/crossentropy": 2.6918623447418213, "loss/hidden": 1.0859375, "loss/logits": 0.19089436531066895, "loss/reg": 0.00016911346756387502, "step": 589 }, { "epoch": 0.07375, "grad_norm": 3.635427951812744, "grad_norm_var": 5.517816220831408, "learning_rate": 0.0001, "loss": 1.8411, "loss/crossentropy": 2.547456741333008, "loss/hidden": 1.515625, "loss/logits": 0.3237870931625366, "loss/reg": 0.00016906464588828385, "step": 590 }, { "epoch": 0.073875, "grad_norm": 2.401596784591675, "grad_norm_var": 5.5452897557559995, "learning_rate": 0.0001, "loss": 1.1834, "loss/crossentropy": 2.7037479877471924, "loss/hidden": 1.015625, "loss/logits": 0.16612407565116882, "loss/reg": 0.00016901020717341453, "step": 591 }, { "epoch": 0.074, "grad_norm": 2.2436153888702393, "grad_norm_var": 5.572080523738638, "learning_rate": 0.0001, "loss": 1.1794, "loss/crossentropy": 2.605909585952759, "loss/hidden": 1.015625, "loss/logits": 0.16213035583496094, "loss/reg": 0.00016897049499675632, "step": 592 }, { "epoch": 0.074125, "grad_norm": 2.623009443283081, "grad_norm_var": 5.562558906298531, "learning_rate": 0.0001, "loss": 1.3793, "loss/crossentropy": 2.4490256309509277, "loss/hidden": 1.1875, "loss/logits": 0.19010208547115326, "loss/reg": 0.00016892659186851233, "step": 593 }, { "epoch": 0.07425, "grad_norm": 2.6642467975616455, "grad_norm_var": 5.531850830156005, "learning_rate": 0.0001, "loss": 1.2564, "loss/crossentropy": 2.501567840576172, "loss/hidden": 1.0859375, "loss/logits": 0.1687425971031189, "loss/reg": 0.00016887504898477346, "step": 594 }, { "epoch": 0.074375, "grad_norm": 2.694584846496582, "grad_norm_var": 5.499915571271185, "learning_rate": 0.0001, "loss": 1.3388, "loss/crossentropy": 2.3972787857055664, "loss/hidden": 1.140625, "loss/logits": 0.19645318388938904, "loss/reg": 0.00016881282499525696, "step": 595 }, { "epoch": 0.0745, "grad_norm": 2.7409250736236572, "grad_norm_var": 0.36762256393680726, "learning_rate": 0.0001, "loss": 1.3813, "loss/crossentropy": 2.5210189819335938, "loss/hidden": 1.1953125, "loss/logits": 0.18434053659439087, "loss/reg": 0.00016875102301128209, "step": 596 }, { "epoch": 0.074625, "grad_norm": 2.1667938232421875, "grad_norm_var": 0.3766533052768258, "learning_rate": 0.0001, "loss": 1.125, "loss/crossentropy": 2.6511788368225098, "loss/hidden": 0.97265625, "loss/logits": 0.15069039165973663, "loss/reg": 0.0001686899777268991, "step": 597 }, { "epoch": 0.07475, "grad_norm": 1.9829405546188354, "grad_norm_var": 0.4036704931910695, "learning_rate": 0.0001, "loss": 1.2093, "loss/crossentropy": 2.288109540939331, "loss/hidden": 1.0390625, "loss/logits": 0.16858066618442535, "loss/reg": 0.0001686221658019349, "step": 598 }, { "epoch": 0.074875, "grad_norm": 2.0377886295318604, "grad_norm_var": 0.40178986547080575, "learning_rate": 0.0001, "loss": 1.1391, "loss/crossentropy": 2.6447594165802, "loss/hidden": 0.96484375, "loss/logits": 0.1725403070449829, "loss/reg": 0.0001685698953224346, "step": 599 }, { "epoch": 0.075, "grad_norm": 2.328906536102295, "grad_norm_var": 0.4060833394862702, "learning_rate": 0.0001, "loss": 1.1923, "loss/crossentropy": 2.3675525188446045, "loss/hidden": 1.0234375, "loss/logits": 0.1671319305896759, "loss/reg": 0.0001685115130385384, "step": 600 }, { "epoch": 0.075125, "grad_norm": 2.977869987487793, "grad_norm_var": 0.40793948307500183, "learning_rate": 0.0001, "loss": 1.4017, "loss/crossentropy": 2.1144216060638428, "loss/hidden": 1.2109375, "loss/logits": 0.1891048103570938, "loss/reg": 0.0001684577582636848, "step": 601 }, { "epoch": 0.07525, "grad_norm": 2.3289711475372314, "grad_norm_var": 0.17604985801183842, "learning_rate": 0.0001, "loss": 1.3612, "loss/crossentropy": 2.5204598903656006, "loss/hidden": 1.15625, "loss/logits": 0.20325785875320435, "loss/reg": 0.00016840448370203376, "step": 602 }, { "epoch": 0.075375, "grad_norm": 2.1975455284118652, "grad_norm_var": 0.18167683033246687, "learning_rate": 0.0001, "loss": 1.2281, "loss/crossentropy": 2.643584966659546, "loss/hidden": 1.0546875, "loss/logits": 0.1716804951429367, "loss/reg": 0.00016835647693369538, "step": 603 }, { "epoch": 0.0755, "grad_norm": 3.285170555114746, "grad_norm_var": 0.20360067101496113, "learning_rate": 0.0001, "loss": 1.4577, "loss/crossentropy": 2.2101516723632812, "loss/hidden": 1.234375, "loss/logits": 0.22164377570152283, "loss/reg": 0.000168298211065121, "step": 604 }, { "epoch": 0.075625, "grad_norm": 2.19590163230896, "grad_norm_var": 0.211067918812994, "learning_rate": 0.0001, "loss": 1.338, "loss/crossentropy": 2.4247162342071533, "loss/hidden": 1.140625, "loss/logits": 0.19573840498924255, "loss/reg": 0.00016823795158416033, "step": 605 }, { "epoch": 0.07575, "grad_norm": 2.1424643993377686, "grad_norm_var": 0.1306428940858675, "learning_rate": 0.0001, "loss": 1.2706, "loss/crossentropy": 2.282320737838745, "loss/hidden": 1.0859375, "loss/logits": 0.18295730650424957, "loss/reg": 0.00016817716823425144, "step": 606 }, { "epoch": 0.075875, "grad_norm": 2.2740960121154785, "grad_norm_var": 0.13228238228511008, "learning_rate": 0.0001, "loss": 1.2908, "loss/crossentropy": 2.63622784614563, "loss/hidden": 1.09375, "loss/logits": 0.19536375999450684, "loss/reg": 0.00016811213572509587, "step": 607 }, { "epoch": 0.076, "grad_norm": 3.2015061378479004, "grad_norm_var": 0.16578617964588976, "learning_rate": 0.0001, "loss": 1.3236, "loss/crossentropy": 2.548574686050415, "loss/hidden": 1.1171875, "loss/logits": 0.20471057295799255, "loss/reg": 0.0001680486893747002, "step": 608 }, { "epoch": 0.076125, "grad_norm": 2.347632884979248, "grad_norm_var": 0.16564824857496294, "learning_rate": 0.0001, "loss": 1.279, "loss/crossentropy": 2.493295192718506, "loss/hidden": 1.078125, "loss/logits": 0.19916461408138275, "loss/reg": 0.0001679811830399558, "step": 609 }, { "epoch": 0.07625, "grad_norm": 2.7271945476531982, "grad_norm_var": 0.16750138435451298, "learning_rate": 0.0001, "loss": 1.2249, "loss/crossentropy": 2.4139223098754883, "loss/hidden": 1.0546875, "loss/logits": 0.1685827374458313, "loss/reg": 0.00016791088273748755, "step": 610 }, { "epoch": 0.076375, "grad_norm": 4.645987033843994, "grad_norm_var": 0.46214004992615393, "learning_rate": 0.0001, "loss": 1.6428, "loss/crossentropy": 2.4112279415130615, "loss/hidden": 1.390625, "loss/logits": 0.25051963329315186, "loss/reg": 0.00016785638581495732, "step": 611 }, { "epoch": 0.0765, "grad_norm": 2.474159002304077, "grad_norm_var": 0.46153457728904496, "learning_rate": 0.0001, "loss": 1.2696, "loss/crossentropy": 2.449897289276123, "loss/hidden": 1.1015625, "loss/logits": 0.16636572778224945, "loss/reg": 0.0001677814725553617, "step": 612 }, { "epoch": 0.076625, "grad_norm": 4.001865386962891, "grad_norm_var": 0.5703666999110495, "learning_rate": 0.0001, "loss": 1.3133, "loss/crossentropy": 2.8769969940185547, "loss/hidden": 1.140625, "loss/logits": 0.17095232009887695, "loss/reg": 0.00016772303206380457, "step": 613 }, { "epoch": 0.07675, "grad_norm": 3.1425790786743164, "grad_norm_var": 0.5440268427608231, "learning_rate": 0.0001, "loss": 1.3433, "loss/crossentropy": 1.9943925142288208, "loss/hidden": 1.1875, "loss/logits": 0.15410134196281433, "loss/reg": 0.00016765583131927997, "step": 614 }, { "epoch": 0.076875, "grad_norm": 2.2765913009643555, "grad_norm_var": 0.5242977612759657, "learning_rate": 0.0001, "loss": 1.123, "loss/crossentropy": 2.6995999813079834, "loss/hidden": 0.96875, "loss/logits": 0.15254710614681244, "loss/reg": 0.00016759400023147464, "step": 615 }, { "epoch": 0.077, "grad_norm": 3.8305952548980713, "grad_norm_var": 0.5740628343768284, "learning_rate": 0.0001, "loss": 1.4399, "loss/crossentropy": 2.586466073989868, "loss/hidden": 1.234375, "loss/logits": 0.20385247468948364, "loss/reg": 0.00016751470684539527, "step": 616 }, { "epoch": 0.077125, "grad_norm": 5.086572647094727, "grad_norm_var": 0.8800190695918066, "learning_rate": 0.0001, "loss": 1.9326, "loss/crossentropy": 2.484973669052124, "loss/hidden": 1.578125, "loss/logits": 0.35283201932907104, "loss/reg": 0.00016745817265473306, "step": 617 }, { "epoch": 0.07725, "grad_norm": 2.2095866203308105, "grad_norm_var": 0.8917492740551815, "learning_rate": 0.0001, "loss": 1.2173, "loss/crossentropy": 2.4899208545684814, "loss/hidden": 1.0546875, "loss/logits": 0.16092899441719055, "loss/reg": 0.0001674194645602256, "step": 618 }, { "epoch": 0.077375, "grad_norm": 3.9603521823883057, "grad_norm_var": 0.8967781256815556, "learning_rate": 0.0001, "loss": 1.4834, "loss/crossentropy": 2.676488161087036, "loss/hidden": 1.265625, "loss/logits": 0.21607547998428345, "loss/reg": 0.00016736592806410044, "step": 619 }, { "epoch": 0.0775, "grad_norm": 2.487009286880493, "grad_norm_var": 0.918233600543351, "learning_rate": 0.0001, "loss": 1.2733, "loss/crossentropy": 2.5772454738616943, "loss/hidden": 1.09375, "loss/logits": 0.17784787714481354, "loss/reg": 0.00016732487711124122, "step": 620 }, { "epoch": 0.077625, "grad_norm": 3.49418306350708, "grad_norm_var": 0.8735234218585182, "learning_rate": 0.0001, "loss": 1.5624, "loss/crossentropy": 2.367485284805298, "loss/hidden": 1.359375, "loss/logits": 0.20139794051647186, "loss/reg": 0.0001672921935096383, "step": 621 }, { "epoch": 0.07775, "grad_norm": 2.7570641040802, "grad_norm_var": 0.8150675806061363, "learning_rate": 0.0001, "loss": 1.3971, "loss/crossentropy": 2.7097396850585938, "loss/hidden": 1.1640625, "loss/logits": 0.23135977983474731, "loss/reg": 0.00016725942259654403, "step": 622 }, { "epoch": 0.077875, "grad_norm": 2.94008207321167, "grad_norm_var": 0.7621408752942889, "learning_rate": 0.0001, "loss": 1.3807, "loss/crossentropy": 2.535205841064453, "loss/hidden": 1.15625, "loss/logits": 0.22281892597675323, "loss/reg": 0.00016722203872632235, "step": 623 }, { "epoch": 0.078, "grad_norm": 4.7731475830078125, "grad_norm_var": 0.9118194021280108, "learning_rate": 0.0001, "loss": 1.3806, "loss/crossentropy": 2.535827159881592, "loss/hidden": 1.171875, "loss/logits": 0.20709550380706787, "loss/reg": 0.00016719059203751385, "step": 624 }, { "epoch": 0.078125, "grad_norm": 6.682031154632568, "grad_norm_var": 1.5228074202759292, "learning_rate": 0.0001, "loss": 1.2882, "loss/crossentropy": 2.547858953475952, "loss/hidden": 1.1171875, "loss/logits": 0.16936041414737701, "loss/reg": 0.00016715959645807743, "step": 625 }, { "epoch": 0.07825, "grad_norm": 2.701521635055542, "grad_norm_var": 1.525812527631956, "learning_rate": 0.0001, "loss": 1.1787, "loss/crossentropy": 2.4940483570098877, "loss/hidden": 1.015625, "loss/logits": 0.16136589646339417, "loss/reg": 0.00016710466297809035, "step": 626 }, { "epoch": 0.078375, "grad_norm": 2.402461528778076, "grad_norm_var": 1.5249520637018918, "learning_rate": 0.0001, "loss": 1.2423, "loss/crossentropy": 2.654344320297241, "loss/hidden": 1.0703125, "loss/logits": 0.17033588886260986, "loss/reg": 0.00016706604219507426, "step": 627 }, { "epoch": 0.0785, "grad_norm": 3.1508126258850098, "grad_norm_var": 1.4654158167347984, "learning_rate": 0.0001, "loss": 1.9202, "loss/crossentropy": 2.158658742904663, "loss/hidden": 1.546875, "loss/logits": 0.37169432640075684, "loss/reg": 0.00016701665299478918, "step": 628 }, { "epoch": 0.078625, "grad_norm": 2.4587316513061523, "grad_norm_var": 1.5096537619464108, "learning_rate": 0.0001, "loss": 1.103, "loss/crossentropy": 2.491288185119629, "loss/hidden": 0.96484375, "loss/logits": 0.13650521636009216, "loss/reg": 0.00016696384409442544, "step": 629 }, { "epoch": 0.07875, "grad_norm": 2.304826021194458, "grad_norm_var": 1.5819462969440072, "learning_rate": 0.0001, "loss": 1.1924, "loss/crossentropy": 2.7701144218444824, "loss/hidden": 1.03125, "loss/logits": 0.15948078036308289, "loss/reg": 0.00016690972552169114, "step": 630 }, { "epoch": 0.078875, "grad_norm": 3.3005666732788086, "grad_norm_var": 1.5016470644880922, "learning_rate": 0.0001, "loss": 2.0456, "loss/crossentropy": 2.482811212539673, "loss/hidden": 1.5, "loss/logits": 0.5439146757125854, "loss/reg": 0.00016687106108292937, "step": 631 }, { "epoch": 0.079, "grad_norm": 2.8882155418395996, "grad_norm_var": 1.504143333129146, "learning_rate": 0.0001, "loss": 1.3746, "loss/crossentropy": 2.2081522941589355, "loss/hidden": 1.1875, "loss/logits": 0.18541193008422852, "loss/reg": 0.00016681908164173365, "step": 632 }, { "epoch": 0.079125, "grad_norm": 10.211669921875, "grad_norm_var": 4.332608818057891, "learning_rate": 0.0001, "loss": 1.6656, "loss/crossentropy": 2.7348687648773193, "loss/hidden": 1.3828125, "loss/logits": 0.281095027923584, "loss/reg": 0.0001667785836616531, "step": 633 }, { "epoch": 0.07925, "grad_norm": 3.1219491958618164, "grad_norm_var": 4.206960096058094, "learning_rate": 0.0001, "loss": 1.2588, "loss/crossentropy": 2.1599411964416504, "loss/hidden": 1.0859375, "loss/logits": 0.1712060123682022, "loss/reg": 0.000166743848240003, "step": 634 }, { "epoch": 0.079375, "grad_norm": 2.623854160308838, "grad_norm_var": 4.277045211346406, "learning_rate": 0.0001, "loss": 1.3474, "loss/crossentropy": 2.3759686946868896, "loss/hidden": 1.15625, "loss/logits": 0.1894654780626297, "loss/reg": 0.00016670754121150821, "step": 635 }, { "epoch": 0.0795, "grad_norm": 5.576538562774658, "grad_norm_var": 4.397163327435782, "learning_rate": 0.0001, "loss": 1.6334, "loss/crossentropy": 3.225729465484619, "loss/hidden": 1.3984375, "loss/logits": 0.2333376556634903, "loss/reg": 0.00016665323346387595, "step": 636 }, { "epoch": 0.079625, "grad_norm": 2.462090253829956, "grad_norm_var": 4.510877787025105, "learning_rate": 0.0001, "loss": 1.368, "loss/crossentropy": 2.6833760738372803, "loss/hidden": 1.15625, "loss/logits": 0.2100624144077301, "loss/reg": 0.00016661611152812839, "step": 637 }, { "epoch": 0.07975, "grad_norm": 2.935678005218506, "grad_norm_var": 4.488695529031612, "learning_rate": 0.0001, "loss": 1.4207, "loss/crossentropy": 2.202238082885742, "loss/hidden": 1.2109375, "loss/logits": 0.20805108547210693, "loss/reg": 0.0001665620948188007, "step": 638 }, { "epoch": 0.079875, "grad_norm": 3.0358095169067383, "grad_norm_var": 4.478504618640515, "learning_rate": 0.0001, "loss": 1.4567, "loss/crossentropy": 2.8320090770721436, "loss/hidden": 1.2265625, "loss/logits": 0.22847674787044525, "loss/reg": 0.00016650540055707097, "step": 639 }, { "epoch": 0.08, "grad_norm": 2.1531565189361572, "grad_norm_var": 4.563861213288183, "learning_rate": 0.0001, "loss": 1.1727, "loss/crossentropy": 2.5067150592803955, "loss/hidden": 1.0078125, "loss/logits": 0.16324225068092346, "loss/reg": 0.00016644694551359862, "step": 640 }, { "epoch": 0.080125, "grad_norm": 6.979793071746826, "grad_norm_var": 4.690746995012082, "learning_rate": 0.0001, "loss": 1.3745, "loss/crossentropy": 2.7025413513183594, "loss/hidden": 1.1796875, "loss/logits": 0.19311562180519104, "loss/reg": 0.00016639451496303082, "step": 641 }, { "epoch": 0.08025, "grad_norm": 2.729844808578491, "grad_norm_var": 4.687237068174059, "learning_rate": 0.0001, "loss": 1.4307, "loss/crossentropy": 2.3578975200653076, "loss/hidden": 1.203125, "loss/logits": 0.2259325385093689, "loss/reg": 0.00016634297207929194, "step": 642 }, { "epoch": 0.080375, "grad_norm": 2.343785285949707, "grad_norm_var": 4.6971810706071775, "learning_rate": 0.0001, "loss": 1.2463, "loss/crossentropy": 2.407282590866089, "loss/hidden": 1.0625, "loss/logits": 0.18211227655410767, "loss/reg": 0.00016628378944005817, "step": 643 }, { "epoch": 0.0805, "grad_norm": 3.551823854446411, "grad_norm_var": 4.680951024735604, "learning_rate": 0.0001, "loss": 1.4017, "loss/crossentropy": 2.5645558834075928, "loss/hidden": 1.1953125, "loss/logits": 0.20475167036056519, "loss/reg": 0.00016621559916529804, "step": 644 }, { "epoch": 0.080625, "grad_norm": 3.353755235671997, "grad_norm_var": 4.586780001463231, "learning_rate": 0.0001, "loss": 1.5083, "loss/crossentropy": 2.4407286643981934, "loss/hidden": 1.265625, "loss/logits": 0.24099557101726532, "loss/reg": 0.00016614363994449377, "step": 645 }, { "epoch": 0.08075, "grad_norm": 2.6528546810150146, "grad_norm_var": 4.528526020816873, "learning_rate": 0.0001, "loss": 1.3029, "loss/crossentropy": 2.426625967025757, "loss/hidden": 1.1171875, "loss/logits": 0.18408827483654022, "loss/reg": 0.0001660897978581488, "step": 646 }, { "epoch": 0.080875, "grad_norm": 2.7911300659179688, "grad_norm_var": 4.574940336167226, "learning_rate": 0.0001, "loss": 1.5606, "loss/crossentropy": 1.9501982927322388, "loss/hidden": 1.328125, "loss/logits": 0.2308322787284851, "loss/reg": 0.0001660331035964191, "step": 647 }, { "epoch": 0.081, "grad_norm": 2.5895497798919678, "grad_norm_var": 4.6133698917856085, "learning_rate": 0.0001, "loss": 1.3276, "loss/crossentropy": 2.4812192916870117, "loss/hidden": 1.140625, "loss/logits": 0.18531008064746857, "loss/reg": 0.00016596495697740465, "step": 648 }, { "epoch": 0.081125, "grad_norm": 2.6669650077819824, "grad_norm_var": 1.6150947924648464, "learning_rate": 0.0001, "loss": 1.2605, "loss/crossentropy": 2.67368221282959, "loss/hidden": 1.1015625, "loss/logits": 0.15727311372756958, "loss/reg": 0.00016591344319749624, "step": 649 }, { "epoch": 0.08125, "grad_norm": 2.3734240531921387, "grad_norm_var": 1.6602017249779197, "learning_rate": 0.0001, "loss": 1.4047, "loss/crossentropy": 2.469904661178589, "loss/hidden": 1.1875, "loss/logits": 0.21549656987190247, "loss/reg": 0.00016584685363341123, "step": 650 }, { "epoch": 0.081375, "grad_norm": 5.391051769256592, "grad_norm_var": 1.9349751260079697, "learning_rate": 0.0001, "loss": 1.6563, "loss/crossentropy": 2.576478958129883, "loss/hidden": 1.3828125, "loss/logits": 0.27184876799583435, "loss/reg": 0.00016577586939092726, "step": 651 }, { "epoch": 0.0815, "grad_norm": 3.1808249950408936, "grad_norm_var": 1.5822159903212916, "learning_rate": 0.0001, "loss": 1.2252, "loss/crossentropy": 2.767976760864258, "loss/hidden": 1.046875, "loss/logits": 0.17664310336112976, "loss/reg": 0.00016570698062423617, "step": 652 }, { "epoch": 0.081625, "grad_norm": 2.647900342941284, "grad_norm_var": 1.5661054547944748, "learning_rate": 0.0001, "loss": 1.2317, "loss/crossentropy": 2.7076566219329834, "loss/hidden": 1.0546875, "loss/logits": 0.17536479234695435, "loss/reg": 0.0001656520616961643, "step": 653 }, { "epoch": 0.08175, "grad_norm": 2.4461452960968018, "grad_norm_var": 1.599059141447918, "learning_rate": 0.0001, "loss": 1.235, "loss/crossentropy": 2.608097553253174, "loss/hidden": 1.046875, "loss/logits": 0.1864546835422516, "loss/reg": 0.000165599471074529, "step": 654 }, { "epoch": 0.081875, "grad_norm": 2.379359483718872, "grad_norm_var": 1.6386553172038514, "learning_rate": 0.0001, "loss": 1.6032, "loss/crossentropy": 2.1569581031799316, "loss/hidden": 1.34375, "loss/logits": 0.2578085660934448, "loss/reg": 0.00016554714238736778, "step": 655 }, { "epoch": 0.082, "grad_norm": 2.3222146034240723, "grad_norm_var": 1.618209257330801, "learning_rate": 0.0001, "loss": 1.2162, "loss/crossentropy": 2.790790557861328, "loss/hidden": 1.03125, "loss/logits": 0.18334238231182098, "loss/reg": 0.00016549928113818169, "step": 656 }, { "epoch": 0.082125, "grad_norm": 2.352463960647583, "grad_norm_var": 0.5935913991944024, "learning_rate": 0.0001, "loss": 1.3045, "loss/crossentropy": 2.673133373260498, "loss/hidden": 1.1015625, "loss/logits": 0.2013138085603714, "loss/reg": 0.0001654458319535479, "step": 657 }, { "epoch": 0.08225, "grad_norm": 2.3700876235961914, "grad_norm_var": 0.6079629647508907, "learning_rate": 0.0001, "loss": 1.3166, "loss/crossentropy": 2.2403147220611572, "loss/hidden": 1.1328125, "loss/logits": 0.1820981204509735, "loss/reg": 0.0001653966901358217, "step": 658 }, { "epoch": 0.082375, "grad_norm": 4.848836898803711, "grad_norm_var": 0.8349856810546211, "learning_rate": 0.0001, "loss": 1.7256, "loss/crossentropy": 2.5139729976654053, "loss/hidden": 1.4140625, "loss/logits": 0.3099275827407837, "loss/reg": 0.0001653533399803564, "step": 659 }, { "epoch": 0.0825, "grad_norm": 2.486321449279785, "grad_norm_var": 0.8268210381853313, "learning_rate": 0.0001, "loss": 1.2977, "loss/crossentropy": 2.6133224964141846, "loss/hidden": 1.078125, "loss/logits": 0.21789303421974182, "loss/reg": 0.00016529561253264546, "step": 660 }, { "epoch": 0.082625, "grad_norm": 2.156172513961792, "grad_norm_var": 0.8485239505093195, "learning_rate": 0.0001, "loss": 1.135, "loss/crossentropy": 2.627702474594116, "loss/hidden": 0.9765625, "loss/logits": 0.1567949652671814, "loss/reg": 0.0001652387873036787, "step": 661 }, { "epoch": 0.08275, "grad_norm": 2.5212655067443848, "grad_norm_var": 0.8531257845111268, "learning_rate": 0.0001, "loss": 1.2322, "loss/crossentropy": 2.584390163421631, "loss/hidden": 1.0703125, "loss/logits": 0.16021685302257538, "loss/reg": 0.00016518463962711394, "step": 662 }, { "epoch": 0.082875, "grad_norm": 2.487727165222168, "grad_norm_var": 0.8610677449324041, "learning_rate": 0.0001, "loss": 1.2693, "loss/crossentropy": 2.5323688983917236, "loss/hidden": 1.0859375, "loss/logits": 0.18166188895702362, "loss/reg": 0.0001651348575251177, "step": 663 }, { "epoch": 0.083, "grad_norm": 2.38796067237854, "grad_norm_var": 0.8699703101256145, "learning_rate": 0.0001, "loss": 1.3151, "loss/crossentropy": 2.759453773498535, "loss/hidden": 1.125, "loss/logits": 0.1884392499923706, "loss/reg": 0.00016509178385604173, "step": 664 }, { "epoch": 0.083125, "grad_norm": 3.4156320095062256, "grad_norm_var": 0.8903572693311579, "learning_rate": 0.0001, "loss": 1.3921, "loss/crossentropy": 2.660811424255371, "loss/hidden": 1.203125, "loss/logits": 0.18728655576705933, "loss/reg": 0.0001650632475502789, "step": 665 }, { "epoch": 0.08325, "grad_norm": 3.6222026348114014, "grad_norm_var": 0.9067291298107532, "learning_rate": 0.0001, "loss": 1.5778, "loss/crossentropy": 2.5574145317077637, "loss/hidden": 1.3359375, "loss/logits": 0.24019111692905426, "loss/reg": 0.00016504104132764041, "step": 666 }, { "epoch": 0.083375, "grad_norm": 4.253505706787109, "grad_norm_var": 0.6156209880377387, "learning_rate": 0.0001, "loss": 1.4672, "loss/crossentropy": 2.490506172180176, "loss/hidden": 1.2421875, "loss/logits": 0.22340671718120575, "loss/reg": 0.00016498679178766906, "step": 667 }, { "epoch": 0.0835, "grad_norm": 2.789968729019165, "grad_norm_var": 0.6088358631127597, "learning_rate": 0.0001, "loss": 1.5776, "loss/crossentropy": 2.3684263229370117, "loss/hidden": 1.328125, "loss/logits": 0.24786591529846191, "loss/reg": 0.000164957411470823, "step": 668 }, { "epoch": 0.083625, "grad_norm": 2.1529343128204346, "grad_norm_var": 0.6370225465252605, "learning_rate": 0.0001, "loss": 1.1455, "loss/crossentropy": 2.5240535736083984, "loss/hidden": 0.98046875, "loss/logits": 0.16337648034095764, "loss/reg": 0.00016490306006744504, "step": 669 }, { "epoch": 0.08375, "grad_norm": 2.3900113105773926, "grad_norm_var": 0.6399581101222035, "learning_rate": 0.0001, "loss": 1.2392, "loss/crossentropy": 2.6252262592315674, "loss/hidden": 1.0625, "loss/logits": 0.1750047653913498, "loss/reg": 0.00016484924708493054, "step": 670 }, { "epoch": 0.083875, "grad_norm": 2.620534658432007, "grad_norm_var": 0.629792377475321, "learning_rate": 0.0001, "loss": 1.582, "loss/crossentropy": 2.178114175796509, "loss/hidden": 1.3203125, "loss/logits": 0.2600269317626953, "loss/reg": 0.0001647974131628871, "step": 671 }, { "epoch": 0.084, "grad_norm": 2.413081169128418, "grad_norm_var": 0.6242336858471352, "learning_rate": 0.0001, "loss": 1.0521, "loss/crossentropy": 2.8187384605407715, "loss/hidden": 0.9140625, "loss/logits": 0.13643045723438263, "loss/reg": 0.00016475330630782992, "step": 672 }, { "epoch": 0.084125, "grad_norm": 2.781545400619507, "grad_norm_var": 0.6084607516908203, "learning_rate": 0.0001, "loss": 1.2818, "loss/crossentropy": 2.488860607147217, "loss/hidden": 1.0859375, "loss/logits": 0.1942487508058548, "loss/reg": 0.000164700104505755, "step": 673 }, { "epoch": 0.08425, "grad_norm": 2.2359659671783447, "grad_norm_var": 0.6182765548801226, "learning_rate": 0.0001, "loss": 1.3119, "loss/crossentropy": 2.148397207260132, "loss/hidden": 1.109375, "loss/logits": 0.2008332908153534, "loss/reg": 0.00016463726933579892, "step": 674 }, { "epoch": 0.084375, "grad_norm": 2.1891558170318604, "grad_norm_var": 0.35075439144963716, "learning_rate": 0.0001, "loss": 1.1196, "loss/crossentropy": 2.57894229888916, "loss/hidden": 0.96484375, "loss/logits": 0.1531408429145813, "loss/reg": 0.0001645681622903794, "step": 675 }, { "epoch": 0.0845, "grad_norm": 3.4695358276367188, "grad_norm_var": 0.38558694028322404, "learning_rate": 0.0001, "loss": 1.5266, "loss/crossentropy": 2.669329881668091, "loss/hidden": 1.3046875, "loss/logits": 0.22030791640281677, "loss/reg": 0.00016448195674456656, "step": 676 }, { "epoch": 0.084625, "grad_norm": 4.914409160614014, "grad_norm_var": 0.6452826360757415, "learning_rate": 0.0001, "loss": 1.6439, "loss/crossentropy": 2.462794542312622, "loss/hidden": 1.3671875, "loss/logits": 0.27508145570755005, "loss/reg": 0.00016439952014479786, "step": 677 }, { "epoch": 0.08475, "grad_norm": 30.328147888183594, "grad_norm_var": 47.51063837654985, "learning_rate": 0.0001, "loss": 1.5824, "loss/crossentropy": 2.5709619522094727, "loss/hidden": 1.34375, "loss/logits": 0.23705410957336426, "loss/reg": 0.00016432724078185856, "step": 678 }, { "epoch": 0.084875, "grad_norm": 2.876424789428711, "grad_norm_var": 47.40784906616088, "learning_rate": 0.0001, "loss": 1.4631, "loss/crossentropy": 2.68898606300354, "loss/hidden": 1.25, "loss/logits": 0.21146059036254883, "loss/reg": 0.00016424572095274925, "step": 679 }, { "epoch": 0.085, "grad_norm": 2.843522071838379, "grad_norm_var": 47.281746121966535, "learning_rate": 0.0001, "loss": 1.1706, "loss/crossentropy": 2.6871814727783203, "loss/hidden": 1.015625, "loss/logits": 0.15333782136440277, "loss/reg": 0.0001641718263272196, "step": 680 }, { "epoch": 0.085125, "grad_norm": 2.263585329055786, "grad_norm_var": 47.56291094253462, "learning_rate": 0.0001, "loss": 1.3514, "loss/crossentropy": 2.4708120822906494, "loss/hidden": 1.1484375, "loss/logits": 0.2013145238161087, "loss/reg": 0.0001640972914174199, "step": 681 }, { "epoch": 0.08525, "grad_norm": 2.5409631729125977, "grad_norm_var": 47.78184918017327, "learning_rate": 0.0001, "loss": 1.4749, "loss/crossentropy": 2.671372175216675, "loss/hidden": 1.234375, "loss/logits": 0.2388349324464798, "loss/reg": 0.0001640205446165055, "step": 682 }, { "epoch": 0.085375, "grad_norm": 2.637148857116699, "grad_norm_var": 48.01258245528209, "learning_rate": 0.0001, "loss": 1.2865, "loss/crossentropy": 2.707645893096924, "loss/hidden": 1.109375, "loss/logits": 0.175527885556221, "loss/reg": 0.00016393957776017487, "step": 683 }, { "epoch": 0.0855, "grad_norm": 2.2043187618255615, "grad_norm_var": 48.16485051728318, "learning_rate": 0.0001, "loss": 1.2791, "loss/crossentropy": 2.6515603065490723, "loss/hidden": 1.078125, "loss/logits": 0.1993018090724945, "loss/reg": 0.00016388599760830402, "step": 684 }, { "epoch": 0.085625, "grad_norm": 3.103644609451294, "grad_norm_var": 47.932845449085484, "learning_rate": 0.0001, "loss": 1.3835, "loss/crossentropy": 2.4236719608306885, "loss/hidden": 1.1875, "loss/logits": 0.19437812268733978, "loss/reg": 0.0001638105750316754, "step": 685 }, { "epoch": 0.08575, "grad_norm": 3.1297998428344727, "grad_norm_var": 47.760083867177244, "learning_rate": 0.0001, "loss": 1.2937, "loss/crossentropy": 3.008211612701416, "loss/hidden": 1.109375, "loss/logits": 0.18267163634300232, "loss/reg": 0.00016373269318137318, "step": 686 }, { "epoch": 0.085875, "grad_norm": 2.5769779682159424, "grad_norm_var": 47.771317828670746, "learning_rate": 0.0001, "loss": 1.2788, "loss/crossentropy": 2.582740545272827, "loss/hidden": 1.1015625, "loss/logits": 0.1755845546722412, "loss/reg": 0.00016368045180570334, "step": 687 }, { "epoch": 0.086, "grad_norm": 2.388911247253418, "grad_norm_var": 47.77818212723798, "learning_rate": 0.0001, "loss": 1.3653, "loss/crossentropy": 2.301215171813965, "loss/hidden": 1.1640625, "loss/logits": 0.19958043098449707, "loss/reg": 0.00016360357403755188, "step": 688 }, { "epoch": 0.086125, "grad_norm": 2.3237059116363525, "grad_norm_var": 47.898033541986095, "learning_rate": 0.0001, "loss": 1.3205, "loss/crossentropy": 2.423755407333374, "loss/hidden": 1.1328125, "loss/logits": 0.1860751211643219, "loss/reg": 0.00016352770035155118, "step": 689 }, { "epoch": 0.08625, "grad_norm": 3.021902322769165, "grad_norm_var": 47.69921627605013, "learning_rate": 0.0001, "loss": 1.4144, "loss/crossentropy": 3.0128273963928223, "loss/hidden": 1.1875, "loss/logits": 0.22526130080223083, "loss/reg": 0.00016344760661013424, "step": 690 }, { "epoch": 0.086375, "grad_norm": 3.2941322326660156, "grad_norm_var": 47.42759155009845, "learning_rate": 0.0001, "loss": 1.393, "loss/crossentropy": 2.560335636138916, "loss/hidden": 1.1640625, "loss/logits": 0.22734451293945312, "loss/reg": 0.00016339511785190552, "step": 691 }, { "epoch": 0.0865, "grad_norm": 2.4065542221069336, "grad_norm_var": 47.66124304582527, "learning_rate": 0.0001, "loss": 1.4288, "loss/crossentropy": 2.377492666244507, "loss/hidden": 1.2109375, "loss/logits": 0.21625959873199463, "loss/reg": 0.0001633405772736296, "step": 692 }, { "epoch": 0.086625, "grad_norm": 2.5693113803863525, "grad_norm_var": 47.8920756161344, "learning_rate": 0.0001, "loss": 1.2561, "loss/crossentropy": 2.4969563484191895, "loss/hidden": 1.0859375, "loss/logits": 0.1685582548379898, "loss/reg": 0.000163278091349639, "step": 693 }, { "epoch": 0.08675, "grad_norm": 3.0437076091766357, "grad_norm_var": 0.11974605968780073, "learning_rate": 0.0001, "loss": 1.2887, "loss/crossentropy": 2.7765395641326904, "loss/hidden": 1.1171875, "loss/logits": 0.1699219048023224, "loss/reg": 0.00016320630675181746, "step": 694 }, { "epoch": 0.086875, "grad_norm": 3.106893301010132, "grad_norm_var": 0.12843990838297803, "learning_rate": 0.0001, "loss": 1.6888, "loss/crossentropy": 2.4358479976654053, "loss/hidden": 1.40625, "loss/logits": 0.28088831901550293, "loss/reg": 0.00016313180094584823, "step": 695 }, { "epoch": 0.087, "grad_norm": 2.2542333602905273, "grad_norm_var": 0.14011956658400565, "learning_rate": 0.0001, "loss": 1.255, "loss/crossentropy": 2.747295618057251, "loss/hidden": 1.0703125, "loss/logits": 0.1830083131790161, "loss/reg": 0.0001630594051675871, "step": 696 }, { "epoch": 0.087125, "grad_norm": 2.3912386894226074, "grad_norm_var": 0.13406557973471156, "learning_rate": 0.0001, "loss": 1.1938, "loss/crossentropy": 2.653669834136963, "loss/hidden": 1.03125, "loss/logits": 0.16088822484016418, "loss/reg": 0.00016298312402796, "step": 697 }, { "epoch": 0.08725, "grad_norm": 2.4358601570129395, "grad_norm_var": 0.1368037807350286, "learning_rate": 0.0001, "loss": 1.1978, "loss/crossentropy": 2.8287577629089355, "loss/hidden": 1.015625, "loss/logits": 0.1805681735277176, "loss/reg": 0.00016289895575027913, "step": 698 }, { "epoch": 0.087375, "grad_norm": 2.815246820449829, "grad_norm_var": 0.13775627233840904, "learning_rate": 0.0001, "loss": 1.4834, "loss/crossentropy": 2.7523555755615234, "loss/hidden": 1.2265625, "loss/logits": 0.2551818788051605, "loss/reg": 0.00016284023877233267, "step": 699 }, { "epoch": 0.0875, "grad_norm": 2.2527101039886475, "grad_norm_var": 0.13475826610412747, "learning_rate": 0.0001, "loss": 1.1716, "loss/crossentropy": 2.627030849456787, "loss/hidden": 1.0, "loss/logits": 0.16992975771427155, "loss/reg": 0.00016276550013571978, "step": 700 }, { "epoch": 0.087625, "grad_norm": 2.4228506088256836, "grad_norm_var": 0.12660275696210402, "learning_rate": 0.0001, "loss": 1.2538, "loss/crossentropy": 2.767721176147461, "loss/hidden": 1.0703125, "loss/logits": 0.18186557292938232, "loss/reg": 0.00016268961189780384, "step": 701 }, { "epoch": 0.08775, "grad_norm": 2.2339673042297363, "grad_norm_var": 0.1197047145201509, "learning_rate": 0.0001, "loss": 1.1975, "loss/crossentropy": 2.5431220531463623, "loss/hidden": 1.046875, "loss/logits": 0.14897748827934265, "loss/reg": 0.00016263082216028124, "step": 702 }, { "epoch": 0.087875, "grad_norm": 4.207867622375488, "grad_norm_var": 0.2817759593744974, "learning_rate": 0.0001, "loss": 1.4572, "loss/crossentropy": 2.8638172149658203, "loss/hidden": 1.140625, "loss/logits": 0.31495168805122375, "loss/reg": 0.00016255886293947697, "step": 703 }, { "epoch": 0.088, "grad_norm": 2.3787028789520264, "grad_norm_var": 0.282203271097409, "learning_rate": 0.0001, "loss": 1.2463, "loss/crossentropy": 2.7839977741241455, "loss/hidden": 1.0703125, "loss/logits": 0.17434002459049225, "loss/reg": 0.00016250263433903456, "step": 704 }, { "epoch": 0.088125, "grad_norm": 2.548332452774048, "grad_norm_var": 0.2741637170718292, "learning_rate": 0.0001, "loss": 1.3931, "loss/crossentropy": 2.3706233501434326, "loss/hidden": 1.1796875, "loss/logits": 0.21178531646728516, "loss/reg": 0.00016242072160821408, "step": 705 }, { "epoch": 0.08825, "grad_norm": 4.417368412017822, "grad_norm_var": 0.45363137527352604, "learning_rate": 0.0001, "loss": 1.5718, "loss/crossentropy": 2.6671862602233887, "loss/hidden": 1.328125, "loss/logits": 0.24208636581897736, "loss/reg": 0.00016233540372923017, "step": 706 }, { "epoch": 0.088375, "grad_norm": 2.3606135845184326, "grad_norm_var": 0.446429677936672, "learning_rate": 0.0001, "loss": 1.3522, "loss/crossentropy": 2.570408821105957, "loss/hidden": 1.140625, "loss/logits": 0.2099515199661255, "loss/reg": 0.0001622470299480483, "step": 707 }, { "epoch": 0.0885, "grad_norm": 3.6912710666656494, "grad_norm_var": 0.4924095372859841, "learning_rate": 0.0001, "loss": 1.2406, "loss/crossentropy": 2.812623977661133, "loss/hidden": 1.0546875, "loss/logits": 0.18432584404945374, "loss/reg": 0.00016217907250393182, "step": 708 }, { "epoch": 0.088625, "grad_norm": 2.1196298599243164, "grad_norm_var": 0.5201166816670356, "learning_rate": 0.0001, "loss": 1.109, "loss/crossentropy": 2.6465704441070557, "loss/hidden": 0.9609375, "loss/logits": 0.14639706909656525, "loss/reg": 0.00016210008470807225, "step": 709 }, { "epoch": 0.08875, "grad_norm": 2.818333625793457, "grad_norm_var": 0.5157434440596752, "learning_rate": 0.0001, "loss": 1.341, "loss/crossentropy": 2.5667293071746826, "loss/hidden": 1.140625, "loss/logits": 0.19876635074615479, "loss/reg": 0.00016204149869736284, "step": 710 }, { "epoch": 0.088875, "grad_norm": 2.93424391746521, "grad_norm_var": 0.51004557905098, "learning_rate": 0.0001, "loss": 1.3484, "loss/crossentropy": 2.8390839099884033, "loss/hidden": 1.15625, "loss/logits": 0.1905781477689743, "loss/reg": 0.00016198218509089202, "step": 711 }, { "epoch": 0.089, "grad_norm": 2.8454763889312744, "grad_norm_var": 0.4914193839330096, "learning_rate": 0.0001, "loss": 1.469, "loss/crossentropy": 2.333479166030884, "loss/hidden": 1.2265625, "loss/logits": 0.24086281657218933, "loss/reg": 0.00016192808107007295, "step": 712 }, { "epoch": 0.089125, "grad_norm": 2.444822072982788, "grad_norm_var": 0.4886455422549922, "learning_rate": 0.0001, "loss": 1.3439, "loss/crossentropy": 2.4458630084991455, "loss/hidden": 1.15625, "loss/logits": 0.18607404828071594, "loss/reg": 0.00016187904111575335, "step": 713 }, { "epoch": 0.08925, "grad_norm": 3.3376550674438477, "grad_norm_var": 0.4947321127919165, "learning_rate": 0.0001, "loss": 1.2053, "loss/crossentropy": 2.9290921688079834, "loss/hidden": 1.0234375, "loss/logits": 0.18021196126937866, "loss/reg": 0.00016181879618670791, "step": 714 }, { "epoch": 0.089375, "grad_norm": 2.488980293273926, "grad_norm_var": 0.5035199429563126, "learning_rate": 0.0001, "loss": 1.3415, "loss/crossentropy": 2.546008825302124, "loss/hidden": 1.140625, "loss/logits": 0.19930589199066162, "loss/reg": 0.00016176272765733302, "step": 715 }, { "epoch": 0.0895, "grad_norm": 2.4051930904388428, "grad_norm_var": 0.49295307378371545, "learning_rate": 0.0001, "loss": 1.3515, "loss/crossentropy": 2.3664801120758057, "loss/hidden": 1.140625, "loss/logits": 0.20927459001541138, "loss/reg": 0.00016171703464351594, "step": 716 }, { "epoch": 0.089625, "grad_norm": 2.0203142166137695, "grad_norm_var": 0.5261915819729202, "learning_rate": 0.0001, "loss": 1.0943, "loss/crossentropy": 2.7026448249816895, "loss/hidden": 0.93359375, "loss/logits": 0.15908381342887878, "loss/reg": 0.00016167921421583742, "step": 717 }, { "epoch": 0.08975, "grad_norm": 2.502314805984497, "grad_norm_var": 0.5094272678862407, "learning_rate": 0.0001, "loss": 1.4104, "loss/crossentropy": 2.2777092456817627, "loss/hidden": 1.21875, "loss/logits": 0.1900528073310852, "loss/reg": 0.00016162224346771836, "step": 718 }, { "epoch": 0.089875, "grad_norm": 4.398810386657715, "grad_norm_var": 0.5464014778025195, "learning_rate": 0.0001, "loss": 1.3635, "loss/crossentropy": 2.718095064163208, "loss/hidden": 1.15625, "loss/logits": 0.20566391944885254, "loss/reg": 0.00016156445781234652, "step": 719 }, { "epoch": 0.09, "grad_norm": 2.0090291500091553, "grad_norm_var": 0.5785180198259352, "learning_rate": 0.0001, "loss": 1.3268, "loss/crossentropy": 2.381427049636841, "loss/hidden": 1.1484375, "loss/logits": 0.17673441767692566, "loss/reg": 0.000161523770657368, "step": 720 }, { "epoch": 0.090125, "grad_norm": 2.4056437015533447, "grad_norm_var": 0.5852234812324276, "learning_rate": 0.0001, "loss": 1.4883, "loss/crossentropy": 2.268463134765625, "loss/hidden": 1.2578125, "loss/logits": 0.22889988124370575, "loss/reg": 0.00016146755660884082, "step": 721 }, { "epoch": 0.09025, "grad_norm": 2.2853894233703613, "grad_norm_var": 0.4166487458483059, "learning_rate": 0.0001, "loss": 1.2535, "loss/crossentropy": 2.6456010341644287, "loss/hidden": 1.0703125, "loss/logits": 0.18154165148735046, "loss/reg": 0.00016142483218573034, "step": 722 }, { "epoch": 0.090375, "grad_norm": 2.0186755657196045, "grad_norm_var": 0.4390526343750705, "learning_rate": 0.0001, "loss": 1.2649, "loss/crossentropy": 2.639620304107666, "loss/hidden": 1.1015625, "loss/logits": 0.16174210608005524, "loss/reg": 0.00016137036436703056, "step": 723 }, { "epoch": 0.0905, "grad_norm": 2.2525289058685303, "grad_norm_var": 0.37258288768871706, "learning_rate": 0.0001, "loss": 1.1584, "loss/crossentropy": 2.6910252571105957, "loss/hidden": 1.0, "loss/logits": 0.1568310707807541, "loss/reg": 0.00016133110329974443, "step": 724 }, { "epoch": 0.090625, "grad_norm": 2.51094388961792, "grad_norm_var": 0.3581104399692471, "learning_rate": 0.0001, "loss": 1.1866, "loss/crossentropy": 2.4803168773651123, "loss/hidden": 1.0234375, "loss/logits": 0.16152504086494446, "loss/reg": 0.00016127487469930202, "step": 725 }, { "epoch": 0.09075, "grad_norm": 3.1180760860443115, "grad_norm_var": 0.37225591603486236, "learning_rate": 0.0001, "loss": 1.4877, "loss/crossentropy": 2.847247362136841, "loss/hidden": 1.28125, "loss/logits": 0.20486503839492798, "loss/reg": 0.00016123637033160776, "step": 726 }, { "epoch": 0.090875, "grad_norm": 3.4448623657226562, "grad_norm_var": 0.4096989033794536, "learning_rate": 0.0001, "loss": 1.2583, "loss/crossentropy": 2.724292278289795, "loss/hidden": 1.078125, "loss/logits": 0.17855608463287354, "loss/reg": 0.00016118075291160494, "step": 727 }, { "epoch": 0.091, "grad_norm": 2.411865711212158, "grad_norm_var": 0.41046918843090435, "learning_rate": 0.0001, "loss": 1.3846, "loss/crossentropy": 2.656142473220825, "loss/hidden": 1.171875, "loss/logits": 0.21108870208263397, "loss/reg": 0.0001611240004422143, "step": 728 }, { "epoch": 0.091125, "grad_norm": 6.291276454925537, "grad_norm_var": 1.2409974232011693, "learning_rate": 0.0001, "loss": 1.6045, "loss/crossentropy": 2.660390615463257, "loss/hidden": 1.21875, "loss/logits": 0.38411080837249756, "loss/reg": 0.00016106815019156784, "step": 729 }, { "epoch": 0.09125, "grad_norm": 4.307939052581787, "grad_norm_var": 1.3604883152401572, "learning_rate": 0.0001, "loss": 1.309, "loss/crossentropy": 2.7089574337005615, "loss/hidden": 1.1328125, "loss/logits": 0.17455777525901794, "loss/reg": 0.00016100869106594473, "step": 730 }, { "epoch": 0.091375, "grad_norm": 2.078972101211548, "grad_norm_var": 1.3950766741367981, "learning_rate": 0.0001, "loss": 1.1573, "loss/crossentropy": 2.287379026412964, "loss/hidden": 1.0, "loss/logits": 0.15567877888679504, "loss/reg": 0.00016094985767267644, "step": 731 }, { "epoch": 0.0915, "grad_norm": 2.298635244369507, "grad_norm_var": 1.4028713178017937, "learning_rate": 0.0001, "loss": 1.334, "loss/crossentropy": 2.508922576904297, "loss/hidden": 1.125, "loss/logits": 0.2074166238307953, "loss/reg": 0.0001608804304851219, "step": 732 }, { "epoch": 0.091625, "grad_norm": 3.388340473175049, "grad_norm_var": 1.3598918924423051, "learning_rate": 0.0001, "loss": 1.1525, "loss/crossentropy": 2.957367420196533, "loss/hidden": 1.0, "loss/logits": 0.1509062647819519, "loss/reg": 0.00016081798821687698, "step": 733 }, { "epoch": 0.09175, "grad_norm": 2.255815267562866, "grad_norm_var": 1.3794783615555655, "learning_rate": 0.0001, "loss": 1.407, "loss/crossentropy": 2.586848735809326, "loss/hidden": 1.1640625, "loss/logits": 0.24134507775306702, "loss/reg": 0.0001607486919965595, "step": 734 }, { "epoch": 0.091875, "grad_norm": 1.9890577793121338, "grad_norm_var": 1.2824653793507383, "learning_rate": 0.0001, "loss": 1.1376, "loss/crossentropy": 2.4694676399230957, "loss/hidden": 0.9765625, "loss/logits": 0.15938282012939453, "loss/reg": 0.0001606686710147187, "step": 735 }, { "epoch": 0.092, "grad_norm": 2.2284231185913086, "grad_norm_var": 1.2618475934622733, "learning_rate": 0.0001, "loss": 1.2971, "loss/crossentropy": 2.340135097503662, "loss/hidden": 1.1171875, "loss/logits": 0.17832466959953308, "loss/reg": 0.00016059167683124542, "step": 736 }, { "epoch": 0.092125, "grad_norm": 2.066023826599121, "grad_norm_var": 1.288290665730081, "learning_rate": 0.0001, "loss": 1.3057, "loss/crossentropy": 2.612424850463867, "loss/hidden": 1.1171875, "loss/logits": 0.18688137829303741, "loss/reg": 0.00016051169950515032, "step": 737 }, { "epoch": 0.09225, "grad_norm": 2.7061967849731445, "grad_norm_var": 1.269969627480242, "learning_rate": 0.0001, "loss": 1.2155, "loss/crossentropy": 2.7299489974975586, "loss/hidden": 1.046875, "loss/logits": 0.16700507700443268, "loss/reg": 0.00016043275536503643, "step": 738 }, { "epoch": 0.092375, "grad_norm": 2.9510438442230225, "grad_norm_var": 1.2227602359060785, "learning_rate": 0.0001, "loss": 1.3198, "loss/crossentropy": 2.6093971729278564, "loss/hidden": 1.140625, "loss/logits": 0.17752662301063538, "loss/reg": 0.0001603730779606849, "step": 739 }, { "epoch": 0.0925, "grad_norm": 2.4394514560699463, "grad_norm_var": 1.2089628293596242, "learning_rate": 0.0001, "loss": 1.3005, "loss/crossentropy": 2.4582626819610596, "loss/hidden": 1.1171875, "loss/logits": 0.1816726177930832, "loss/reg": 0.00016031460836529732, "step": 740 }, { "epoch": 0.092625, "grad_norm": 2.9025044441223145, "grad_norm_var": 1.1979498067853738, "learning_rate": 0.0001, "loss": 1.3853, "loss/crossentropy": 2.7229855060577393, "loss/hidden": 1.1796875, "loss/logits": 0.2040165662765503, "loss/reg": 0.0001602490374352783, "step": 741 }, { "epoch": 0.09275, "grad_norm": 2.2374203205108643, "grad_norm_var": 1.2243268037269728, "learning_rate": 0.0001, "loss": 1.2548, "loss/crossentropy": 2.5456337928771973, "loss/hidden": 1.0703125, "loss/logits": 0.18293476104736328, "loss/reg": 0.0001601918920641765, "step": 742 }, { "epoch": 0.092875, "grad_norm": 2.66164231300354, "grad_norm_var": 1.2031418812806314, "learning_rate": 0.0001, "loss": 1.3517, "loss/crossentropy": 2.324861526489258, "loss/hidden": 1.1484375, "loss/logits": 0.20170405507087708, "loss/reg": 0.00016013518325053155, "step": 743 }, { "epoch": 0.093, "grad_norm": 2.270768404006958, "grad_norm_var": 1.212175620638671, "learning_rate": 0.0001, "loss": 1.1252, "loss/crossentropy": 2.450507640838623, "loss/hidden": 0.9609375, "loss/logits": 0.16261890530586243, "loss/reg": 0.00016008685634005815, "step": 744 }, { "epoch": 0.093125, "grad_norm": 2.200423240661621, "grad_norm_var": 0.3631356282941565, "learning_rate": 0.0001, "loss": 1.2617, "loss/crossentropy": 2.6792309284210205, "loss/hidden": 1.0703125, "loss/logits": 0.18975603580474854, "loss/reg": 0.00016004228382371366, "step": 745 }, { "epoch": 0.09325, "grad_norm": 3.2866992950439453, "grad_norm_var": 0.19050297049615186, "learning_rate": 0.0001, "loss": 1.5215, "loss/crossentropy": 2.6857693195343018, "loss/hidden": 1.2890625, "loss/logits": 0.2308271825313568, "loss/reg": 0.0001599832612555474, "step": 746 }, { "epoch": 0.093375, "grad_norm": 2.3304667472839355, "grad_norm_var": 0.18041875939156474, "learning_rate": 0.0001, "loss": 1.2801, "loss/crossentropy": 2.2642483711242676, "loss/hidden": 1.1015625, "loss/logits": 0.17696912586688995, "loss/reg": 0.0001599361130502075, "step": 747 }, { "epoch": 0.0935, "grad_norm": 2.9594101905822754, "grad_norm_var": 0.1887944312030373, "learning_rate": 0.0001, "loss": 1.4722, "loss/crossentropy": 2.640739679336548, "loss/hidden": 1.2265625, "loss/logits": 0.2439916729927063, "loss/reg": 0.00015988021914381534, "step": 748 }, { "epoch": 0.093625, "grad_norm": 2.8214306831359863, "grad_norm_var": 0.1458607624334602, "learning_rate": 0.0001, "loss": 1.464, "loss/crossentropy": 2.4347331523895264, "loss/hidden": 1.2265625, "loss/logits": 0.23584049940109253, "loss/reg": 0.0001598329981788993, "step": 749 }, { "epoch": 0.09375, "grad_norm": 2.58860182762146, "grad_norm_var": 0.14109682788710315, "learning_rate": 0.0001, "loss": 1.2944, "loss/crossentropy": 2.473512649536133, "loss/hidden": 1.0859375, "loss/logits": 0.20684140920639038, "loss/reg": 0.0001597863738425076, "step": 750 }, { "epoch": 0.093875, "grad_norm": 2.1102545261383057, "grad_norm_var": 0.13311232136032888, "learning_rate": 0.0001, "loss": 1.3487, "loss/crossentropy": 2.224846839904785, "loss/hidden": 1.15625, "loss/logits": 0.19080884754657745, "loss/reg": 0.0001597255322849378, "step": 751 }, { "epoch": 0.094, "grad_norm": 3.6358067989349365, "grad_norm_var": 0.1970238013293046, "learning_rate": 0.0001, "loss": 1.4033, "loss/crossentropy": 2.4640469551086426, "loss/hidden": 1.1796875, "loss/logits": 0.2220308482646942, "loss/reg": 0.000159663672093302, "step": 752 }, { "epoch": 0.094125, "grad_norm": 1.9362329244613647, "grad_norm_var": 0.20793185623656232, "learning_rate": 0.0001, "loss": 1.2684, "loss/crossentropy": 2.349578380584717, "loss/hidden": 1.0859375, "loss/logits": 0.18084318935871124, "loss/reg": 0.00015960917517077178, "step": 753 }, { "epoch": 0.09425, "grad_norm": 2.8127987384796143, "grad_norm_var": 0.20976213130276158, "learning_rate": 0.0001, "loss": 1.3767, "loss/crossentropy": 2.7700722217559814, "loss/hidden": 1.15625, "loss/logits": 0.21881088614463806, "loss/reg": 0.0001595582434674725, "step": 754 }, { "epoch": 0.094375, "grad_norm": 2.624014139175415, "grad_norm_var": 0.20262464540361258, "learning_rate": 0.0001, "loss": 1.0452, "loss/crossentropy": 2.5594871044158936, "loss/hidden": 0.9140625, "loss/logits": 0.12955060601234436, "loss/reg": 0.00015951337991282344, "step": 755 }, { "epoch": 0.0945, "grad_norm": 2.647692918777466, "grad_norm_var": 0.2004990349344301, "learning_rate": 0.0001, "loss": 1.3711, "loss/crossentropy": 2.8837127685546875, "loss/hidden": 1.1484375, "loss/logits": 0.22106823325157166, "loss/reg": 0.0001594527711858973, "step": 756 }, { "epoch": 0.094625, "grad_norm": 4.303273677825928, "grad_norm_var": 0.37465752410703357, "learning_rate": 0.0001, "loss": 1.6256, "loss/crossentropy": 2.1526553630828857, "loss/hidden": 1.4140625, "loss/logits": 0.2099636346101761, "loss/reg": 0.00015939251170493662, "step": 757 }, { "epoch": 0.09475, "grad_norm": 5.404844760894775, "grad_norm_var": 0.8003454239234145, "learning_rate": 0.0001, "loss": 2.4086, "loss/crossentropy": 2.3594839572906494, "loss/hidden": 1.953125, "loss/logits": 0.4539313316345215, "loss/reg": 0.00015932397218421102, "step": 758 }, { "epoch": 0.094875, "grad_norm": 2.540015697479248, "grad_norm_var": 0.8053324028011833, "learning_rate": 0.0001, "loss": 1.1342, "loss/crossentropy": 2.6021182537078857, "loss/hidden": 0.98046875, "loss/logits": 0.15212813019752502, "loss/reg": 0.00015925623301882297, "step": 759 }, { "epoch": 0.095, "grad_norm": 2.0890510082244873, "grad_norm_var": 0.8227520149486531, "learning_rate": 0.0001, "loss": 1.267, "loss/crossentropy": 2.664130449295044, "loss/hidden": 1.078125, "loss/logits": 0.18725955486297607, "loss/reg": 0.0001591854525031522, "step": 760 }, { "epoch": 0.095125, "grad_norm": 2.0796992778778076, "grad_norm_var": 0.834814023981355, "learning_rate": 0.0001, "loss": 1.2805, "loss/crossentropy": 2.3574774265289307, "loss/hidden": 1.109375, "loss/logits": 0.1694914698600769, "loss/reg": 0.00015911730588413775, "step": 761 }, { "epoch": 0.09525, "grad_norm": 2.3075294494628906, "grad_norm_var": 0.8423771182670705, "learning_rate": 0.0001, "loss": 1.3681, "loss/crossentropy": 2.704540252685547, "loss/hidden": 1.15625, "loss/logits": 0.21024569869041443, "loss/reg": 0.00015904406609479338, "step": 762 }, { "epoch": 0.095375, "grad_norm": 2.4182631969451904, "grad_norm_var": 0.8370762744334944, "learning_rate": 0.0001, "loss": 1.4621, "loss/crossentropy": 2.2880709171295166, "loss/hidden": 1.2265625, "loss/logits": 0.23398488759994507, "loss/reg": 0.00015898223500698805, "step": 763 }, { "epoch": 0.0955, "grad_norm": 2.4664182662963867, "grad_norm_var": 0.8437554777382047, "learning_rate": 0.0001, "loss": 1.3703, "loss/crossentropy": 2.382688522338867, "loss/hidden": 1.1796875, "loss/logits": 0.18898102641105652, "loss/reg": 0.00015890307258814573, "step": 764 }, { "epoch": 0.095625, "grad_norm": 2.9035651683807373, "grad_norm_var": 0.8444214321374729, "learning_rate": 0.0001, "loss": 1.3555, "loss/crossentropy": 2.5646626949310303, "loss/hidden": 1.15625, "loss/logits": 0.19766995310783386, "loss/reg": 0.00015881822037044913, "step": 765 }, { "epoch": 0.09575, "grad_norm": 2.3307271003723145, "grad_norm_var": 0.855992472268653, "learning_rate": 0.0001, "loss": 1.1844, "loss/crossentropy": 2.306934118270874, "loss/hidden": 1.015625, "loss/logits": 0.1672147810459137, "loss/reg": 0.0001587588049005717, "step": 766 }, { "epoch": 0.095875, "grad_norm": 4.602903842926025, "grad_norm_var": 1.019027413118511, "learning_rate": 0.0001, "loss": 1.4211, "loss/crossentropy": 2.7331838607788086, "loss/hidden": 1.2421875, "loss/logits": 0.1773134469985962, "loss/reg": 0.0001586988364579156, "step": 767 }, { "epoch": 0.096, "grad_norm": 3.3415300846099854, "grad_norm_var": 0.9972926341432582, "learning_rate": 0.0001, "loss": 1.2484, "loss/crossentropy": 2.641475200653076, "loss/hidden": 1.0703125, "loss/logits": 0.17650385200977325, "loss/reg": 0.00015862745931372046, "step": 768 }, { "epoch": 0.096125, "grad_norm": 2.530545473098755, "grad_norm_var": 0.9409741440179211, "learning_rate": 0.0001, "loss": 1.2638, "loss/crossentropy": 2.7084455490112305, "loss/hidden": 1.09375, "loss/logits": 0.16844099760055542, "loss/reg": 0.00015855921083129942, "step": 769 }, { "epoch": 0.09625, "grad_norm": 2.2985293865203857, "grad_norm_var": 0.9677809187941456, "learning_rate": 0.0001, "loss": 1.2952, "loss/crossentropy": 2.5004665851593018, "loss/hidden": 1.09375, "loss/logits": 0.19982460141181946, "loss/reg": 0.00015850854106247425, "step": 770 }, { "epoch": 0.096375, "grad_norm": 3.4938621520996094, "grad_norm_var": 0.9795201184664014, "learning_rate": 0.0001, "loss": 1.3691, "loss/crossentropy": 2.5238912105560303, "loss/hidden": 1.15625, "loss/logits": 0.21127717196941376, "loss/reg": 0.000158456910867244, "step": 771 }, { "epoch": 0.0965, "grad_norm": 2.524238348007202, "grad_norm_var": 0.986023369913004, "learning_rate": 0.0001, "loss": 1.3384, "loss/crossentropy": 2.9121556282043457, "loss/hidden": 1.125, "loss/logits": 0.2118336409330368, "loss/reg": 0.0001583997072884813, "step": 772 }, { "epoch": 0.096625, "grad_norm": 3.3034844398498535, "grad_norm_var": 0.8717227763043857, "learning_rate": 0.0001, "loss": 1.3087, "loss/crossentropy": 2.3371756076812744, "loss/hidden": 1.140625, "loss/logits": 0.1664540320634842, "loss/reg": 0.00015834005898796022, "step": 773 }, { "epoch": 0.09675, "grad_norm": 2.4156699180603027, "grad_norm_var": 0.4377071137671076, "learning_rate": 0.0001, "loss": 1.3046, "loss/crossentropy": 2.5928711891174316, "loss/hidden": 1.125, "loss/logits": 0.17806413769721985, "loss/reg": 0.00015828636242076755, "step": 774 }, { "epoch": 0.096875, "grad_norm": 2.9043703079223633, "grad_norm_var": 0.43687783638188193, "learning_rate": 0.0001, "loss": 1.5698, "loss/crossentropy": 2.098407030105591, "loss/hidden": 1.3359375, "loss/logits": 0.23231714963912964, "loss/reg": 0.0001582311379024759, "step": 775 }, { "epoch": 0.097, "grad_norm": 2.595289707183838, "grad_norm_var": 0.408238305676165, "learning_rate": 0.0001, "loss": 1.2996, "loss/crossentropy": 2.7181334495544434, "loss/hidden": 1.1171875, "loss/logits": 0.18086370825767517, "loss/reg": 0.0001581712276674807, "step": 776 }, { "epoch": 0.097125, "grad_norm": 2.545562982559204, "grad_norm_var": 0.37816113596360096, "learning_rate": 0.0001, "loss": 1.3531, "loss/crossentropy": 2.326741933822632, "loss/hidden": 1.171875, "loss/logits": 0.17961636185646057, "loss/reg": 0.00015810313925612718, "step": 777 }, { "epoch": 0.09725, "grad_norm": 2.5478177070617676, "grad_norm_var": 0.3656263854679572, "learning_rate": 0.0001, "loss": 1.5645, "loss/crossentropy": 2.5691230297088623, "loss/hidden": 1.3046875, "loss/logits": 0.25823599100112915, "loss/reg": 0.00015804090071469545, "step": 778 }, { "epoch": 0.097375, "grad_norm": 2.9159414768218994, "grad_norm_var": 0.35402227055322416, "learning_rate": 0.0001, "loss": 1.2089, "loss/crossentropy": 2.5886998176574707, "loss/hidden": 1.0390625, "loss/logits": 0.16828888654708862, "loss/reg": 0.00015798755339346826, "step": 779 }, { "epoch": 0.0975, "grad_norm": 2.3789730072021484, "grad_norm_var": 0.35906028599656165, "learning_rate": 0.0001, "loss": 1.2613, "loss/crossentropy": 2.6034371852874756, "loss/hidden": 1.078125, "loss/logits": 0.1816016435623169, "loss/reg": 0.00015793280908837914, "step": 780 }, { "epoch": 0.097625, "grad_norm": 5.179575443267822, "grad_norm_var": 0.6984534122826441, "learning_rate": 0.0001, "loss": 1.5792, "loss/crossentropy": 2.607119083404541, "loss/hidden": 1.3515625, "loss/logits": 0.2260763943195343, "loss/reg": 0.00015787192387506366, "step": 781 }, { "epoch": 0.09775, "grad_norm": 2.9297902584075928, "grad_norm_var": 0.6678791552519799, "learning_rate": 0.0001, "loss": 1.4466, "loss/crossentropy": 2.407424211502075, "loss/hidden": 1.2265625, "loss/logits": 0.2184680700302124, "loss/reg": 0.00015780895773787051, "step": 782 }, { "epoch": 0.097875, "grad_norm": 2.98895001411438, "grad_norm_var": 0.4925805925935644, "learning_rate": 0.0001, "loss": 1.3869, "loss/crossentropy": 2.744739055633545, "loss/hidden": 1.1875, "loss/logits": 0.1978280395269394, "loss/reg": 0.00015775540668983012, "step": 783 }, { "epoch": 0.098, "grad_norm": 3.9746737480163574, "grad_norm_var": 0.5523014894046144, "learning_rate": 0.0001, "loss": 1.715, "loss/crossentropy": 2.1144697666168213, "loss/hidden": 1.4765625, "loss/logits": 0.23690226674079895, "loss/reg": 0.00015769751917105168, "step": 784 }, { "epoch": 0.098125, "grad_norm": 2.7441892623901367, "grad_norm_var": 0.5426230369519733, "learning_rate": 0.0001, "loss": 1.4605, "loss/crossentropy": 2.533891439437866, "loss/hidden": 1.21875, "loss/logits": 0.24014262855052948, "loss/reg": 0.00015763672126922756, "step": 785 }, { "epoch": 0.09825, "grad_norm": 2.886592149734497, "grad_norm_var": 0.5105051205161686, "learning_rate": 0.0001, "loss": 1.5767, "loss/crossentropy": 1.7247384786605835, "loss/hidden": 1.3671875, "loss/logits": 0.20795351266860962, "loss/reg": 0.00015757934306748211, "step": 786 }, { "epoch": 0.098375, "grad_norm": 2.8609161376953125, "grad_norm_var": 0.49560072717516257, "learning_rate": 0.0001, "loss": 1.3977, "loss/crossentropy": 2.516383409500122, "loss/hidden": 1.1875, "loss/logits": 0.20862555503845215, "loss/reg": 0.00015751863247714937, "step": 787 }, { "epoch": 0.0985, "grad_norm": 2.4624829292297363, "grad_norm_var": 0.4996001043209463, "learning_rate": 0.0001, "loss": 1.2412, "loss/crossentropy": 2.999992847442627, "loss/hidden": 1.0625, "loss/logits": 0.1771094799041748, "loss/reg": 0.0001574623747728765, "step": 788 }, { "epoch": 0.098625, "grad_norm": 6.9875078201293945, "grad_norm_var": 1.5081520648287794, "learning_rate": 0.0001, "loss": 1.6105, "loss/crossentropy": 2.4183802604675293, "loss/hidden": 1.3203125, "loss/logits": 0.28856509923934937, "loss/reg": 0.00015740639355499297, "step": 789 }, { "epoch": 0.09875, "grad_norm": 2.1079306602478027, "grad_norm_var": 1.5465569717877712, "learning_rate": 0.0001, "loss": 1.1826, "loss/crossentropy": 2.517399787902832, "loss/hidden": 1.0234375, "loss/logits": 0.15760257840156555, "loss/reg": 0.0001573533081682399, "step": 790 }, { "epoch": 0.098875, "grad_norm": 2.4908204078674316, "grad_norm_var": 1.5728941140646608, "learning_rate": 0.0001, "loss": 1.3931, "loss/crossentropy": 2.5084221363067627, "loss/hidden": 1.203125, "loss/logits": 0.1883927285671234, "loss/reg": 0.00015728619473520666, "step": 791 }, { "epoch": 0.099, "grad_norm": 3.32619571685791, "grad_norm_var": 1.5510242896474682, "learning_rate": 0.0001, "loss": 1.4941, "loss/crossentropy": 2.375476598739624, "loss/hidden": 1.3125, "loss/logits": 0.18002313375473022, "loss/reg": 0.00015722097305115312, "step": 792 }, { "epoch": 0.099125, "grad_norm": 3.3583014011383057, "grad_norm_var": 1.5205237483948524, "learning_rate": 0.0001, "loss": 1.7693, "loss/crossentropy": 2.742772102355957, "loss/hidden": 1.5, "loss/logits": 0.2677229642868042, "loss/reg": 0.0001571673055877909, "step": 793 }, { "epoch": 0.09925, "grad_norm": 2.7061352729797363, "grad_norm_var": 1.5070823323117468, "learning_rate": 0.0001, "loss": 1.3389, "loss/crossentropy": 2.2309160232543945, "loss/hidden": 1.1640625, "loss/logits": 0.1732320487499237, "loss/reg": 0.0001571090833749622, "step": 794 }, { "epoch": 0.099375, "grad_norm": 3.7295544147491455, "grad_norm_var": 1.5101888757233228, "learning_rate": 0.0001, "loss": 1.2634, "loss/crossentropy": 2.6407904624938965, "loss/hidden": 1.078125, "loss/logits": 0.18367145955562592, "loss/reg": 0.00015706109115853906, "step": 795 }, { "epoch": 0.0995, "grad_norm": 3.2890892028808594, "grad_norm_var": 1.447822032889314, "learning_rate": 0.0001, "loss": 1.5751, "loss/crossentropy": 2.553081750869751, "loss/hidden": 1.3515625, "loss/logits": 0.2219996154308319, "loss/reg": 0.00015700374206062406, "step": 796 }, { "epoch": 0.099625, "grad_norm": 2.9799745082855225, "grad_norm_var": 1.2213823688837426, "learning_rate": 0.0001, "loss": 1.3383, "loss/crossentropy": 2.4798338413238525, "loss/hidden": 1.140625, "loss/logits": 0.1960686892271042, "loss/reg": 0.00015694284229539335, "step": 797 }, { "epoch": 0.09975, "grad_norm": 3.9751150608062744, "grad_norm_var": 1.246587556275107, "learning_rate": 0.0001, "loss": 1.6814, "loss/crossentropy": 2.334136486053467, "loss/hidden": 1.4296875, "loss/logits": 0.25015050172805786, "loss/reg": 0.00015689186693634838, "step": 798 }, { "epoch": 0.099875, "grad_norm": 2.823457717895508, "grad_norm_var": 1.255257174584494, "learning_rate": 0.0001, "loss": 1.3489, "loss/crossentropy": 2.544034957885742, "loss/hidden": 1.140625, "loss/logits": 0.20675182342529297, "loss/reg": 0.0001568411971675232, "step": 799 }, { "epoch": 0.1, "grad_norm": 3.4553818702697754, "grad_norm_var": 1.2249774622026561, "learning_rate": 0.0001, "loss": 1.5263, "loss/crossentropy": 2.3714699745178223, "loss/hidden": 1.28125, "loss/logits": 0.24343347549438477, "loss/reg": 0.00015678441559430212, "step": 800 }, { "epoch": 0.100125, "grad_norm": 9.315553665161133, "grad_norm_var": 3.4706654946725655, "learning_rate": 0.0001, "loss": 1.4211, "loss/crossentropy": 2.46150279045105, "loss/hidden": 1.203125, "loss/logits": 0.2164209634065628, "loss/reg": 0.00015673183952458203, "step": 801 }, { "epoch": 0.10025, "grad_norm": 8.424959182739258, "grad_norm_var": 4.807637367376958, "learning_rate": 0.0001, "loss": 1.5307, "loss/crossentropy": 2.5745885372161865, "loss/hidden": 1.3125, "loss/logits": 0.2166537046432495, "loss/reg": 0.0001566773426020518, "step": 802 }, { "epoch": 0.100375, "grad_norm": 2.473189115524292, "grad_norm_var": 4.876868193390911, "learning_rate": 0.0001, "loss": 1.3944, "loss/crossentropy": 2.408083438873291, "loss/hidden": 1.1875, "loss/logits": 0.2053414136171341, "loss/reg": 0.00015662483929190785, "step": 803 }, { "epoch": 0.1005, "grad_norm": 2.5679922103881836, "grad_norm_var": 4.856017271227494, "learning_rate": 0.0001, "loss": 1.2197, "loss/crossentropy": 2.750852346420288, "loss/hidden": 1.03125, "loss/logits": 0.18685811758041382, "loss/reg": 0.000156574125867337, "step": 804 }, { "epoch": 0.100625, "grad_norm": 2.0632970333099365, "grad_norm_var": 4.410483461032395, "learning_rate": 0.0001, "loss": 1.0807, "loss/crossentropy": 2.6440253257751465, "loss/hidden": 0.94140625, "loss/logits": 0.13775327801704407, "loss/reg": 0.00015652149158995599, "step": 805 }, { "epoch": 0.10075, "grad_norm": 2.2623279094696045, "grad_norm_var": 4.3793440094266955, "learning_rate": 0.0001, "loss": 1.3287, "loss/crossentropy": 2.5161526203155518, "loss/hidden": 1.1328125, "loss/logits": 0.19432221353054047, "loss/reg": 0.0001564609701745212, "step": 806 }, { "epoch": 0.100875, "grad_norm": 5.586756706237793, "grad_norm_var": 4.478189620682763, "learning_rate": 0.0001, "loss": 2.3625, "loss/crossentropy": 2.3109660148620605, "loss/hidden": 1.8515625, "loss/logits": 0.5093801021575928, "loss/reg": 0.00015640609490219504, "step": 807 }, { "epoch": 0.101, "grad_norm": 3.319744825363159, "grad_norm_var": 4.478682389834329, "learning_rate": 0.0001, "loss": 1.2178, "loss/crossentropy": 2.4249396324157715, "loss/hidden": 1.046875, "loss/logits": 0.1694084256887436, "loss/reg": 0.0001563557452755049, "step": 808 }, { "epoch": 0.101125, "grad_norm": 3.047255039215088, "grad_norm_var": 4.50701574652457, "learning_rate": 0.0001, "loss": 1.5435, "loss/crossentropy": 2.2202775478363037, "loss/hidden": 1.34375, "loss/logits": 0.19818627834320068, "loss/reg": 0.00015629793051630259, "step": 809 }, { "epoch": 0.10125, "grad_norm": 2.7431864738464355, "grad_norm_var": 4.501321058661267, "learning_rate": 0.0001, "loss": 1.2541, "loss/crossentropy": 2.699061155319214, "loss/hidden": 1.0859375, "loss/logits": 0.16662535071372986, "loss/reg": 0.0001562309480505064, "step": 810 }, { "epoch": 0.101375, "grad_norm": 2.6741671562194824, "grad_norm_var": 4.591902913146055, "learning_rate": 0.0001, "loss": 1.2735, "loss/crossentropy": 2.6062464714050293, "loss/hidden": 1.0859375, "loss/logits": 0.18601274490356445, "loss/reg": 0.00015617247845511883, "step": 811 }, { "epoch": 0.1015, "grad_norm": 2.555079698562622, "grad_norm_var": 4.676810023548718, "learning_rate": 0.0001, "loss": 1.7222, "loss/crossentropy": 2.4174118041992188, "loss/hidden": 1.4609375, "loss/logits": 0.259733647108078, "loss/reg": 0.0001561163371661678, "step": 812 }, { "epoch": 0.101625, "grad_norm": 2.786980628967285, "grad_norm_var": 4.699382748720115, "learning_rate": 0.0001, "loss": 1.4643, "loss/crossentropy": 2.640596628189087, "loss/hidden": 1.21875, "loss/logits": 0.2439698427915573, "loss/reg": 0.0001560602686367929, "step": 813 }, { "epoch": 0.10175, "grad_norm": 2.6986706256866455, "grad_norm_var": 4.7636935996229175, "learning_rate": 0.0001, "loss": 1.5069, "loss/crossentropy": 2.7341158390045166, "loss/hidden": 1.265625, "loss/logits": 0.23967362940311432, "loss/reg": 0.00015599608013872057, "step": 814 }, { "epoch": 0.101875, "grad_norm": 2.478461742401123, "grad_norm_var": 4.810297226266997, "learning_rate": 0.0001, "loss": 1.2154, "loss/crossentropy": 2.4954257011413574, "loss/hidden": 1.046875, "loss/logits": 0.16694118082523346, "loss/reg": 0.0001559416705276817, "step": 815 }, { "epoch": 0.102, "grad_norm": 2.5957534313201904, "grad_norm_var": 4.879168559668788, "learning_rate": 0.0001, "loss": 1.417, "loss/crossentropy": 2.539020538330078, "loss/hidden": 1.1953125, "loss/logits": 0.22013987600803375, "loss/reg": 0.0001558883668622002, "step": 816 }, { "epoch": 0.102125, "grad_norm": 2.2633743286132812, "grad_norm_var": 2.6128250733116127, "learning_rate": 0.0001, "loss": 1.2702, "loss/crossentropy": 2.2717041969299316, "loss/hidden": 1.0703125, "loss/logits": 0.19831937551498413, "loss/reg": 0.00015582211199216545, "step": 817 }, { "epoch": 0.10225, "grad_norm": 2.632050037384033, "grad_norm_var": 0.6426889092702404, "learning_rate": 0.0001, "loss": 1.3941, "loss/crossentropy": 2.526956558227539, "loss/hidden": 1.171875, "loss/logits": 0.22068020701408386, "loss/reg": 0.000155754474690184, "step": 818 }, { "epoch": 0.102375, "grad_norm": 2.763049840927124, "grad_norm_var": 0.6354344062606077, "learning_rate": 0.0001, "loss": 1.2244, "loss/crossentropy": 2.791860342025757, "loss/hidden": 1.0625, "loss/logits": 0.16038620471954346, "loss/reg": 0.00015570268442388624, "step": 819 }, { "epoch": 0.1025, "grad_norm": 2.6946897506713867, "grad_norm_var": 0.6322669247084849, "learning_rate": 0.0001, "loss": 1.1555, "loss/crossentropy": 2.622080087661743, "loss/hidden": 0.9921875, "loss/logits": 0.16179564595222473, "loss/reg": 0.0001556438219267875, "step": 820 }, { "epoch": 0.102625, "grad_norm": 3.2547173500061035, "grad_norm_var": 0.600332488502597, "learning_rate": 0.0001, "loss": 1.4699, "loss/crossentropy": 2.640681505203247, "loss/hidden": 1.2421875, "loss/logits": 0.22610792517662048, "loss/reg": 0.00015557486040052027, "step": 821 }, { "epoch": 0.10275, "grad_norm": 2.2784531116485596, "grad_norm_var": 0.5989836045932786, "learning_rate": 0.0001, "loss": 1.2457, "loss/crossentropy": 2.459578275680542, "loss/hidden": 1.078125, "loss/logits": 0.1659996658563614, "loss/reg": 0.00015552052354905754, "step": 822 }, { "epoch": 0.102875, "grad_norm": 2.866572856903076, "grad_norm_var": 0.08635730352705101, "learning_rate": 0.0001, "loss": 1.4594, "loss/crossentropy": 2.900744915008545, "loss/hidden": 1.21875, "loss/logits": 0.2391422986984253, "loss/reg": 0.00015544889902230352, "step": 823 }, { "epoch": 0.103, "grad_norm": 2.2067203521728516, "grad_norm_var": 0.0760059277298294, "learning_rate": 0.0001, "loss": 1.1862, "loss/crossentropy": 2.3179521560668945, "loss/hidden": 1.0234375, "loss/logits": 0.16124112904071808, "loss/reg": 0.00015537750732619315, "step": 824 }, { "epoch": 0.103125, "grad_norm": 2.75836443901062, "grad_norm_var": 0.06625534346652937, "learning_rate": 0.0001, "loss": 1.3593, "loss/crossentropy": 2.621162176132202, "loss/hidden": 1.1640625, "loss/logits": 0.1937066614627838, "loss/reg": 0.0001553032052470371, "step": 825 }, { "epoch": 0.10325, "grad_norm": 2.4691476821899414, "grad_norm_var": 0.0672021456196101, "learning_rate": 0.0001, "loss": 1.1782, "loss/crossentropy": 2.3111352920532227, "loss/hidden": 1.03125, "loss/logits": 0.14542686939239502, "loss/reg": 0.00015522754983976483, "step": 826 }, { "epoch": 0.103375, "grad_norm": 2.242722749710083, "grad_norm_var": 0.07592239779072708, "learning_rate": 0.0001, "loss": 1.2963, "loss/crossentropy": 2.6022891998291016, "loss/hidden": 1.09375, "loss/logits": 0.20099574327468872, "loss/reg": 0.0001551712048240006, "step": 827 }, { "epoch": 0.1035, "grad_norm": 16.502452850341797, "grad_norm_var": 12.156877274023865, "learning_rate": 0.0001, "loss": 1.3548, "loss/crossentropy": 2.4874210357666016, "loss/hidden": 1.1875, "loss/logits": 0.1657799482345581, "loss/reg": 0.0001551109744468704, "step": 828 }, { "epoch": 0.103625, "grad_norm": 3.020467758178711, "grad_norm_var": 12.139075168014692, "learning_rate": 0.0001, "loss": 1.3161, "loss/crossentropy": 2.5590686798095703, "loss/hidden": 1.1171875, "loss/logits": 0.19737553596496582, "loss/reg": 0.0001550390152260661, "step": 829 }, { "epoch": 0.10375, "grad_norm": 2.3817336559295654, "grad_norm_var": 12.178491473135303, "learning_rate": 0.0001, "loss": 1.274, "loss/crossentropy": 2.345802068710327, "loss/hidden": 1.1015625, "loss/logits": 0.17091208696365356, "loss/reg": 0.00015496429114136845, "step": 830 }, { "epoch": 0.103875, "grad_norm": 2.4110162258148193, "grad_norm_var": 12.187629882808864, "learning_rate": 0.0001, "loss": 1.4273, "loss/crossentropy": 2.476094961166382, "loss/hidden": 1.2109375, "loss/logits": 0.2147751897573471, "loss/reg": 0.00015489156066905707, "step": 831 }, { "epoch": 0.104, "grad_norm": 3.0275886058807373, "grad_norm_var": 12.149590718067628, "learning_rate": 0.0001, "loss": 1.2915, "loss/crossentropy": 2.3147025108337402, "loss/hidden": 1.125, "loss/logits": 0.16493496298789978, "loss/reg": 0.0001548164727864787, "step": 832 }, { "epoch": 0.104125, "grad_norm": 2.8160605430603027, "grad_norm_var": 12.078598239549114, "learning_rate": 0.0001, "loss": 1.7475, "loss/crossentropy": 2.259359836578369, "loss/hidden": 1.46875, "loss/logits": 0.27723509073257446, "loss/reg": 0.00015476063708774745, "step": 833 }, { "epoch": 0.10425, "grad_norm": 2.3662846088409424, "grad_norm_var": 12.114490409757988, "learning_rate": 0.0001, "loss": 1.1816, "loss/crossentropy": 2.8441381454467773, "loss/hidden": 1.015625, "loss/logits": 0.1644306480884552, "loss/reg": 0.00015470368089154363, "step": 834 }, { "epoch": 0.104375, "grad_norm": 2.4449284076690674, "grad_norm_var": 12.15223327950075, "learning_rate": 0.0001, "loss": 1.2178, "loss/crossentropy": 2.6379342079162598, "loss/hidden": 1.0546875, "loss/logits": 0.16153493523597717, "loss/reg": 0.00015461975999642164, "step": 835 }, { "epoch": 0.1045, "grad_norm": 3.113579034805298, "grad_norm_var": 12.11912282075488, "learning_rate": 0.0001, "loss": 1.2506, "loss/crossentropy": 2.4042086601257324, "loss/hidden": 1.0859375, "loss/logits": 0.1630953848361969, "loss/reg": 0.0001545300183352083, "step": 836 }, { "epoch": 0.104625, "grad_norm": 2.785813093185425, "grad_norm_var": 12.148828353064777, "learning_rate": 0.0001, "loss": 1.3299, "loss/crossentropy": 2.268047332763672, "loss/hidden": 1.1171875, "loss/logits": 0.2111961543560028, "loss/reg": 0.00015444493328686804, "step": 837 }, { "epoch": 0.10475, "grad_norm": 2.966097831726074, "grad_norm_var": 12.068148598044699, "learning_rate": 0.0001, "loss": 1.2446, "loss/crossentropy": 2.605198383331299, "loss/hidden": 1.078125, "loss/logits": 0.1649276167154312, "loss/reg": 0.00015438608534168452, "step": 838 }, { "epoch": 0.104875, "grad_norm": 2.4610483646392822, "grad_norm_var": 12.113958735429613, "learning_rate": 0.0001, "loss": 1.3404, "loss/crossentropy": 2.2986252307891846, "loss/hidden": 1.1484375, "loss/logits": 0.1903916299343109, "loss/reg": 0.00015430156781803817, "step": 839 }, { "epoch": 0.105, "grad_norm": 3.7536721229553223, "grad_norm_var": 11.997107641332153, "learning_rate": 0.0001, "loss": 1.4539, "loss/crossentropy": 2.5108516216278076, "loss/hidden": 1.21875, "loss/logits": 0.23361173272132874, "loss/reg": 0.0001542152022011578, "step": 840 }, { "epoch": 0.105125, "grad_norm": 2.7188615798950195, "grad_norm_var": 12.001612092573852, "learning_rate": 0.0001, "loss": 1.5241, "loss/crossentropy": 2.3506808280944824, "loss/hidden": 1.296875, "loss/logits": 0.22568640112876892, "loss/reg": 0.00015415725647471845, "step": 841 }, { "epoch": 0.10525, "grad_norm": 2.4441373348236084, "grad_norm_var": 12.005397552416609, "learning_rate": 0.0001, "loss": 1.1992, "loss/crossentropy": 2.965500831604004, "loss/hidden": 1.0234375, "loss/logits": 0.1742335855960846, "loss/reg": 0.00015409871411975473, "step": 842 }, { "epoch": 0.105375, "grad_norm": 2.153806686401367, "grad_norm_var": 12.021876493317956, "learning_rate": 0.0001, "loss": 1.2382, "loss/crossentropy": 2.554860830307007, "loss/hidden": 1.0859375, "loss/logits": 0.1507563591003418, "loss/reg": 0.00015404151054099202, "step": 843 }, { "epoch": 0.1055, "grad_norm": 2.62217116355896, "grad_norm_var": 0.15775381088085302, "learning_rate": 0.0001, "loss": 1.4434, "loss/crossentropy": 2.5468316078186035, "loss/hidden": 1.234375, "loss/logits": 0.20744654536247253, "loss/reg": 0.00015396089293062687, "step": 844 }, { "epoch": 0.105625, "grad_norm": 3.028557062149048, "grad_norm_var": 0.15808418391255236, "learning_rate": 0.0001, "loss": 1.3275, "loss/crossentropy": 2.5722765922546387, "loss/hidden": 1.140625, "loss/logits": 0.18531131744384766, "loss/reg": 0.00015387858729809523, "step": 845 }, { "epoch": 0.10575, "grad_norm": 3.1140003204345703, "grad_norm_var": 0.1587211470307087, "learning_rate": 0.0001, "loss": 1.5073, "loss/crossentropy": 2.2211999893188477, "loss/hidden": 1.28125, "loss/logits": 0.22455669939517975, "loss/reg": 0.00015380082186311483, "step": 846 }, { "epoch": 0.105875, "grad_norm": 2.7188003063201904, "grad_norm_var": 0.1501468397164937, "learning_rate": 0.0001, "loss": 1.348, "loss/crossentropy": 2.621995687484741, "loss/hidden": 1.1484375, "loss/logits": 0.19803985953330994, "loss/reg": 0.00015374379290733486, "step": 847 }, { "epoch": 0.106, "grad_norm": 2.165024995803833, "grad_norm_var": 0.16857131665523467, "learning_rate": 0.0001, "loss": 1.2247, "loss/crossentropy": 2.4396474361419678, "loss/hidden": 1.0625, "loss/logits": 0.1606440246105194, "loss/reg": 0.00015368600725196302, "step": 848 }, { "epoch": 0.106125, "grad_norm": 3.2530934810638428, "grad_norm_var": 0.18554958110323672, "learning_rate": 0.0001, "loss": 1.2512, "loss/crossentropy": 2.5811870098114014, "loss/hidden": 1.0703125, "loss/logits": 0.17938640713691711, "loss/reg": 0.00015363001148216426, "step": 849 }, { "epoch": 0.10625, "grad_norm": 2.5239920616149902, "grad_norm_var": 0.17889101900169255, "learning_rate": 0.0001, "loss": 1.4329, "loss/crossentropy": 2.4825353622436523, "loss/hidden": 1.203125, "loss/logits": 0.2282102406024933, "loss/reg": 0.00015357449592556804, "step": 850 }, { "epoch": 0.106375, "grad_norm": 3.0814735889434814, "grad_norm_var": 0.17690372248029387, "learning_rate": 0.0001, "loss": 1.2092, "loss/crossentropy": 2.747859239578247, "loss/hidden": 1.03125, "loss/logits": 0.17641031742095947, "loss/reg": 0.0001535168121336028, "step": 851 }, { "epoch": 0.1065, "grad_norm": 3.3972651958465576, "grad_norm_var": 0.19354849199297158, "learning_rate": 0.0001, "loss": 1.5051, "loss/crossentropy": 2.7524545192718506, "loss/hidden": 1.2265625, "loss/logits": 0.27705004811286926, "loss/reg": 0.00015345441352110356, "step": 852 }, { "epoch": 0.106625, "grad_norm": 1.956974744796753, "grad_norm_var": 0.24073075947492933, "learning_rate": 0.0001, "loss": 1.2681, "loss/crossentropy": 2.2648580074310303, "loss/hidden": 1.109375, "loss/logits": 0.15722431242465973, "loss/reg": 0.000153396773384884, "step": 853 }, { "epoch": 0.10675, "grad_norm": 2.570080518722534, "grad_norm_var": 0.24030682749846974, "learning_rate": 0.0001, "loss": 1.2576, "loss/crossentropy": 2.6702942848205566, "loss/hidden": 1.09375, "loss/logits": 0.1623595654964447, "loss/reg": 0.0001533414761070162, "step": 854 }, { "epoch": 0.106875, "grad_norm": 2.7701046466827393, "grad_norm_var": 0.23446498492980722, "learning_rate": 0.0001, "loss": 1.4018, "loss/crossentropy": 2.5442535877227783, "loss/hidden": 1.2109375, "loss/logits": 0.18928521871566772, "loss/reg": 0.0001532830501673743, "step": 855 }, { "epoch": 0.107, "grad_norm": 2.6539034843444824, "grad_norm_var": 0.16537684665656016, "learning_rate": 0.0001, "loss": 1.2351, "loss/crossentropy": 2.3728694915771484, "loss/hidden": 1.078125, "loss/logits": 0.15545538067817688, "loss/reg": 0.00015323622210416943, "step": 856 }, { "epoch": 0.107125, "grad_norm": 3.8023223876953125, "grad_norm_var": 0.24172015114670345, "learning_rate": 0.0001, "loss": 1.3129, "loss/crossentropy": 2.5650129318237305, "loss/hidden": 1.125, "loss/logits": 0.18641337752342224, "loss/reg": 0.0001531776797492057, "step": 857 }, { "epoch": 0.10725, "grad_norm": 2.3649542331695557, "grad_norm_var": 0.24550997572969563, "learning_rate": 0.0001, "loss": 1.237, "loss/crossentropy": 2.609403610229492, "loss/hidden": 1.0625, "loss/logits": 0.17298534512519836, "loss/reg": 0.00015311813331209123, "step": 858 }, { "epoch": 0.107375, "grad_norm": 2.6221439838409424, "grad_norm_var": 0.22130049617418307, "learning_rate": 0.0001, "loss": 1.2412, "loss/crossentropy": 2.3091585636138916, "loss/hidden": 1.078125, "loss/logits": 0.1614995002746582, "loss/reg": 0.00015306533896364272, "step": 859 }, { "epoch": 0.1075, "grad_norm": 2.57521653175354, "grad_norm_var": 0.22249090694117418, "learning_rate": 0.0001, "loss": 1.1912, "loss/crossentropy": 2.4844014644622803, "loss/hidden": 1.0234375, "loss/logits": 0.16619476675987244, "loss/reg": 0.00015301072562579066, "step": 860 }, { "epoch": 0.107625, "grad_norm": 2.5931003093719482, "grad_norm_var": 0.22033873522295835, "learning_rate": 0.0001, "loss": 1.2213, "loss/crossentropy": 2.6779448986053467, "loss/hidden": 1.0546875, "loss/logits": 0.16506552696228027, "loss/reg": 0.00015295080083888024, "step": 861 }, { "epoch": 0.10775, "grad_norm": 11.062914848327637, "grad_norm_var": 4.544443133592642, "learning_rate": 0.0001, "loss": 1.2929, "loss/crossentropy": 2.4246323108673096, "loss/hidden": 1.1171875, "loss/logits": 0.1741706132888794, "loss/reg": 0.00015289057046175003, "step": 862 }, { "epoch": 0.107875, "grad_norm": 3.0183844566345215, "grad_norm_var": 4.528555988151701, "learning_rate": 0.0001, "loss": 1.2358, "loss/crossentropy": 2.384451150894165, "loss/hidden": 1.0703125, "loss/logits": 0.16397619247436523, "loss/reg": 0.00015282341337297112, "step": 863 }, { "epoch": 0.108, "grad_norm": 3.690757989883423, "grad_norm_var": 4.448104696647963, "learning_rate": 0.0001, "loss": 1.5422, "loss/crossentropy": 2.663633346557617, "loss/hidden": 1.296875, "loss/logits": 0.24384552240371704, "loss/reg": 0.00015275817713700235, "step": 864 }, { "epoch": 0.108125, "grad_norm": 2.2269880771636963, "grad_norm_var": 4.530047569879394, "learning_rate": 0.0001, "loss": 1.3416, "loss/crossentropy": 2.3885631561279297, "loss/hidden": 1.15625, "loss/logits": 0.1838582158088684, "loss/reg": 0.0001527029526187107, "step": 865 }, { "epoch": 0.10825, "grad_norm": 2.363738536834717, "grad_norm_var": 4.548381381738593, "learning_rate": 0.0001, "loss": 1.1994, "loss/crossentropy": 2.5490643978118896, "loss/hidden": 1.03125, "loss/logits": 0.16663864254951477, "loss/reg": 0.0001526353444205597, "step": 866 }, { "epoch": 0.108375, "grad_norm": 2.250906229019165, "grad_norm_var": 4.615352805596595, "learning_rate": 0.0001, "loss": 1.237, "loss/crossentropy": 2.549473762512207, "loss/hidden": 1.0625, "loss/logits": 0.17293354868888855, "loss/reg": 0.00015257827180903405, "step": 867 }, { "epoch": 0.1085, "grad_norm": 2.1149637699127197, "grad_norm_var": 4.692085442261248, "learning_rate": 0.0001, "loss": 1.2365, "loss/crossentropy": 2.3521182537078857, "loss/hidden": 1.0625, "loss/logits": 0.17247089743614197, "loss/reg": 0.00015251578588504344, "step": 868 }, { "epoch": 0.108625, "grad_norm": 2.3415005207061768, "grad_norm_var": 4.639399272930234, "learning_rate": 0.0001, "loss": 1.272, "loss/crossentropy": 2.6056532859802246, "loss/hidden": 1.078125, "loss/logits": 0.19234131276607513, "loss/reg": 0.00015245236863847822, "step": 869 }, { "epoch": 0.10875, "grad_norm": 3.373487710952759, "grad_norm_var": 4.61345498986966, "learning_rate": 0.0001, "loss": 1.646, "loss/crossentropy": 2.084838390350342, "loss/hidden": 1.3671875, "loss/logits": 0.27726030349731445, "loss/reg": 0.00015239673666656017, "step": 870 }, { "epoch": 0.108875, "grad_norm": 2.4091196060180664, "grad_norm_var": 4.644172112975123, "learning_rate": 0.0001, "loss": 1.3183, "loss/crossentropy": 2.6128547191619873, "loss/hidden": 1.1171875, "loss/logits": 0.19963675737380981, "loss/reg": 0.0001523350365459919, "step": 871 }, { "epoch": 0.109, "grad_norm": 3.5518054962158203, "grad_norm_var": 4.627204145610123, "learning_rate": 0.0001, "loss": 1.3989, "loss/crossentropy": 2.658078193664551, "loss/hidden": 1.171875, "loss/logits": 0.22548414766788483, "loss/reg": 0.000152284512296319, "step": 872 }, { "epoch": 0.109125, "grad_norm": 4.572037220001221, "grad_norm_var": 4.718593123740408, "learning_rate": 0.0001, "loss": 1.4377, "loss/crossentropy": 2.151373863220215, "loss/hidden": 1.234375, "loss/logits": 0.20181819796562195, "loss/reg": 0.00015224071103148162, "step": 873 }, { "epoch": 0.10925, "grad_norm": 3.1162211894989014, "grad_norm_var": 4.6581270921324975, "learning_rate": 0.0001, "loss": 1.8494, "loss/crossentropy": 2.135683298110962, "loss/hidden": 1.515625, "loss/logits": 0.3322503864765167, "loss/reg": 0.0001521995000075549, "step": 874 }, { "epoch": 0.109375, "grad_norm": 2.7984983921051025, "grad_norm_var": 4.642539824536569, "learning_rate": 0.0001, "loss": 1.2677, "loss/crossentropy": 2.458695411682129, "loss/hidden": 1.09375, "loss/logits": 0.17247238755226135, "loss/reg": 0.00015215885650832206, "step": 875 }, { "epoch": 0.1095, "grad_norm": 2.5824549198150635, "grad_norm_var": 4.641767615897932, "learning_rate": 0.0001, "loss": 1.462, "loss/crossentropy": 2.2612884044647217, "loss/hidden": 1.2578125, "loss/logits": 0.20269830524921417, "loss/reg": 0.00015210473793558776, "step": 876 }, { "epoch": 0.109625, "grad_norm": 2.2584636211395264, "grad_norm_var": 4.6838399515196745, "learning_rate": 0.0001, "loss": 1.2975, "loss/crossentropy": 2.4042654037475586, "loss/hidden": 1.1171875, "loss/logits": 0.17884109914302826, "loss/reg": 0.00015205070667434484, "step": 877 }, { "epoch": 0.10975, "grad_norm": 2.086137533187866, "grad_norm_var": 0.498524866245526, "learning_rate": 0.0001, "loss": 1.1044, "loss/crossentropy": 2.5678513050079346, "loss/hidden": 0.9609375, "loss/logits": 0.14195486903190613, "loss/reg": 0.00015200056077446789, "step": 878 }, { "epoch": 0.109875, "grad_norm": 2.412999153137207, "grad_norm_var": 0.5035783389301057, "learning_rate": 0.0001, "loss": 1.376, "loss/crossentropy": 2.4384567737579346, "loss/hidden": 1.15625, "loss/logits": 0.21818344295024872, "loss/reg": 0.00015194181469269097, "step": 879 }, { "epoch": 0.11, "grad_norm": 2.976552724838257, "grad_norm_var": 0.44676623604769905, "learning_rate": 0.0001, "loss": 1.6032, "loss/crossentropy": 2.2460405826568604, "loss/hidden": 1.3671875, "loss/logits": 0.23447275161743164, "loss/reg": 0.00015188820543698967, "step": 880 }, { "epoch": 0.110125, "grad_norm": 2.7214362621307373, "grad_norm_var": 0.4298902906488439, "learning_rate": 0.0001, "loss": 1.3043, "loss/crossentropy": 2.1849935054779053, "loss/hidden": 1.1328125, "loss/logits": 0.16993211209774017, "loss/reg": 0.00015183616778813303, "step": 881 }, { "epoch": 0.11025, "grad_norm": 2.4439332485198975, "grad_norm_var": 0.4262086543796452, "learning_rate": 0.0001, "loss": 1.208, "loss/crossentropy": 2.6016674041748047, "loss/hidden": 1.046875, "loss/logits": 0.15959714353084564, "loss/reg": 0.0001517821365268901, "step": 882 }, { "epoch": 0.110375, "grad_norm": 2.450092315673828, "grad_norm_var": 0.4154158852755015, "learning_rate": 0.0001, "loss": 1.3565, "loss/crossentropy": 2.1921534538269043, "loss/hidden": 1.171875, "loss/logits": 0.18311545252799988, "loss/reg": 0.0001517308846814558, "step": 883 }, { "epoch": 0.1105, "grad_norm": 1.957759141921997, "grad_norm_var": 0.4305459373717949, "learning_rate": 0.0001, "loss": 1.1768, "loss/crossentropy": 2.6606931686401367, "loss/hidden": 1.0078125, "loss/logits": 0.16743463277816772, "loss/reg": 0.0001516815973445773, "step": 884 }, { "epoch": 0.110625, "grad_norm": 2.24867582321167, "grad_norm_var": 0.4361809193879623, "learning_rate": 0.0001, "loss": 1.2824, "loss/crossentropy": 2.735870122909546, "loss/hidden": 1.1015625, "loss/logits": 0.17929066717624664, "loss/reg": 0.00015161729243118316, "step": 885 }, { "epoch": 0.11075, "grad_norm": 4.560276508331299, "grad_norm_var": 0.6232687284063025, "learning_rate": 0.0001, "loss": 1.5532, "loss/crossentropy": 2.43245792388916, "loss/hidden": 1.328125, "loss/logits": 0.22360533475875854, "loss/reg": 0.00015155358414631337, "step": 886 }, { "epoch": 0.110875, "grad_norm": 2.38519287109375, "grad_norm_var": 0.6246205889238219, "learning_rate": 0.0001, "loss": 1.2289, "loss/crossentropy": 2.7643001079559326, "loss/hidden": 1.046875, "loss/logits": 0.18053092062473297, "loss/reg": 0.00015148980310186744, "step": 887 }, { "epoch": 0.111, "grad_norm": 3.681467294692993, "grad_norm_var": 0.6383202385604402, "learning_rate": 0.0001, "loss": 1.599, "loss/crossentropy": 2.4496991634368896, "loss/hidden": 1.3359375, "loss/logits": 0.26154834032058716, "loss/reg": 0.0001514303294243291, "step": 888 }, { "epoch": 0.111125, "grad_norm": 9.071762084960938, "grad_norm_var": 2.9499914088007007, "learning_rate": 0.0001, "loss": 1.7924, "loss/crossentropy": 2.018651247024536, "loss/hidden": 1.5859375, "loss/logits": 0.20496124029159546, "loss/reg": 0.00015137075388338417, "step": 889 }, { "epoch": 0.11125, "grad_norm": 2.2700443267822266, "grad_norm_var": 2.993983512946158, "learning_rate": 0.0001, "loss": 1.3227, "loss/crossentropy": 2.2610602378845215, "loss/hidden": 1.140625, "loss/logits": 0.18059572577476501, "loss/reg": 0.00015131563122849911, "step": 890 }, { "epoch": 0.111375, "grad_norm": 3.127906084060669, "grad_norm_var": 2.9894288634523964, "learning_rate": 0.0001, "loss": 1.4859, "loss/crossentropy": 2.588167905807495, "loss/hidden": 1.2734375, "loss/logits": 0.21090903878211975, "loss/reg": 0.00015125975187402219, "step": 891 }, { "epoch": 0.1115, "grad_norm": 2.2491743564605713, "grad_norm_var": 3.018356170529423, "learning_rate": 0.0001, "loss": 1.328, "loss/crossentropy": 2.481363296508789, "loss/hidden": 1.125, "loss/logits": 0.20145750045776367, "loss/reg": 0.0001511952723376453, "step": 892 }, { "epoch": 0.111625, "grad_norm": 2.470684289932251, "grad_norm_var": 2.9985934737489477, "learning_rate": 0.0001, "loss": 1.2376, "loss/crossentropy": 2.4518485069274902, "loss/hidden": 1.0546875, "loss/logits": 0.1814129650592804, "loss/reg": 0.00015114144480321556, "step": 893 }, { "epoch": 0.11175, "grad_norm": 2.268484115600586, "grad_norm_var": 2.9767600626835087, "learning_rate": 0.0001, "loss": 1.2324, "loss/crossentropy": 2.4263572692871094, "loss/hidden": 1.0546875, "loss/logits": 0.17615455389022827, "loss/reg": 0.00015106963110156357, "step": 894 }, { "epoch": 0.111875, "grad_norm": 3.0133771896362305, "grad_norm_var": 2.9458124817155817, "learning_rate": 0.0001, "loss": 1.2304, "loss/crossentropy": 2.695824146270752, "loss/hidden": 1.0625, "loss/logits": 0.16641905903816223, "loss/reg": 0.0001510050060460344, "step": 895 }, { "epoch": 0.112, "grad_norm": 2.391716957092285, "grad_norm_var": 2.9782623053100448, "learning_rate": 0.0001, "loss": 1.1129, "loss/crossentropy": 2.694415330886841, "loss/hidden": 0.97265625, "loss/logits": 0.1387733668088913, "loss/reg": 0.00015094956324901432, "step": 896 }, { "epoch": 0.112125, "grad_norm": 2.405618667602539, "grad_norm_var": 2.9996790402730955, "learning_rate": 0.0001, "loss": 1.3141, "loss/crossentropy": 2.386396646499634, "loss/hidden": 1.140625, "loss/logits": 0.17200858891010284, "loss/reg": 0.0001508957357145846, "step": 897 }, { "epoch": 0.11225, "grad_norm": 2.339975357055664, "grad_norm_var": 3.0089251569192372, "learning_rate": 0.0001, "loss": 1.1665, "loss/crossentropy": 2.853754758834839, "loss/hidden": 1.0078125, "loss/logits": 0.1571904718875885, "loss/reg": 0.0001508403947809711, "step": 898 }, { "epoch": 0.112375, "grad_norm": 2.3195629119873047, "grad_norm_var": 3.020531071010045, "learning_rate": 0.0001, "loss": 1.3257, "loss/crossentropy": 2.605168342590332, "loss/hidden": 1.140625, "loss/logits": 0.18355190753936768, "loss/reg": 0.00015079096192494035, "step": 899 }, { "epoch": 0.1125, "grad_norm": 2.1336631774902344, "grad_norm_var": 2.996903858717451, "learning_rate": 0.0001, "loss": 1.1899, "loss/crossentropy": 2.6083362102508545, "loss/hidden": 1.0, "loss/logits": 0.1883779913187027, "loss/reg": 0.00015073452959768474, "step": 900 }, { "epoch": 0.112625, "grad_norm": 1.9317057132720947, "grad_norm_var": 3.03741275675761, "learning_rate": 0.0001, "loss": 1.183, "loss/crossentropy": 2.36804461479187, "loss/hidden": 1.015625, "loss/logits": 0.16587695479393005, "loss/reg": 0.000150699372170493, "step": 901 }, { "epoch": 0.11275, "grad_norm": 2.2094831466674805, "grad_norm_var": 2.905908161240213, "learning_rate": 0.0001, "loss": 1.2755, "loss/crossentropy": 2.3709306716918945, "loss/hidden": 1.1015625, "loss/logits": 0.1724090874195099, "loss/reg": 0.00015065635670907795, "step": 902 }, { "epoch": 0.112875, "grad_norm": 6.73660945892334, "grad_norm_var": 3.7953700501366976, "learning_rate": 0.0001, "loss": 1.9898, "loss/crossentropy": 2.709784984588623, "loss/hidden": 1.6875, "loss/logits": 0.30080917477607727, "loss/reg": 0.00015060049190651625, "step": 903 }, { "epoch": 0.113, "grad_norm": 2.7410106658935547, "grad_norm_var": 3.785739642069757, "learning_rate": 0.0001, "loss": 1.4295, "loss/crossentropy": 2.2827908992767334, "loss/hidden": 1.203125, "loss/logits": 0.22488418221473694, "loss/reg": 0.00015055955736897886, "step": 904 }, { "epoch": 0.113125, "grad_norm": 2.3902018070220947, "grad_norm_var": 1.2603485684773013, "learning_rate": 0.0001, "loss": 1.2708, "loss/crossentropy": 2.530529260635376, "loss/hidden": 1.078125, "loss/logits": 0.1912027895450592, "loss/reg": 0.00015051972877699882, "step": 905 }, { "epoch": 0.11325, "grad_norm": 2.857919216156006, "grad_norm_var": 1.2492306426764683, "learning_rate": 0.0001, "loss": 1.2628, "loss/crossentropy": 2.6943166255950928, "loss/hidden": 1.078125, "loss/logits": 0.18316224217414856, "loss/reg": 0.00015046296175569296, "step": 906 }, { "epoch": 0.113375, "grad_norm": 4.934794902801514, "grad_norm_var": 1.5505454702081958, "learning_rate": 0.0001, "loss": 2.058, "loss/crossentropy": 2.7537894248962402, "loss/hidden": 1.640625, "loss/logits": 0.4158385396003723, "loss/reg": 0.00015040210564620793, "step": 907 }, { "epoch": 0.1135, "grad_norm": 3.091104745864868, "grad_norm_var": 1.528846718849233, "learning_rate": 0.0001, "loss": 1.5923, "loss/crossentropy": 2.435492992401123, "loss/hidden": 1.296875, "loss/logits": 0.2939673364162445, "loss/reg": 0.000150355976074934, "step": 908 }, { "epoch": 0.113625, "grad_norm": 2.9427144527435303, "grad_norm_var": 1.5163979560166192, "learning_rate": 0.0001, "loss": 1.3449, "loss/crossentropy": 2.2136189937591553, "loss/hidden": 1.1640625, "loss/logits": 0.17936971783638, "loss/reg": 0.0001503032835898921, "step": 909 }, { "epoch": 0.11375, "grad_norm": 2.8414390087127686, "grad_norm_var": 1.4872009627733023, "learning_rate": 0.0001, "loss": 1.3254, "loss/crossentropy": 2.7871148586273193, "loss/hidden": 1.140625, "loss/logits": 0.18329131603240967, "loss/reg": 0.00015026138862594962, "step": 910 }, { "epoch": 0.113875, "grad_norm": 3.5800082683563232, "grad_norm_var": 1.511674093118965, "learning_rate": 0.0001, "loss": 1.3549, "loss/crossentropy": 2.7723023891448975, "loss/hidden": 1.140625, "loss/logits": 0.21274316310882568, "loss/reg": 0.00015020478167571127, "step": 911 }, { "epoch": 0.114, "grad_norm": 2.4420511722564697, "grad_norm_var": 1.5078140667984805, "learning_rate": 0.0001, "loss": 1.2617, "loss/crossentropy": 2.705547571182251, "loss/hidden": 1.0703125, "loss/logits": 0.18991434574127197, "loss/reg": 0.00015014635573606938, "step": 912 }, { "epoch": 0.114125, "grad_norm": 2.490351676940918, "grad_norm_var": 1.5016197544719638, "learning_rate": 0.0001, "loss": 1.3055, "loss/crossentropy": 2.588766098022461, "loss/hidden": 1.1015625, "loss/logits": 0.20243065059185028, "loss/reg": 0.00015010114293545485, "step": 913 }, { "epoch": 0.11425, "grad_norm": 2.2149529457092285, "grad_norm_var": 1.5135809174202883, "learning_rate": 0.0001, "loss": 1.3597, "loss/crossentropy": 2.444214344024658, "loss/hidden": 1.140625, "loss/logits": 0.21759945154190063, "loss/reg": 0.0001500437210779637, "step": 914 }, { "epoch": 0.114375, "grad_norm": 2.3366408348083496, "grad_norm_var": 1.5120700218817014, "learning_rate": 0.0001, "loss": 1.3578, "loss/crossentropy": 2.3637807369232178, "loss/hidden": 1.171875, "loss/logits": 0.1844063699245453, "loss/reg": 0.0001499952340964228, "step": 915 }, { "epoch": 0.1145, "grad_norm": 2.4280269145965576, "grad_norm_var": 1.4837907127262306, "learning_rate": 0.0001, "loss": 1.3025, "loss/crossentropy": 2.5469183921813965, "loss/hidden": 1.125, "loss/logits": 0.17600534856319427, "loss/reg": 0.0001499430654803291, "step": 916 }, { "epoch": 0.114625, "grad_norm": 2.4685251712799072, "grad_norm_var": 1.4245814161871104, "learning_rate": 0.0001, "loss": 1.2751, "loss/crossentropy": 2.401733160018921, "loss/hidden": 1.109375, "loss/logits": 0.16419798135757446, "loss/reg": 0.00014990134513936937, "step": 917 }, { "epoch": 0.11475, "grad_norm": 2.449252128601074, "grad_norm_var": 1.4014919895573825, "learning_rate": 0.0001, "loss": 1.4316, "loss/crossentropy": 2.2170298099517822, "loss/hidden": 1.21875, "loss/logits": 0.21135510504245758, "loss/reg": 0.00014985818415880203, "step": 918 }, { "epoch": 0.114875, "grad_norm": 2.6959264278411865, "grad_norm_var": 0.4406503872682907, "learning_rate": 0.0001, "loss": 1.5224, "loss/crossentropy": 2.3303816318511963, "loss/hidden": 1.28125, "loss/logits": 0.23968294262886047, "loss/reg": 0.00014980306150391698, "step": 919 }, { "epoch": 0.115, "grad_norm": 2.823617696762085, "grad_norm_var": 0.44035493155154865, "learning_rate": 0.0001, "loss": 1.5936, "loss/crossentropy": 2.505021095275879, "loss/hidden": 1.359375, "loss/logits": 0.23276162147521973, "loss/reg": 0.00014975851809140295, "step": 920 }, { "epoch": 0.115125, "grad_norm": 2.3678812980651855, "grad_norm_var": 0.44164053748940524, "learning_rate": 0.0001, "loss": 1.3888, "loss/crossentropy": 2.6819827556610107, "loss/hidden": 1.1875, "loss/logits": 0.1997576355934143, "loss/reg": 0.00014970805204939097, "step": 921 }, { "epoch": 0.11525, "grad_norm": 2.670546054840088, "grad_norm_var": 0.44264579155321804, "learning_rate": 0.0001, "loss": 1.214, "loss/crossentropy": 2.3917977809906006, "loss/hidden": 1.0546875, "loss/logits": 0.1578504592180252, "loss/reg": 0.000149664978380315, "step": 922 }, { "epoch": 0.115375, "grad_norm": 3.0978972911834717, "grad_norm_var": 0.1303402458161789, "learning_rate": 0.0001, "loss": 1.3489, "loss/crossentropy": 2.5284368991851807, "loss/hidden": 1.140625, "loss/logits": 0.20675615966320038, "loss/reg": 0.00014962753630243242, "step": 923 }, { "epoch": 0.1155, "grad_norm": 3.1418039798736572, "grad_norm_var": 0.13325417757332994, "learning_rate": 0.0001, "loss": 1.2896, "loss/crossentropy": 2.4776134490966797, "loss/hidden": 1.109375, "loss/logits": 0.17872066795825958, "loss/reg": 0.00014957292296458036, "step": 924 }, { "epoch": 0.115625, "grad_norm": 3.0424060821533203, "grad_norm_var": 0.13727464390597272, "learning_rate": 0.0001, "loss": 1.3307, "loss/crossentropy": 2.367518901824951, "loss/hidden": 1.15625, "loss/logits": 0.17294055223464966, "loss/reg": 0.0001495258038630709, "step": 925 }, { "epoch": 0.11575, "grad_norm": 3.4168553352355957, "grad_norm_var": 0.16934125140121903, "learning_rate": 0.0001, "loss": 1.7875, "loss/crossentropy": 2.6515471935272217, "loss/hidden": 1.53125, "loss/logits": 0.25479888916015625, "loss/reg": 0.00014946791634429246, "step": 926 }, { "epoch": 0.115875, "grad_norm": 3.301290512084961, "grad_norm_var": 0.14257736528365186, "learning_rate": 0.0001, "loss": 1.4197, "loss/crossentropy": 2.5125603675842285, "loss/hidden": 1.21875, "loss/logits": 0.19941125810146332, "loss/reg": 0.0001494106400059536, "step": 927 }, { "epoch": 0.116, "grad_norm": 2.221191644668579, "grad_norm_var": 0.15356816953164198, "learning_rate": 0.0001, "loss": 1.245, "loss/crossentropy": 2.541898488998413, "loss/hidden": 1.0625, "loss/logits": 0.1809685081243515, "loss/reg": 0.00014935850049369037, "step": 928 }, { "epoch": 0.116125, "grad_norm": 3.8448290824890137, "grad_norm_var": 0.23073998675936855, "learning_rate": 0.0001, "loss": 1.6885, "loss/crossentropy": 3.0520853996276855, "loss/hidden": 1.125, "loss/logits": 0.561988353729248, "loss/reg": 0.0001492997835157439, "step": 929 }, { "epoch": 0.11625, "grad_norm": 2.722247838973999, "grad_norm_var": 0.20842879984021845, "learning_rate": 0.0001, "loss": 1.1592, "loss/crossentropy": 2.44636869430542, "loss/hidden": 1.0078125, "loss/logits": 0.14993345737457275, "loss/reg": 0.00014924356946721673, "step": 930 }, { "epoch": 0.116375, "grad_norm": 2.0081658363342285, "grad_norm_var": 0.236092546569652, "learning_rate": 0.0001, "loss": 1.1461, "loss/crossentropy": 2.5273056030273438, "loss/hidden": 0.99609375, "loss/logits": 0.14855071902275085, "loss/reg": 0.00014918258239049464, "step": 931 }, { "epoch": 0.1165, "grad_norm": 2.3488662242889404, "grad_norm_var": 0.24034462204346027, "learning_rate": 0.0001, "loss": 1.2849, "loss/crossentropy": 2.4500248432159424, "loss/hidden": 1.109375, "loss/logits": 0.1739884912967682, "loss/reg": 0.00014912831829860806, "step": 932 }, { "epoch": 0.116625, "grad_norm": 2.7105705738067627, "grad_norm_var": 0.23366909184424003, "learning_rate": 0.0001, "loss": 1.2757, "loss/crossentropy": 2.4227216243743896, "loss/hidden": 1.1171875, "loss/logits": 0.15702617168426514, "loss/reg": 0.00014907700824551284, "step": 933 }, { "epoch": 0.11675, "grad_norm": 1.8577007055282593, "grad_norm_var": 0.2835169100884736, "learning_rate": 0.0001, "loss": 1.1809, "loss/crossentropy": 2.497603178024292, "loss/hidden": 1.03125, "loss/logits": 0.148163303732872, "loss/reg": 0.0001490200957050547, "step": 934 }, { "epoch": 0.116875, "grad_norm": 2.1681952476501465, "grad_norm_var": 0.3059233099388772, "learning_rate": 0.0001, "loss": 1.2118, "loss/crossentropy": 2.529005527496338, "loss/hidden": 1.046875, "loss/logits": 0.1634415090084076, "loss/reg": 0.00014895829372107983, "step": 935 }, { "epoch": 0.117, "grad_norm": 2.1570377349853516, "grad_norm_var": 0.32572924463601627, "learning_rate": 0.0001, "loss": 1.2363, "loss/crossentropy": 2.529071092605591, "loss/hidden": 1.0625, "loss/logits": 0.17230528593063354, "loss/reg": 0.00014889417798258364, "step": 936 }, { "epoch": 0.117125, "grad_norm": 2.0456464290618896, "grad_norm_var": 0.3461593278951819, "learning_rate": 0.0001, "loss": 1.2757, "loss/crossentropy": 2.3965156078338623, "loss/hidden": 1.078125, "loss/logits": 0.19612111151218414, "loss/reg": 0.0001488511188654229, "step": 937 }, { "epoch": 0.11725, "grad_norm": 2.402035713195801, "grad_norm_var": 0.35072476729742913, "learning_rate": 0.0001, "loss": 1.3166, "loss/crossentropy": 2.943227529525757, "loss/hidden": 1.1171875, "loss/logits": 0.19789212942123413, "loss/reg": 0.00014880574599374086, "step": 938 }, { "epoch": 0.117375, "grad_norm": 2.7098913192749023, "grad_norm_var": 0.3372429448317106, "learning_rate": 0.0001, "loss": 1.1745, "loss/crossentropy": 2.561946392059326, "loss/hidden": 1.015625, "loss/logits": 0.15740427374839783, "loss/reg": 0.00014876823115628213, "step": 939 }, { "epoch": 0.1175, "grad_norm": 3.8419198989868164, "grad_norm_var": 0.4155450691524208, "learning_rate": 0.0001, "loss": 1.8435, "loss/crossentropy": 2.494044780731201, "loss/hidden": 1.453125, "loss/logits": 0.38888823986053467, "loss/reg": 0.00014872981410007924, "step": 940 }, { "epoch": 0.117625, "grad_norm": 4.364389896392822, "grad_norm_var": 0.5895459640292117, "learning_rate": 0.0001, "loss": 2.0013, "loss/crossentropy": 2.4075002670288086, "loss/hidden": 1.6640625, "loss/logits": 0.33570897579193115, "loss/reg": 0.00014869867300149053, "step": 941 }, { "epoch": 0.11775, "grad_norm": 2.411377429962158, "grad_norm_var": 0.5643439361590751, "learning_rate": 0.0001, "loss": 1.0814, "loss/crossentropy": 2.4931085109710693, "loss/hidden": 0.94140625, "loss/logits": 0.13845817744731903, "loss/reg": 0.00014864426339045167, "step": 942 }, { "epoch": 0.117875, "grad_norm": 3.5457420349121094, "grad_norm_var": 0.5878493323033112, "learning_rate": 0.0001, "loss": 1.5874, "loss/crossentropy": 2.0147228240966797, "loss/hidden": 1.328125, "loss/logits": 0.2578049898147583, "loss/reg": 0.00014859173097647727, "step": 943 }, { "epoch": 0.118, "grad_norm": 2.3871850967407227, "grad_norm_var": 0.5787531810576927, "learning_rate": 0.0001, "loss": 1.2401, "loss/crossentropy": 2.6504478454589844, "loss/hidden": 1.0703125, "loss/logits": 0.16834473609924316, "loss/reg": 0.0001485378888901323, "step": 944 }, { "epoch": 0.118125, "grad_norm": 2.712791919708252, "grad_norm_var": 0.48912238841050587, "learning_rate": 0.0001, "loss": 1.3422, "loss/crossentropy": 2.171556234359741, "loss/hidden": 1.171875, "loss/logits": 0.1688719093799591, "loss/reg": 0.00014849254512228072, "step": 945 }, { "epoch": 0.11825, "grad_norm": 2.4678895473480225, "grad_norm_var": 0.49070255890865294, "learning_rate": 0.0001, "loss": 1.1581, "loss/crossentropy": 2.8594019412994385, "loss/hidden": 1.0, "loss/logits": 0.15663599967956543, "loss/reg": 0.00014843452663626522, "step": 946 }, { "epoch": 0.118375, "grad_norm": 2.1392078399658203, "grad_norm_var": 0.48084608478125684, "learning_rate": 0.0001, "loss": 1.2969, "loss/crossentropy": 2.3068058490753174, "loss/hidden": 1.125, "loss/logits": 0.17045344412326813, "loss/reg": 0.00014837455819360912, "step": 947 }, { "epoch": 0.1185, "grad_norm": 3.274775266647339, "grad_norm_var": 0.49825108989664757, "learning_rate": 0.0001, "loss": 1.5165, "loss/crossentropy": 2.6716904640197754, "loss/hidden": 1.28125, "loss/logits": 0.233724445104599, "loss/reg": 0.0001483206870034337, "step": 948 }, { "epoch": 0.118625, "grad_norm": 7.424248695373535, "grad_norm_var": 1.8937102968327844, "learning_rate": 0.0001, "loss": 2.0225, "loss/crossentropy": 2.5134148597717285, "loss/hidden": 1.6328125, "loss/logits": 0.3882126808166504, "loss/reg": 0.00014826534606982023, "step": 949 }, { "epoch": 0.11875, "grad_norm": 2.4559290409088135, "grad_norm_var": 1.825412008655575, "learning_rate": 0.0001, "loss": 1.3578, "loss/crossentropy": 2.448934316635132, "loss/hidden": 1.15625, "loss/logits": 0.20004288852214813, "loss/reg": 0.00014820077922195196, "step": 950 }, { "epoch": 0.118875, "grad_norm": 4.2732038497924805, "grad_norm_var": 1.859976694890262, "learning_rate": 0.0001, "loss": 1.689, "loss/crossentropy": 2.7985315322875977, "loss/hidden": 1.4453125, "loss/logits": 0.24216260015964508, "loss/reg": 0.000148153500049375, "step": 951 }, { "epoch": 0.119, "grad_norm": 3.409996271133423, "grad_norm_var": 1.7899835186384694, "learning_rate": 0.0001, "loss": 1.4178, "loss/crossentropy": 2.5561344623565674, "loss/hidden": 1.21875, "loss/logits": 0.19758714735507965, "loss/reg": 0.00014810002176091075, "step": 952 }, { "epoch": 0.119125, "grad_norm": 2.5779950618743896, "grad_norm_var": 1.7228043479274937, "learning_rate": 0.0001, "loss": 1.3552, "loss/crossentropy": 2.709462881088257, "loss/hidden": 1.140625, "loss/logits": 0.21312478184700012, "loss/reg": 0.00014803760859649628, "step": 953 }, { "epoch": 0.11925, "grad_norm": 2.2351348400115967, "grad_norm_var": 1.7439698321178563, "learning_rate": 0.0001, "loss": 1.2243, "loss/crossentropy": 2.610175609588623, "loss/hidden": 1.0546875, "loss/logits": 0.16812941431999207, "loss/reg": 0.0001479786296840757, "step": 954 }, { "epoch": 0.119375, "grad_norm": 3.175199270248413, "grad_norm_var": 1.723094538227237, "learning_rate": 0.0001, "loss": 1.4206, "loss/crossentropy": 2.215555429458618, "loss/hidden": 1.234375, "loss/logits": 0.18472649157047272, "loss/reg": 0.00014792606816627085, "step": 955 }, { "epoch": 0.1195, "grad_norm": 2.332378387451172, "grad_norm_var": 1.7551449841277718, "learning_rate": 0.0001, "loss": 1.1306, "loss/crossentropy": 2.428434133529663, "loss/hidden": 0.98046875, "loss/logits": 0.14862212538719177, "loss/reg": 0.00014787526743020862, "step": 956 }, { "epoch": 0.119625, "grad_norm": 2.2376108169555664, "grad_norm_var": 1.7074351121266504, "learning_rate": 0.0001, "loss": 1.2301, "loss/crossentropy": 2.388077974319458, "loss/hidden": 1.046875, "loss/logits": 0.1817147433757782, "loss/reg": 0.00014781762729398906, "step": 957 }, { "epoch": 0.11975, "grad_norm": 2.3867716789245605, "grad_norm_var": 1.709621572934992, "learning_rate": 0.0001, "loss": 1.2682, "loss/crossentropy": 2.3468499183654785, "loss/hidden": 1.0859375, "loss/logits": 0.18082332611083984, "loss/reg": 0.00014776407624594867, "step": 958 }, { "epoch": 0.119875, "grad_norm": 2.6116244792938232, "grad_norm_var": 1.70425093197908, "learning_rate": 0.0001, "loss": 1.8446, "loss/crossentropy": 2.268183469772339, "loss/hidden": 1.53125, "loss/logits": 0.31190574169158936, "loss/reg": 0.0001477039622841403, "step": 959 }, { "epoch": 0.12, "grad_norm": 3.1699044704437256, "grad_norm_var": 1.6779216532387236, "learning_rate": 0.0001, "loss": 1.4486, "loss/crossentropy": 2.4038498401641846, "loss/hidden": 1.2578125, "loss/logits": 0.18929308652877808, "loss/reg": 0.00014764105435460806, "step": 960 }, { "epoch": 0.120125, "grad_norm": 5.222793102264404, "grad_norm_var": 1.9570550810201663, "learning_rate": 0.0001, "loss": 1.5336, "loss/crossentropy": 2.8360648155212402, "loss/hidden": 1.296875, "loss/logits": 0.2351997345685959, "loss/reg": 0.00014758027100469917, "step": 961 }, { "epoch": 0.12025, "grad_norm": 2.709230899810791, "grad_norm_var": 1.9367454626425484, "learning_rate": 0.0001, "loss": 1.2892, "loss/crossentropy": 2.529167413711548, "loss/hidden": 1.09375, "loss/logits": 0.19396063685417175, "loss/reg": 0.00014752705465070903, "step": 962 }, { "epoch": 0.120375, "grad_norm": 2.3209972381591797, "grad_norm_var": 1.9124383142767698, "learning_rate": 0.0001, "loss": 1.1866, "loss/crossentropy": 2.4871509075164795, "loss/hidden": 1.015625, "loss/logits": 0.16945767402648926, "loss/reg": 0.0001474728196626529, "step": 963 }, { "epoch": 0.1205, "grad_norm": 2.009201765060425, "grad_norm_var": 2.006440793779589, "learning_rate": 0.0001, "loss": 1.3007, "loss/crossentropy": 2.7829058170318604, "loss/hidden": 1.1171875, "loss/logits": 0.18204209208488464, "loss/reg": 0.0001474215096095577, "step": 964 }, { "epoch": 0.120625, "grad_norm": 2.7747392654418945, "grad_norm_var": 0.7137051972196019, "learning_rate": 0.0001, "loss": 1.2522, "loss/crossentropy": 2.5970277786254883, "loss/hidden": 1.0859375, "loss/logits": 0.1647624373435974, "loss/reg": 0.00014737599121872336, "step": 965 }, { "epoch": 0.12075, "grad_norm": 2.899587631225586, "grad_norm_var": 0.701577026723718, "learning_rate": 0.0001, "loss": 1.4427, "loss/crossentropy": 2.6986851692199707, "loss/hidden": 1.21875, "loss/logits": 0.22245578467845917, "loss/reg": 0.00014732121780980378, "step": 966 }, { "epoch": 0.120875, "grad_norm": 2.635730028152466, "grad_norm_var": 0.5686163506293487, "learning_rate": 0.0001, "loss": 1.1734, "loss/crossentropy": 2.6989428997039795, "loss/hidden": 1.015625, "loss/logits": 0.15630163252353668, "loss/reg": 0.00014726696826983243, "step": 967 }, { "epoch": 0.121, "grad_norm": 3.7007603645324707, "grad_norm_var": 0.5977697538945428, "learning_rate": 0.0001, "loss": 1.3282, "loss/crossentropy": 2.8170976638793945, "loss/hidden": 1.1484375, "loss/logits": 0.17827363312244415, "loss/reg": 0.00014721987827215344, "step": 968 }, { "epoch": 0.121125, "grad_norm": 3.2693941593170166, "grad_norm_var": 0.606030561047087, "learning_rate": 0.0001, "loss": 1.3601, "loss/crossentropy": 2.5672309398651123, "loss/hidden": 1.171875, "loss/logits": 0.18672838807106018, "loss/reg": 0.00014716546866111457, "step": 969 }, { "epoch": 0.12125, "grad_norm": 4.545632362365723, "grad_norm_var": 0.7485079772549009, "learning_rate": 0.0001, "loss": 1.4174, "loss/crossentropy": 2.4874889850616455, "loss/hidden": 1.21875, "loss/logits": 0.19721585512161255, "loss/reg": 0.00014711670519318432, "step": 970 }, { "epoch": 0.121375, "grad_norm": 2.418478488922119, "grad_norm_var": 0.7666300113773291, "learning_rate": 0.0001, "loss": 1.3676, "loss/crossentropy": 2.4417524337768555, "loss/hidden": 1.1640625, "loss/logits": 0.2021140456199646, "loss/reg": 0.0001470686256652698, "step": 971 }, { "epoch": 0.1215, "grad_norm": 2.9754316806793213, "grad_norm_var": 0.7392794477143515, "learning_rate": 0.0001, "loss": 1.6187, "loss/crossentropy": 2.4728047847747803, "loss/hidden": 1.375, "loss/logits": 0.2422417551279068, "loss/reg": 0.0001470104034524411, "step": 972 }, { "epoch": 0.121625, "grad_norm": 2.6117324829101562, "grad_norm_var": 0.7103467397437261, "learning_rate": 0.0001, "loss": 1.3077, "loss/crossentropy": 2.5421762466430664, "loss/hidden": 1.1171875, "loss/logits": 0.18908782303333282, "loss/reg": 0.00014695858408231288, "step": 973 }, { "epoch": 0.12175, "grad_norm": 2.6674041748046875, "grad_norm_var": 0.6917105916342811, "learning_rate": 0.0001, "loss": 1.4841, "loss/crossentropy": 2.3860273361206055, "loss/hidden": 1.2890625, "loss/logits": 0.19360515475273132, "loss/reg": 0.0001469059061491862, "step": 974 }, { "epoch": 0.121875, "grad_norm": 3.3765125274658203, "grad_norm_var": 0.6852091033743872, "learning_rate": 0.0001, "loss": 1.4032, "loss/crossentropy": 2.3495490550994873, "loss/hidden": 1.1640625, "loss/logits": 0.23769448697566986, "loss/reg": 0.00014685816131532192, "step": 975 }, { "epoch": 0.122, "grad_norm": 2.4105846881866455, "grad_norm_var": 0.7123165505574147, "learning_rate": 0.0001, "loss": 1.1448, "loss/crossentropy": 2.578697443008423, "loss/hidden": 0.99609375, "loss/logits": 0.14723609387874603, "loss/reg": 0.00014681293396279216, "step": 976 }, { "epoch": 0.122125, "grad_norm": 2.4042739868164062, "grad_norm_var": 0.3863645525793475, "learning_rate": 0.0001, "loss": 1.3246, "loss/crossentropy": 2.5354533195495605, "loss/hidden": 1.1328125, "loss/logits": 0.19027411937713623, "loss/reg": 0.00014675200509373099, "step": 977 }, { "epoch": 0.12225, "grad_norm": 3.220013380050659, "grad_norm_var": 0.392531703022154, "learning_rate": 0.0001, "loss": 1.6537, "loss/crossentropy": 2.398271083831787, "loss/hidden": 1.3984375, "loss/logits": 0.2537580132484436, "loss/reg": 0.00014670556993223727, "step": 978 }, { "epoch": 0.122375, "grad_norm": 2.3881642818450928, "grad_norm_var": 0.387717636373484, "learning_rate": 0.0001, "loss": 1.4603, "loss/crossentropy": 2.569382667541504, "loss/hidden": 1.2109375, "loss/logits": 0.24794355034828186, "loss/reg": 0.00014665084017906338, "step": 979 }, { "epoch": 0.1225, "grad_norm": 6.624855041503906, "grad_norm_var": 1.17457061120339, "learning_rate": 0.0001, "loss": 1.8472, "loss/crossentropy": 2.4095969200134277, "loss/hidden": 1.5859375, "loss/logits": 0.25981831550598145, "loss/reg": 0.0001465911918785423, "step": 980 }, { "epoch": 0.122625, "grad_norm": 3.4011709690093994, "grad_norm_var": 1.1650215550513543, "learning_rate": 0.0001, "loss": 1.496, "loss/crossentropy": 2.9182286262512207, "loss/hidden": 1.28125, "loss/logits": 0.21328431367874146, "loss/reg": 0.00014653308608103544, "step": 981 }, { "epoch": 0.12275, "grad_norm": 3.284522771835327, "grad_norm_var": 1.15774207678105, "learning_rate": 0.0001, "loss": 1.3609, "loss/crossentropy": 2.656087636947632, "loss/hidden": 1.1796875, "loss/logits": 0.17970861494541168, "loss/reg": 0.00014647809439338744, "step": 982 }, { "epoch": 0.122875, "grad_norm": 3.3832926750183105, "grad_norm_var": 1.131849863957196, "learning_rate": 0.0001, "loss": 1.3712, "loss/crossentropy": 2.6478617191314697, "loss/hidden": 1.140625, "loss/logits": 0.22913646697998047, "loss/reg": 0.00014642714813817292, "step": 983 }, { "epoch": 0.123, "grad_norm": 2.8698549270629883, "grad_norm_var": 1.1297854031190542, "learning_rate": 0.0001, "loss": 1.2087, "loss/crossentropy": 2.720348834991455, "loss/hidden": 1.03125, "loss/logits": 0.17595639824867249, "loss/reg": 0.00014636846026405692, "step": 984 }, { "epoch": 0.123125, "grad_norm": 2.4347264766693115, "grad_norm_var": 1.1701347668171098, "learning_rate": 0.0001, "loss": 1.314, "loss/crossentropy": 2.8232991695404053, "loss/hidden": 1.109375, "loss/logits": 0.2031865417957306, "loss/reg": 0.00014630808436777443, "step": 985 }, { "epoch": 0.12325, "grad_norm": 2.9429919719696045, "grad_norm_var": 1.0406726219439018, "learning_rate": 0.0001, "loss": 1.4211, "loss/crossentropy": 2.5900399684906006, "loss/hidden": 1.1953125, "loss/logits": 0.22433730959892273, "loss/reg": 0.00014624485629610717, "step": 986 }, { "epoch": 0.123375, "grad_norm": 1.982164740562439, "grad_norm_var": 1.0915421066841848, "learning_rate": 0.0001, "loss": 1.1942, "loss/crossentropy": 2.5158679485321045, "loss/hidden": 1.046875, "loss/logits": 0.145903080701828, "loss/reg": 0.00014618723071180284, "step": 987 }, { "epoch": 0.1235, "grad_norm": 2.650486946105957, "grad_norm_var": 1.10185334884634, "learning_rate": 0.0001, "loss": 1.2462, "loss/crossentropy": 2.2769696712493896, "loss/hidden": 1.078125, "loss/logits": 0.16664837300777435, "loss/reg": 0.00014612126688007265, "step": 988 }, { "epoch": 0.123625, "grad_norm": 3.147040605545044, "grad_norm_var": 1.0891387933103824, "learning_rate": 0.0001, "loss": 1.5204, "loss/crossentropy": 2.4655423164367676, "loss/hidden": 1.2734375, "loss/logits": 0.2455189824104309, "loss/reg": 0.0001460643543396145, "step": 989 }, { "epoch": 0.12375, "grad_norm": 3.033003091812134, "grad_norm_var": 1.0776602014398222, "learning_rate": 0.0001, "loss": 1.448, "loss/crossentropy": 2.562800168991089, "loss/hidden": 1.234375, "loss/logits": 0.21214446425437927, "loss/reg": 0.00014600624854210764, "step": 990 }, { "epoch": 0.123875, "grad_norm": 4.266551971435547, "grad_norm_var": 1.1603288242162992, "learning_rate": 0.0001, "loss": 1.3264, "loss/crossentropy": 2.132007122039795, "loss/hidden": 1.140625, "loss/logits": 0.1842939853668213, "loss/reg": 0.0001459480554331094, "step": 991 }, { "epoch": 0.124, "grad_norm": 3.529345750808716, "grad_norm_var": 1.1278508591841108, "learning_rate": 0.0001, "loss": 1.4896, "loss/crossentropy": 2.628926992416382, "loss/hidden": 1.2421875, "loss/logits": 0.2459276020526886, "loss/reg": 0.00014588667545467615, "step": 992 }, { "epoch": 0.124125, "grad_norm": 4.71246337890625, "grad_norm_var": 1.2089711105568222, "learning_rate": 0.0001, "loss": 1.4666, "loss/crossentropy": 2.228299140930176, "loss/hidden": 1.265625, "loss/logits": 0.19956105947494507, "loss/reg": 0.00014582481526304036, "step": 993 }, { "epoch": 0.12425, "grad_norm": 2.3028690814971924, "grad_norm_var": 1.2795072809719072, "learning_rate": 0.0001, "loss": 1.48, "loss/crossentropy": 2.3957412242889404, "loss/hidden": 1.2578125, "loss/logits": 0.22072714567184448, "loss/reg": 0.0001457597827538848, "step": 994 }, { "epoch": 0.124375, "grad_norm": 2.758638620376587, "grad_norm_var": 1.2425700057088225, "learning_rate": 0.0001, "loss": 1.4803, "loss/crossentropy": 2.3968119621276855, "loss/hidden": 1.265625, "loss/logits": 0.2131744921207428, "loss/reg": 0.00014569915947504342, "step": 995 }, { "epoch": 0.1245, "grad_norm": 2.5863823890686035, "grad_norm_var": 0.4892213110580333, "learning_rate": 0.0001, "loss": 1.4225, "loss/crossentropy": 2.4983115196228027, "loss/hidden": 1.2109375, "loss/logits": 0.2101190984249115, "loss/reg": 0.00014564381854142994, "step": 996 }, { "epoch": 0.124625, "grad_norm": 2.3363261222839355, "grad_norm_var": 0.5145389486743288, "learning_rate": 0.0001, "loss": 1.3241, "loss/crossentropy": 2.5176138877868652, "loss/hidden": 1.1328125, "loss/logits": 0.1897883117198944, "loss/reg": 0.0001455901947338134, "step": 997 }, { "epoch": 0.12475, "grad_norm": 3.9978582859039307, "grad_norm_var": 0.5720915664337515, "learning_rate": 0.0001, "loss": 1.6215, "loss/crossentropy": 2.3367302417755127, "loss/hidden": 1.3671875, "loss/logits": 0.2528332769870758, "loss/reg": 0.00014553169603459537, "step": 998 }, { "epoch": 0.124875, "grad_norm": 6.48536491394043, "grad_norm_var": 1.3079089839962283, "learning_rate": 0.0001, "loss": 1.9506, "loss/crossentropy": 2.708216905593872, "loss/hidden": 1.5703125, "loss/logits": 0.37884700298309326, "loss/reg": 0.000145476617035456, "step": 999 }, { "epoch": 0.125, "grad_norm": 3.2179746627807617, "grad_norm_var": 1.2977337562016709, "learning_rate": 0.0001, "loss": 1.6279, "loss/crossentropy": 2.2986085414886475, "loss/hidden": 1.4296875, "loss/logits": 0.1967754364013672, "loss/reg": 0.00014541221025865525, "step": 1000 }, { "epoch": 0.125125, "grad_norm": 2.9530444145202637, "grad_norm_var": 1.25652237968204, "learning_rate": 0.0001, "loss": 1.2811, "loss/crossentropy": 2.5369064807891846, "loss/hidden": 1.1015625, "loss/logits": 0.17807644605636597, "loss/reg": 0.0001453455479349941, "step": 1001 }, { "epoch": 0.12525, "grad_norm": 2.215806722640991, "grad_norm_var": 1.3248082546112594, "learning_rate": 0.0001, "loss": 1.2207, "loss/crossentropy": 2.415578603744507, "loss/hidden": 1.046875, "loss/logits": 0.17236942052841187, "loss/reg": 0.00014528623432852328, "step": 1002 }, { "epoch": 0.125375, "grad_norm": 2.3455402851104736, "grad_norm_var": 1.2711032590741032, "learning_rate": 0.0001, "loss": 1.2687, "loss/crossentropy": 2.403144121170044, "loss/hidden": 1.0703125, "loss/logits": 0.19696345925331116, "loss/reg": 0.00014522341371048242, "step": 1003 }, { "epoch": 0.1255, "grad_norm": 2.4876372814178467, "grad_norm_var": 1.2865092154229403, "learning_rate": 0.0001, "loss": 1.458, "loss/crossentropy": 2.2569031715393066, "loss/hidden": 1.265625, "loss/logits": 0.19091898202896118, "loss/reg": 0.0001451568678021431, "step": 1004 }, { "epoch": 0.125625, "grad_norm": 2.3009023666381836, "grad_norm_var": 1.3455219612070162, "learning_rate": 0.0001, "loss": 1.3567, "loss/crossentropy": 2.730410575866699, "loss/hidden": 1.1640625, "loss/logits": 0.19117164611816406, "loss/reg": 0.00014509812172036618, "step": 1005 }, { "epoch": 0.12575, "grad_norm": 18.976280212402344, "grad_norm_var": 16.83347483577364, "learning_rate": 0.0001, "loss": 1.279, "loss/crossentropy": 2.369011640548706, "loss/hidden": 1.0859375, "loss/logits": 0.19158004224300385, "loss/reg": 0.00014503761485684663, "step": 1006 }, { "epoch": 0.125875, "grad_norm": 2.385678291320801, "grad_norm_var": 17.042168860176346, "learning_rate": 0.0001, "loss": 1.319, "loss/crossentropy": 2.527763843536377, "loss/hidden": 1.140625, "loss/logits": 0.17694416642189026, "loss/reg": 0.00014497899974230677, "step": 1007 }, { "epoch": 0.126, "grad_norm": 3.135558843612671, "grad_norm_var": 17.081796892075086, "learning_rate": 0.0001, "loss": 1.3286, "loss/crossentropy": 2.307640552520752, "loss/hidden": 1.15625, "loss/logits": 0.17092537879943848, "loss/reg": 0.0001449301780667156, "step": 1008 }, { "epoch": 0.126125, "grad_norm": 2.454467535018921, "grad_norm_var": 17.20850584621065, "learning_rate": 0.0001, "loss": 1.1284, "loss/crossentropy": 2.7389183044433594, "loss/hidden": 0.96875, "loss/logits": 0.15822961926460266, "loss/reg": 0.00014488757005892694, "step": 1009 }, { "epoch": 0.12625, "grad_norm": 2.403377056121826, "grad_norm_var": 17.18728139720366, "learning_rate": 0.0001, "loss": 1.325, "loss/crossentropy": 2.4705183506011963, "loss/hidden": 1.140625, "loss/logits": 0.18289461731910706, "loss/reg": 0.00014484470011666417, "step": 1010 }, { "epoch": 0.126375, "grad_norm": 2.2817978858947754, "grad_norm_var": 17.276605292092572, "learning_rate": 0.0001, "loss": 1.3779, "loss/crossentropy": 2.4313480854034424, "loss/hidden": 1.1640625, "loss/logits": 0.212357297539711, "loss/reg": 0.00014480575919151306, "step": 1011 }, { "epoch": 0.1265, "grad_norm": 2.3840107917785645, "grad_norm_var": 17.314886689351543, "learning_rate": 0.0001, "loss": 1.2505, "loss/crossentropy": 2.232851028442383, "loss/hidden": 1.0859375, "loss/logits": 0.1630796194076538, "loss/reg": 0.00014474951603915542, "step": 1012 }, { "epoch": 0.126625, "grad_norm": 2.1883866786956787, "grad_norm_var": 17.347051132769963, "learning_rate": 0.0001, "loss": 1.3501, "loss/crossentropy": 2.6184241771698, "loss/hidden": 1.140625, "loss/logits": 0.20805229246616364, "loss/reg": 0.00014469443704001606, "step": 1013 }, { "epoch": 0.12675, "grad_norm": 1.9212366342544556, "grad_norm_var": 17.586253997374705, "learning_rate": 0.0001, "loss": 1.1864, "loss/crossentropy": 2.8416061401367188, "loss/hidden": 1.03125, "loss/logits": 0.15368226170539856, "loss/reg": 0.00014464251580648124, "step": 1014 }, { "epoch": 0.126875, "grad_norm": 6.187258243560791, "grad_norm_var": 17.483424650191953, "learning_rate": 0.0001, "loss": 1.6247, "loss/crossentropy": 2.5149781703948975, "loss/hidden": 1.3671875, "loss/logits": 0.25602880120277405, "loss/reg": 0.0001445778034394607, "step": 1015 }, { "epoch": 0.127, "grad_norm": 2.336730718612671, "grad_norm_var": 17.593291483552747, "learning_rate": 0.0001, "loss": 1.2191, "loss/crossentropy": 2.478747844696045, "loss/hidden": 1.0546875, "loss/logits": 0.16301372647285461, "loss/reg": 0.00014452172035817057, "step": 1016 }, { "epoch": 0.127125, "grad_norm": 2.878291130065918, "grad_norm_var": 17.600934790703377, "learning_rate": 0.0001, "loss": 1.3852, "loss/crossentropy": 2.6063098907470703, "loss/hidden": 1.171875, "loss/logits": 0.21186596155166626, "loss/reg": 0.0001444646914023906, "step": 1017 }, { "epoch": 0.12725, "grad_norm": 2.3468618392944336, "grad_norm_var": 17.576419686393706, "learning_rate": 0.0001, "loss": 1.2377, "loss/crossentropy": 2.3034865856170654, "loss/hidden": 1.0625, "loss/logits": 0.1737387478351593, "loss/reg": 0.00014440054656006396, "step": 1018 }, { "epoch": 0.127375, "grad_norm": 3.0489449501037598, "grad_norm_var": 17.48140239945023, "learning_rate": 0.0001, "loss": 1.4616, "loss/crossentropy": 2.5037310123443604, "loss/hidden": 1.234375, "loss/logits": 0.2258032262325287, "loss/reg": 0.00014434837794397026, "step": 1019 }, { "epoch": 0.1275, "grad_norm": 2.7019476890563965, "grad_norm_var": 17.44870596084123, "learning_rate": 0.0001, "loss": 1.3326, "loss/crossentropy": 2.778315305709839, "loss/hidden": 1.140625, "loss/logits": 0.19048453867435455, "loss/reg": 0.00014428474241867661, "step": 1020 }, { "epoch": 0.127625, "grad_norm": 2.3664348125457764, "grad_norm_var": 17.436349927574003, "learning_rate": 0.0001, "loss": 1.2709, "loss/crossentropy": 2.407701253890991, "loss/hidden": 1.0859375, "loss/logits": 0.18356312811374664, "loss/reg": 0.0001442142529413104, "step": 1021 }, { "epoch": 0.12775, "grad_norm": 2.219855308532715, "grad_norm_var": 0.9661759649162353, "learning_rate": 0.0001, "loss": 1.2407, "loss/crossentropy": 2.6576895713806152, "loss/hidden": 1.0390625, "loss/logits": 0.2001841962337494, "loss/reg": 0.00014415892655961215, "step": 1022 }, { "epoch": 0.127875, "grad_norm": 2.5545461177825928, "grad_norm_var": 0.9608235907308965, "learning_rate": 0.0001, "loss": 1.2311, "loss/crossentropy": 2.538710355758667, "loss/hidden": 1.0625, "loss/logits": 0.16719122231006622, "loss/reg": 0.0001441039639757946, "step": 1023 }, { "epoch": 0.128, "grad_norm": 2.989229679107666, "grad_norm_var": 0.9539195776823702, "learning_rate": 0.0001, "loss": 1.7663, "loss/crossentropy": 2.8465256690979004, "loss/hidden": 1.4921875, "loss/logits": 0.27264857292175293, "loss/reg": 0.0001440496271243319, "step": 1024 }, { "epoch": 0.128125, "grad_norm": 2.2875874042510986, "grad_norm_var": 0.9612115405736976, "learning_rate": 0.0001, "loss": 1.1713, "loss/crossentropy": 2.9393043518066406, "loss/hidden": 1.015625, "loss/logits": 0.154221773147583, "loss/reg": 0.0001439889456378296, "step": 1025 }, { "epoch": 0.12825, "grad_norm": 2.8194057941436768, "grad_norm_var": 0.9559340478642477, "learning_rate": 0.0001, "loss": 1.3965, "loss/crossentropy": 2.196493625640869, "loss/hidden": 1.203125, "loss/logits": 0.1919146627187729, "loss/reg": 0.00014392408775165677, "step": 1026 }, { "epoch": 0.128375, "grad_norm": 2.300347089767456, "grad_norm_var": 0.9548729344003244, "learning_rate": 0.0001, "loss": 1.2197, "loss/crossentropy": 2.272184371948242, "loss/hidden": 1.046875, "loss/logits": 0.17136628925800323, "loss/reg": 0.00014387199189513922, "step": 1027 }, { "epoch": 0.1285, "grad_norm": 34.56549072265625, "grad_norm_var": 64.23819704773994, "learning_rate": 0.0001, "loss": 1.2114, "loss/crossentropy": 2.3336105346679688, "loss/hidden": 1.046875, "loss/logits": 0.16311419010162354, "loss/reg": 0.0001438146864529699, "step": 1028 }, { "epoch": 0.128625, "grad_norm": 3.153364896774292, "grad_norm_var": 63.969120661272974, "learning_rate": 0.0001, "loss": 1.3725, "loss/crossentropy": 2.5941293239593506, "loss/hidden": 1.15625, "loss/logits": 0.2148231565952301, "loss/reg": 0.00014375426690094173, "step": 1029 }, { "epoch": 0.12875, "grad_norm": 5.7905192375183105, "grad_norm_var": 63.42361219572334, "learning_rate": 0.0001, "loss": 1.917, "loss/crossentropy": 2.8984906673431396, "loss/hidden": 1.5625, "loss/logits": 0.35301950573921204, "loss/reg": 0.00014369298878591508, "step": 1030 }, { "epoch": 0.128875, "grad_norm": 2.0816190242767334, "grad_norm_var": 63.84591064632968, "learning_rate": 0.0001, "loss": 1.1345, "loss/crossentropy": 2.4390463829040527, "loss/hidden": 0.97265625, "loss/logits": 0.16045506298542023, "loss/reg": 0.00014363866648636758, "step": 1031 }, { "epoch": 0.129, "grad_norm": 4.089993953704834, "grad_norm_var": 63.46743940032793, "learning_rate": 0.0001, "loss": 1.4399, "loss/crossentropy": 2.6033990383148193, "loss/hidden": 1.2421875, "loss/logits": 0.19623170793056488, "loss/reg": 0.0001435737794963643, "step": 1032 }, { "epoch": 0.129125, "grad_norm": 4.725457668304443, "grad_norm_var": 63.18593071913756, "learning_rate": 0.0001, "loss": 1.9456, "loss/crossentropy": 3.303938627243042, "loss/hidden": 1.5234375, "loss/logits": 0.4207460284233093, "loss/reg": 0.00014351541176438332, "step": 1033 }, { "epoch": 0.12925, "grad_norm": 2.583591938018799, "grad_norm_var": 63.10560752389551, "learning_rate": 0.0001, "loss": 1.3248, "loss/crossentropy": 2.3864002227783203, "loss/hidden": 1.1328125, "loss/logits": 0.1905287653207779, "loss/reg": 0.00014345927047543228, "step": 1034 }, { "epoch": 0.129375, "grad_norm": 2.682438850402832, "grad_norm_var": 63.21019618707384, "learning_rate": 0.0001, "loss": 1.278, "loss/crossentropy": 2.593626022338867, "loss/hidden": 1.1015625, "loss/logits": 0.1750161349773407, "loss/reg": 0.00014340343477670103, "step": 1035 }, { "epoch": 0.1295, "grad_norm": 3.5153167247772217, "grad_norm_var": 63.00291993734814, "learning_rate": 0.0001, "loss": 1.6521, "loss/crossentropy": 2.541039228439331, "loss/hidden": 1.359375, "loss/logits": 0.29124292731285095, "loss/reg": 0.00014334172010421753, "step": 1036 }, { "epoch": 0.129625, "grad_norm": 2.5193445682525635, "grad_norm_var": 62.94976414954182, "learning_rate": 0.0001, "loss": 1.3236, "loss/crossentropy": 2.5253329277038574, "loss/hidden": 1.140625, "loss/logits": 0.18151576817035675, "loss/reg": 0.00014328851830214262, "step": 1037 }, { "epoch": 0.12975, "grad_norm": 2.3693583011627197, "grad_norm_var": 62.894648436582095, "learning_rate": 0.0001, "loss": 1.2415, "loss/crossentropy": 2.545581579208374, "loss/hidden": 1.0703125, "loss/logits": 0.1697414517402649, "loss/reg": 0.0001432363933417946, "step": 1038 }, { "epoch": 0.129875, "grad_norm": 2.849060535430908, "grad_norm_var": 62.80151802999233, "learning_rate": 0.0001, "loss": 1.4527, "loss/crossentropy": 2.410926342010498, "loss/hidden": 1.2421875, "loss/logits": 0.20909011363983154, "loss/reg": 0.00014317978639155626, "step": 1039 }, { "epoch": 0.13, "grad_norm": 3.325183868408203, "grad_norm_var": 62.714800428512305, "learning_rate": 0.0001, "loss": 1.3604, "loss/crossentropy": 2.827643394470215, "loss/hidden": 1.140625, "loss/logits": 0.21830135583877563, "loss/reg": 0.00014313000428956002, "step": 1040 }, { "epoch": 0.130125, "grad_norm": 3.1623077392578125, "grad_norm_var": 62.43418810696358, "learning_rate": 0.0001, "loss": 1.6402, "loss/crossentropy": 2.4908640384674072, "loss/hidden": 1.390625, "loss/logits": 0.24815502762794495, "loss/reg": 0.00014307738456409425, "step": 1041 }, { "epoch": 0.13025, "grad_norm": 3.234795570373535, "grad_norm_var": 62.31543203000146, "learning_rate": 0.0001, "loss": 1.3887, "loss/crossentropy": 2.335465669631958, "loss/hidden": 1.1875, "loss/logits": 0.19978907704353333, "loss/reg": 0.00014302735507953912, "step": 1042 }, { "epoch": 0.130375, "grad_norm": 4.320650577545166, "grad_norm_var": 61.793682574945976, "learning_rate": 0.0001, "loss": 1.5641, "loss/crossentropy": 2.1389238834381104, "loss/hidden": 1.3828125, "loss/logits": 0.17987647652626038, "loss/reg": 0.0001429805561201647, "step": 1043 }, { "epoch": 0.1305, "grad_norm": 3.154794216156006, "grad_norm_var": 0.9356848822638367, "learning_rate": 0.0001, "loss": 1.7099, "loss/crossentropy": 2.3197882175445557, "loss/hidden": 1.40625, "loss/logits": 0.3022366762161255, "loss/reg": 0.00014293599815573543, "step": 1044 }, { "epoch": 0.130625, "grad_norm": 2.5388293266296387, "grad_norm_var": 0.9751840336299513, "learning_rate": 0.0001, "loss": 1.288, "loss/crossentropy": 2.5088329315185547, "loss/hidden": 1.109375, "loss/logits": 0.17718367278575897, "loss/reg": 0.00014288499369286, "step": 1045 }, { "epoch": 0.13075, "grad_norm": 2.3715758323669434, "grad_norm_var": 0.5745132078775441, "learning_rate": 0.0001, "loss": 1.1955, "loss/crossentropy": 2.702469825744629, "loss/hidden": 1.03125, "loss/logits": 0.1628141701221466, "loss/reg": 0.00014283911150414497, "step": 1046 }, { "epoch": 0.130875, "grad_norm": 2.893900156021118, "grad_norm_var": 0.5059681482884789, "learning_rate": 0.0001, "loss": 1.3486, "loss/crossentropy": 2.5923421382904053, "loss/hidden": 1.15625, "loss/logits": 0.19094862043857574, "loss/reg": 0.00014280076720751822, "step": 1047 }, { "epoch": 0.131, "grad_norm": 2.677845001220703, "grad_norm_var": 0.45286915544523176, "learning_rate": 0.0001, "loss": 1.4025, "loss/crossentropy": 2.3695199489593506, "loss/hidden": 1.1875, "loss/logits": 0.2135327011346817, "loss/reg": 0.0001427609968231991, "step": 1048 }, { "epoch": 0.131125, "grad_norm": 2.7685861587524414, "grad_norm_var": 0.2570786898784708, "learning_rate": 0.0001, "loss": 1.4944, "loss/crossentropy": 2.7170326709747314, "loss/hidden": 1.2578125, "loss/logits": 0.23512059450149536, "loss/reg": 0.00014270858082454652, "step": 1049 }, { "epoch": 0.13125, "grad_norm": 2.4609200954437256, "grad_norm_var": 0.2637746784050703, "learning_rate": 0.0001, "loss": 1.162, "loss/crossentropy": 2.41953444480896, "loss/hidden": 1.0234375, "loss/logits": 0.13712382316589355, "loss/reg": 0.00014265488425735384, "step": 1050 }, { "epoch": 0.131375, "grad_norm": 2.3637502193450928, "grad_norm_var": 0.2805484578627547, "learning_rate": 0.0001, "loss": 1.3195, "loss/crossentropy": 2.7971181869506836, "loss/hidden": 1.140625, "loss/logits": 0.177445650100708, "loss/reg": 0.00014259127783589065, "step": 1051 }, { "epoch": 0.1315, "grad_norm": 2.3424313068389893, "grad_norm_var": 0.27153475454197556, "learning_rate": 0.0001, "loss": 1.0484, "loss/crossentropy": 2.638460397720337, "loss/hidden": 0.91796875, "loss/logits": 0.12901164591312408, "loss/reg": 0.00014253993867896497, "step": 1052 }, { "epoch": 0.131625, "grad_norm": 2.4308948516845703, "grad_norm_var": 0.27574141809065106, "learning_rate": 0.0001, "loss": 1.1918, "loss/crossentropy": 2.587552785873413, "loss/hidden": 1.03125, "loss/logits": 0.15914621949195862, "loss/reg": 0.00014247227227315307, "step": 1053 }, { "epoch": 0.13175, "grad_norm": 2.7306430339813232, "grad_norm_var": 0.2617551363804047, "learning_rate": 0.0001, "loss": 1.339, "loss/crossentropy": 2.3140664100646973, "loss/hidden": 1.140625, "loss/logits": 0.19697260856628418, "loss/reg": 0.00014240805467125028, "step": 1054 }, { "epoch": 0.131875, "grad_norm": 3.1046218872070312, "grad_norm_var": 0.26574936909714947, "learning_rate": 0.0001, "loss": 1.2837, "loss/crossentropy": 2.705944776535034, "loss/hidden": 1.109375, "loss/logits": 0.1729246824979782, "loss/reg": 0.00014233090041670948, "step": 1055 }, { "epoch": 0.132, "grad_norm": 2.8519394397735596, "grad_norm_var": 0.25087419704861985, "learning_rate": 0.0001, "loss": 1.3356, "loss/crossentropy": 2.6108601093292236, "loss/hidden": 1.125, "loss/logits": 0.2091948539018631, "loss/reg": 0.00014227778592612594, "step": 1056 }, { "epoch": 0.132125, "grad_norm": 2.189276933670044, "grad_norm_var": 0.2679775862441846, "learning_rate": 0.0001, "loss": 1.3398, "loss/crossentropy": 2.6916613578796387, "loss/hidden": 1.1328125, "loss/logits": 0.20556402206420898, "loss/reg": 0.00014222465688362718, "step": 1057 }, { "epoch": 0.13225, "grad_norm": 3.149076461791992, "grad_norm_var": 0.26320704554729724, "learning_rate": 0.0001, "loss": 1.4362, "loss/crossentropy": 2.641360282897949, "loss/hidden": 1.234375, "loss/logits": 0.20044422149658203, "loss/reg": 0.0001421700872015208, "step": 1058 }, { "epoch": 0.132375, "grad_norm": 3.255315065383911, "grad_norm_var": 0.11414301553138936, "learning_rate": 0.0001, "loss": 1.6363, "loss/crossentropy": 2.2705044746398926, "loss/hidden": 1.4375, "loss/logits": 0.197383850812912, "loss/reg": 0.00014210060180630535, "step": 1059 }, { "epoch": 0.1325, "grad_norm": 2.9358930587768555, "grad_norm_var": 0.10401783590944641, "learning_rate": 0.0001, "loss": 1.3488, "loss/crossentropy": 2.774256467819214, "loss/hidden": 1.1640625, "loss/logits": 0.1832944005727768, "loss/reg": 0.00014203076716512442, "step": 1060 }, { "epoch": 0.132625, "grad_norm": 2.5962319374084473, "grad_norm_var": 0.1030545674710074, "learning_rate": 0.0001, "loss": 1.4162, "loss/crossentropy": 2.5801644325256348, "loss/hidden": 1.1875, "loss/logits": 0.22723841667175293, "loss/reg": 0.00014197806012816727, "step": 1061 }, { "epoch": 0.13275, "grad_norm": 3.9817557334899902, "grad_norm_var": 0.19562194669480562, "learning_rate": 0.0001, "loss": 1.733, "loss/crossentropy": 2.521998167037964, "loss/hidden": 1.375, "loss/logits": 0.3565419018268585, "loss/reg": 0.00014191192167345434, "step": 1062 }, { "epoch": 0.132875, "grad_norm": 2.8030755519866943, "grad_norm_var": 0.1949497412171354, "learning_rate": 0.0001, "loss": 1.3186, "loss/crossentropy": 2.140580654144287, "loss/hidden": 1.1484375, "loss/logits": 0.16878274083137512, "loss/reg": 0.00014185633335728198, "step": 1063 }, { "epoch": 0.133, "grad_norm": 2.73686146736145, "grad_norm_var": 0.1942837830749035, "learning_rate": 0.0001, "loss": 1.344, "loss/crossentropy": 2.594520092010498, "loss/hidden": 1.171875, "loss/logits": 0.17074783146381378, "loss/reg": 0.00014179061690811068, "step": 1064 }, { "epoch": 0.133125, "grad_norm": 3.490253448486328, "grad_norm_var": 0.22440503316954405, "learning_rate": 0.0001, "loss": 1.3733, "loss/crossentropy": 2.588472366333008, "loss/hidden": 1.171875, "loss/logits": 0.19999830424785614, "loss/reg": 0.0001417383609805256, "step": 1065 }, { "epoch": 0.13325, "grad_norm": 2.604966402053833, "grad_norm_var": 0.21844167012626273, "learning_rate": 0.0001, "loss": 1.3394, "loss/crossentropy": 2.610567331314087, "loss/hidden": 1.15625, "loss/logits": 0.1817035973072052, "loss/reg": 0.00014168783673085272, "step": 1066 }, { "epoch": 0.133375, "grad_norm": 2.4641871452331543, "grad_norm_var": 0.21258811707699862, "learning_rate": 0.0001, "loss": 1.4833, "loss/crossentropy": 2.336571216583252, "loss/hidden": 1.28125, "loss/logits": 0.20059654116630554, "loss/reg": 0.00014163984451442957, "step": 1067 }, { "epoch": 0.1335, "grad_norm": 4.1514458656311035, "grad_norm_var": 0.2936784967920249, "learning_rate": 0.0001, "loss": 1.5455, "loss/crossentropy": 3.126817464828491, "loss/hidden": 1.3125, "loss/logits": 0.23153823614120483, "loss/reg": 0.00014158482372295111, "step": 1068 }, { "epoch": 0.133625, "grad_norm": 3.086136817932129, "grad_norm_var": 0.2736509938223982, "learning_rate": 0.0001, "loss": 1.6165, "loss/crossentropy": 2.307713270187378, "loss/hidden": 1.3828125, "loss/logits": 0.23230654001235962, "loss/reg": 0.00014153325173538178, "step": 1069 }, { "epoch": 0.13375, "grad_norm": 5.534700393676758, "grad_norm_var": 0.6612894560862136, "learning_rate": 0.0001, "loss": 1.3566, "loss/crossentropy": 2.7461800575256348, "loss/hidden": 1.171875, "loss/logits": 0.1832621693611145, "loss/reg": 0.0001414819562342018, "step": 1070 }, { "epoch": 0.133875, "grad_norm": 2.2858574390411377, "grad_norm_var": 0.7117971297364664, "learning_rate": 0.0001, "loss": 1.364, "loss/crossentropy": 2.641616106033325, "loss/hidden": 1.15625, "loss/logits": 0.20629598200321198, "loss/reg": 0.0001414327707607299, "step": 1071 }, { "epoch": 0.134, "grad_norm": 2.7599470615386963, "grad_norm_var": 0.7157649794625662, "learning_rate": 0.0001, "loss": 1.3193, "loss/crossentropy": 2.567641019821167, "loss/hidden": 1.140625, "loss/logits": 0.1772143840789795, "loss/reg": 0.00014137876860331744, "step": 1072 }, { "epoch": 0.134125, "grad_norm": 2.364522933959961, "grad_norm_var": 0.6957837155141723, "learning_rate": 0.0001, "loss": 1.2606, "loss/crossentropy": 2.3941338062286377, "loss/hidden": 1.0703125, "loss/logits": 0.1888333559036255, "loss/reg": 0.0001413320715073496, "step": 1073 }, { "epoch": 0.13425, "grad_norm": 3.9050779342651367, "grad_norm_var": 0.7326703355440335, "learning_rate": 0.0001, "loss": 1.2918, "loss/crossentropy": 2.6505584716796875, "loss/hidden": 1.125, "loss/logits": 0.16539788246154785, "loss/reg": 0.00014129700139164925, "step": 1074 }, { "epoch": 0.134375, "grad_norm": 2.2486608028411865, "grad_norm_var": 0.7865355174243092, "learning_rate": 0.0001, "loss": 1.2443, "loss/crossentropy": 2.3849587440490723, "loss/hidden": 1.0703125, "loss/logits": 0.17252877354621887, "loss/reg": 0.00014124637527856976, "step": 1075 }, { "epoch": 0.1345, "grad_norm": 3.4393551349639893, "grad_norm_var": 0.7898947863912305, "learning_rate": 0.0001, "loss": 1.3532, "loss/crossentropy": 2.4399914741516113, "loss/hidden": 1.171875, "loss/logits": 0.17988482117652893, "loss/reg": 0.0001412063866155222, "step": 1076 }, { "epoch": 0.134625, "grad_norm": 2.7268998622894287, "grad_norm_var": 0.7812562039627664, "learning_rate": 0.0001, "loss": 1.2561, "loss/crossentropy": 2.55763840675354, "loss/hidden": 1.109375, "loss/logits": 0.14528216421604156, "loss/reg": 0.00014116587408352643, "step": 1077 }, { "epoch": 0.13475, "grad_norm": 2.4400649070739746, "grad_norm_var": 0.7611922985390265, "learning_rate": 0.0001, "loss": 1.3308, "loss/crossentropy": 2.4494190216064453, "loss/hidden": 1.140625, "loss/logits": 0.18872657418251038, "loss/reg": 0.0001411273260600865, "step": 1078 }, { "epoch": 0.134875, "grad_norm": 2.5993704795837402, "grad_norm_var": 0.7709032459129274, "learning_rate": 0.0001, "loss": 1.3278, "loss/crossentropy": 2.661067008972168, "loss/hidden": 1.109375, "loss/logits": 0.21702289581298828, "loss/reg": 0.0001410842960467562, "step": 1079 }, { "epoch": 0.135, "grad_norm": 2.5341849327087402, "grad_norm_var": 0.7819974193083984, "learning_rate": 0.0001, "loss": 1.5862, "loss/crossentropy": 2.6597087383270264, "loss/hidden": 1.3359375, "loss/logits": 0.24883466958999634, "loss/reg": 0.00014104568981565535, "step": 1080 }, { "epoch": 0.135125, "grad_norm": 2.3037710189819336, "grad_norm_var": 0.7987089710033046, "learning_rate": 0.0001, "loss": 1.2866, "loss/crossentropy": 2.611107110977173, "loss/hidden": 1.109375, "loss/logits": 0.17586109042167664, "loss/reg": 0.00014100705448072404, "step": 1081 }, { "epoch": 0.13525, "grad_norm": 2.4578065872192383, "grad_norm_var": 0.8071380219160704, "learning_rate": 0.0001, "loss": 1.3082, "loss/crossentropy": 2.57969069480896, "loss/hidden": 1.125, "loss/logits": 0.18178004026412964, "loss/reg": 0.00014096140512265265, "step": 1082 }, { "epoch": 0.135375, "grad_norm": 2.438591718673706, "grad_norm_var": 0.8088586660824073, "learning_rate": 0.0001, "loss": 1.1343, "loss/crossentropy": 2.6585121154785156, "loss/hidden": 0.97265625, "loss/logits": 0.16023336350917816, "loss/reg": 0.0001409082324244082, "step": 1083 }, { "epoch": 0.1355, "grad_norm": 2.1233022212982178, "grad_norm_var": 0.7423412565521155, "learning_rate": 0.0001, "loss": 1.1339, "loss/crossentropy": 2.57684326171875, "loss/hidden": 0.9765625, "loss/logits": 0.15594279766082764, "loss/reg": 0.00014085885777603835, "step": 1084 }, { "epoch": 0.135625, "grad_norm": 1.9893174171447754, "grad_norm_var": 0.7797812477175263, "learning_rate": 0.0001, "loss": 1.1893, "loss/crossentropy": 2.489999532699585, "loss/hidden": 1.0078125, "loss/logits": 0.18003268539905548, "loss/reg": 0.000140800402732566, "step": 1085 }, { "epoch": 0.13575, "grad_norm": 3.270289659500122, "grad_norm_var": 0.26235028124153054, "learning_rate": 0.0001, "loss": 1.3787, "loss/crossentropy": 2.483642339706421, "loss/hidden": 1.203125, "loss/logits": 0.1741553544998169, "loss/reg": 0.00014074533828534186, "step": 1086 }, { "epoch": 0.135875, "grad_norm": 3.129045009613037, "grad_norm_var": 0.26945136589962787, "learning_rate": 0.0001, "loss": 1.7181, "loss/crossentropy": 2.3532562255859375, "loss/hidden": 1.4296875, "loss/logits": 0.2870248556137085, "loss/reg": 0.00014069535245653242, "step": 1087 }, { "epoch": 0.136, "grad_norm": 2.5498738288879395, "grad_norm_var": 0.26970801226627733, "learning_rate": 0.0001, "loss": 1.207, "loss/crossentropy": 2.521801233291626, "loss/hidden": 1.046875, "loss/logits": 0.15868797898292542, "loss/reg": 0.0001406477822456509, "step": 1088 }, { "epoch": 0.136125, "grad_norm": 2.259587049484253, "grad_norm_var": 0.2744955254187912, "learning_rate": 0.0001, "loss": 1.2987, "loss/crossentropy": 2.6568217277526855, "loss/hidden": 1.1171875, "loss/logits": 0.1800576150417328, "loss/reg": 0.00014058791566640139, "step": 1089 }, { "epoch": 0.13625, "grad_norm": 2.3829431533813477, "grad_norm_var": 0.16477450061941543, "learning_rate": 0.0001, "loss": 1.2115, "loss/crossentropy": 2.554049491882324, "loss/hidden": 1.03125, "loss/logits": 0.1788145899772644, "loss/reg": 0.00014052398910280317, "step": 1090 }, { "epoch": 0.136375, "grad_norm": 2.431690216064453, "grad_norm_var": 0.15937243272840743, "learning_rate": 0.0001, "loss": 1.4766, "loss/crossentropy": 2.3914289474487305, "loss/hidden": 1.265625, "loss/logits": 0.20959001779556274, "loss/reg": 0.0001404571085004136, "step": 1091 }, { "epoch": 0.1365, "grad_norm": 2.4380593299865723, "grad_norm_var": 0.10560392919011458, "learning_rate": 0.0001, "loss": 1.3616, "loss/crossentropy": 2.5094845294952393, "loss/hidden": 1.15625, "loss/logits": 0.20389944314956665, "loss/reg": 0.00014038202061783522, "step": 1092 }, { "epoch": 0.136625, "grad_norm": 3.403883457183838, "grad_norm_var": 0.1543071296249451, "learning_rate": 0.0001, "loss": 1.6094, "loss/crossentropy": 2.5270915031433105, "loss/hidden": 1.3046875, "loss/logits": 0.3033224940299988, "loss/reg": 0.00014032720355316997, "step": 1093 }, { "epoch": 0.13675, "grad_norm": 2.4035472869873047, "grad_norm_var": 0.1549110776919709, "learning_rate": 0.0001, "loss": 1.3178, "loss/crossentropy": 2.521379232406616, "loss/hidden": 1.109375, "loss/logits": 0.20702823996543884, "loss/reg": 0.00014025846030563116, "step": 1094 }, { "epoch": 0.136875, "grad_norm": 3.7205774784088135, "grad_norm_var": 0.2416524797655993, "learning_rate": 0.0001, "loss": 1.4267, "loss/crossentropy": 2.261558771133423, "loss/hidden": 1.234375, "loss/logits": 0.19096454977989197, "loss/reg": 0.00014020170783624053, "step": 1095 }, { "epoch": 0.137, "grad_norm": 2.688718557357788, "grad_norm_var": 0.2414844125274044, "learning_rate": 0.0001, "loss": 1.3291, "loss/crossentropy": 2.5607919692993164, "loss/hidden": 1.125, "loss/logits": 0.20268316566944122, "loss/reg": 0.00014013091276865453, "step": 1096 }, { "epoch": 0.137125, "grad_norm": 2.432055711746216, "grad_norm_var": 0.23702808827864388, "learning_rate": 0.0001, "loss": 1.1324, "loss/crossentropy": 2.813500165939331, "loss/hidden": 0.9921875, "loss/logits": 0.138791024684906, "loss/reg": 0.00014006906712893397, "step": 1097 }, { "epoch": 0.13725, "grad_norm": 2.5421934127807617, "grad_norm_var": 0.23550808317392935, "learning_rate": 0.0001, "loss": 1.3123, "loss/crossentropy": 2.271918535232544, "loss/hidden": 1.140625, "loss/logits": 0.17024344205856323, "loss/reg": 0.00014001200906932354, "step": 1098 }, { "epoch": 0.137375, "grad_norm": 2.3173115253448486, "grad_norm_var": 0.23964758786650364, "learning_rate": 0.0001, "loss": 1.2274, "loss/crossentropy": 2.255692720413208, "loss/hidden": 1.078125, "loss/logits": 0.14787998795509338, "loss/reg": 0.00013994732580613345, "step": 1099 }, { "epoch": 0.1375, "grad_norm": 2.630260944366455, "grad_norm_var": 0.22145043398181674, "learning_rate": 0.0001, "loss": 1.341, "loss/crossentropy": 2.512317419052124, "loss/hidden": 1.140625, "loss/logits": 0.19899873435497284, "loss/reg": 0.00013988044520374388, "step": 1100 }, { "epoch": 0.137625, "grad_norm": 2.3226044178009033, "grad_norm_var": 0.19850744884658386, "learning_rate": 0.0001, "loss": 1.3032, "loss/crossentropy": 2.5138912200927734, "loss/hidden": 1.109375, "loss/logits": 0.1924511343240738, "loss/reg": 0.00013980967923998833, "step": 1101 }, { "epoch": 0.13775, "grad_norm": 2.4632811546325684, "grad_norm_var": 0.17598229654805947, "learning_rate": 0.0001, "loss": 1.3857, "loss/crossentropy": 2.4244821071624756, "loss/hidden": 1.1875, "loss/logits": 0.19683147966861725, "loss/reg": 0.00013975510955788195, "step": 1102 }, { "epoch": 0.137875, "grad_norm": 2.609951972961426, "grad_norm_var": 0.15843742841547054, "learning_rate": 0.0001, "loss": 1.5206, "loss/crossentropy": 2.4863991737365723, "loss/hidden": 1.28125, "loss/logits": 0.23795738816261292, "loss/reg": 0.00013969963765703142, "step": 1103 }, { "epoch": 0.138, "grad_norm": 2.9891414642333984, "grad_norm_var": 0.16757400865144423, "learning_rate": 0.0001, "loss": 1.3303, "loss/crossentropy": 2.7802960872650146, "loss/hidden": 1.1328125, "loss/logits": 0.19604817032814026, "loss/reg": 0.00013964292884338647, "step": 1104 }, { "epoch": 0.138125, "grad_norm": 2.412811279296875, "grad_norm_var": 0.16153029263209623, "learning_rate": 0.0001, "loss": 1.4545, "loss/crossentropy": 2.4836504459381104, "loss/hidden": 1.2109375, "loss/logits": 0.2421611249446869, "loss/reg": 0.00013958517229184508, "step": 1105 }, { "epoch": 0.13825, "grad_norm": 2.5901825428009033, "grad_norm_var": 0.15719960163401842, "learning_rate": 0.0001, "loss": 1.4617, "loss/crossentropy": 2.707066774368286, "loss/hidden": 1.234375, "loss/logits": 0.22596776485443115, "loss/reg": 0.0001395278231939301, "step": 1106 }, { "epoch": 0.138375, "grad_norm": 2.5841996669769287, "grad_norm_var": 0.15421879626547533, "learning_rate": 0.0001, "loss": 1.1785, "loss/crossentropy": 2.1721110343933105, "loss/hidden": 1.0234375, "loss/logits": 0.15365329384803772, "loss/reg": 0.00013946513354312629, "step": 1107 }, { "epoch": 0.1385, "grad_norm": 2.478071689605713, "grad_norm_var": 0.15313854984074146, "learning_rate": 0.0001, "loss": 1.1921, "loss/crossentropy": 2.78641414642334, "loss/hidden": 1.0390625, "loss/logits": 0.1516191065311432, "loss/reg": 0.00013940469943918288, "step": 1108 }, { "epoch": 0.138625, "grad_norm": 3.260493755340576, "grad_norm_var": 0.14023596210993006, "learning_rate": 0.0001, "loss": 1.4305, "loss/crossentropy": 2.4489269256591797, "loss/hidden": 1.2109375, "loss/logits": 0.21812792122364044, "loss/reg": 0.0001393466372974217, "step": 1109 }, { "epoch": 0.13875, "grad_norm": 2.498387336730957, "grad_norm_var": 0.1376457650450824, "learning_rate": 0.0001, "loss": 1.2656, "loss/crossentropy": 2.54548978805542, "loss/hidden": 1.09375, "loss/logits": 0.1704767495393753, "loss/reg": 0.0001392918056808412, "step": 1110 }, { "epoch": 0.138875, "grad_norm": 2.2783641815185547, "grad_norm_var": 0.06346315627596652, "learning_rate": 0.0001, "loss": 1.1791, "loss/crossentropy": 2.758366823196411, "loss/hidden": 1.015625, "loss/logits": 0.16205796599388123, "loss/reg": 0.0001392312115058303, "step": 1111 }, { "epoch": 0.139, "grad_norm": 2.485659122467041, "grad_norm_var": 0.06278879328245139, "learning_rate": 0.0001, "loss": 1.4563, "loss/crossentropy": 2.255074977874756, "loss/hidden": 1.234375, "loss/logits": 0.2204999327659607, "loss/reg": 0.0001391780679114163, "step": 1112 }, { "epoch": 0.139125, "grad_norm": 2.534553289413452, "grad_norm_var": 0.06175241724844014, "learning_rate": 0.0001, "loss": 1.2676, "loss/crossentropy": 2.395563840866089, "loss/hidden": 1.078125, "loss/logits": 0.18806511163711548, "loss/reg": 0.0001391225669067353, "step": 1113 }, { "epoch": 0.13925, "grad_norm": 2.956914186477661, "grad_norm_var": 0.07138787606588343, "learning_rate": 0.0001, "loss": 1.1978, "loss/crossentropy": 2.7679617404937744, "loss/hidden": 1.03125, "loss/logits": 0.1651535928249359, "loss/reg": 0.00013906652748119086, "step": 1114 }, { "epoch": 0.139375, "grad_norm": 1.9856147766113281, "grad_norm_var": 0.09024740616237802, "learning_rate": 0.0001, "loss": 1.2167, "loss/crossentropy": 2.6154086589813232, "loss/hidden": 1.0234375, "loss/logits": 0.19184809923171997, "loss/reg": 0.00013900973135605454, "step": 1115 }, { "epoch": 0.1395, "grad_norm": 2.673013210296631, "grad_norm_var": 0.09071922206564006, "learning_rate": 0.0001, "loss": 1.4693, "loss/crossentropy": 2.9229443073272705, "loss/hidden": 1.2421875, "loss/logits": 0.22574329376220703, "loss/reg": 0.0001389500976074487, "step": 1116 }, { "epoch": 0.139625, "grad_norm": 2.74776291847229, "grad_norm_var": 0.08798090155865618, "learning_rate": 0.0001, "loss": 1.2665, "loss/crossentropy": 2.518014907836914, "loss/hidden": 1.0859375, "loss/logits": 0.17920026183128357, "loss/reg": 0.00013891411072108895, "step": 1117 }, { "epoch": 0.13975, "grad_norm": 2.830657720565796, "grad_norm_var": 0.08987723868542616, "learning_rate": 0.0001, "loss": 1.424, "loss/crossentropy": 2.5110037326812744, "loss/hidden": 1.2109375, "loss/logits": 0.2116960883140564, "loss/reg": 0.00013888246030546725, "step": 1118 }, { "epoch": 0.139875, "grad_norm": 3.392299175262451, "grad_norm_var": 0.12711081412016842, "learning_rate": 0.0001, "loss": 1.4222, "loss/crossentropy": 2.5967905521392822, "loss/hidden": 1.1953125, "loss/logits": 0.22547084093093872, "loss/reg": 0.00013885533553548157, "step": 1119 }, { "epoch": 0.14, "grad_norm": 3.3788132667541504, "grad_norm_var": 0.1532534914907842, "learning_rate": 0.0001, "loss": 1.6399, "loss/crossentropy": 2.199849843978882, "loss/hidden": 1.3984375, "loss/logits": 0.2400365173816681, "loss/reg": 0.0001387996453559026, "step": 1120 }, { "epoch": 0.140125, "grad_norm": 2.760607957839966, "grad_norm_var": 0.14782107384839166, "learning_rate": 0.0001, "loss": 1.3111, "loss/crossentropy": 2.729834794998169, "loss/hidden": 1.140625, "loss/logits": 0.16906651854515076, "loss/reg": 0.00013875472359359264, "step": 1121 }, { "epoch": 0.14025, "grad_norm": 2.9688501358032227, "grad_norm_var": 0.1504948862248587, "learning_rate": 0.0001, "loss": 1.3507, "loss/crossentropy": 2.761674642562866, "loss/hidden": 1.1484375, "loss/logits": 0.2008332908153534, "loss/reg": 0.00013871857663616538, "step": 1122 }, { "epoch": 0.140375, "grad_norm": 2.4969496726989746, "grad_norm_var": 0.152764435021394, "learning_rate": 0.0001, "loss": 1.1722, "loss/crossentropy": 2.6266415119171143, "loss/hidden": 1.0, "loss/logits": 0.170831099152565, "loss/reg": 0.0001386810909025371, "step": 1123 }, { "epoch": 0.1405, "grad_norm": 2.458451271057129, "grad_norm_var": 0.15345524028281615, "learning_rate": 0.0001, "loss": 1.2287, "loss/crossentropy": 2.1737289428710938, "loss/hidden": 1.046875, "loss/logits": 0.18041090667247772, "loss/reg": 0.00013864520587958395, "step": 1124 }, { "epoch": 0.140625, "grad_norm": 2.8649933338165283, "grad_norm_var": 0.1353470723879506, "learning_rate": 0.0001, "loss": 1.5076, "loss/crossentropy": 1.9967268705368042, "loss/hidden": 1.2890625, "loss/logits": 0.2171594500541687, "loss/reg": 0.00013861124170944095, "step": 1125 }, { "epoch": 0.14075, "grad_norm": 5.948707580566406, "grad_norm_var": 0.7834238193192846, "learning_rate": 0.0001, "loss": 2.7463, "loss/crossentropy": 2.2537713050842285, "loss/hidden": 2.28125, "loss/logits": 0.4636331796646118, "loss/reg": 0.0001385835639666766, "step": 1126 }, { "epoch": 0.140875, "grad_norm": 4.61808967590332, "grad_norm_var": 0.9245786065903947, "learning_rate": 0.0001, "loss": 1.5048, "loss/crossentropy": 3.158489942550659, "loss/hidden": 1.296875, "loss/logits": 0.20657947659492493, "loss/reg": 0.00013852660777047276, "step": 1127 }, { "epoch": 0.141, "grad_norm": 5.949882507324219, "grad_norm_var": 1.4052478013994754, "learning_rate": 0.0001, "loss": 1.8777, "loss/crossentropy": 2.4392154216766357, "loss/hidden": 1.5546875, "loss/logits": 0.32164543867111206, "loss/reg": 0.00013849088281858712, "step": 1128 }, { "epoch": 0.141125, "grad_norm": 2.6664278507232666, "grad_norm_var": 1.3931326515716629, "learning_rate": 0.0001, "loss": 1.3169, "loss/crossentropy": 2.6932904720306396, "loss/hidden": 1.125, "loss/logits": 0.19055697321891785, "loss/reg": 0.00013843220949638635, "step": 1129 }, { "epoch": 0.14125, "grad_norm": 3.053105592727661, "grad_norm_var": 1.389392430934447, "learning_rate": 0.0001, "loss": 1.2675, "loss/crossentropy": 2.684875726699829, "loss/hidden": 1.09375, "loss/logits": 0.17233556509017944, "loss/reg": 0.00013837449660059065, "step": 1130 }, { "epoch": 0.141375, "grad_norm": 2.267932653427124, "grad_norm_var": 1.3449108823372378, "learning_rate": 0.0001, "loss": 1.2769, "loss/crossentropy": 2.4949803352355957, "loss/hidden": 1.09375, "loss/logits": 0.18172959983348846, "loss/reg": 0.0001383376365993172, "step": 1131 }, { "epoch": 0.1415, "grad_norm": 2.1992413997650146, "grad_norm_var": 1.3996379292328243, "learning_rate": 0.0001, "loss": 1.0513, "loss/crossentropy": 2.5984036922454834, "loss/hidden": 0.91796875, "loss/logits": 0.131981760263443, "loss/reg": 0.00013828086957801133, "step": 1132 }, { "epoch": 0.141625, "grad_norm": 2.3870694637298584, "grad_norm_var": 1.4337347832574239, "learning_rate": 0.0001, "loss": 1.2912, "loss/crossentropy": 2.666065216064453, "loss/hidden": 1.1171875, "loss/logits": 0.17264345288276672, "loss/reg": 0.00013823062181472778, "step": 1133 }, { "epoch": 0.14175, "grad_norm": 3.0439159870147705, "grad_norm_var": 1.4242232535715478, "learning_rate": 0.0001, "loss": 1.2962, "loss/crossentropy": 2.795403003692627, "loss/hidden": 1.109375, "loss/logits": 0.18547967076301575, "loss/reg": 0.00013817368017043918, "step": 1134 }, { "epoch": 0.141875, "grad_norm": 2.7333335876464844, "grad_norm_var": 1.4413607114673739, "learning_rate": 0.0001, "loss": 1.3734, "loss/crossentropy": 2.3955838680267334, "loss/hidden": 1.171875, "loss/logits": 0.20017856359481812, "loss/reg": 0.0001381170586682856, "step": 1135 }, { "epoch": 0.142, "grad_norm": 2.2776148319244385, "grad_norm_var": 1.496368766119796, "learning_rate": 0.0001, "loss": 1.1546, "loss/crossentropy": 2.4887545108795166, "loss/hidden": 1.0, "loss/logits": 0.15322545170783997, "loss/reg": 0.00013807426148559898, "step": 1136 }, { "epoch": 0.142125, "grad_norm": 2.641200065612793, "grad_norm_var": 1.5037531545300227, "learning_rate": 0.0001, "loss": 1.2616, "loss/crossentropy": 2.2680163383483887, "loss/hidden": 1.1015625, "loss/logits": 0.15868523716926575, "loss/reg": 0.0001380166650051251, "step": 1137 }, { "epoch": 0.14225, "grad_norm": 2.4466216564178467, "grad_norm_var": 1.5341767802145037, "learning_rate": 0.0001, "loss": 1.3614, "loss/crossentropy": 2.4957382678985596, "loss/hidden": 1.171875, "loss/logits": 0.18814434111118317, "loss/reg": 0.00013795678387396038, "step": 1138 }, { "epoch": 0.142375, "grad_norm": 1.9342833757400513, "grad_norm_var": 1.6013325950757458, "learning_rate": 0.0001, "loss": 1.1822, "loss/crossentropy": 2.5197877883911133, "loss/hidden": 1.0234375, "loss/logits": 0.1573667824268341, "loss/reg": 0.00013788689102511853, "step": 1139 }, { "epoch": 0.1425, "grad_norm": 2.2716784477233887, "grad_norm_var": 1.6193195131802391, "learning_rate": 0.0001, "loss": 1.3071, "loss/crossentropy": 2.6087090969085693, "loss/hidden": 1.125, "loss/logits": 0.18069323897361755, "loss/reg": 0.00013783156464342028, "step": 1140 }, { "epoch": 0.142625, "grad_norm": 2.875831127166748, "grad_norm_var": 1.6190139848378209, "learning_rate": 0.0001, "loss": 1.6097, "loss/crossentropy": 2.712416648864746, "loss/hidden": 1.3515625, "loss/logits": 0.25671347975730896, "loss/reg": 0.00013776691048406065, "step": 1141 }, { "epoch": 0.14275, "grad_norm": 2.370041608810425, "grad_norm_var": 1.0516644879839874, "learning_rate": 0.0001, "loss": 1.2493, "loss/crossentropy": 2.206761360168457, "loss/hidden": 1.0859375, "loss/logits": 0.16200338304042816, "loss/reg": 0.0001377137377858162, "step": 1142 }, { "epoch": 0.142875, "grad_norm": 2.88512921333313, "grad_norm_var": 0.8327921373576174, "learning_rate": 0.0001, "loss": 1.2695, "loss/crossentropy": 2.4962573051452637, "loss/hidden": 1.0859375, "loss/logits": 0.1821853518486023, "loss/reg": 0.00013765328912995756, "step": 1143 }, { "epoch": 0.143, "grad_norm": 2.235801935195923, "grad_norm_var": 0.1104280267630528, "learning_rate": 0.0001, "loss": 1.4246, "loss/crossentropy": 2.5026495456695557, "loss/hidden": 1.234375, "loss/logits": 0.18889817595481873, "loss/reg": 0.00013758997374679893, "step": 1144 }, { "epoch": 0.143125, "grad_norm": 3.192683219909668, "grad_norm_var": 0.1381464763052425, "learning_rate": 0.0001, "loss": 1.6177, "loss/crossentropy": 2.3173067569732666, "loss/hidden": 1.34375, "loss/logits": 0.27255359292030334, "loss/reg": 0.00013753565144725144, "step": 1145 }, { "epoch": 0.14325, "grad_norm": 2.295013427734375, "grad_norm_var": 0.12330989194307158, "learning_rate": 0.0001, "loss": 1.2218, "loss/crossentropy": 2.848573684692383, "loss/hidden": 1.046875, "loss/logits": 0.17354759573936462, "loss/reg": 0.0001374850980937481, "step": 1146 }, { "epoch": 0.143375, "grad_norm": 2.6460840702056885, "grad_norm_var": 0.12036556114496046, "learning_rate": 0.0001, "loss": 1.1628, "loss/crossentropy": 2.7620790004730225, "loss/hidden": 1.015625, "loss/logits": 0.14582540094852448, "loss/reg": 0.000137420924147591, "step": 1147 }, { "epoch": 0.1435, "grad_norm": 2.483330488204956, "grad_norm_var": 0.11298631663166195, "learning_rate": 0.0001, "loss": 1.2347, "loss/crossentropy": 2.581028461456299, "loss/hidden": 1.0625, "loss/logits": 0.17081311345100403, "loss/reg": 0.0001373584382236004, "step": 1148 }, { "epoch": 0.143625, "grad_norm": 2.128574848175049, "grad_norm_var": 0.12260496741522023, "learning_rate": 0.0001, "loss": 1.1658, "loss/crossentropy": 2.5505971908569336, "loss/hidden": 0.9921875, "loss/logits": 0.1722814440727234, "loss/reg": 0.0001372901169816032, "step": 1149 }, { "epoch": 0.14375, "grad_norm": 2.2990832328796387, "grad_norm_var": 0.10612385291917731, "learning_rate": 0.0001, "loss": 1.3075, "loss/crossentropy": 2.6589207649230957, "loss/hidden": 1.125, "loss/logits": 0.1811218112707138, "loss/reg": 0.00013722767471335828, "step": 1150 }, { "epoch": 0.143875, "grad_norm": 3.161498785018921, "grad_norm_var": 0.1319146377915643, "learning_rate": 0.0001, "loss": 1.5832, "loss/crossentropy": 2.4901185035705566, "loss/hidden": 1.3515625, "loss/logits": 0.23031097650527954, "loss/reg": 0.00013715388195123523, "step": 1151 }, { "epoch": 0.144, "grad_norm": 17.34389305114746, "grad_norm_var": 13.854086688379954, "learning_rate": 0.0001, "loss": 1.525, "loss/crossentropy": 2.3399879932403564, "loss/hidden": 1.28125, "loss/logits": 0.24234414100646973, "loss/reg": 0.00013709689665120095, "step": 1152 }, { "epoch": 0.144125, "grad_norm": 3.452894687652588, "grad_norm_var": 13.807658852881636, "learning_rate": 0.0001, "loss": 1.3556, "loss/crossentropy": 2.591115951538086, "loss/hidden": 1.1875, "loss/logits": 0.16673940420150757, "loss/reg": 0.0001370416284771636, "step": 1153 }, { "epoch": 0.14425, "grad_norm": 2.4593427181243896, "grad_norm_var": 13.805879909006796, "learning_rate": 0.0001, "loss": 1.2617, "loss/crossentropy": 2.537099838256836, "loss/hidden": 1.09375, "loss/logits": 0.16658425331115723, "loss/reg": 0.00013697674148716033, "step": 1154 }, { "epoch": 0.144375, "grad_norm": 2.7449913024902344, "grad_norm_var": 13.677475118087726, "learning_rate": 0.0001, "loss": 1.217, "loss/crossentropy": 2.788774013519287, "loss/hidden": 1.0390625, "loss/logits": 0.17656010389328003, "loss/reg": 0.00013691138883586973, "step": 1155 }, { "epoch": 0.1445, "grad_norm": 3.310128927230835, "grad_norm_var": 13.567480380209659, "learning_rate": 0.0001, "loss": 1.5905, "loss/crossentropy": 2.3263795375823975, "loss/hidden": 1.3671875, "loss/logits": 0.22196845710277557, "loss/reg": 0.00013684302393812686, "step": 1156 }, { "epoch": 0.144625, "grad_norm": 2.9665098190307617, "grad_norm_var": 13.559023886093758, "learning_rate": 0.0001, "loss": 1.3229, "loss/crossentropy": 2.670069694519043, "loss/hidden": 1.125, "loss/logits": 0.19656628370285034, "loss/reg": 0.00013678865798283368, "step": 1157 }, { "epoch": 0.14475, "grad_norm": 2.5074424743652344, "grad_norm_var": 13.53724146455332, "learning_rate": 0.0001, "loss": 1.0958, "loss/crossentropy": 2.710556983947754, "loss/hidden": 0.94140625, "loss/logits": 0.1530512571334839, "loss/reg": 0.00013672743807546794, "step": 1158 }, { "epoch": 0.144875, "grad_norm": 2.719710111618042, "grad_norm_var": 13.555425129051761, "learning_rate": 0.0001, "loss": 1.1819, "loss/crossentropy": 2.3385159969329834, "loss/hidden": 1.03125, "loss/logits": 0.14927588403224945, "loss/reg": 0.0001366620563203469, "step": 1159 }, { "epoch": 0.145, "grad_norm": 3.7135472297668457, "grad_norm_var": 13.418843751446328, "learning_rate": 0.0001, "loss": 1.6313, "loss/crossentropy": 2.5336427688598633, "loss/hidden": 1.3984375, "loss/logits": 0.23144850134849548, "loss/reg": 0.00013660837430506945, "step": 1160 }, { "epoch": 0.145125, "grad_norm": 2.395505905151367, "grad_norm_var": 13.513977741169754, "learning_rate": 0.0001, "loss": 1.1662, "loss/crossentropy": 2.542593479156494, "loss/hidden": 1.0078125, "loss/logits": 0.15698453783988953, "loss/reg": 0.00013655259681399912, "step": 1161 }, { "epoch": 0.14525, "grad_norm": 2.7838761806488037, "grad_norm_var": 13.43966707644489, "learning_rate": 0.0001, "loss": 1.1918, "loss/crossentropy": 2.3821299076080322, "loss/hidden": 1.03125, "loss/logits": 0.15919485688209534, "loss/reg": 0.0001364961644867435, "step": 1162 }, { "epoch": 0.145375, "grad_norm": 2.3815958499908447, "grad_norm_var": 13.481021419439019, "learning_rate": 0.0001, "loss": 1.4695, "loss/crossentropy": 2.259742259979248, "loss/hidden": 1.25, "loss/logits": 0.21814435720443726, "loss/reg": 0.00013643868442159146, "step": 1163 }, { "epoch": 0.1455, "grad_norm": 2.608128070831299, "grad_norm_var": 13.462111823556892, "learning_rate": 0.0001, "loss": 1.2584, "loss/crossentropy": 2.453218698501587, "loss/hidden": 1.078125, "loss/logits": 0.17890435457229614, "loss/reg": 0.0001363928458886221, "step": 1164 }, { "epoch": 0.145625, "grad_norm": 2.8419406414031982, "grad_norm_var": 13.34577801938156, "learning_rate": 0.0001, "loss": 1.265, "loss/crossentropy": 2.79464054107666, "loss/hidden": 1.0859375, "loss/logits": 0.17774024605751038, "loss/reg": 0.00013633618073072284, "step": 1165 }, { "epoch": 0.14575, "grad_norm": 2.8397867679595947, "grad_norm_var": 13.260844845423556, "learning_rate": 0.0001, "loss": 1.3585, "loss/crossentropy": 2.33562970161438, "loss/hidden": 1.1640625, "loss/logits": 0.19309183955192566, "loss/reg": 0.0001362798793707043, "step": 1166 }, { "epoch": 0.145875, "grad_norm": 2.6030402183532715, "grad_norm_var": 13.325231633435052, "learning_rate": 0.0001, "loss": 1.2291, "loss/crossentropy": 2.5360007286071777, "loss/hidden": 1.0703125, "loss/logits": 0.15742647647857666, "loss/reg": 0.0001362244802294299, "step": 1167 }, { "epoch": 0.146, "grad_norm": 2.5892679691314697, "grad_norm_var": 0.1480890851131022, "learning_rate": 0.0001, "loss": 1.1563, "loss/crossentropy": 2.503377914428711, "loss/hidden": 1.0, "loss/logits": 0.15494558215141296, "loss/reg": 0.00013617172953672707, "step": 1168 }, { "epoch": 0.146125, "grad_norm": 2.169832229614258, "grad_norm_var": 0.14054427483489462, "learning_rate": 0.0001, "loss": 1.1679, "loss/crossentropy": 2.554893732070923, "loss/hidden": 1.0078125, "loss/logits": 0.1587696522474289, "loss/reg": 0.00013611878966912627, "step": 1169 }, { "epoch": 0.14625, "grad_norm": 3.72001314163208, "grad_norm_var": 0.19485674329269287, "learning_rate": 0.0001, "loss": 1.7481, "loss/crossentropy": 2.5990328788757324, "loss/hidden": 1.4609375, "loss/logits": 0.2858317196369171, "loss/reg": 0.00013606912398245186, "step": 1170 }, { "epoch": 0.146375, "grad_norm": 2.3007895946502686, "grad_norm_var": 0.21079976746106296, "learning_rate": 0.0001, "loss": 1.3781, "loss/crossentropy": 2.4757027626037598, "loss/hidden": 1.171875, "loss/logits": 0.2048284411430359, "loss/reg": 0.00013601657701656222, "step": 1171 }, { "epoch": 0.1465, "grad_norm": 5.02172327041626, "grad_norm_var": 0.5152910500696917, "learning_rate": 0.0001, "loss": 1.2631, "loss/crossentropy": 2.521611213684082, "loss/hidden": 1.109375, "loss/logits": 0.1523493528366089, "loss/reg": 0.0001359651068923995, "step": 1172 }, { "epoch": 0.146625, "grad_norm": 2.5816080570220947, "grad_norm_var": 0.5203759730868484, "learning_rate": 0.0001, "loss": 1.241, "loss/crossentropy": 2.554847478866577, "loss/hidden": 1.046875, "loss/logits": 0.19280372560024261, "loss/reg": 0.00013591660535894334, "step": 1173 }, { "epoch": 0.14675, "grad_norm": 2.435493230819702, "grad_norm_var": 0.5240923598385959, "learning_rate": 0.0001, "loss": 1.2834, "loss/crossentropy": 2.398463010787964, "loss/hidden": 1.1015625, "loss/logits": 0.18044692277908325, "loss/reg": 0.00013586532440967858, "step": 1174 }, { "epoch": 0.146875, "grad_norm": 3.173790216445923, "grad_norm_var": 0.5286903148261556, "learning_rate": 0.0001, "loss": 1.2692, "loss/crossentropy": 2.698183059692383, "loss/hidden": 1.078125, "loss/logits": 0.18972782790660858, "loss/reg": 0.00013580928498413414, "step": 1175 }, { "epoch": 0.147, "grad_norm": 2.556936025619507, "grad_norm_var": 0.48452479724022623, "learning_rate": 0.0001, "loss": 1.2295, "loss/crossentropy": 2.5440683364868164, "loss/hidden": 1.0625, "loss/logits": 0.16567812860012054, "loss/reg": 0.00013575928460340947, "step": 1176 }, { "epoch": 0.147125, "grad_norm": 2.4822487831115723, "grad_norm_var": 0.4801698267392254, "learning_rate": 0.0001, "loss": 1.4901, "loss/crossentropy": 2.2783827781677246, "loss/hidden": 1.2734375, "loss/logits": 0.21532906591892242, "loss/reg": 0.0001357072324026376, "step": 1177 }, { "epoch": 0.14725, "grad_norm": 2.409527540206909, "grad_norm_var": 0.4906380689474863, "learning_rate": 0.0001, "loss": 1.4483, "loss/crossentropy": 2.6060824394226074, "loss/hidden": 1.2265625, "loss/logits": 0.22040531039237976, "loss/reg": 0.00013565810513682663, "step": 1178 }, { "epoch": 0.147375, "grad_norm": 2.468101978302002, "grad_norm_var": 0.48634059440448874, "learning_rate": 0.0001, "loss": 1.2981, "loss/crossentropy": 2.418004274368286, "loss/hidden": 1.1171875, "loss/logits": 0.1795501410961151, "loss/reg": 0.00013561184459831566, "step": 1179 }, { "epoch": 0.1475, "grad_norm": 2.7479474544525146, "grad_norm_var": 0.4839828513356148, "learning_rate": 0.0001, "loss": 1.6565, "loss/crossentropy": 2.561248302459717, "loss/hidden": 1.359375, "loss/logits": 0.2958047389984131, "loss/reg": 0.0001355713466182351, "step": 1180 }, { "epoch": 0.147625, "grad_norm": 3.4296419620513916, "grad_norm_var": 0.5081607026950972, "learning_rate": 0.0001, "loss": 1.4678, "loss/crossentropy": 2.72118878364563, "loss/hidden": 1.2421875, "loss/logits": 0.22430044412612915, "loss/reg": 0.0001355288695776835, "step": 1181 }, { "epoch": 0.14775, "grad_norm": 2.5870907306671143, "grad_norm_var": 0.5123478348270045, "learning_rate": 0.0001, "loss": 1.3042, "loss/crossentropy": 2.3236382007598877, "loss/hidden": 1.125, "loss/logits": 0.1778583824634552, "loss/reg": 0.00013548559218179435, "step": 1182 }, { "epoch": 0.147875, "grad_norm": 2.9039268493652344, "grad_norm_var": 0.5089083015178137, "learning_rate": 0.0001, "loss": 1.3207, "loss/crossentropy": 2.162961006164551, "loss/hidden": 1.140625, "loss/logits": 0.1786920726299286, "loss/reg": 0.00013541235239244998, "step": 1183 }, { "epoch": 0.148, "grad_norm": 2.55702543258667, "grad_norm_var": 0.5100882360989959, "learning_rate": 0.0001, "loss": 1.3182, "loss/crossentropy": 2.7152206897735596, "loss/hidden": 1.140625, "loss/logits": 0.17617307603359222, "loss/reg": 0.00013535344623960555, "step": 1184 }, { "epoch": 0.148125, "grad_norm": 2.3750383853912354, "grad_norm_var": 0.49420299731367073, "learning_rate": 0.0001, "loss": 1.1263, "loss/crossentropy": 2.632753849029541, "loss/hidden": 0.9765625, "loss/logits": 0.1483670175075531, "loss/reg": 0.00013529676652979106, "step": 1185 }, { "epoch": 0.14825, "grad_norm": 2.5940427780151367, "grad_norm_var": 0.4442424735461259, "learning_rate": 0.0001, "loss": 1.3401, "loss/crossentropy": 2.142656087875366, "loss/hidden": 1.1484375, "loss/logits": 0.1903262734413147, "loss/reg": 0.0001352417457383126, "step": 1186 }, { "epoch": 0.148375, "grad_norm": 3.9694278240203857, "grad_norm_var": 0.5096320665386922, "learning_rate": 0.0001, "loss": 1.5608, "loss/crossentropy": 2.4541471004486084, "loss/hidden": 1.3046875, "loss/logits": 0.254780113697052, "loss/reg": 0.00013517274055629969, "step": 1187 }, { "epoch": 0.1485, "grad_norm": 2.466203451156616, "grad_norm_var": 0.19258569198738443, "learning_rate": 0.0001, "loss": 1.3854, "loss/crossentropy": 2.4137227535247803, "loss/hidden": 1.1875, "loss/logits": 0.19650119543075562, "loss/reg": 0.00013511099677998573, "step": 1188 }, { "epoch": 0.148625, "grad_norm": 2.077420949935913, "grad_norm_var": 0.21869302596546328, "learning_rate": 0.0001, "loss": 1.223, "loss/crossentropy": 2.5744669437408447, "loss/hidden": 1.0546875, "loss/logits": 0.16699844598770142, "loss/reg": 0.00013503149966709316, "step": 1189 }, { "epoch": 0.14875, "grad_norm": 3.1861753463745117, "grad_norm_var": 0.22722667996163562, "learning_rate": 0.0001, "loss": 1.3791, "loss/crossentropy": 2.6713171005249023, "loss/hidden": 1.1640625, "loss/logits": 0.21371984481811523, "loss/reg": 0.00013494817540049553, "step": 1190 }, { "epoch": 0.148875, "grad_norm": 2.916372299194336, "grad_norm_var": 0.2167895345555903, "learning_rate": 0.0001, "loss": 1.2991, "loss/crossentropy": 2.588320255279541, "loss/hidden": 1.1171875, "loss/logits": 0.18051624298095703, "loss/reg": 0.00013486265379469842, "step": 1191 }, { "epoch": 0.149, "grad_norm": 2.8979673385620117, "grad_norm_var": 0.2160551334747955, "learning_rate": 0.0001, "loss": 1.302, "loss/crossentropy": 2.763566732406616, "loss/hidden": 1.125, "loss/logits": 0.1756616234779358, "loss/reg": 0.00013480265624821186, "step": 1192 }, { "epoch": 0.149125, "grad_norm": 2.264946460723877, "grad_norm_var": 0.22688755644441397, "learning_rate": 0.0001, "loss": 1.1639, "loss/crossentropy": 2.469449520111084, "loss/hidden": 1.0078125, "loss/logits": 0.15473109483718872, "loss/reg": 0.0001347199286101386, "step": 1193 }, { "epoch": 0.14925, "grad_norm": 2.6669046878814697, "grad_norm_var": 0.21966365009754157, "learning_rate": 0.0001, "loss": 1.3249, "loss/crossentropy": 2.6232573986053467, "loss/hidden": 1.125, "loss/logits": 0.19851422309875488, "loss/reg": 0.00013465284428093582, "step": 1194 }, { "epoch": 0.149375, "grad_norm": 3.952681064605713, "grad_norm_var": 0.30027308867669544, "learning_rate": 0.0001, "loss": 1.5957, "loss/crossentropy": 2.58945369720459, "loss/hidden": 1.3828125, "loss/logits": 0.21150372922420502, "loss/reg": 0.00013456812303047627, "step": 1195 }, { "epoch": 0.1495, "grad_norm": 2.4492130279541016, "grad_norm_var": 0.30989771926534004, "learning_rate": 0.0001, "loss": 1.3587, "loss/crossentropy": 2.6759867668151855, "loss/hidden": 1.15625, "loss/logits": 0.20107804238796234, "loss/reg": 0.00013448372192215174, "step": 1196 }, { "epoch": 0.149625, "grad_norm": 3.0652377605438232, "grad_norm_var": 0.28910493306883966, "learning_rate": 0.0001, "loss": 1.2333, "loss/crossentropy": 2.8479163646698, "loss/hidden": 1.0625, "loss/logits": 0.16946375370025635, "loss/reg": 0.0001344182383036241, "step": 1197 }, { "epoch": 0.14975, "grad_norm": 3.185439109802246, "grad_norm_var": 0.29384878933973746, "learning_rate": 0.0001, "loss": 1.2967, "loss/crossentropy": 2.4679136276245117, "loss/hidden": 1.109375, "loss/logits": 0.18601495027542114, "loss/reg": 0.0001343381591141224, "step": 1198 }, { "epoch": 0.149875, "grad_norm": 2.72277569770813, "grad_norm_var": 0.29448859530144184, "learning_rate": 0.0001, "loss": 1.2721, "loss/crossentropy": 2.5996079444885254, "loss/hidden": 1.0859375, "loss/logits": 0.18481579422950745, "loss/reg": 0.00013426129589788616, "step": 1199 }, { "epoch": 0.15, "grad_norm": 2.4012537002563477, "grad_norm_var": 0.3017615160651585, "learning_rate": 0.0001, "loss": 1.3213, "loss/crossentropy": 2.360085964202881, "loss/hidden": 1.1484375, "loss/logits": 0.17153772711753845, "loss/reg": 0.0001342054601991549, "step": 1200 }, { "epoch": 0.150125, "grad_norm": 2.443404197692871, "grad_norm_var": 0.29795710552664995, "learning_rate": 0.0001, "loss": 1.3169, "loss/crossentropy": 2.3268697261810303, "loss/hidden": 1.125, "loss/logits": 0.1905977874994278, "loss/reg": 0.00013413169654086232, "step": 1201 }, { "epoch": 0.15025, "grad_norm": 1.977095603942871, "grad_norm_var": 0.34105027466049764, "learning_rate": 0.0001, "loss": 1.1827, "loss/crossentropy": 2.583897113800049, "loss/hidden": 1.015625, "loss/logits": 0.16575005650520325, "loss/reg": 0.00013405534264165908, "step": 1202 }, { "epoch": 0.150375, "grad_norm": 3.518091917037964, "grad_norm_var": 0.2828155032343593, "learning_rate": 0.0001, "loss": 1.5366, "loss/crossentropy": 2.80460786819458, "loss/hidden": 1.296875, "loss/logits": 0.23834118247032166, "loss/reg": 0.00013399054296314716, "step": 1203 }, { "epoch": 0.1505, "grad_norm": 3.222933769226074, "grad_norm_var": 0.28876561060554284, "learning_rate": 0.0001, "loss": 1.2125, "loss/crossentropy": 2.760080337524414, "loss/hidden": 1.0546875, "loss/logits": 0.15649531781673431, "loss/reg": 0.00013391199172474444, "step": 1204 }, { "epoch": 0.150625, "grad_norm": 2.2016725540161133, "grad_norm_var": 0.2776064801276637, "learning_rate": 0.0001, "loss": 1.2438, "loss/crossentropy": 2.620450735092163, "loss/hidden": 1.078125, "loss/logits": 0.16430824995040894, "loss/reg": 0.00013385553029365838, "step": 1205 }, { "epoch": 0.15075, "grad_norm": 3.7037551403045654, "grad_norm_var": 0.3198258497783511, "learning_rate": 0.0001, "loss": 1.3619, "loss/crossentropy": 2.400062084197998, "loss/hidden": 1.1953125, "loss/logits": 0.1652124524116516, "loss/reg": 0.00013380117889028043, "step": 1206 }, { "epoch": 0.150875, "grad_norm": 2.8617329597473145, "grad_norm_var": 0.3195242326965245, "learning_rate": 0.0001, "loss": 1.3437, "loss/crossentropy": 2.319291591644287, "loss/hidden": 1.171875, "loss/logits": 0.17051541805267334, "loss/reg": 0.00013374279660638422, "step": 1207 }, { "epoch": 0.151, "grad_norm": 2.992048501968384, "grad_norm_var": 0.3207300248309205, "learning_rate": 0.0001, "loss": 1.3527, "loss/crossentropy": 2.7394344806671143, "loss/hidden": 1.15625, "loss/logits": 0.19507554173469543, "loss/reg": 0.00013368793588597327, "step": 1208 }, { "epoch": 0.151125, "grad_norm": 2.0492210388183594, "grad_norm_var": 0.34051920710794925, "learning_rate": 0.0001, "loss": 1.1625, "loss/crossentropy": 2.405010223388672, "loss/hidden": 1.0, "loss/logits": 0.1611155867576599, "loss/reg": 0.00013362312165554613, "step": 1209 }, { "epoch": 0.15125, "grad_norm": 2.209076404571533, "grad_norm_var": 0.3640847652187306, "learning_rate": 0.0001, "loss": 1.2133, "loss/crossentropy": 2.723270893096924, "loss/hidden": 1.0234375, "loss/logits": 0.18850207328796387, "loss/reg": 0.00013356722774915397, "step": 1210 }, { "epoch": 0.151375, "grad_norm": 2.0855367183685303, "grad_norm_var": 0.2974326601661526, "learning_rate": 0.0001, "loss": 1.3757, "loss/crossentropy": 2.317182779312134, "loss/hidden": 1.171875, "loss/logits": 0.20251649618148804, "loss/reg": 0.0001334957778453827, "step": 1211 }, { "epoch": 0.1515, "grad_norm": 2.122633218765259, "grad_norm_var": 0.3147153404789357, "learning_rate": 0.0001, "loss": 1.3026, "loss/crossentropy": 2.4646530151367188, "loss/hidden": 1.125, "loss/logits": 0.1762724220752716, "loss/reg": 0.00013341823068913072, "step": 1212 }, { "epoch": 0.151625, "grad_norm": 2.7330989837646484, "grad_norm_var": 0.304222924151577, "learning_rate": 0.0001, "loss": 1.3067, "loss/crossentropy": 2.6515402793884277, "loss/hidden": 1.1171875, "loss/logits": 0.18822193145751953, "loss/reg": 0.00013333684182725847, "step": 1213 }, { "epoch": 0.15175, "grad_norm": 1.7902295589447021, "grad_norm_var": 0.3266255177822944, "learning_rate": 0.0001, "loss": 1.1264, "loss/crossentropy": 2.328126907348633, "loss/hidden": 0.96875, "loss/logits": 0.15635313093662262, "loss/reg": 0.0001332624233327806, "step": 1214 }, { "epoch": 0.151875, "grad_norm": 3.4579367637634277, "grad_norm_var": 0.37590311404607823, "learning_rate": 0.0001, "loss": 1.2153, "loss/crossentropy": 2.7028801441192627, "loss/hidden": 1.0625, "loss/logits": 0.1514454334974289, "loss/reg": 0.000133179928525351, "step": 1215 }, { "epoch": 0.152, "grad_norm": 2.7429351806640625, "grad_norm_var": 0.37366210857489174, "learning_rate": 0.0001, "loss": 1.3943, "loss/crossentropy": 2.650686502456665, "loss/hidden": 1.1875, "loss/logits": 0.20544981956481934, "loss/reg": 0.00013312778901308775, "step": 1216 }, { "epoch": 0.152125, "grad_norm": 2.156160593032837, "grad_norm_var": 0.38604054230545987, "learning_rate": 0.0001, "loss": 1.1771, "loss/crossentropy": 2.36908221244812, "loss/hidden": 1.0234375, "loss/logits": 0.1522974669933319, "loss/reg": 0.00013307285553310066, "step": 1217 }, { "epoch": 0.15225, "grad_norm": 2.511603593826294, "grad_norm_var": 0.3585052771520537, "learning_rate": 0.0001, "loss": 1.3329, "loss/crossentropy": 2.6030852794647217, "loss/hidden": 1.140625, "loss/logits": 0.19095377624034882, "loss/reg": 0.00013300571299623698, "step": 1218 }, { "epoch": 0.152375, "grad_norm": 2.8069794178009033, "grad_norm_var": 0.3075572664012422, "learning_rate": 0.0001, "loss": 1.2637, "loss/crossentropy": 2.722777843475342, "loss/hidden": 1.09375, "loss/logits": 0.16862529516220093, "loss/reg": 0.00013293074152898043, "step": 1219 }, { "epoch": 0.1525, "grad_norm": 2.4026052951812744, "grad_norm_var": 0.2818063191756188, "learning_rate": 0.0001, "loss": 1.4556, "loss/crossentropy": 2.5976243019104004, "loss/hidden": 1.2421875, "loss/logits": 0.212128147482872, "loss/reg": 0.00013285950990393758, "step": 1220 }, { "epoch": 0.152625, "grad_norm": 3.155329465866089, "grad_norm_var": 0.29413997815926224, "learning_rate": 0.0001, "loss": 1.3657, "loss/crossentropy": 2.3139760494232178, "loss/hidden": 1.1796875, "loss/logits": 0.18470482528209686, "loss/reg": 0.00013278215192258358, "step": 1221 }, { "epoch": 0.15275, "grad_norm": 2.6437668800354004, "grad_norm_var": 0.2099655378788931, "learning_rate": 0.0001, "loss": 1.4202, "loss/crossentropy": 2.4007277488708496, "loss/hidden": 1.2109375, "loss/logits": 0.2078927457332611, "loss/reg": 0.0001327060890616849, "step": 1222 }, { "epoch": 0.152875, "grad_norm": 2.125824451446533, "grad_norm_var": 0.21274040988610235, "learning_rate": 0.0001, "loss": 1.3709, "loss/crossentropy": 2.3286030292510986, "loss/hidden": 1.15625, "loss/logits": 0.21329466998577118, "loss/reg": 0.000132655244669877, "step": 1223 }, { "epoch": 0.153, "grad_norm": 2.814157009124756, "grad_norm_var": 0.20302515690395923, "learning_rate": 0.0001, "loss": 1.4608, "loss/crossentropy": 2.698427677154541, "loss/hidden": 1.234375, "loss/logits": 0.22507141530513763, "loss/reg": 0.00013260301784612238, "step": 1224 }, { "epoch": 0.153125, "grad_norm": 2.5524885654449463, "grad_norm_var": 0.18941574820577572, "learning_rate": 0.0001, "loss": 1.3323, "loss/crossentropy": 2.941849708557129, "loss/hidden": 1.140625, "loss/logits": 0.19035504758358002, "loss/reg": 0.0001325382909271866, "step": 1225 }, { "epoch": 0.15325, "grad_norm": 2.5318713188171387, "grad_norm_var": 0.1825720178283497, "learning_rate": 0.0001, "loss": 1.1052, "loss/crossentropy": 2.652430772781372, "loss/hidden": 0.96484375, "loss/logits": 0.13903579115867615, "loss/reg": 0.00013248201867099851, "step": 1226 }, { "epoch": 0.153375, "grad_norm": 3.8172354698181152, "grad_norm_var": 0.2651620867786998, "learning_rate": 0.0001, "loss": 1.4834, "loss/crossentropy": 2.3483917713165283, "loss/hidden": 1.265625, "loss/logits": 0.21648316085338593, "loss/reg": 0.00013241711712908, "step": 1227 }, { "epoch": 0.1535, "grad_norm": 3.619731903076172, "grad_norm_var": 0.30041272082792314, "learning_rate": 0.0001, "loss": 1.5171, "loss/crossentropy": 2.7781167030334473, "loss/hidden": 1.3125, "loss/logits": 0.20323674380779266, "loss/reg": 0.00013236266386229545, "step": 1228 }, { "epoch": 0.153625, "grad_norm": 2.363253116607666, "grad_norm_var": 0.30936981667010904, "learning_rate": 0.0001, "loss": 1.1376, "loss/crossentropy": 2.244126558303833, "loss/hidden": 0.98046875, "loss/logits": 0.1558556854724884, "loss/reg": 0.00013230668264441192, "step": 1229 }, { "epoch": 0.15375, "grad_norm": 2.9101548194885254, "grad_norm_var": 0.249183202218948, "learning_rate": 0.0001, "loss": 1.3481, "loss/crossentropy": 2.692718505859375, "loss/hidden": 1.15625, "loss/logits": 0.19055041670799255, "loss/reg": 0.00013224301801528782, "step": 1230 }, { "epoch": 0.153875, "grad_norm": 3.0857272148132324, "grad_norm_var": 0.2246068795279295, "learning_rate": 0.0001, "loss": 1.343, "loss/crossentropy": 2.6534008979797363, "loss/hidden": 1.1484375, "loss/logits": 0.19319745898246765, "loss/reg": 0.00013219112588558346, "step": 1231 }, { "epoch": 0.154, "grad_norm": 2.4666452407836914, "grad_norm_var": 0.23019032150964827, "learning_rate": 0.0001, "loss": 1.2727, "loss/crossentropy": 2.40718936920166, "loss/hidden": 1.09375, "loss/logits": 0.17764654755592346, "loss/reg": 0.0001321397430729121, "step": 1232 }, { "epoch": 0.154125, "grad_norm": 2.8066935539245605, "grad_norm_var": 0.2053292955064722, "learning_rate": 0.0001, "loss": 1.3376, "loss/crossentropy": 1.9716176986694336, "loss/hidden": 1.15625, "loss/logits": 0.18000416457653046, "loss/reg": 0.0001320884475717321, "step": 1233 }, { "epoch": 0.15425, "grad_norm": 2.586003303527832, "grad_norm_var": 0.20292964943450897, "learning_rate": 0.0001, "loss": 1.5561, "loss/crossentropy": 2.3284027576446533, "loss/hidden": 1.3359375, "loss/logits": 0.21880823373794556, "loss/reg": 0.00013204067363403738, "step": 1234 }, { "epoch": 0.154375, "grad_norm": 2.3122806549072266, "grad_norm_var": 0.21730492377663282, "learning_rate": 0.0001, "loss": 1.4781, "loss/crossentropy": 2.4504876136779785, "loss/hidden": 1.25, "loss/logits": 0.22682304680347443, "loss/reg": 0.00013199099339544773, "step": 1235 }, { "epoch": 0.1545, "grad_norm": 2.5657150745391846, "grad_norm_var": 0.2111492148088395, "learning_rate": 0.0001, "loss": 1.4129, "loss/crossentropy": 2.5228676795959473, "loss/hidden": 1.203125, "loss/logits": 0.20841020345687866, "loss/reg": 0.00013194471830502152, "step": 1236 }, { "epoch": 0.154625, "grad_norm": 2.305783748626709, "grad_norm_var": 0.21287095702704448, "learning_rate": 0.0001, "loss": 1.3023, "loss/crossentropy": 2.5582048892974854, "loss/hidden": 1.1171875, "loss/logits": 0.18379032611846924, "loss/reg": 0.0001318956637987867, "step": 1237 }, { "epoch": 0.15475, "grad_norm": 6.51556921005249, "grad_norm_var": 1.1108534004440487, "learning_rate": 0.0001, "loss": 1.5037, "loss/crossentropy": 2.5778305530548096, "loss/hidden": 1.3125, "loss/logits": 0.1898770034313202, "loss/reg": 0.0001318511349381879, "step": 1238 }, { "epoch": 0.154875, "grad_norm": 2.255342721939087, "grad_norm_var": 1.097475721203551, "learning_rate": 0.0001, "loss": 1.2752, "loss/crossentropy": 2.427241325378418, "loss/hidden": 1.09375, "loss/logits": 0.1800854206085205, "loss/reg": 0.00013180031965021044, "step": 1239 }, { "epoch": 0.155, "grad_norm": 2.0070910453796387, "grad_norm_var": 1.1548791992379106, "learning_rate": 0.0001, "loss": 1.1802, "loss/crossentropy": 2.5689311027526855, "loss/hidden": 1.03125, "loss/logits": 0.14763739705085754, "loss/reg": 0.00013174394553061575, "step": 1240 }, { "epoch": 0.155125, "grad_norm": 1.9197287559509277, "grad_norm_var": 1.2108123637238188, "learning_rate": 0.0001, "loss": 1.1474, "loss/crossentropy": 2.6826183795928955, "loss/hidden": 0.9921875, "loss/logits": 0.15393030643463135, "loss/reg": 0.00013169572048354894, "step": 1241 }, { "epoch": 0.15525, "grad_norm": 3.082594156265259, "grad_norm_var": 1.2042566289612882, "learning_rate": 0.0001, "loss": 1.3082, "loss/crossentropy": 2.668745517730713, "loss/hidden": 1.1328125, "loss/logits": 0.17404061555862427, "loss/reg": 0.00013164858683012426, "step": 1242 }, { "epoch": 0.155375, "grad_norm": 2.144428253173828, "grad_norm_var": 1.17762883625489, "learning_rate": 0.0001, "loss": 1.2974, "loss/crossentropy": 2.5141208171844482, "loss/hidden": 1.1015625, "loss/logits": 0.19456884264945984, "loss/reg": 0.00013160427624825388, "step": 1243 }, { "epoch": 0.1555, "grad_norm": 2.3358635902404785, "grad_norm_var": 1.1418949794687323, "learning_rate": 0.0001, "loss": 1.4272, "loss/crossentropy": 2.2969248294830322, "loss/hidden": 1.2421875, "loss/logits": 0.18371343612670898, "loss/reg": 0.000131547189084813, "step": 1244 }, { "epoch": 0.155625, "grad_norm": 2.552175760269165, "grad_norm_var": 1.134914437715967, "learning_rate": 0.0001, "loss": 1.4681, "loss/crossentropy": 2.5100483894348145, "loss/hidden": 1.265625, "loss/logits": 0.20113971829414368, "loss/reg": 0.000131497741676867, "step": 1245 }, { "epoch": 0.15575, "grad_norm": 2.735105276107788, "grad_norm_var": 1.1328753899838278, "learning_rate": 0.0001, "loss": 1.4007, "loss/crossentropy": 2.3812525272369385, "loss/hidden": 1.203125, "loss/logits": 0.19628703594207764, "loss/reg": 0.00013145487173460424, "step": 1246 }, { "epoch": 0.155875, "grad_norm": 2.213937759399414, "grad_norm_var": 1.1390035833902816, "learning_rate": 0.0001, "loss": 1.4141, "loss/crossentropy": 2.236989736557007, "loss/hidden": 1.21875, "loss/logits": 0.19403648376464844, "loss/reg": 0.00013139851216692477, "step": 1247 }, { "epoch": 0.156, "grad_norm": 2.708592176437378, "grad_norm_var": 1.1359307923141908, "learning_rate": 0.0001, "loss": 1.3108, "loss/crossentropy": 2.858665943145752, "loss/hidden": 1.140625, "loss/logits": 0.16884824633598328, "loss/reg": 0.00013134452456142753, "step": 1248 }, { "epoch": 0.156125, "grad_norm": 2.7696828842163086, "grad_norm_var": 1.1354426796260404, "learning_rate": 0.0001, "loss": 1.5933, "loss/crossentropy": 2.1554453372955322, "loss/hidden": 1.328125, "loss/logits": 0.2638967037200928, "loss/reg": 0.00013130102888680995, "step": 1249 }, { "epoch": 0.15625, "grad_norm": 2.8501203060150146, "grad_norm_var": 1.1362064972022834, "learning_rate": 0.0001, "loss": 1.483, "loss/crossentropy": 2.5494585037231445, "loss/hidden": 1.234375, "loss/logits": 0.24731585383415222, "loss/reg": 0.00013124916586093605, "step": 1250 }, { "epoch": 0.156375, "grad_norm": 3.5710434913635254, "grad_norm_var": 1.1693874895407588, "learning_rate": 0.0001, "loss": 1.9298, "loss/crossentropy": 2.0444109439849854, "loss/hidden": 1.671875, "loss/logits": 0.2566450536251068, "loss/reg": 0.00013119846698828042, "step": 1251 }, { "epoch": 0.1565, "grad_norm": 2.7254796028137207, "grad_norm_var": 1.166347837510988, "learning_rate": 0.0001, "loss": 1.3236, "loss/crossentropy": 2.4209351539611816, "loss/hidden": 1.1171875, "loss/logits": 0.20506344735622406, "loss/reg": 0.0001311425439780578, "step": 1252 }, { "epoch": 0.156625, "grad_norm": 2.5141329765319824, "grad_norm_var": 1.1555182273977447, "learning_rate": 0.0001, "loss": 1.4183, "loss/crossentropy": 2.3118011951446533, "loss/hidden": 1.21875, "loss/logits": 0.19824537634849548, "loss/reg": 0.0001310960651608184, "step": 1253 }, { "epoch": 0.15675, "grad_norm": 2.9120190143585205, "grad_norm_var": 0.18491420642144263, "learning_rate": 0.0001, "loss": 1.4208, "loss/crossentropy": 2.4693243503570557, "loss/hidden": 1.2265625, "loss/logits": 0.19295932352542877, "loss/reg": 0.00013103905075695366, "step": 1254 }, { "epoch": 0.156875, "grad_norm": 3.4383442401885986, "grad_norm_var": 0.22100223131199428, "learning_rate": 0.0001, "loss": 1.4055, "loss/crossentropy": 2.5472207069396973, "loss/hidden": 1.171875, "loss/logits": 0.23234573006629944, "loss/reg": 0.00013098205090500414, "step": 1255 }, { "epoch": 0.157, "grad_norm": 2.3200254440307617, "grad_norm_var": 0.20008810942297614, "learning_rate": 0.0001, "loss": 1.2699, "loss/crossentropy": 2.771556854248047, "loss/hidden": 1.09375, "loss/logits": 0.17485584318637848, "loss/reg": 0.00013092001609038562, "step": 1256 }, { "epoch": 0.157125, "grad_norm": 2.514660358428955, "grad_norm_var": 0.162331635078944, "learning_rate": 0.0001, "loss": 1.3298, "loss/crossentropy": 2.408695697784424, "loss/hidden": 1.1328125, "loss/logits": 0.195638045668602, "loss/reg": 0.0001308704522671178, "step": 1257 }, { "epoch": 0.15725, "grad_norm": 3.057460069656372, "grad_norm_var": 0.16112838350247383, "learning_rate": 0.0001, "loss": 1.651, "loss/crossentropy": 2.2750370502471924, "loss/hidden": 1.40625, "loss/logits": 0.2434910386800766, "loss/reg": 0.0001308250502916053, "step": 1258 }, { "epoch": 0.157375, "grad_norm": 3.642026901245117, "grad_norm_var": 0.18833189738190845, "learning_rate": 0.0001, "loss": 1.4482, "loss/crossentropy": 2.28401780128479, "loss/hidden": 1.2421875, "loss/logits": 0.20467320084571838, "loss/reg": 0.00013078213669359684, "step": 1259 }, { "epoch": 0.1575, "grad_norm": 2.368194580078125, "grad_norm_var": 0.18638008361255606, "learning_rate": 0.0001, "loss": 1.3675, "loss/crossentropy": 2.653162956237793, "loss/hidden": 1.15625, "loss/logits": 0.20993557572364807, "loss/reg": 0.00013073279114905745, "step": 1260 }, { "epoch": 0.157625, "grad_norm": 2.2031679153442383, "grad_norm_var": 0.20579581905459085, "learning_rate": 0.0001, "loss": 1.478, "loss/crossentropy": 2.4636590480804443, "loss/hidden": 1.25, "loss/logits": 0.22670403122901917, "loss/reg": 0.00013067574764136225, "step": 1261 }, { "epoch": 0.15775, "grad_norm": 2.631377935409546, "grad_norm_var": 0.20714450236721405, "learning_rate": 0.0001, "loss": 1.3078, "loss/crossentropy": 2.599713087081909, "loss/hidden": 1.125, "loss/logits": 0.181521475315094, "loss/reg": 0.00013061884965281934, "step": 1262 }, { "epoch": 0.157875, "grad_norm": 2.171218156814575, "grad_norm_var": 0.21046867787754853, "learning_rate": 0.0001, "loss": 1.3096, "loss/crossentropy": 2.25119686126709, "loss/hidden": 1.125, "loss/logits": 0.1833014190196991, "loss/reg": 0.00013056538591627032, "step": 1263 }, { "epoch": 0.158, "grad_norm": 2.443354368209839, "grad_norm_var": 0.2172087127229074, "learning_rate": 0.0001, "loss": 1.1614, "loss/crossentropy": 2.7536213397979736, "loss/hidden": 0.99609375, "loss/logits": 0.16401053965091705, "loss/reg": 0.0001305162877542898, "step": 1264 }, { "epoch": 0.158125, "grad_norm": 2.5962986946105957, "grad_norm_var": 0.21882373373926095, "learning_rate": 0.0001, "loss": 1.2329, "loss/crossentropy": 2.4206035137176514, "loss/hidden": 1.0625, "loss/logits": 0.169047549366951, "loss/reg": 0.00013046478852629662, "step": 1265 }, { "epoch": 0.15825, "grad_norm": 2.7338926792144775, "grad_norm_var": 0.2180766868279323, "learning_rate": 0.0001, "loss": 1.3565, "loss/crossentropy": 2.7029006481170654, "loss/hidden": 1.15625, "loss/logits": 0.19893750548362732, "loss/reg": 0.00013040259364061058, "step": 1266 }, { "epoch": 0.158375, "grad_norm": 2.3763999938964844, "grad_norm_var": 0.17492842155059876, "learning_rate": 0.0001, "loss": 1.4126, "loss/crossentropy": 2.456965208053589, "loss/hidden": 1.21875, "loss/logits": 0.19252705574035645, "loss/reg": 0.00013035195297561586, "step": 1267 }, { "epoch": 0.1585, "grad_norm": 2.6634342670440674, "grad_norm_var": 0.17467285645274444, "learning_rate": 0.0001, "loss": 1.5335, "loss/crossentropy": 3.042233467102051, "loss/hidden": 1.25, "loss/logits": 0.28217369318008423, "loss/reg": 0.0001303115568589419, "step": 1268 }, { "epoch": 0.158625, "grad_norm": 3.157491683959961, "grad_norm_var": 0.1878901803747188, "learning_rate": 0.0001, "loss": 1.2198, "loss/crossentropy": 2.7382898330688477, "loss/hidden": 1.046875, "loss/logits": 0.171668142080307, "loss/reg": 0.00013027561362832785, "step": 1269 }, { "epoch": 0.15875, "grad_norm": 2.4261109828948975, "grad_norm_var": 0.1890295225800846, "learning_rate": 0.0001, "loss": 1.3294, "loss/crossentropy": 2.89754319190979, "loss/hidden": 1.1328125, "loss/logits": 0.19530890882015228, "loss/reg": 0.00013022631173953414, "step": 1270 }, { "epoch": 0.158875, "grad_norm": 3.107105016708374, "grad_norm_var": 0.16201763909256972, "learning_rate": 0.0001, "loss": 1.4813, "loss/crossentropy": 2.6303040981292725, "loss/hidden": 1.21875, "loss/logits": 0.2612677216529846, "loss/reg": 0.00013018911704421043, "step": 1271 }, { "epoch": 0.159, "grad_norm": 2.9736440181732178, "grad_norm_var": 0.15989516181449492, "learning_rate": 0.0001, "loss": 1.6058, "loss/crossentropy": 2.8325958251953125, "loss/hidden": 1.3359375, "loss/logits": 0.26858165860176086, "loss/reg": 0.00013013428542762995, "step": 1272 }, { "epoch": 0.159125, "grad_norm": 2.6073713302612305, "grad_norm_var": 0.15824495318188359, "learning_rate": 0.0001, "loss": 1.3127, "loss/crossentropy": 2.6577377319335938, "loss/hidden": 1.125, "loss/logits": 0.1864062249660492, "loss/reg": 0.00013007981760893017, "step": 1273 }, { "epoch": 0.15925, "grad_norm": 3.8253326416015625, "grad_norm_var": 0.2319598038283213, "learning_rate": 0.0001, "loss": 1.8366, "loss/crossentropy": 2.977168321609497, "loss/hidden": 1.4921875, "loss/logits": 0.34315794706344604, "loss/reg": 0.00013002852210775018, "step": 1274 }, { "epoch": 0.159375, "grad_norm": 3.0556631088256836, "grad_norm_var": 0.1833488732902121, "learning_rate": 0.0001, "loss": 1.5639, "loss/crossentropy": 2.3977134227752686, "loss/hidden": 1.296875, "loss/logits": 0.26569706201553345, "loss/reg": 0.00012998130114283413, "step": 1275 }, { "epoch": 0.1595, "grad_norm": 2.2236955165863037, "grad_norm_var": 0.19121526631862076, "learning_rate": 0.0001, "loss": 1.3187, "loss/crossentropy": 2.3185195922851562, "loss/hidden": 1.125, "loss/logits": 0.19236713647842407, "loss/reg": 0.00012993151904083788, "step": 1276 }, { "epoch": 0.159625, "grad_norm": 3.845000982284546, "grad_norm_var": 0.2509899799612024, "learning_rate": 0.0001, "loss": 1.8386, "loss/crossentropy": 2.461397409439087, "loss/hidden": 1.546875, "loss/logits": 0.29045963287353516, "loss/reg": 0.00012988719390705228, "step": 1277 }, { "epoch": 0.15975, "grad_norm": 2.4181408882141113, "grad_norm_var": 0.25869249706339753, "learning_rate": 0.0001, "loss": 1.3244, "loss/crossentropy": 2.566429853439331, "loss/hidden": 1.1171875, "loss/logits": 0.20595690608024597, "loss/reg": 0.00012983269698452204, "step": 1278 }, { "epoch": 0.159875, "grad_norm": 2.89695405960083, "grad_norm_var": 0.23183032275595963, "learning_rate": 0.0001, "loss": 1.316, "loss/crossentropy": 2.567094326019287, "loss/hidden": 1.125, "loss/logits": 0.18970975279808044, "loss/reg": 0.00012977914593648165, "step": 1279 }, { "epoch": 0.16, "grad_norm": 2.577812910079956, "grad_norm_var": 0.22595024760163654, "learning_rate": 0.0001, "loss": 1.3687, "loss/crossentropy": 2.8782851696014404, "loss/hidden": 1.1796875, "loss/logits": 0.18776485323905945, "loss/reg": 0.00012973452976439148, "step": 1280 }, { "epoch": 0.160125, "grad_norm": 2.6032321453094482, "grad_norm_var": 0.22572539759932866, "learning_rate": 0.0001, "loss": 1.4692, "loss/crossentropy": 2.5821444988250732, "loss/hidden": 1.234375, "loss/logits": 0.2335243672132492, "loss/reg": 0.00012968084774911404, "step": 1281 }, { "epoch": 0.16025, "grad_norm": 2.183288812637329, "grad_norm_var": 0.2526982346235037, "learning_rate": 0.0001, "loss": 1.3595, "loss/crossentropy": 2.485654354095459, "loss/hidden": 1.15625, "loss/logits": 0.2019890993833542, "loss/reg": 0.00012963595509063452, "step": 1282 }, { "epoch": 0.160375, "grad_norm": 2.9796805381774902, "grad_norm_var": 0.24066442479889527, "learning_rate": 0.0001, "loss": 1.5, "loss/crossentropy": 2.5237677097320557, "loss/hidden": 1.2578125, "loss/logits": 0.24090701341629028, "loss/reg": 0.00012958442675881088, "step": 1283 }, { "epoch": 0.1605, "grad_norm": 2.9147651195526123, "grad_norm_var": 0.238477785149833, "learning_rate": 0.0001, "loss": 1.3264, "loss/crossentropy": 3.1219584941864014, "loss/hidden": 1.125, "loss/logits": 0.2001497447490692, "loss/reg": 0.00012952966790180653, "step": 1284 }, { "epoch": 0.160625, "grad_norm": 2.7882018089294434, "grad_norm_var": 0.23246173572475162, "learning_rate": 0.0001, "loss": 1.4386, "loss/crossentropy": 2.2280428409576416, "loss/hidden": 1.265625, "loss/logits": 0.1717071682214737, "loss/reg": 0.00012947934737894684, "step": 1285 }, { "epoch": 0.16075, "grad_norm": 2.386415481567383, "grad_norm_var": 0.23474619211517525, "learning_rate": 0.0001, "loss": 1.1904, "loss/crossentropy": 2.713867425918579, "loss/hidden": 1.0390625, "loss/logits": 0.1500096619129181, "loss/reg": 0.00012942435569129884, "step": 1286 }, { "epoch": 0.160875, "grad_norm": 2.693232297897339, "grad_norm_var": 0.23052699945596847, "learning_rate": 0.0001, "loss": 1.2645, "loss/crossentropy": 2.5223896503448486, "loss/hidden": 1.1015625, "loss/logits": 0.16161540150642395, "loss/reg": 0.00012936738494317979, "step": 1287 }, { "epoch": 0.161, "grad_norm": 2.1552627086639404, "grad_norm_var": 0.25461460197200175, "learning_rate": 0.0001, "loss": 1.3657, "loss/crossentropy": 2.4290008544921875, "loss/hidden": 1.15625, "loss/logits": 0.20817135274410248, "loss/reg": 0.00012930792581755668, "step": 1288 }, { "epoch": 0.161125, "grad_norm": 2.5147931575775146, "grad_norm_var": 0.2570296928988377, "learning_rate": 0.0001, "loss": 1.2111, "loss/crossentropy": 2.454050302505493, "loss/hidden": 1.03125, "loss/logits": 0.17852438986301422, "loss/reg": 0.0001292482775170356, "step": 1289 }, { "epoch": 0.16125, "grad_norm": 2.903743267059326, "grad_norm_var": 0.17844937818686024, "learning_rate": 0.0001, "loss": 1.4787, "loss/crossentropy": 2.6197073459625244, "loss/hidden": 1.25, "loss/logits": 0.2273719757795334, "loss/reg": 0.00012918114953208715, "step": 1290 }, { "epoch": 0.161375, "grad_norm": 4.084197044372559, "grad_norm_var": 0.29385715513967603, "learning_rate": 0.0001, "loss": 1.4822, "loss/crossentropy": 2.8456625938415527, "loss/hidden": 1.21875, "loss/logits": 0.262115478515625, "loss/reg": 0.00012912609963677824, "step": 1291 }, { "epoch": 0.1615, "grad_norm": 2.139212131500244, "grad_norm_var": 0.30035034666765625, "learning_rate": 0.0001, "loss": 1.1946, "loss/crossentropy": 2.4731454849243164, "loss/hidden": 1.03125, "loss/logits": 0.16209183633327484, "loss/reg": 0.00012907307245768607, "step": 1292 }, { "epoch": 0.161625, "grad_norm": 3.6615920066833496, "grad_norm_var": 0.2758033248990195, "learning_rate": 0.0001, "loss": 1.5982, "loss/crossentropy": 2.2579526901245117, "loss/hidden": 1.3671875, "loss/logits": 0.22969064116477966, "loss/reg": 0.0001290114742005244, "step": 1293 }, { "epoch": 0.16175, "grad_norm": 2.791762113571167, "grad_norm_var": 0.2683056467845963, "learning_rate": 0.0001, "loss": 1.2839, "loss/crossentropy": 2.3603663444519043, "loss/hidden": 1.125, "loss/logits": 0.157606840133667, "loss/reg": 0.0001289561332669109, "step": 1294 }, { "epoch": 0.161875, "grad_norm": 2.6849453449249268, "grad_norm_var": 0.26744514936397246, "learning_rate": 0.0001, "loss": 1.3492, "loss/crossentropy": 2.3472578525543213, "loss/hidden": 1.15625, "loss/logits": 0.19165396690368652, "loss/reg": 0.00012889599020127207, "step": 1295 }, { "epoch": 0.162, "grad_norm": 2.939249277114868, "grad_norm_var": 0.26712480356963486, "learning_rate": 0.0001, "loss": 1.1803, "loss/crossentropy": 2.7723758220672607, "loss/hidden": 1.03125, "loss/logits": 0.14771811664104462, "loss/reg": 0.00012883119052276015, "step": 1296 }, { "epoch": 0.162125, "grad_norm": 2.64975643157959, "grad_norm_var": 0.26618542907555137, "learning_rate": 0.0001, "loss": 1.229, "loss/crossentropy": 2.313321590423584, "loss/hidden": 1.0703125, "loss/logits": 0.15737277269363403, "loss/reg": 0.00012876400433015078, "step": 1297 }, { "epoch": 0.16225, "grad_norm": 2.8204500675201416, "grad_norm_var": 0.24091791211750432, "learning_rate": 0.0001, "loss": 1.7007, "loss/crossentropy": 2.386540651321411, "loss/hidden": 1.390625, "loss/logits": 0.3088032603263855, "loss/reg": 0.00012869405327364802, "step": 1298 }, { "epoch": 0.162375, "grad_norm": 2.5313658714294434, "grad_norm_var": 0.2438869887733252, "learning_rate": 0.0001, "loss": 1.4224, "loss/crossentropy": 2.6738691329956055, "loss/hidden": 1.1953125, "loss/logits": 0.22582921385765076, "loss/reg": 0.00012863799929618835, "step": 1299 }, { "epoch": 0.1625, "grad_norm": 2.710449457168579, "grad_norm_var": 0.24312943683976204, "learning_rate": 0.0001, "loss": 1.5886, "loss/crossentropy": 2.1685776710510254, "loss/hidden": 1.34375, "loss/logits": 0.2435504049062729, "loss/reg": 0.00012857552792411298, "step": 1300 }, { "epoch": 0.162625, "grad_norm": 3.143795967102051, "grad_norm_var": 0.25149643895756524, "learning_rate": 0.0001, "loss": 1.4234, "loss/crossentropy": 2.500770092010498, "loss/hidden": 1.1953125, "loss/logits": 0.22680968046188354, "loss/reg": 0.0001285048492718488, "step": 1301 }, { "epoch": 0.16275, "grad_norm": 3.1476283073425293, "grad_norm_var": 0.2456701240621205, "learning_rate": 0.0001, "loss": 1.5075, "loss/crossentropy": 2.1270394325256348, "loss/hidden": 1.3046875, "loss/logits": 0.20148611068725586, "loss/reg": 0.0001284321624552831, "step": 1302 }, { "epoch": 0.162875, "grad_norm": 2.4907288551330566, "grad_norm_var": 0.252417698256414, "learning_rate": 0.0001, "loss": 1.6744, "loss/crossentropy": 2.4124698638916016, "loss/hidden": 1.3671875, "loss/logits": 0.30596286058425903, "loss/reg": 0.00012837971735280007, "step": 1303 }, { "epoch": 0.163, "grad_norm": 3.0680148601531982, "grad_norm_var": 0.2216953162841887, "learning_rate": 0.0001, "loss": 1.2421, "loss/crossentropy": 2.7073466777801514, "loss/hidden": 1.0703125, "loss/logits": 0.17053528130054474, "loss/reg": 0.000128315354231745, "step": 1304 }, { "epoch": 0.163125, "grad_norm": 3.8425657749176025, "grad_norm_var": 0.26499509877866456, "learning_rate": 0.0001, "loss": 1.7052, "loss/crossentropy": 2.2696022987365723, "loss/hidden": 1.46875, "loss/logits": 0.23520112037658691, "loss/reg": 0.00012824854638893157, "step": 1305 }, { "epoch": 0.16325, "grad_norm": 2.483870267868042, "grad_norm_var": 0.28003569138028295, "learning_rate": 0.0001, "loss": 1.2635, "loss/crossentropy": 2.5698344707489014, "loss/hidden": 1.078125, "loss/logits": 0.18404905498027802, "loss/reg": 0.00012819665425922722, "step": 1306 }, { "epoch": 0.163375, "grad_norm": 2.8548123836517334, "grad_norm_var": 0.1884753839607202, "learning_rate": 0.0001, "loss": 1.5406, "loss/crossentropy": 2.3930320739746094, "loss/hidden": 1.3203125, "loss/logits": 0.21898691356182098, "loss/reg": 0.00012813655484933406, "step": 1307 }, { "epoch": 0.1635, "grad_norm": 2.498481273651123, "grad_norm_var": 0.1614155721397516, "learning_rate": 0.0001, "loss": 1.4919, "loss/crossentropy": 2.3816494941711426, "loss/hidden": 1.265625, "loss/logits": 0.22494632005691528, "loss/reg": 0.00012808511382900178, "step": 1308 }, { "epoch": 0.163625, "grad_norm": 2.6505322456359863, "grad_norm_var": 0.12195849617634522, "learning_rate": 0.0001, "loss": 1.1997, "loss/crossentropy": 2.4359896183013916, "loss/hidden": 1.03125, "loss/logits": 0.1672007143497467, "loss/reg": 0.00012802917626686394, "step": 1309 }, { "epoch": 0.16375, "grad_norm": 2.3046913146972656, "grad_norm_var": 0.1393844511746579, "learning_rate": 0.0001, "loss": 1.1847, "loss/crossentropy": 2.5967977046966553, "loss/hidden": 1.0546875, "loss/logits": 0.1287800669670105, "loss/reg": 0.00012797542149201035, "step": 1310 }, { "epoch": 0.163875, "grad_norm": 2.7136170864105225, "grad_norm_var": 0.1389908899200833, "learning_rate": 0.0001, "loss": 1.2287, "loss/crossentropy": 2.5475406646728516, "loss/hidden": 1.0625, "loss/logits": 0.1649061143398285, "loss/reg": 0.0001279206044273451, "step": 1311 }, { "epoch": 0.164, "grad_norm": 2.2551400661468506, "grad_norm_var": 0.1558247657863035, "learning_rate": 0.0001, "loss": 1.2788, "loss/crossentropy": 2.7481167316436768, "loss/hidden": 1.0859375, "loss/logits": 0.19162803888320923, "loss/reg": 0.00012786875595338643, "step": 1312 }, { "epoch": 0.164125, "grad_norm": 2.274029016494751, "grad_norm_var": 0.17018930372417246, "learning_rate": 0.0001, "loss": 1.2598, "loss/crossentropy": 2.7532927989959717, "loss/hidden": 1.078125, "loss/logits": 0.1803492307662964, "loss/reg": 0.00012782019621226937, "step": 1313 }, { "epoch": 0.16425, "grad_norm": 2.079087257385254, "grad_norm_var": 0.1962802878084517, "learning_rate": 0.0001, "loss": 1.1634, "loss/crossentropy": 2.4910993576049805, "loss/hidden": 1.0078125, "loss/logits": 0.15434816479682922, "loss/reg": 0.0001277825649594888, "step": 1314 }, { "epoch": 0.164375, "grad_norm": 5.4168291091918945, "grad_norm_var": 0.6554060181215988, "learning_rate": 0.0001, "loss": 1.5576, "loss/crossentropy": 2.4851670265197754, "loss/hidden": 1.3125, "loss/logits": 0.24378037452697754, "loss/reg": 0.0001277408591704443, "step": 1315 }, { "epoch": 0.1645, "grad_norm": 2.9020419120788574, "grad_norm_var": 0.6536016346820094, "learning_rate": 0.0001, "loss": 1.2172, "loss/crossentropy": 2.870258331298828, "loss/hidden": 1.0546875, "loss/logits": 0.1612808108329773, "loss/reg": 0.00012769039312843233, "step": 1316 }, { "epoch": 0.164625, "grad_norm": 2.439941644668579, "grad_norm_var": 0.6600773152385765, "learning_rate": 0.0001, "loss": 1.262, "loss/crossentropy": 2.584825277328491, "loss/hidden": 1.0859375, "loss/logits": 0.1748025119304657, "loss/reg": 0.00012765044812113047, "step": 1317 }, { "epoch": 0.16475, "grad_norm": 2.339271306991577, "grad_norm_var": 0.6676397372184072, "learning_rate": 0.0001, "loss": 1.1637, "loss/crossentropy": 2.7072770595550537, "loss/hidden": 1.0, "loss/logits": 0.16244381666183472, "loss/reg": 0.00012760011304635555, "step": 1318 }, { "epoch": 0.164875, "grad_norm": 5.329165458679199, "grad_norm_var": 1.0585464311946413, "learning_rate": 0.0001, "loss": 1.9104, "loss/crossentropy": 2.6075241565704346, "loss/hidden": 1.6328125, "loss/logits": 0.27628904581069946, "loss/reg": 0.00012754485942423344, "step": 1319 }, { "epoch": 0.165, "grad_norm": 4.464590549468994, "grad_norm_var": 1.1994895998809463, "learning_rate": 0.0001, "loss": 1.6252, "loss/crossentropy": 2.2507588863372803, "loss/hidden": 1.375, "loss/logits": 0.24888555705547333, "loss/reg": 0.0001274889800697565, "step": 1320 }, { "epoch": 0.165125, "grad_norm": 2.8431396484375, "grad_norm_var": 1.156708416781807, "learning_rate": 0.0001, "loss": 1.2876, "loss/crossentropy": 2.3549301624298096, "loss/hidden": 1.109375, "loss/logits": 0.17691946029663086, "loss/reg": 0.0001274366513825953, "step": 1321 }, { "epoch": 0.16525, "grad_norm": 2.583818197250366, "grad_norm_var": 1.15058018713028, "learning_rate": 0.0001, "loss": 1.4045, "loss/crossentropy": 2.6264071464538574, "loss/hidden": 1.1953125, "loss/logits": 0.2078964114189148, "loss/reg": 0.000127394130686298, "step": 1322 }, { "epoch": 0.165375, "grad_norm": 2.4748754501342773, "grad_norm_var": 1.166796266948153, "learning_rate": 0.0001, "loss": 1.4435, "loss/crossentropy": 2.6050422191619873, "loss/hidden": 1.25, "loss/logits": 0.19220617413520813, "loss/reg": 0.00012734108895529062, "step": 1323 }, { "epoch": 0.1655, "grad_norm": 2.0183868408203125, "grad_norm_var": 1.211582113782068, "learning_rate": 0.0001, "loss": 1.0972, "loss/crossentropy": 2.7528345584869385, "loss/hidden": 0.95703125, "loss/logits": 0.13893622159957886, "loss/reg": 0.0001272816734854132, "step": 1324 }, { "epoch": 0.165625, "grad_norm": 3.0193448066711426, "grad_norm_var": 1.2056978723657645, "learning_rate": 0.0001, "loss": 1.6888, "loss/crossentropy": 2.208976984024048, "loss/hidden": 1.453125, "loss/logits": 0.23441238701343536, "loss/reg": 0.00012723378313239664, "step": 1325 }, { "epoch": 0.16575, "grad_norm": 3.5611555576324463, "grad_norm_var": 1.1935580529740868, "learning_rate": 0.0001, "loss": 1.3402, "loss/crossentropy": 2.3863472938537598, "loss/hidden": 1.1328125, "loss/logits": 0.20608696341514587, "loss/reg": 0.00012717633217107505, "step": 1326 }, { "epoch": 0.165875, "grad_norm": 2.73720383644104, "grad_norm_var": 1.1925517518214346, "learning_rate": 0.0001, "loss": 1.2955, "loss/crossentropy": 2.7537636756896973, "loss/hidden": 1.109375, "loss/logits": 0.18484225869178772, "loss/reg": 0.00012713104661088437, "step": 1327 }, { "epoch": 0.166, "grad_norm": 77.81947326660156, "grad_norm_var": 350.0962004485157, "learning_rate": 0.0001, "loss": 1.4591, "loss/crossentropy": 2.6545777320861816, "loss/hidden": 1.296875, "loss/logits": 0.16092628240585327, "loss/reg": 0.0001270852517336607, "step": 1328 }, { "epoch": 0.166125, "grad_norm": 3.1249680519104004, "grad_norm_var": 349.5180168247422, "learning_rate": 0.0001, "loss": 1.4949, "loss/crossentropy": 2.5301332473754883, "loss/hidden": 1.265625, "loss/logits": 0.22799977660179138, "loss/reg": 0.0001270338980248198, "step": 1329 }, { "epoch": 0.16625, "grad_norm": 3.399801731109619, "grad_norm_var": 348.6157207845411, "learning_rate": 0.0001, "loss": 1.3205, "loss/crossentropy": 2.6434967517852783, "loss/hidden": 1.140625, "loss/logits": 0.17864950001239777, "loss/reg": 0.00012698805949185044, "step": 1330 }, { "epoch": 0.166375, "grad_norm": 2.3725063800811768, "grad_norm_var": 350.20478525317105, "learning_rate": 0.0001, "loss": 1.2966, "loss/crossentropy": 2.446640968322754, "loss/hidden": 1.1171875, "loss/logits": 0.1781914234161377, "loss/reg": 0.0001269360218429938, "step": 1331 }, { "epoch": 0.1665, "grad_norm": 2.300149917602539, "grad_norm_var": 350.6136264294236, "learning_rate": 0.0001, "loss": 1.2311, "loss/crossentropy": 2.46268630027771, "loss/hidden": 1.046875, "loss/logits": 0.18293406069278717, "loss/reg": 0.00012688270362559706, "step": 1332 }, { "epoch": 0.166625, "grad_norm": 2.4820268154144287, "grad_norm_var": 350.5843516032387, "learning_rate": 0.0001, "loss": 1.3156, "loss/crossentropy": 2.578028917312622, "loss/hidden": 1.140625, "loss/logits": 0.17366383969783783, "loss/reg": 0.0001268255291506648, "step": 1333 }, { "epoch": 0.16675, "grad_norm": 4.75213098526001, "grad_norm_var": 349.2302328487316, "learning_rate": 0.0001, "loss": 1.6415, "loss/crossentropy": 2.6102542877197266, "loss/hidden": 1.4296875, "loss/logits": 0.21051731705665588, "loss/reg": 0.00012677241466008127, "step": 1334 }, { "epoch": 0.166875, "grad_norm": 2.4282379150390625, "grad_norm_var": 350.7235589547256, "learning_rate": 0.0001, "loss": 1.1594, "loss/crossentropy": 2.4482662677764893, "loss/hidden": 1.0078125, "loss/logits": 0.15035338699817657, "loss/reg": 0.00012672167213167995, "step": 1335 }, { "epoch": 0.167, "grad_norm": 2.238334894180298, "grad_norm_var": 351.97852298786563, "learning_rate": 0.0001, "loss": 1.1638, "loss/crossentropy": 2.4557130336761475, "loss/hidden": 1.0078125, "loss/logits": 0.15474534034729004, "loss/reg": 0.00012666883412748575, "step": 1336 }, { "epoch": 0.167125, "grad_norm": 3.0490801334381104, "grad_norm_var": 351.853035270601, "learning_rate": 0.0001, "loss": 1.3434, "loss/crossentropy": 2.395153284072876, "loss/hidden": 1.140625, "loss/logits": 0.20155774056911469, "loss/reg": 0.00012660547508858144, "step": 1337 }, { "epoch": 0.16725, "grad_norm": 2.8521084785461426, "grad_norm_var": 351.68086394765527, "learning_rate": 0.0001, "loss": 1.2533, "loss/crossentropy": 2.9391930103302, "loss/hidden": 1.0859375, "loss/logits": 0.16611135005950928, "loss/reg": 0.0001265349128516391, "step": 1338 }, { "epoch": 0.167375, "grad_norm": 2.8729686737060547, "grad_norm_var": 351.42195048890835, "learning_rate": 0.0001, "loss": 1.3914, "loss/crossentropy": 2.5734429359436035, "loss/hidden": 1.21875, "loss/logits": 0.1714039444923401, "loss/reg": 0.00012647398398257792, "step": 1339 }, { "epoch": 0.1675, "grad_norm": 2.1493406295776367, "grad_norm_var": 351.3261888553087, "learning_rate": 0.0001, "loss": 1.2662, "loss/crossentropy": 2.724952459335327, "loss/hidden": 1.078125, "loss/logits": 0.18683390319347382, "loss/reg": 0.0001264003076357767, "step": 1340 }, { "epoch": 0.167625, "grad_norm": 2.4122562408447266, "grad_norm_var": 351.7177735237798, "learning_rate": 0.0001, "loss": 1.2412, "loss/crossentropy": 2.563736915588379, "loss/hidden": 1.0546875, "loss/logits": 0.18529221415519714, "loss/reg": 0.00012633373262360692, "step": 1341 }, { "epoch": 0.16775, "grad_norm": 2.167365312576294, "grad_norm_var": 352.57758741079243, "learning_rate": 0.0001, "loss": 1.325, "loss/crossentropy": 2.3570609092712402, "loss/hidden": 1.1328125, "loss/logits": 0.19091476500034332, "loss/reg": 0.00012628218973986804, "step": 1342 }, { "epoch": 0.167875, "grad_norm": 2.9273107051849365, "grad_norm_var": 352.4604548215283, "learning_rate": 0.0001, "loss": 1.5767, "loss/crossentropy": 2.8456368446350098, "loss/hidden": 1.3359375, "loss/logits": 0.23951734602451324, "loss/reg": 0.00012623280053958297, "step": 1343 }, { "epoch": 0.168, "grad_norm": 2.712193727493286, "grad_norm_var": 0.4207964667369062, "learning_rate": 0.0001, "loss": 1.7486, "loss/crossentropy": 1.8741474151611328, "loss/hidden": 1.4609375, "loss/logits": 0.28636929392814636, "loss/reg": 0.00012617842003237456, "step": 1344 }, { "epoch": 0.168125, "grad_norm": 2.150104284286499, "grad_norm_var": 0.43341096032493925, "learning_rate": 0.0001, "loss": 1.3218, "loss/crossentropy": 2.527280330657959, "loss/hidden": 1.140625, "loss/logits": 0.17993897199630737, "loss/reg": 0.00012612607679329813, "step": 1345 }, { "epoch": 0.16825, "grad_norm": 2.129319190979004, "grad_norm_var": 0.416446928786641, "learning_rate": 0.0001, "loss": 1.1885, "loss/crossentropy": 2.4787251949310303, "loss/hidden": 1.03125, "loss/logits": 0.1559671014547348, "loss/reg": 0.00012608838733285666, "step": 1346 }, { "epoch": 0.168375, "grad_norm": 3.420137405395508, "grad_norm_var": 0.44981310816821346, "learning_rate": 0.0001, "loss": 1.3486, "loss/crossentropy": 2.3244402408599854, "loss/hidden": 1.15625, "loss/logits": 0.19109143316745758, "loss/reg": 0.0001260630670003593, "step": 1347 }, { "epoch": 0.1685, "grad_norm": 3.105586528778076, "grad_norm_var": 0.44847143841342524, "learning_rate": 0.0001, "loss": 1.5578, "loss/crossentropy": 2.773986577987671, "loss/hidden": 1.3203125, "loss/logits": 0.236245796084404, "loss/reg": 0.00012601265916600823, "step": 1348 }, { "epoch": 0.168625, "grad_norm": 2.557159662246704, "grad_norm_var": 0.4462346230410406, "learning_rate": 0.0001, "loss": 1.3656, "loss/crossentropy": 2.7838551998138428, "loss/hidden": 1.1484375, "loss/logits": 0.21587368845939636, "loss/reg": 0.0001259811397176236, "step": 1349 }, { "epoch": 0.16875, "grad_norm": 2.4449706077575684, "grad_norm_var": 0.16155490838788916, "learning_rate": 0.0001, "loss": 1.4412, "loss/crossentropy": 2.4866108894348145, "loss/hidden": 1.234375, "loss/logits": 0.2055819034576416, "loss/reg": 0.00012593007704708725, "step": 1350 }, { "epoch": 0.168875, "grad_norm": 2.6294541358947754, "grad_norm_var": 0.15944960638838074, "learning_rate": 0.0001, "loss": 1.3829, "loss/crossentropy": 2.5440731048583984, "loss/hidden": 1.171875, "loss/logits": 0.2097521424293518, "loss/reg": 0.00012590413098223507, "step": 1351 }, { "epoch": 0.169, "grad_norm": 2.1530978679656982, "grad_norm_var": 0.16416861938656074, "learning_rate": 0.0001, "loss": 1.2267, "loss/crossentropy": 2.2171056270599365, "loss/hidden": 1.0625, "loss/logits": 0.16290676593780518, "loss/reg": 0.00012586945376824588, "step": 1352 }, { "epoch": 0.169125, "grad_norm": 2.949974298477173, "grad_norm_var": 0.15895768844162012, "learning_rate": 0.0001, "loss": 1.4648, "loss/crossentropy": 2.3707659244537354, "loss/hidden": 1.265625, "loss/logits": 0.19793057441711426, "loss/reg": 0.00012584343494381756, "step": 1353 }, { "epoch": 0.16925, "grad_norm": 6.300345420837402, "grad_norm_var": 1.0170561921155883, "learning_rate": 0.0001, "loss": 1.3685, "loss/crossentropy": 2.1804256439208984, "loss/hidden": 1.2109375, "loss/logits": 0.15635056793689728, "loss/reg": 0.00012579603935591877, "step": 1354 }, { "epoch": 0.169375, "grad_norm": 3.361546039581299, "grad_norm_var": 1.0355824120281263, "learning_rate": 0.0001, "loss": 1.5423, "loss/crossentropy": 2.1024653911590576, "loss/hidden": 1.359375, "loss/logits": 0.18161754310131073, "loss/reg": 0.0001257634867215529, "step": 1355 }, { "epoch": 0.1695, "grad_norm": 2.355725049972534, "grad_norm_var": 1.019015197068226, "learning_rate": 0.0001, "loss": 1.2076, "loss/crossentropy": 2.6107654571533203, "loss/hidden": 1.03125, "loss/logits": 0.1750730276107788, "loss/reg": 0.00012571582919918, "step": 1356 }, { "epoch": 0.169625, "grad_norm": 3.752018690109253, "grad_norm_var": 1.0510329712069637, "learning_rate": 0.0001, "loss": 1.4337, "loss/crossentropy": 2.5731000900268555, "loss/hidden": 1.234375, "loss/logits": 0.19804102182388306, "loss/reg": 0.00012567441444844007, "step": 1357 }, { "epoch": 0.16975, "grad_norm": 3.3008177280426025, "grad_norm_var": 1.0138408949900923, "learning_rate": 0.0001, "loss": 1.4185, "loss/crossentropy": 2.1232943534851074, "loss/hidden": 1.25, "loss/logits": 0.1672396957874298, "loss/reg": 0.00012562419578898698, "step": 1358 }, { "epoch": 0.169875, "grad_norm": 2.22627329826355, "grad_norm_var": 1.0528102243608428, "learning_rate": 0.0001, "loss": 1.2746, "loss/crossentropy": 2.3972342014312744, "loss/hidden": 1.0859375, "loss/logits": 0.1874513179063797, "loss/reg": 0.0001255742972716689, "step": 1359 }, { "epoch": 0.17, "grad_norm": 2.255157947540283, "grad_norm_var": 1.0816849552026724, "learning_rate": 0.0001, "loss": 1.3757, "loss/crossentropy": 2.398299217224121, "loss/hidden": 1.171875, "loss/logits": 0.20254850387573242, "loss/reg": 0.00012553844135254622, "step": 1360 }, { "epoch": 0.170125, "grad_norm": 2.1013174057006836, "grad_norm_var": 1.086992935554847, "learning_rate": 0.0001, "loss": 1.2796, "loss/crossentropy": 2.773859739303589, "loss/hidden": 1.1015625, "loss/logits": 0.17682811617851257, "loss/reg": 0.0001255036477232352, "step": 1361 }, { "epoch": 0.17025, "grad_norm": 3.3749279975891113, "grad_norm_var": 1.0492953305995663, "learning_rate": 0.0001, "loss": 1.444, "loss/crossentropy": 2.579564094543457, "loss/hidden": 1.21875, "loss/logits": 0.22398871183395386, "loss/reg": 0.00012547285587061197, "step": 1362 }, { "epoch": 0.170375, "grad_norm": 3.8883025646209717, "grad_norm_var": 1.0880942337458681, "learning_rate": 0.0001, "loss": 1.4343, "loss/crossentropy": 2.847822666168213, "loss/hidden": 1.2265625, "loss/logits": 0.20643851161003113, "loss/reg": 0.00012544909259304404, "step": 1363 }, { "epoch": 0.1705, "grad_norm": 2.8531761169433594, "grad_norm_var": 1.0901142929326701, "learning_rate": 0.0001, "loss": 1.6629, "loss/crossentropy": 2.248915195465088, "loss/hidden": 1.390625, "loss/logits": 0.2710200846195221, "loss/reg": 0.0001254247035831213, "step": 1364 }, { "epoch": 0.170625, "grad_norm": 3.127364158630371, "grad_norm_var": 1.074371058392131, "learning_rate": 0.0001, "loss": 1.4005, "loss/crossentropy": 2.4211807250976562, "loss/hidden": 1.1875, "loss/logits": 0.21177317202091217, "loss/reg": 0.00012537595466710627, "step": 1365 }, { "epoch": 0.17075, "grad_norm": 3.6994669437408447, "grad_norm_var": 1.0686608306384404, "learning_rate": 0.0001, "loss": 1.5964, "loss/crossentropy": 2.373058557510376, "loss/hidden": 1.390625, "loss/logits": 0.2045435905456543, "loss/reg": 0.00012533632980193943, "step": 1366 }, { "epoch": 0.170875, "grad_norm": 3.4185245037078857, "grad_norm_var": 1.053276117027288, "learning_rate": 0.0001, "loss": 1.4238, "loss/crossentropy": 2.480586528778076, "loss/hidden": 1.2109375, "loss/logits": 0.21158848702907562, "loss/reg": 0.0001252839429071173, "step": 1367 }, { "epoch": 0.171, "grad_norm": 2.8058104515075684, "grad_norm_var": 0.9892388892255884, "learning_rate": 0.0001, "loss": 1.5198, "loss/crossentropy": 2.489130735397339, "loss/hidden": 1.3046875, "loss/logits": 0.2138749212026596, "loss/reg": 0.0001252458750968799, "step": 1368 }, { "epoch": 0.171125, "grad_norm": 2.487428665161133, "grad_norm_var": 1.0202304183485766, "learning_rate": 0.0001, "loss": 1.2818, "loss/crossentropy": 2.4536304473876953, "loss/hidden": 1.1015625, "loss/logits": 0.17900395393371582, "loss/reg": 0.00012519907613750547, "step": 1369 }, { "epoch": 0.17125, "grad_norm": 2.6989829540252686, "grad_norm_var": 0.34536194471611453, "learning_rate": 0.0001, "loss": 1.4601, "loss/crossentropy": 2.6088736057281494, "loss/hidden": 1.21875, "loss/logits": 0.2400750070810318, "loss/reg": 0.00012515111302491277, "step": 1370 }, { "epoch": 0.171375, "grad_norm": 2.389324426651001, "grad_norm_var": 0.35519569069646173, "learning_rate": 0.0001, "loss": 1.1445, "loss/crossentropy": 2.4220409393310547, "loss/hidden": 0.98828125, "loss/logits": 0.15494085848331451, "loss/reg": 0.00012509699445217848, "step": 1371 }, { "epoch": 0.1715, "grad_norm": 2.7900049686431885, "grad_norm_var": 0.3342564547968794, "learning_rate": 0.0001, "loss": 1.5356, "loss/crossentropy": 2.175588607788086, "loss/hidden": 1.3046875, "loss/logits": 0.22961562871932983, "loss/reg": 0.00012504978803917766, "step": 1372 }, { "epoch": 0.171625, "grad_norm": 2.2409114837646484, "grad_norm_var": 0.31498862684810075, "learning_rate": 0.0001, "loss": 1.302, "loss/crossentropy": 2.363229990005493, "loss/hidden": 1.1328125, "loss/logits": 0.16798266768455505, "loss/reg": 0.00012500598677434027, "step": 1373 }, { "epoch": 0.17175, "grad_norm": 2.578244924545288, "grad_norm_var": 0.3045354309955807, "learning_rate": 0.0001, "loss": 1.3227, "loss/crossentropy": 2.916712999343872, "loss/hidden": 1.1171875, "loss/logits": 0.20429915189743042, "loss/reg": 0.0001249638880835846, "step": 1374 }, { "epoch": 0.171875, "grad_norm": 2.73020076751709, "grad_norm_var": 0.2812901386304472, "learning_rate": 0.0001, "loss": 1.2721, "loss/crossentropy": 2.622807025909424, "loss/hidden": 1.09375, "loss/logits": 0.1770937442779541, "loss/reg": 0.0001249118213308975, "step": 1375 }, { "epoch": 0.172, "grad_norm": 4.233326435089111, "grad_norm_var": 0.3716206398471738, "learning_rate": 0.0001, "loss": 1.7583, "loss/crossentropy": 2.450744152069092, "loss/hidden": 1.484375, "loss/logits": 0.27268433570861816, "loss/reg": 0.0001248600601684302, "step": 1376 }, { "epoch": 0.172125, "grad_norm": 3.618061065673828, "grad_norm_var": 0.3410246487759288, "learning_rate": 0.0001, "loss": 1.8092, "loss/crossentropy": 2.182755470275879, "loss/hidden": 1.546875, "loss/logits": 0.2611222267150879, "loss/reg": 0.00012481324665714055, "step": 1377 }, { "epoch": 0.17225, "grad_norm": 2.93454909324646, "grad_norm_var": 0.33455861027497635, "learning_rate": 0.0001, "loss": 1.427, "loss/crossentropy": 2.4290924072265625, "loss/hidden": 1.2421875, "loss/logits": 0.18354354798793793, "loss/reg": 0.0001247714681085199, "step": 1378 }, { "epoch": 0.172375, "grad_norm": 2.878164529800415, "grad_norm_var": 0.28284689796550994, "learning_rate": 0.0001, "loss": 1.3262, "loss/crossentropy": 2.5193514823913574, "loss/hidden": 1.15625, "loss/logits": 0.16868917644023895, "loss/reg": 0.00012472183152567595, "step": 1379 }, { "epoch": 0.1725, "grad_norm": 3.589125394821167, "grad_norm_var": 0.3054583015003286, "learning_rate": 0.0001, "loss": 1.8325, "loss/crossentropy": 2.1043519973754883, "loss/hidden": 1.5, "loss/logits": 0.33121782541275024, "loss/reg": 0.00012467816122807562, "step": 1380 }, { "epoch": 0.172625, "grad_norm": 2.244504690170288, "grad_norm_var": 0.3407955627929132, "learning_rate": 0.0001, "loss": 1.281, "loss/crossentropy": 2.5267179012298584, "loss/hidden": 1.09375, "loss/logits": 0.1860395073890686, "loss/reg": 0.00012462481390684843, "step": 1381 }, { "epoch": 0.17275, "grad_norm": 3.8526663780212402, "grad_norm_var": 0.3573970648853918, "learning_rate": 0.0001, "loss": 1.5901, "loss/crossentropy": 2.248718738555908, "loss/hidden": 1.421875, "loss/logits": 0.16695934534072876, "loss/reg": 0.00012457840784918517, "step": 1382 }, { "epoch": 0.172875, "grad_norm": 2.7534327507019043, "grad_norm_var": 0.3451018839959554, "learning_rate": 0.0001, "loss": 1.3489, "loss/crossentropy": 2.3835206031799316, "loss/hidden": 1.1640625, "loss/logits": 0.18362778425216675, "loss/reg": 0.00012452072405721992, "step": 1383 }, { "epoch": 0.173, "grad_norm": 2.8040714263916016, "grad_norm_var": 0.34513006800564955, "learning_rate": 0.0001, "loss": 1.3497, "loss/crossentropy": 2.427971839904785, "loss/hidden": 1.15625, "loss/logits": 0.19217385351657867, "loss/reg": 0.00012445985339581966, "step": 1384 }, { "epoch": 0.173125, "grad_norm": 2.9184110164642334, "grad_norm_var": 0.33151183854841954, "learning_rate": 0.0001, "loss": 1.4787, "loss/crossentropy": 2.529407024383545, "loss/hidden": 1.2578125, "loss/logits": 0.2196352481842041, "loss/reg": 0.0001244109298568219, "step": 1385 }, { "epoch": 0.17325, "grad_norm": 2.2244670391082764, "grad_norm_var": 0.3616796797659615, "learning_rate": 0.0001, "loss": 1.3451, "loss/crossentropy": 2.5614936351776123, "loss/hidden": 1.140625, "loss/logits": 0.20321504771709442, "loss/reg": 0.00012436485849320889, "step": 1386 }, { "epoch": 0.173375, "grad_norm": 2.5242624282836914, "grad_norm_var": 0.35320305380007444, "learning_rate": 0.0001, "loss": 1.2178, "loss/crossentropy": 2.757469415664673, "loss/hidden": 1.0546875, "loss/logits": 0.16182047128677368, "loss/reg": 0.0001243241858901456, "step": 1387 }, { "epoch": 0.1735, "grad_norm": 2.8499906063079834, "grad_norm_var": 0.3522910558536144, "learning_rate": 0.0001, "loss": 1.6543, "loss/crossentropy": 2.4509925842285156, "loss/hidden": 1.390625, "loss/logits": 0.26246190071105957, "loss/reg": 0.00012429323396645486, "step": 1388 }, { "epoch": 0.173625, "grad_norm": 2.553403854370117, "grad_norm_var": 0.32943715155791625, "learning_rate": 0.0001, "loss": 1.2634, "loss/crossentropy": 2.1729512214660645, "loss/hidden": 1.1015625, "loss/logits": 0.1606322079896927, "loss/reg": 0.00012424589658621699, "step": 1389 }, { "epoch": 0.17375, "grad_norm": 2.151810884475708, "grad_norm_var": 0.36224847524056164, "learning_rate": 0.0001, "loss": 1.2616, "loss/crossentropy": 2.289837121963501, "loss/hidden": 1.09375, "loss/logits": 0.1665709912776947, "loss/reg": 0.00012421247083693743, "step": 1390 }, { "epoch": 0.173875, "grad_norm": 2.957038640975952, "grad_norm_var": 0.35945846007691645, "learning_rate": 0.0001, "loss": 1.3735, "loss/crossentropy": 2.5611021518707275, "loss/hidden": 1.171875, "loss/logits": 0.20039844512939453, "loss/reg": 0.00012416629760991782, "step": 1391 }, { "epoch": 0.174, "grad_norm": 2.1540000438690186, "grad_norm_var": 0.2719363409117695, "learning_rate": 0.0001, "loss": 1.2749, "loss/crossentropy": 2.520090103149414, "loss/hidden": 1.0859375, "loss/logits": 0.18771693110466003, "loss/reg": 0.00012413323565851897, "step": 1392 }, { "epoch": 0.174125, "grad_norm": 2.1885299682617188, "grad_norm_var": 0.24621033277204846, "learning_rate": 0.0001, "loss": 1.1467, "loss/crossentropy": 2.5471370220184326, "loss/hidden": 0.99609375, "loss/logits": 0.14937998354434967, "loss/reg": 0.00012408249313011765, "step": 1393 }, { "epoch": 0.17425, "grad_norm": 2.131239652633667, "grad_norm_var": 0.2639531894364978, "learning_rate": 0.0001, "loss": 1.0811, "loss/crossentropy": 2.4523544311523438, "loss/hidden": 0.93359375, "loss/logits": 0.14624947309494019, "loss/reg": 0.0001240312703885138, "step": 1394 }, { "epoch": 0.174375, "grad_norm": 3.0716922283172607, "grad_norm_var": 0.27157652111455566, "learning_rate": 0.0001, "loss": 1.5143, "loss/crossentropy": 2.18898868560791, "loss/hidden": 1.28125, "loss/logits": 0.23179930448532104, "loss/reg": 0.00012399136903695762, "step": 1395 }, { "epoch": 0.1745, "grad_norm": 3.1297645568847656, "grad_norm_var": 0.2294219224221533, "learning_rate": 0.0001, "loss": 1.5361, "loss/crossentropy": 2.652775526046753, "loss/hidden": 1.328125, "loss/logits": 0.20668646693229675, "loss/reg": 0.00012394081568345428, "step": 1396 }, { "epoch": 0.174625, "grad_norm": 2.3542675971984863, "grad_norm_var": 0.22414050698139362, "learning_rate": 0.0001, "loss": 1.2569, "loss/crossentropy": 2.7019245624542236, "loss/hidden": 1.078125, "loss/logits": 0.17754441499710083, "loss/reg": 0.00012390216579660773, "step": 1397 }, { "epoch": 0.17475, "grad_norm": 2.4598019123077393, "grad_norm_var": 0.12458401635635306, "learning_rate": 0.0001, "loss": 1.2307, "loss/crossentropy": 2.4387967586517334, "loss/hidden": 1.0625, "loss/logits": 0.16694127023220062, "loss/reg": 0.0001238500844920054, "step": 1398 }, { "epoch": 0.174875, "grad_norm": 2.140251398086548, "grad_norm_var": 0.13362905826974442, "learning_rate": 0.0001, "loss": 1.2767, "loss/crossentropy": 2.388298749923706, "loss/hidden": 1.0859375, "loss/logits": 0.18952462077140808, "loss/reg": 0.0001237952965311706, "step": 1399 }, { "epoch": 0.175, "grad_norm": 2.443934679031372, "grad_norm_var": 0.12897394879086405, "learning_rate": 0.0001, "loss": 1.2766, "loss/crossentropy": 2.5147831439971924, "loss/hidden": 1.09375, "loss/logits": 0.18164008855819702, "loss/reg": 0.0001237535907421261, "step": 1400 }, { "epoch": 0.175125, "grad_norm": 2.2890682220458984, "grad_norm_var": 0.11994477401951827, "learning_rate": 0.0001, "loss": 1.1625, "loss/crossentropy": 2.4310338497161865, "loss/hidden": 1.015625, "loss/logits": 0.1456294059753418, "loss/reg": 0.00012370246986392885, "step": 1401 }, { "epoch": 0.17525, "grad_norm": 3.0157530307769775, "grad_norm_var": 0.13249057287120253, "learning_rate": 0.0001, "loss": 1.615, "loss/crossentropy": 2.4014968872070312, "loss/hidden": 1.328125, "loss/logits": 0.28561902046203613, "loss/reg": 0.00012366785085760057, "step": 1402 }, { "epoch": 0.175375, "grad_norm": 2.6250622272491455, "grad_norm_var": 0.1331032572676688, "learning_rate": 0.0001, "loss": 1.6478, "loss/crossentropy": 2.055495500564575, "loss/hidden": 1.3984375, "loss/logits": 0.24807974696159363, "loss/reg": 0.00012362911365926266, "step": 1403 }, { "epoch": 0.1755, "grad_norm": 2.489910364151001, "grad_norm_var": 0.12595074821941785, "learning_rate": 0.0001, "loss": 1.3999, "loss/crossentropy": 2.7454919815063477, "loss/hidden": 1.1953125, "loss/logits": 0.20339912176132202, "loss/reg": 0.00012359289394225925, "step": 1404 }, { "epoch": 0.175625, "grad_norm": 2.463088035583496, "grad_norm_var": 0.1259345186411013, "learning_rate": 0.0001, "loss": 1.3475, "loss/crossentropy": 2.831529140472412, "loss/hidden": 1.15625, "loss/logits": 0.18996471166610718, "loss/reg": 0.00012355422950349748, "step": 1405 }, { "epoch": 0.17575, "grad_norm": 3.73136568069458, "grad_norm_var": 0.20768202991946413, "learning_rate": 0.0001, "loss": 1.5968, "loss/crossentropy": 2.3867976665496826, "loss/hidden": 1.4140625, "loss/logits": 0.181511789560318, "loss/reg": 0.00012350117322057486, "step": 1406 }, { "epoch": 0.175875, "grad_norm": 6.24423360824585, "grad_norm_var": 1.0382962690799404, "learning_rate": 0.0001, "loss": 1.5621, "loss/crossentropy": 2.6809537410736084, "loss/hidden": 1.34375, "loss/logits": 0.2171478569507599, "loss/reg": 0.00012346208677627146, "step": 1407 }, { "epoch": 0.176, "grad_norm": 2.218271493911743, "grad_norm_var": 1.0329478525431983, "learning_rate": 0.0001, "loss": 1.2233, "loss/crossentropy": 3.026775598526001, "loss/hidden": 1.0546875, "loss/logits": 0.16736505925655365, "loss/reg": 0.00012341061665210873, "step": 1408 }, { "epoch": 0.176125, "grad_norm": 2.9582364559173584, "grad_norm_var": 1.0059635049116944, "learning_rate": 0.0001, "loss": 1.3083, "loss/crossentropy": 2.7227070331573486, "loss/hidden": 1.125, "loss/logits": 0.18210051953792572, "loss/reg": 0.0001233569928444922, "step": 1409 }, { "epoch": 0.17625, "grad_norm": 2.427577018737793, "grad_norm_var": 0.9826428001340937, "learning_rate": 0.0001, "loss": 1.6232, "loss/crossentropy": 2.4025256633758545, "loss/hidden": 1.3828125, "loss/logits": 0.23915690183639526, "loss/reg": 0.00012331439938861877, "step": 1410 }, { "epoch": 0.176375, "grad_norm": 2.8900115489959717, "grad_norm_var": 0.9800353916225198, "learning_rate": 0.0001, "loss": 1.4759, "loss/crossentropy": 2.611516237258911, "loss/hidden": 1.2421875, "loss/logits": 0.23251909017562866, "loss/reg": 0.00012327195145189762, "step": 1411 }, { "epoch": 0.1765, "grad_norm": 2.6608002185821533, "grad_norm_var": 0.9773841699582476, "learning_rate": 0.0001, "loss": 1.3621, "loss/crossentropy": 2.364351272583008, "loss/hidden": 1.1796875, "loss/logits": 0.1811881959438324, "loss/reg": 0.00012321824033278972, "step": 1412 }, { "epoch": 0.176625, "grad_norm": 2.518124580383301, "grad_norm_var": 0.9684888869916742, "learning_rate": 0.0001, "loss": 1.2861, "loss/crossentropy": 2.388920783996582, "loss/hidden": 1.109375, "loss/logits": 0.17547652125358582, "loss/reg": 0.0001231755450135097, "step": 1413 }, { "epoch": 0.17675, "grad_norm": 3.419569969177246, "grad_norm_var": 0.9763237979514786, "learning_rate": 0.0001, "loss": 1.498, "loss/crossentropy": 2.6352081298828125, "loss/hidden": 1.25, "loss/logits": 0.24680346250534058, "loss/reg": 0.00012313587649259716, "step": 1414 }, { "epoch": 0.176875, "grad_norm": 2.7200961112976074, "grad_norm_var": 0.9379458052708051, "learning_rate": 0.0001, "loss": 1.3753, "loss/crossentropy": 2.4700608253479004, "loss/hidden": 1.1875, "loss/logits": 0.18656814098358154, "loss/reg": 0.00012310186866670847, "step": 1415 }, { "epoch": 0.177, "grad_norm": 2.054546594619751, "grad_norm_var": 0.9734208737035706, "learning_rate": 0.0001, "loss": 1.2408, "loss/crossentropy": 2.4620559215545654, "loss/hidden": 1.078125, "loss/logits": 0.16146335005760193, "loss/reg": 0.00012305451673455536, "step": 1416 }, { "epoch": 0.177125, "grad_norm": 2.3744401931762695, "grad_norm_var": 0.9666904791525733, "learning_rate": 0.0001, "loss": 1.3941, "loss/crossentropy": 2.4547486305236816, "loss/hidden": 1.1875, "loss/logits": 0.2053956836462021, "loss/reg": 0.00012301858805585653, "step": 1417 }, { "epoch": 0.17725, "grad_norm": 6.772999286651611, "grad_norm_var": 1.8941137490096496, "learning_rate": 0.0001, "loss": 1.7879, "loss/crossentropy": 3.083491802215576, "loss/hidden": 1.5625, "loss/logits": 0.22418245673179626, "loss/reg": 0.0001229665067512542, "step": 1418 }, { "epoch": 0.177375, "grad_norm": 2.9948341846466064, "grad_norm_var": 1.876259778206029, "learning_rate": 0.0001, "loss": 1.3538, "loss/crossentropy": 2.885481357574463, "loss/hidden": 1.1484375, "loss/logits": 0.2041017711162567, "loss/reg": 0.00012291650637052953, "step": 1419 }, { "epoch": 0.1775, "grad_norm": 2.330918788909912, "grad_norm_var": 1.8925457838723332, "learning_rate": 0.0001, "loss": 1.3741, "loss/crossentropy": 2.593684673309326, "loss/hidden": 1.1640625, "loss/logits": 0.208843395113945, "loss/reg": 0.00012287317076697946, "step": 1420 }, { "epoch": 0.177625, "grad_norm": 2.5464389324188232, "grad_norm_var": 1.8850827019126235, "learning_rate": 0.0001, "loss": 1.3723, "loss/crossentropy": 2.6521592140197754, "loss/hidden": 1.15625, "loss/logits": 0.21481186151504517, "loss/reg": 0.00012282062380108982, "step": 1421 }, { "epoch": 0.17775, "grad_norm": 2.5340471267700195, "grad_norm_var": 1.886484591617587, "learning_rate": 0.0001, "loss": 1.3204, "loss/crossentropy": 2.4767885208129883, "loss/hidden": 1.1484375, "loss/logits": 0.17074325680732727, "loss/reg": 0.00012277014320716262, "step": 1422 }, { "epoch": 0.177875, "grad_norm": 2.497507333755493, "grad_norm_var": 1.1951466349545437, "learning_rate": 0.0001, "loss": 1.5868, "loss/crossentropy": 2.1008732318878174, "loss/hidden": 1.3671875, "loss/logits": 0.21836718916893005, "loss/reg": 0.00012271860032342374, "step": 1423 }, { "epoch": 0.178, "grad_norm": 2.761888027191162, "grad_norm_var": 1.1663850079438776, "learning_rate": 0.0001, "loss": 1.4199, "loss/crossentropy": 2.497096300125122, "loss/hidden": 1.203125, "loss/logits": 0.2155841588973999, "loss/reg": 0.00012265924306120723, "step": 1424 }, { "epoch": 0.178125, "grad_norm": 2.743013381958008, "grad_norm_var": 1.1677201552797356, "learning_rate": 0.0001, "loss": 1.1784, "loss/crossentropy": 2.5461199283599854, "loss/hidden": 1.015625, "loss/logits": 0.1615104079246521, "loss/reg": 0.00012260786024853587, "step": 1425 }, { "epoch": 0.17825, "grad_norm": 4.0257439613342285, "grad_norm_var": 1.2287257976313033, "learning_rate": 0.0001, "loss": 1.4008, "loss/crossentropy": 2.615342140197754, "loss/hidden": 1.1796875, "loss/logits": 0.21985702216625214, "loss/reg": 0.00012254797911737114, "step": 1426 }, { "epoch": 0.178375, "grad_norm": 2.8603885173797607, "grad_norm_var": 1.229176800435842, "learning_rate": 0.0001, "loss": 1.3965, "loss/crossentropy": 2.4568042755126953, "loss/hidden": 1.1953125, "loss/logits": 0.19992274045944214, "loss/reg": 0.00012249914288986474, "step": 1427 }, { "epoch": 0.1785, "grad_norm": 2.4687345027923584, "grad_norm_var": 1.239873334810188, "learning_rate": 0.0001, "loss": 1.2511, "loss/crossentropy": 2.762143850326538, "loss/hidden": 1.078125, "loss/logits": 0.1717434823513031, "loss/reg": 0.00012244738172739744, "step": 1428 }, { "epoch": 0.178625, "grad_norm": 2.4968984127044678, "grad_norm_var": 1.2411986426753872, "learning_rate": 0.0001, "loss": 1.3123, "loss/crossentropy": 2.7674665451049805, "loss/hidden": 1.125, "loss/logits": 0.1860542595386505, "loss/reg": 0.00012241210788488388, "step": 1429 }, { "epoch": 0.17875, "grad_norm": 4.7463178634643555, "grad_norm_var": 1.4298363591309242, "learning_rate": 0.0001, "loss": 1.6732, "loss/crossentropy": 2.851639747619629, "loss/hidden": 1.4140625, "loss/logits": 0.25786828994750977, "loss/reg": 0.00012236017209943384, "step": 1430 }, { "epoch": 0.178875, "grad_norm": 2.13455867767334, "grad_norm_var": 1.4776494243756035, "learning_rate": 0.0001, "loss": 1.2534, "loss/crossentropy": 2.5413355827331543, "loss/hidden": 1.078125, "loss/logits": 0.17405973374843597, "loss/reg": 0.0001223067956743762, "step": 1431 }, { "epoch": 0.179, "grad_norm": 2.218266487121582, "grad_norm_var": 1.458217740800644, "learning_rate": 0.0001, "loss": 1.4285, "loss/crossentropy": 2.543544054031372, "loss/hidden": 1.1953125, "loss/logits": 0.23193752765655518, "loss/reg": 0.0001222506252815947, "step": 1432 }, { "epoch": 0.179125, "grad_norm": 2.326692581176758, "grad_norm_var": 1.4625444939866676, "learning_rate": 0.0001, "loss": 1.1934, "loss/crossentropy": 2.5187294483184814, "loss/hidden": 1.015625, "loss/logits": 0.17655548453330994, "loss/reg": 0.00012219331983942538, "step": 1433 }, { "epoch": 0.17925, "grad_norm": 2.542522430419922, "grad_norm_var": 0.4690817271155917, "learning_rate": 0.0001, "loss": 1.5982, "loss/crossentropy": 2.8521389961242676, "loss/hidden": 1.3125, "loss/logits": 0.28449544310569763, "loss/reg": 0.0001221426500706002, "step": 1434 }, { "epoch": 0.179375, "grad_norm": 2.2797062397003174, "grad_norm_var": 0.4790630291179923, "learning_rate": 0.0001, "loss": 1.3248, "loss/crossentropy": 2.541137456893921, "loss/hidden": 1.125, "loss/logits": 0.19856733083724976, "loss/reg": 0.0001220909325638786, "step": 1435 }, { "epoch": 0.1795, "grad_norm": 2.2005088329315186, "grad_norm_var": 0.4868843850416603, "learning_rate": 0.0001, "loss": 1.2925, "loss/crossentropy": 2.5352225303649902, "loss/hidden": 1.1171875, "loss/logits": 0.17413491010665894, "loss/reg": 0.00012203741061966866, "step": 1436 }, { "epoch": 0.179625, "grad_norm": 3.1560230255126953, "grad_norm_var": 0.4966970141493893, "learning_rate": 0.0001, "loss": 1.5889, "loss/crossentropy": 2.4854037761688232, "loss/hidden": 1.328125, "loss/logits": 0.2595502734184265, "loss/reg": 0.0001219876212417148, "step": 1437 }, { "epoch": 0.17975, "grad_norm": 2.4988839626312256, "grad_norm_var": 0.4977846656427855, "learning_rate": 0.0001, "loss": 1.4575, "loss/crossentropy": 2.367790460586548, "loss/hidden": 1.21875, "loss/logits": 0.2375032603740692, "loss/reg": 0.00012194045848445967, "step": 1438 }, { "epoch": 0.179875, "grad_norm": 2.5907726287841797, "grad_norm_var": 0.4952213877784326, "learning_rate": 0.0001, "loss": 1.3505, "loss/crossentropy": 2.6016342639923096, "loss/hidden": 1.125, "loss/logits": 0.22427219152450562, "loss/reg": 0.00012188626715214923, "step": 1439 }, { "epoch": 0.18, "grad_norm": 2.6144635677337646, "grad_norm_var": 0.496408639388598, "learning_rate": 0.0001, "loss": 1.3817, "loss/crossentropy": 2.6020567417144775, "loss/hidden": 1.1796875, "loss/logits": 0.2007896602153778, "loss/reg": 0.00012182991486042738, "step": 1440 }, { "epoch": 0.180125, "grad_norm": 2.820241928100586, "grad_norm_var": 0.49677157052248594, "learning_rate": 0.0001, "loss": 1.3724, "loss/crossentropy": 2.4075067043304443, "loss/hidden": 1.1796875, "loss/logits": 0.1914624571800232, "loss/reg": 0.00012178025644971058, "step": 1441 }, { "epoch": 0.18025, "grad_norm": 2.067556381225586, "grad_norm_var": 0.4030270458141082, "learning_rate": 0.0001, "loss": 1.1079, "loss/crossentropy": 2.5904557704925537, "loss/hidden": 0.953125, "loss/logits": 0.1535588502883911, "loss/reg": 0.00012172715651104227, "step": 1442 }, { "epoch": 0.180375, "grad_norm": 2.372554302215576, "grad_norm_var": 0.4026818070645495, "learning_rate": 0.0001, "loss": 1.414, "loss/crossentropy": 2.3734114170074463, "loss/hidden": 1.2109375, "loss/logits": 0.20185071229934692, "loss/reg": 0.00012166958913439885, "step": 1443 }, { "epoch": 0.1805, "grad_norm": 2.223769426345825, "grad_norm_var": 0.410586397009475, "learning_rate": 0.0001, "loss": 1.4112, "loss/crossentropy": 2.2828826904296875, "loss/hidden": 1.2109375, "loss/logits": 0.19906693696975708, "loss/reg": 0.00012161279300926253, "step": 1444 }, { "epoch": 0.180625, "grad_norm": 2.481344223022461, "grad_norm_var": 0.4107751235842497, "learning_rate": 0.0001, "loss": 1.3509, "loss/crossentropy": 2.5053985118865967, "loss/hidden": 1.15625, "loss/logits": 0.1934780776500702, "loss/reg": 0.00012154504656791687, "step": 1445 }, { "epoch": 0.18075, "grad_norm": 2.631906270980835, "grad_norm_var": 0.07936196312864467, "learning_rate": 0.0001, "loss": 1.3183, "loss/crossentropy": 2.5259923934936523, "loss/hidden": 1.140625, "loss/logits": 0.17645049095153809, "loss/reg": 0.00012149464600952342, "step": 1446 }, { "epoch": 0.180875, "grad_norm": 2.4057629108428955, "grad_norm_var": 0.07264332941360081, "learning_rate": 0.0001, "loss": 1.4878, "loss/crossentropy": 2.4729113578796387, "loss/hidden": 1.265625, "loss/logits": 0.22092443704605103, "loss/reg": 0.000121431534353178, "step": 1447 }, { "epoch": 0.181, "grad_norm": 2.8776566982269287, "grad_norm_var": 0.07817514719749473, "learning_rate": 0.0001, "loss": 1.2072, "loss/crossentropy": 2.9761579036712646, "loss/hidden": 1.0390625, "loss/logits": 0.16689828038215637, "loss/reg": 0.00012136517761973664, "step": 1448 }, { "epoch": 0.181125, "grad_norm": 2.535752534866333, "grad_norm_var": 0.0759184591818664, "learning_rate": 0.0001, "loss": 1.3184, "loss/crossentropy": 2.3764472007751465, "loss/hidden": 1.109375, "loss/logits": 0.20777416229248047, "loss/reg": 0.0001212987772305496, "step": 1449 }, { "epoch": 0.18125, "grad_norm": 2.898702383041382, "grad_norm_var": 0.08497814313276801, "learning_rate": 0.0001, "loss": 1.2031, "loss/crossentropy": 2.17466139793396, "loss/hidden": 1.0546875, "loss/logits": 0.14723367989063263, "loss/reg": 0.00012125736975576729, "step": 1450 }, { "epoch": 0.181375, "grad_norm": 2.3114421367645264, "grad_norm_var": 0.08393554321008405, "learning_rate": 0.0001, "loss": 1.2197, "loss/crossentropy": 2.415696144104004, "loss/hidden": 1.0546875, "loss/logits": 0.16376326978206635, "loss/reg": 0.00012121062172809616, "step": 1451 }, { "epoch": 0.1815, "grad_norm": 2.9938294887542725, "grad_norm_var": 0.0870473767514331, "learning_rate": 0.0001, "loss": 1.5457, "loss/crossentropy": 2.4802660942077637, "loss/hidden": 1.2734375, "loss/logits": 0.27103176712989807, "loss/reg": 0.00012116167636122555, "step": 1452 }, { "epoch": 0.181625, "grad_norm": 2.524752616882324, "grad_norm_var": 0.0645258660433446, "learning_rate": 0.0001, "loss": 1.4466, "loss/crossentropy": 2.824394941329956, "loss/hidden": 1.234375, "loss/logits": 0.21098601818084717, "loss/reg": 0.00012112017429899424, "step": 1453 }, { "epoch": 0.18175, "grad_norm": 2.6837897300720215, "grad_norm_var": 0.06532642357219037, "learning_rate": 0.0001, "loss": 1.4733, "loss/crossentropy": 2.283506393432617, "loss/hidden": 1.25, "loss/logits": 0.22205930948257446, "loss/reg": 0.0001210679838550277, "step": 1454 }, { "epoch": 0.181875, "grad_norm": 2.6279468536376953, "grad_norm_var": 0.06554230406006217, "learning_rate": 0.0001, "loss": 1.3167, "loss/crossentropy": 2.4046006202697754, "loss/hidden": 1.1328125, "loss/logits": 0.18264982104301453, "loss/reg": 0.00012102424807380885, "step": 1455 }, { "epoch": 0.182, "grad_norm": 2.3619606494903564, "grad_norm_var": 0.06792809104647948, "learning_rate": 0.0001, "loss": 1.3485, "loss/crossentropy": 2.5968782901763916, "loss/hidden": 1.15625, "loss/logits": 0.1910872459411621, "loss/reg": 0.00012098611477995291, "step": 1456 }, { "epoch": 0.182125, "grad_norm": 5.224443435668945, "grad_norm_var": 0.5154384185752157, "learning_rate": 0.0001, "loss": 1.4436, "loss/crossentropy": 2.640511989593506, "loss/hidden": 1.25, "loss/logits": 0.19236883521080017, "loss/reg": 0.00012093476834706962, "step": 1457 }, { "epoch": 0.18225, "grad_norm": 2.3527910709381104, "grad_norm_var": 0.49641562325917904, "learning_rate": 0.0001, "loss": 1.1632, "loss/crossentropy": 2.7701945304870605, "loss/hidden": 0.9921875, "loss/logits": 0.16985301673412323, "loss/reg": 0.00012088369112461805, "step": 1458 }, { "epoch": 0.182375, "grad_norm": 2.2830605506896973, "grad_norm_var": 0.501053442307519, "learning_rate": 0.0001, "loss": 1.2449, "loss/crossentropy": 2.481513738632202, "loss/hidden": 1.0625, "loss/logits": 0.18122999370098114, "loss/reg": 0.00012082922330591828, "step": 1459 }, { "epoch": 0.1825, "grad_norm": 3.0270626544952393, "grad_norm_var": 0.48891098940987376, "learning_rate": 0.0001, "loss": 1.5512, "loss/crossentropy": 2.6582679748535156, "loss/hidden": 1.3046875, "loss/logits": 0.2452942579984665, "loss/reg": 0.00012077302380930632, "step": 1460 }, { "epoch": 0.182625, "grad_norm": 2.8883237838745117, "grad_norm_var": 0.4839310859835109, "learning_rate": 0.0001, "loss": 1.4253, "loss/crossentropy": 2.529261350631714, "loss/hidden": 1.21875, "loss/logits": 0.2053220272064209, "loss/reg": 0.00012071572564309463, "step": 1461 }, { "epoch": 0.18275, "grad_norm": 2.1943676471710205, "grad_norm_var": 0.5050795996356849, "learning_rate": 0.0001, "loss": 1.1908, "loss/crossentropy": 2.480616331100464, "loss/hidden": 1.015625, "loss/logits": 0.1739429235458374, "loss/reg": 0.00012066281487932429, "step": 1462 }, { "epoch": 0.182875, "grad_norm": 3.0016472339630127, "grad_norm_var": 0.4989702650811286, "learning_rate": 0.0001, "loss": 1.1956, "loss/crossentropy": 2.9199087619781494, "loss/hidden": 1.0234375, "loss/logits": 0.17092075943946838, "loss/reg": 0.00012059887376381084, "step": 1463 }, { "epoch": 0.183, "grad_norm": 2.371258020401001, "grad_norm_var": 0.5097017493080519, "learning_rate": 0.0001, "loss": 1.3844, "loss/crossentropy": 2.4022204875946045, "loss/hidden": 1.203125, "loss/logits": 0.1800795942544937, "loss/reg": 0.00012054949183948338, "step": 1464 }, { "epoch": 0.183125, "grad_norm": 4.382226943969727, "grad_norm_var": 0.6657206483083994, "learning_rate": 0.0001, "loss": 1.8236, "loss/crossentropy": 2.7129218578338623, "loss/hidden": 1.515625, "loss/logits": 0.3067227303981781, "loss/reg": 0.00012049229553667828, "step": 1465 }, { "epoch": 0.18325, "grad_norm": 2.2945079803466797, "grad_norm_var": 0.6872693680143432, "learning_rate": 0.0001, "loss": 1.3318, "loss/crossentropy": 2.4627301692962646, "loss/hidden": 1.1328125, "loss/logits": 0.1978265643119812, "loss/reg": 0.00012042631715303287, "step": 1466 }, { "epoch": 0.183375, "grad_norm": 3.436194896697998, "grad_norm_var": 0.6862881064370808, "learning_rate": 0.0001, "loss": 1.4467, "loss/crossentropy": 2.571751594543457, "loss/hidden": 1.25, "loss/logits": 0.19548696279525757, "loss/reg": 0.00012037341366522014, "step": 1467 }, { "epoch": 0.1835, "grad_norm": 2.577315092086792, "grad_norm_var": 0.6927813913337936, "learning_rate": 0.0001, "loss": 1.325, "loss/crossentropy": 2.3032162189483643, "loss/hidden": 1.140625, "loss/logits": 0.1831728219985962, "loss/reg": 0.00012031103688059375, "step": 1468 }, { "epoch": 0.183625, "grad_norm": 2.532583713531494, "grad_norm_var": 0.6924043975076584, "learning_rate": 0.0001, "loss": 1.4882, "loss/crossentropy": 2.527585983276367, "loss/hidden": 1.25, "loss/logits": 0.2369614690542221, "loss/reg": 0.00012025787873426452, "step": 1469 }, { "epoch": 0.18375, "grad_norm": 2.7767436504364014, "grad_norm_var": 0.6903890866645942, "learning_rate": 0.0001, "loss": 1.3311, "loss/crossentropy": 2.733691453933716, "loss/hidden": 1.1328125, "loss/logits": 0.1971227377653122, "loss/reg": 0.00012021444854326546, "step": 1470 }, { "epoch": 0.183875, "grad_norm": 2.567958116531372, "grad_norm_var": 0.6927562422545518, "learning_rate": 0.0001, "loss": 1.3319, "loss/crossentropy": 2.6208181381225586, "loss/hidden": 1.140625, "loss/logits": 0.1900712549686432, "loss/reg": 0.00012016348773613572, "step": 1471 }, { "epoch": 0.184, "grad_norm": 3.06532883644104, "grad_norm_var": 0.6739656811536233, "learning_rate": 0.0001, "loss": 1.5149, "loss/crossentropy": 2.5983071327209473, "loss/hidden": 1.2734375, "loss/logits": 0.24025274813175201, "loss/reg": 0.0001201113045681268, "step": 1472 }, { "epoch": 0.184125, "grad_norm": 3.0422353744506836, "grad_norm_var": 0.3057413316275526, "learning_rate": 0.0001, "loss": 1.4316, "loss/crossentropy": 2.650599241256714, "loss/hidden": 1.1875, "loss/logits": 0.24290411174297333, "loss/reg": 0.0001200652404804714, "step": 1473 }, { "epoch": 0.18425, "grad_norm": 2.6200764179229736, "grad_norm_var": 0.29428301298909765, "learning_rate": 0.0001, "loss": 1.3336, "loss/crossentropy": 2.451019525527954, "loss/hidden": 1.1328125, "loss/logits": 0.1996142566204071, "loss/reg": 0.0001200192273245193, "step": 1474 }, { "epoch": 0.184375, "grad_norm": 3.16168475151062, "grad_norm_var": 0.2800621830615796, "learning_rate": 0.0001, "loss": 1.3472, "loss/crossentropy": 2.442430257797241, "loss/hidden": 1.140625, "loss/logits": 0.20534521341323853, "loss/reg": 0.00011996590183116496, "step": 1475 }, { "epoch": 0.1845, "grad_norm": 3.115558624267578, "grad_norm_var": 0.2823905172854474, "learning_rate": 0.0001, "loss": 1.4539, "loss/crossentropy": 2.3762309551239014, "loss/hidden": 1.25, "loss/logits": 0.20274010300636292, "loss/reg": 0.00011991594510618597, "step": 1476 }, { "epoch": 0.184625, "grad_norm": 3.178046703338623, "grad_norm_var": 0.28808379321504063, "learning_rate": 0.0001, "loss": 1.2824, "loss/crossentropy": 2.7192118167877197, "loss/hidden": 1.0703125, "loss/logits": 0.21090492606163025, "loss/reg": 0.0001198663012473844, "step": 1477 }, { "epoch": 0.18475, "grad_norm": 6.871819496154785, "grad_norm_var": 1.2186253105574754, "learning_rate": 0.0001, "loss": 1.317, "loss/crossentropy": 2.488731861114502, "loss/hidden": 1.15625, "loss/logits": 0.15956056118011475, "loss/reg": 0.00011981191346421838, "step": 1478 }, { "epoch": 0.184875, "grad_norm": 2.760854721069336, "grad_norm_var": 1.228206392384199, "learning_rate": 0.0001, "loss": 1.4068, "loss/crossentropy": 2.1699817180633545, "loss/hidden": 1.203125, "loss/logits": 0.202475443482399, "loss/reg": 0.00011976160749327391, "step": 1479 }, { "epoch": 0.185, "grad_norm": 1.9283795356750488, "grad_norm_var": 1.2877582458297676, "learning_rate": 0.0001, "loss": 1.4261, "loss/crossentropy": 2.0415639877319336, "loss/hidden": 1.1953125, "loss/logits": 0.2295614778995514, "loss/reg": 0.00011970567720709369, "step": 1480 }, { "epoch": 0.185125, "grad_norm": 2.516223430633545, "grad_norm_var": 1.1974267278219553, "learning_rate": 0.0001, "loss": 1.2936, "loss/crossentropy": 2.4886105060577393, "loss/hidden": 1.125, "loss/logits": 0.16744288802146912, "loss/reg": 0.00011964350414928049, "step": 1481 }, { "epoch": 0.18525, "grad_norm": 2.4866883754730225, "grad_norm_var": 1.1809440067797858, "learning_rate": 0.0001, "loss": 1.5609, "loss/crossentropy": 2.334197998046875, "loss/hidden": 1.2890625, "loss/logits": 0.2706204652786255, "loss/reg": 0.00011958623508689925, "step": 1482 }, { "epoch": 0.185375, "grad_norm": 2.653337240219116, "grad_norm_var": 1.1778778522125237, "learning_rate": 0.0001, "loss": 1.3272, "loss/crossentropy": 2.6950366497039795, "loss/hidden": 1.1171875, "loss/logits": 0.20879371464252472, "loss/reg": 0.00011952499335166067, "step": 1483 }, { "epoch": 0.1855, "grad_norm": 8.682543754577637, "grad_norm_var": 3.170798606854333, "learning_rate": 0.0001, "loss": 2.0646, "loss/crossentropy": 2.3108787536621094, "loss/hidden": 1.6875, "loss/logits": 0.3758898973464966, "loss/reg": 0.00011947143502766266, "step": 1484 }, { "epoch": 0.185625, "grad_norm": 6.775638580322266, "grad_norm_var": 3.8208412660942144, "learning_rate": 0.0001, "loss": 1.9106, "loss/crossentropy": 2.7394680976867676, "loss/hidden": 1.59375, "loss/logits": 0.31562715768814087, "loss/reg": 0.00011940792319364846, "step": 1485 }, { "epoch": 0.18575, "grad_norm": 3.287755012512207, "grad_norm_var": 3.7785012749604743, "learning_rate": 0.0001, "loss": 1.3548, "loss/crossentropy": 2.5342772006988525, "loss/hidden": 1.171875, "loss/logits": 0.18173633515834808, "loss/reg": 0.00011934544454561546, "step": 1486 }, { "epoch": 0.185875, "grad_norm": 3.311138153076172, "grad_norm_var": 3.7038553503453175, "learning_rate": 0.0001, "loss": 1.331, "loss/crossentropy": 2.612750768661499, "loss/hidden": 1.140625, "loss/logits": 0.18920673429965973, "loss/reg": 0.00011929369065910578, "step": 1487 }, { "epoch": 0.186, "grad_norm": 2.4694905281066895, "grad_norm_var": 3.777743446307999, "learning_rate": 0.0001, "loss": 1.3405, "loss/crossentropy": 2.6493754386901855, "loss/hidden": 1.140625, "loss/logits": 0.19864195585250854, "loss/reg": 0.0001192428680951707, "step": 1488 }, { "epoch": 0.186125, "grad_norm": 1.9863967895507812, "grad_norm_var": 3.937038641519946, "learning_rate": 0.0001, "loss": 1.1828, "loss/crossentropy": 2.5557332038879395, "loss/hidden": 1.0234375, "loss/logits": 0.15813088417053223, "loss/reg": 0.00011918347445316613, "step": 1489 }, { "epoch": 0.18625, "grad_norm": 2.4341773986816406, "grad_norm_var": 3.9638060121365144, "learning_rate": 0.0001, "loss": 1.2746, "loss/crossentropy": 2.6224076747894287, "loss/hidden": 1.0859375, "loss/logits": 0.1874784380197525, "loss/reg": 0.00011913350317627192, "step": 1490 }, { "epoch": 0.186375, "grad_norm": 36.63341522216797, "grad_norm_var": 72.02444215106806, "learning_rate": 0.0001, "loss": 1.4208, "loss/crossentropy": 2.608072280883789, "loss/hidden": 1.1953125, "loss/logits": 0.2242889404296875, "loss/reg": 0.00011908278247574344, "step": 1491 }, { "epoch": 0.1865, "grad_norm": 3.1195626258850098, "grad_norm_var": 72.02306702691054, "learning_rate": 0.0001, "loss": 1.3969, "loss/crossentropy": 2.577718496322632, "loss/hidden": 1.1796875, "loss/logits": 0.21601516008377075, "loss/reg": 0.00011903255654033273, "step": 1492 }, { "epoch": 0.186625, "grad_norm": 7.494073390960693, "grad_norm_var": 71.73977310022218, "learning_rate": 0.0001, "loss": 1.4972, "loss/crossentropy": 2.3182387351989746, "loss/hidden": 1.296875, "loss/logits": 0.1990913450717926, "loss/reg": 0.0001189842150779441, "step": 1493 }, { "epoch": 0.18675, "grad_norm": 3.256635904312134, "grad_norm_var": 72.11865215111314, "learning_rate": 0.0001, "loss": 1.4457, "loss/crossentropy": 3.134037733078003, "loss/hidden": 1.203125, "loss/logits": 0.24142670631408691, "loss/reg": 0.00011893283226527274, "step": 1494 }, { "epoch": 0.186875, "grad_norm": 2.4897048473358154, "grad_norm_var": 72.23085455450061, "learning_rate": 0.0001, "loss": 1.3423, "loss/crossentropy": 2.597522497177124, "loss/hidden": 1.140625, "loss/logits": 0.200444757938385, "loss/reg": 0.00011888061271747574, "step": 1495 }, { "epoch": 0.187, "grad_norm": 2.49828839302063, "grad_norm_var": 71.96301272310045, "learning_rate": 0.0001, "loss": 1.4144, "loss/crossentropy": 2.2553422451019287, "loss/hidden": 1.2265625, "loss/logits": 0.18667912483215332, "loss/reg": 0.00011882473336299881, "step": 1496 }, { "epoch": 0.187125, "grad_norm": 3.187157154083252, "grad_norm_var": 71.70132904772959, "learning_rate": 0.0001, "loss": 1.5057, "loss/crossentropy": 2.7505416870117188, "loss/hidden": 1.28125, "loss/logits": 0.22329594194889069, "loss/reg": 0.00011877108772750944, "step": 1497 }, { "epoch": 0.18725, "grad_norm": 3.284027338027954, "grad_norm_var": 71.38904494975777, "learning_rate": 0.0001, "loss": 1.9962, "loss/crossentropy": 2.2327938079833984, "loss/hidden": 1.625, "loss/logits": 0.3699975609779358, "loss/reg": 0.00011871519382111728, "step": 1498 }, { "epoch": 0.187375, "grad_norm": 3.3013715744018555, "grad_norm_var": 71.1392833963725, "learning_rate": 0.0001, "loss": 1.7364, "loss/crossentropy": 2.84899640083313, "loss/hidden": 1.3828125, "loss/logits": 0.35243597626686096, "loss/reg": 0.00011866320710396394, "step": 1499 }, { "epoch": 0.1875, "grad_norm": 2.286165952682495, "grad_norm_var": 71.31323875979778, "learning_rate": 0.0001, "loss": 1.2549, "loss/crossentropy": 2.5492546558380127, "loss/hidden": 1.0625, "loss/logits": 0.1911659687757492, "loss/reg": 0.00011861207894980907, "step": 1500 }, { "epoch": 0.187625, "grad_norm": 2.895925283432007, "grad_norm_var": 71.58813685762244, "learning_rate": 0.0001, "loss": 1.6294, "loss/crossentropy": 2.4387454986572266, "loss/hidden": 1.3671875, "loss/logits": 0.2610049843788147, "loss/reg": 0.00011855871707666665, "step": 1501 }, { "epoch": 0.18775, "grad_norm": 12.093132972717285, "grad_norm_var": 74.13502854471473, "learning_rate": 0.0001, "loss": 1.7152, "loss/crossentropy": 2.600273609161377, "loss/hidden": 1.4453125, "loss/logits": 0.2686743140220642, "loss/reg": 0.0001185064684250392, "step": 1502 }, { "epoch": 0.187875, "grad_norm": 4.026681900024414, "grad_norm_var": 73.92993060087568, "learning_rate": 0.0001, "loss": 1.6701, "loss/crossentropy": 2.709118127822876, "loss/hidden": 1.40625, "loss/logits": 0.2626368999481201, "loss/reg": 0.000118453630420845, "step": 1503 }, { "epoch": 0.188, "grad_norm": 3.7459239959716797, "grad_norm_var": 73.4579575423467, "learning_rate": 0.0001, "loss": 1.4324, "loss/crossentropy": 2.4148643016815186, "loss/hidden": 1.2109375, "loss/logits": 0.22032088041305542, "loss/reg": 0.00011840878869406879, "step": 1504 }, { "epoch": 0.188125, "grad_norm": 2.2754406929016113, "grad_norm_var": 73.31155087307008, "learning_rate": 0.0001, "loss": 1.2355, "loss/crossentropy": 2.4492344856262207, "loss/hidden": 1.0703125, "loss/logits": 0.16400852799415588, "loss/reg": 0.00011836851626867428, "step": 1505 }, { "epoch": 0.18825, "grad_norm": 3.067089557647705, "grad_norm_var": 73.04083321883901, "learning_rate": 0.0001, "loss": 1.7007, "loss/crossentropy": 2.1950020790100098, "loss/hidden": 1.4453125, "loss/logits": 0.2542187571525574, "loss/reg": 0.00011831735173473135, "step": 1506 }, { "epoch": 0.188375, "grad_norm": 2.1761672496795654, "grad_norm_var": 6.408932697086608, "learning_rate": 0.0001, "loss": 1.2191, "loss/crossentropy": 2.5266611576080322, "loss/hidden": 1.046875, "loss/logits": 0.17105624079704285, "loss/reg": 0.00011826892296085134, "step": 1507 }, { "epoch": 0.1885, "grad_norm": 2.2936675548553467, "grad_norm_var": 6.529228167338039, "learning_rate": 0.0001, "loss": 1.3304, "loss/crossentropy": 2.638796806335449, "loss/hidden": 1.1328125, "loss/logits": 0.19644491374492645, "loss/reg": 0.0001182199630420655, "step": 1508 }, { "epoch": 0.188625, "grad_norm": 2.2385849952697754, "grad_norm_var": 5.648164916291223, "learning_rate": 0.0001, "loss": 1.3797, "loss/crossentropy": 2.5375537872314453, "loss/hidden": 1.1640625, "loss/logits": 0.21441210806369781, "loss/reg": 0.00011817533959401771, "step": 1509 }, { "epoch": 0.18875, "grad_norm": 2.3796937465667725, "grad_norm_var": 5.718224242409011, "learning_rate": 0.0001, "loss": 1.2461, "loss/crossentropy": 2.825817108154297, "loss/hidden": 1.0703125, "loss/logits": 0.174603670835495, "loss/reg": 0.00011812576849479228, "step": 1510 }, { "epoch": 0.188875, "grad_norm": 5.252975940704346, "grad_norm_var": 5.863774655088408, "learning_rate": 0.0001, "loss": 1.9322, "loss/crossentropy": 2.840268611907959, "loss/hidden": 1.546875, "loss/logits": 0.3841322064399719, "loss/reg": 0.00011806859401986003, "step": 1511 }, { "epoch": 0.189, "grad_norm": 4.309655666351318, "grad_norm_var": 5.811781992085654, "learning_rate": 0.0001, "loss": 1.2636, "loss/crossentropy": 2.5870347023010254, "loss/hidden": 1.1015625, "loss/logits": 0.16089820861816406, "loss/reg": 0.00011801971413660794, "step": 1512 }, { "epoch": 0.189125, "grad_norm": 5.7971696853637695, "grad_norm_var": 6.067475064250841, "learning_rate": 0.0001, "loss": 1.475, "loss/crossentropy": 2.144009828567505, "loss/hidden": 1.2734375, "loss/logits": 0.2004067301750183, "loss/reg": 0.00011796548642450944, "step": 1513 }, { "epoch": 0.18925, "grad_norm": 3.2778749465942383, "grad_norm_var": 6.0679326678275265, "learning_rate": 0.0001, "loss": 1.35, "loss/crossentropy": 2.584254264831543, "loss/hidden": 1.15625, "loss/logits": 0.19256888329982758, "loss/reg": 0.00011790933785960078, "step": 1514 }, { "epoch": 0.189375, "grad_norm": 2.486767530441284, "grad_norm_var": 6.167756330415012, "learning_rate": 0.0001, "loss": 1.2907, "loss/crossentropy": 2.7800066471099854, "loss/hidden": 1.09375, "loss/logits": 0.19572564959526062, "loss/reg": 0.0001178508173325099, "step": 1515 }, { "epoch": 0.1895, "grad_norm": 2.4400854110717773, "grad_norm_var": 6.138422018218078, "learning_rate": 0.0001, "loss": 1.2665, "loss/crossentropy": 2.5022735595703125, "loss/hidden": 1.09375, "loss/logits": 0.17159509658813477, "loss/reg": 0.00011779481428675354, "step": 1516 }, { "epoch": 0.189625, "grad_norm": 2.335866928100586, "grad_norm_var": 6.225335935433387, "learning_rate": 0.0001, "loss": 1.3175, "loss/crossentropy": 2.675154685974121, "loss/hidden": 1.140625, "loss/logits": 0.17565396428108215, "loss/reg": 0.00011773013829952106, "step": 1517 }, { "epoch": 0.18975, "grad_norm": 4.094714641571045, "grad_norm_var": 1.3392880531383373, "learning_rate": 0.0001, "loss": 1.7635, "loss/crossentropy": 2.6612064838409424, "loss/hidden": 1.4375, "loss/logits": 0.32486772537231445, "loss/reg": 0.00011767914838856086, "step": 1518 }, { "epoch": 0.189875, "grad_norm": 2.263744592666626, "grad_norm_var": 1.3538834088715472, "learning_rate": 0.0001, "loss": 1.2725, "loss/crossentropy": 2.315162420272827, "loss/hidden": 1.1015625, "loss/logits": 0.1697949320077896, "loss/reg": 0.00011761367932194844, "step": 1519 }, { "epoch": 0.19, "grad_norm": 2.4299325942993164, "grad_norm_var": 1.3579473516687186, "learning_rate": 0.0001, "loss": 1.3913, "loss/crossentropy": 2.1836163997650146, "loss/hidden": 1.1875, "loss/logits": 0.20257624983787537, "loss/reg": 0.00011754166189348325, "step": 1520 }, { "epoch": 0.190125, "grad_norm": 2.289334297180176, "grad_norm_var": 1.356487576299067, "learning_rate": 0.0001, "loss": 1.2013, "loss/crossentropy": 2.6502480506896973, "loss/hidden": 1.03125, "loss/logits": 0.16885000467300415, "loss/reg": 0.00011747071403078735, "step": 1521 }, { "epoch": 0.19025, "grad_norm": 3.3030593395233154, "grad_norm_var": 1.3598499115853049, "learning_rate": 0.0001, "loss": 1.3071, "loss/crossentropy": 2.435314655303955, "loss/hidden": 1.1328125, "loss/logits": 0.17315879464149475, "loss/reg": 0.00011739900946849957, "step": 1522 }, { "epoch": 0.190375, "grad_norm": 2.276865243911743, "grad_norm_var": 1.348273515623281, "learning_rate": 0.0001, "loss": 1.2466, "loss/crossentropy": 2.391756296157837, "loss/hidden": 1.0703125, "loss/logits": 0.17516164481639862, "loss/reg": 0.00011735052976291627, "step": 1523 }, { "epoch": 0.1905, "grad_norm": 2.891308307647705, "grad_norm_var": 1.3069914477995115, "learning_rate": 0.0001, "loss": 1.2478, "loss/crossentropy": 2.4250457286834717, "loss/hidden": 1.078125, "loss/logits": 0.16853061318397522, "loss/reg": 0.00011730389087460935, "step": 1524 }, { "epoch": 0.190625, "grad_norm": 3.06881046295166, "grad_norm_var": 1.2514799236747496, "learning_rate": 0.0001, "loss": 1.321, "loss/crossentropy": 2.669633626937866, "loss/hidden": 1.1171875, "loss/logits": 0.20264790952205658, "loss/reg": 0.00011725593503797427, "step": 1525 }, { "epoch": 0.19075, "grad_norm": 2.636446714401245, "grad_norm_var": 1.228164374670668, "learning_rate": 0.0001, "loss": 1.3712, "loss/crossentropy": 2.551330804824829, "loss/hidden": 1.1796875, "loss/logits": 0.19033363461494446, "loss/reg": 0.00011719469330273569, "step": 1526 }, { "epoch": 0.190875, "grad_norm": 2.8944242000579834, "grad_norm_var": 0.929338528522832, "learning_rate": 0.0001, "loss": 1.5083, "loss/crossentropy": 2.475257635116577, "loss/hidden": 1.265625, "loss/logits": 0.24154072999954224, "loss/reg": 0.00011713660205714405, "step": 1527 }, { "epoch": 0.191, "grad_norm": 2.4068422317504883, "grad_norm_var": 0.8359844505058684, "learning_rate": 0.0001, "loss": 1.3441, "loss/crossentropy": 2.7353224754333496, "loss/hidden": 1.140625, "loss/logits": 0.20230522751808167, "loss/reg": 0.00011709082900779322, "step": 1528 }, { "epoch": 0.191125, "grad_norm": 2.0968706607818604, "grad_norm_var": 0.2775715490642386, "learning_rate": 0.0001, "loss": 1.2677, "loss/crossentropy": 2.4381606578826904, "loss/hidden": 1.09375, "loss/logits": 0.17275375127792358, "loss/reg": 0.00011704309144988656, "step": 1529 }, { "epoch": 0.19125, "grad_norm": 2.695347547531128, "grad_norm_var": 0.25386222008695064, "learning_rate": 0.0001, "loss": 1.504, "loss/crossentropy": 2.376267671585083, "loss/hidden": 1.265625, "loss/logits": 0.23716747760772705, "loss/reg": 0.00011700496543198824, "step": 1530 }, { "epoch": 0.191375, "grad_norm": 2.434994697570801, "grad_norm_var": 0.25524733167410596, "learning_rate": 0.0001, "loss": 1.2319, "loss/crossentropy": 2.5715880393981934, "loss/hidden": 1.0546875, "loss/logits": 0.17599859833717346, "loss/reg": 0.0001169489260064438, "step": 1531 }, { "epoch": 0.1915, "grad_norm": 2.7061607837677, "grad_norm_var": 0.2518732387451223, "learning_rate": 0.0001, "loss": 1.2896, "loss/crossentropy": 2.358098030090332, "loss/hidden": 1.109375, "loss/logits": 0.17909148335456848, "loss/reg": 0.00011688924132613465, "step": 1532 }, { "epoch": 0.191625, "grad_norm": 2.229325532913208, "grad_norm_var": 0.2574221923758877, "learning_rate": 0.0001, "loss": 1.2652, "loss/crossentropy": 2.5599749088287354, "loss/hidden": 1.078125, "loss/logits": 0.1858920305967331, "loss/reg": 0.00011683399497997016, "step": 1533 }, { "epoch": 0.19175, "grad_norm": 4.915576934814453, "grad_norm_var": 0.45548066472475524, "learning_rate": 0.0001, "loss": 1.5502, "loss/crossentropy": 2.5302011966705322, "loss/hidden": 1.375, "loss/logits": 0.1739870011806488, "loss/reg": 0.00011678911687340587, "step": 1534 }, { "epoch": 0.191875, "grad_norm": 2.7937123775482178, "grad_norm_var": 0.44071058501582433, "learning_rate": 0.0001, "loss": 1.5972, "loss/crossentropy": 2.394104242324829, "loss/hidden": 1.359375, "loss/logits": 0.2366926670074463, "loss/reg": 0.00011674488632706925, "step": 1535 }, { "epoch": 0.192, "grad_norm": 3.1008052825927734, "grad_norm_var": 0.43982422100014534, "learning_rate": 0.0001, "loss": 1.5254, "loss/crossentropy": 2.3557515144348145, "loss/hidden": 1.296875, "loss/logits": 0.22731639444828033, "loss/reg": 0.0001166999718407169, "step": 1536 }, { "epoch": 0.192125, "grad_norm": 12.463056564331055, "grad_norm_var": 6.221243775086623, "learning_rate": 0.0001, "loss": 1.9888, "loss/crossentropy": 2.481325387954712, "loss/hidden": 1.703125, "loss/logits": 0.2845377027988434, "loss/reg": 0.00011665234342217445, "step": 1537 }, { "epoch": 0.19225, "grad_norm": 2.6797103881835938, "grad_norm_var": 6.256254036917311, "learning_rate": 0.0001, "loss": 1.6258, "loss/crossentropy": 2.221282482147217, "loss/hidden": 1.40625, "loss/logits": 0.2183791697025299, "loss/reg": 0.00011661092139547691, "step": 1538 }, { "epoch": 0.192375, "grad_norm": 2.962153196334839, "grad_norm_var": 6.183609205869228, "learning_rate": 0.0001, "loss": 1.1647, "loss/crossentropy": 2.515779495239258, "loss/hidden": 1.0078125, "loss/logits": 0.15575549006462097, "loss/reg": 0.00011656247806968167, "step": 1539 }, { "epoch": 0.1925, "grad_norm": 4.216954708099365, "grad_norm_var": 6.197172060368344, "learning_rate": 0.0001, "loss": 1.8201, "loss/crossentropy": 2.8789989948272705, "loss/hidden": 1.546875, "loss/logits": 0.27209770679473877, "loss/reg": 0.00011651441309368238, "step": 1540 }, { "epoch": 0.192625, "grad_norm": 2.832493305206299, "grad_norm_var": 6.21484189512464, "learning_rate": 0.0001, "loss": 1.4264, "loss/crossentropy": 2.5738658905029297, "loss/hidden": 1.2109375, "loss/logits": 0.21427616477012634, "loss/reg": 0.00011646730126813054, "step": 1541 }, { "epoch": 0.19275, "grad_norm": 2.7491986751556396, "grad_norm_var": 6.202593191112827, "learning_rate": 0.0001, "loss": 1.283, "loss/crossentropy": 2.7724905014038086, "loss/hidden": 1.1171875, "loss/logits": 0.16460852324962616, "loss/reg": 0.00011642691970337182, "step": 1542 }, { "epoch": 0.192875, "grad_norm": 2.4093189239501953, "grad_norm_var": 6.257188270728466, "learning_rate": 0.0001, "loss": 1.3838, "loss/crossentropy": 2.208812713623047, "loss/hidden": 1.1796875, "loss/logits": 0.20298536121845245, "loss/reg": 0.00011637954594334587, "step": 1543 }, { "epoch": 0.193, "grad_norm": 2.32133150100708, "grad_norm_var": 6.269889732950687, "learning_rate": 0.0001, "loss": 1.2924, "loss/crossentropy": 2.536208391189575, "loss/hidden": 1.109375, "loss/logits": 0.18182924389839172, "loss/reg": 0.00011633076064754277, "step": 1544 }, { "epoch": 0.193125, "grad_norm": 2.1866965293884277, "grad_norm_var": 6.253883222405192, "learning_rate": 0.0001, "loss": 1.307, "loss/crossentropy": 2.3625078201293945, "loss/hidden": 1.1328125, "loss/logits": 0.17297673225402832, "loss/reg": 0.00011628259380813688, "step": 1545 }, { "epoch": 0.19325, "grad_norm": 1.9878499507904053, "grad_norm_var": 6.359285672916484, "learning_rate": 0.0001, "loss": 1.1218, "loss/crossentropy": 2.6263480186462402, "loss/hidden": 0.98046875, "loss/logits": 0.14016547799110413, "loss/reg": 0.00011622869351413101, "step": 1546 }, { "epoch": 0.193375, "grad_norm": 2.360893487930298, "grad_norm_var": 6.369527190765524, "learning_rate": 0.0001, "loss": 1.3702, "loss/crossentropy": 2.3654918670654297, "loss/hidden": 1.1796875, "loss/logits": 0.1893061101436615, "loss/reg": 0.00011618361168075353, "step": 1547 }, { "epoch": 0.1935, "grad_norm": 2.217850685119629, "grad_norm_var": 6.431701238842651, "learning_rate": 0.0001, "loss": 1.2321, "loss/crossentropy": 2.663281202316284, "loss/hidden": 1.046875, "loss/logits": 0.18411287665367126, "loss/reg": 0.00011613820970524102, "step": 1548 }, { "epoch": 0.193625, "grad_norm": 2.408940553665161, "grad_norm_var": 6.405641184570458, "learning_rate": 0.0001, "loss": 1.5465, "loss/crossentropy": 2.402618646621704, "loss/hidden": 1.3125, "loss/logits": 0.23284387588500977, "loss/reg": 0.00011609937064349651, "step": 1549 }, { "epoch": 0.19375, "grad_norm": 2.4717278480529785, "grad_norm_var": 6.289276908246276, "learning_rate": 0.0001, "loss": 1.3624, "loss/crossentropy": 2.4588308334350586, "loss/hidden": 1.171875, "loss/logits": 0.18932726979255676, "loss/reg": 0.00011605407053139061, "step": 1550 }, { "epoch": 0.193875, "grad_norm": 2.2029716968536377, "grad_norm_var": 6.347828422083852, "learning_rate": 0.0001, "loss": 1.1282, "loss/crossentropy": 2.5651614665985107, "loss/hidden": 0.9765625, "loss/logits": 0.1505090892314911, "loss/reg": 0.00011602052836678922, "step": 1551 }, { "epoch": 0.194, "grad_norm": 2.3370368480682373, "grad_norm_var": 6.396756268361879, "learning_rate": 0.0001, "loss": 1.2842, "loss/crossentropy": 2.538853168487549, "loss/hidden": 1.109375, "loss/logits": 0.17369768023490906, "loss/reg": 0.00011598801211221144, "step": 1552 }, { "epoch": 0.194125, "grad_norm": 4.071417808532715, "grad_norm_var": 0.40628497723138984, "learning_rate": 0.0001, "loss": 1.8071, "loss/crossentropy": 2.0490260124206543, "loss/hidden": 1.578125, "loss/logits": 0.22785112261772156, "loss/reg": 0.00011595317482715473, "step": 1553 }, { "epoch": 0.19425, "grad_norm": 2.167083740234375, "grad_norm_var": 0.420749078807849, "learning_rate": 0.0001, "loss": 1.2279, "loss/crossentropy": 2.739078998565674, "loss/hidden": 1.0546875, "loss/logits": 0.17200574278831482, "loss/reg": 0.00011590418580453843, "step": 1554 }, { "epoch": 0.194375, "grad_norm": 2.7465193271636963, "grad_norm_var": 0.41378899673199665, "learning_rate": 0.0001, "loss": 1.2495, "loss/crossentropy": 2.8327136039733887, "loss/hidden": 1.0625, "loss/logits": 0.18583707511425018, "loss/reg": 0.00011585705215111375, "step": 1555 }, { "epoch": 0.1945, "grad_norm": 2.9431259632110596, "grad_norm_var": 0.24151136401809764, "learning_rate": 0.0001, "loss": 1.397, "loss/crossentropy": 2.5207533836364746, "loss/hidden": 1.203125, "loss/logits": 0.19267132878303528, "loss/reg": 0.00011580926366150379, "step": 1556 }, { "epoch": 0.194625, "grad_norm": 3.226850748062134, "grad_norm_var": 0.26735201950093007, "learning_rate": 0.0001, "loss": 1.7948, "loss/crossentropy": 2.142216920852661, "loss/hidden": 1.484375, "loss/logits": 0.3093142509460449, "loss/reg": 0.00011576213728403673, "step": 1557 }, { "epoch": 0.19475, "grad_norm": 2.418354034423828, "grad_norm_var": 0.2654302816349012, "learning_rate": 0.0001, "loss": 1.2186, "loss/crossentropy": 2.5046820640563965, "loss/hidden": 1.0390625, "loss/logits": 0.17838521301746368, "loss/reg": 0.0001157145670731552, "step": 1558 }, { "epoch": 0.194875, "grad_norm": 2.0176548957824707, "grad_norm_var": 0.28131339078210355, "learning_rate": 0.0001, "loss": 1.1014, "loss/crossentropy": 2.6053385734558105, "loss/hidden": 0.94921875, "loss/logits": 0.1510506570339203, "loss/reg": 0.00011567320325411856, "step": 1559 }, { "epoch": 0.195, "grad_norm": 2.0956063270568848, "grad_norm_var": 0.29003755665789865, "learning_rate": 0.0001, "loss": 1.2742, "loss/crossentropy": 2.642915964126587, "loss/hidden": 1.109375, "loss/logits": 0.16365128755569458, "loss/reg": 0.000115623050078284, "step": 1560 }, { "epoch": 0.195125, "grad_norm": 2.4613945484161377, "grad_norm_var": 0.2835977175285959, "learning_rate": 0.0001, "loss": 1.3653, "loss/crossentropy": 2.509488582611084, "loss/hidden": 1.15625, "loss/logits": 0.20786544680595398, "loss/reg": 0.00011557457037270069, "step": 1561 }, { "epoch": 0.19525, "grad_norm": 2.5948381423950195, "grad_norm_var": 0.26449140953331685, "learning_rate": 0.0001, "loss": 1.4794, "loss/crossentropy": 2.454765558242798, "loss/hidden": 1.21875, "loss/logits": 0.2595103979110718, "loss/reg": 0.0001155437421402894, "step": 1562 }, { "epoch": 0.195375, "grad_norm": 3.4489541053771973, "grad_norm_var": 0.3115725521229988, "learning_rate": 0.0001, "loss": 1.296, "loss/crossentropy": 2.621476650238037, "loss/hidden": 1.125, "loss/logits": 0.16980065405368805, "loss/reg": 0.00011551759234862402, "step": 1563 }, { "epoch": 0.1955, "grad_norm": 2.0052497386932373, "grad_norm_var": 0.32563827221116526, "learning_rate": 0.0001, "loss": 1.294, "loss/crossentropy": 2.2698540687561035, "loss/hidden": 1.1171875, "loss/logits": 0.17564430832862854, "loss/reg": 0.00011547568283276632, "step": 1564 }, { "epoch": 0.195625, "grad_norm": 2.3177356719970703, "grad_norm_var": 0.32849504781847805, "learning_rate": 0.0001, "loss": 1.5376, "loss/crossentropy": 2.3064796924591064, "loss/hidden": 1.296875, "loss/logits": 0.23954331874847412, "loss/reg": 0.00011542916035978124, "step": 1565 }, { "epoch": 0.19575, "grad_norm": 2.4090170860290527, "grad_norm_var": 0.32977497791559335, "learning_rate": 0.0001, "loss": 1.4166, "loss/crossentropy": 2.236618757247925, "loss/hidden": 1.203125, "loss/logits": 0.21227706968784332, "loss/reg": 0.0001153778939624317, "step": 1566 }, { "epoch": 0.195875, "grad_norm": 2.252762794494629, "grad_norm_var": 0.32735063679906395, "learning_rate": 0.0001, "loss": 1.3985, "loss/crossentropy": 2.537562131881714, "loss/hidden": 1.1796875, "loss/logits": 0.21762613952159882, "loss/reg": 0.00011533160432009026, "step": 1567 }, { "epoch": 0.196, "grad_norm": 4.1365065574646, "grad_norm_var": 0.4679343669431714, "learning_rate": 0.0001, "loss": 1.2637, "loss/crossentropy": 2.5620906352996826, "loss/hidden": 1.0859375, "loss/logits": 0.17659765481948853, "loss/reg": 0.00011528363393153995, "step": 1568 }, { "epoch": 0.196125, "grad_norm": 2.2536633014678955, "grad_norm_var": 0.3437748471810655, "learning_rate": 0.0001, "loss": 1.2721, "loss/crossentropy": 2.765577793121338, "loss/hidden": 1.0859375, "loss/logits": 0.18503984808921814, "loss/reg": 0.00011523719149408862, "step": 1569 }, { "epoch": 0.19625, "grad_norm": 2.901008129119873, "grad_norm_var": 0.3357166985321366, "learning_rate": 0.0001, "loss": 1.3539, "loss/crossentropy": 2.6998634338378906, "loss/hidden": 1.140625, "loss/logits": 0.21210506558418274, "loss/reg": 0.00011518428073031828, "step": 1570 }, { "epoch": 0.196375, "grad_norm": 2.2413582801818848, "grad_norm_var": 0.3444460497642673, "learning_rate": 0.0001, "loss": 1.451, "loss/crossentropy": 2.4969563484191895, "loss/hidden": 1.2265625, "loss/logits": 0.2232953906059265, "loss/reg": 0.00011513246136019006, "step": 1571 }, { "epoch": 0.1965, "grad_norm": 2.589327335357666, "grad_norm_var": 0.3364488876561751, "learning_rate": 0.0001, "loss": 1.4251, "loss/crossentropy": 2.4594039916992188, "loss/hidden": 1.203125, "loss/logits": 0.22080573439598083, "loss/reg": 0.00011508452007547021, "step": 1572 }, { "epoch": 0.196625, "grad_norm": 3.2259724140167236, "grad_norm_var": 0.3363738432142469, "learning_rate": 0.0001, "loss": 1.5479, "loss/crossentropy": 2.465007781982422, "loss/hidden": 1.2734375, "loss/logits": 0.27334755659103394, "loss/reg": 0.00011503412679303437, "step": 1573 }, { "epoch": 0.19675, "grad_norm": 2.536031484603882, "grad_norm_var": 0.3346153911010994, "learning_rate": 0.0001, "loss": 1.2098, "loss/crossentropy": 2.8030953407287598, "loss/hidden": 1.03125, "loss/logits": 0.17740653455257416, "loss/reg": 0.00011498343519633636, "step": 1574 }, { "epoch": 0.196875, "grad_norm": 2.16465425491333, "grad_norm_var": 0.3246903529451153, "learning_rate": 0.0001, "loss": 1.1349, "loss/crossentropy": 2.484870672225952, "loss/hidden": 0.98046875, "loss/logits": 0.15326657891273499, "loss/reg": 0.00011493259808048606, "step": 1575 }, { "epoch": 0.197, "grad_norm": 2.3675003051757812, "grad_norm_var": 0.3109479836552033, "learning_rate": 0.0001, "loss": 1.3162, "loss/crossentropy": 2.575305700302124, "loss/hidden": 1.140625, "loss/logits": 0.17440912127494812, "loss/reg": 0.00011488122254377231, "step": 1576 }, { "epoch": 0.197125, "grad_norm": 69.20826721191406, "grad_norm_var": 277.3537902605975, "learning_rate": 0.0001, "loss": 2.6945, "loss/crossentropy": 2.3041789531707764, "loss/hidden": 2.09375, "loss/logits": 0.5996133685112, "loss/reg": 0.00011482146510388702, "step": 1577 }, { "epoch": 0.19725, "grad_norm": 2.447985887527466, "grad_norm_var": 277.43729636161726, "learning_rate": 0.0001, "loss": 1.1016, "loss/crossentropy": 2.5284910202026367, "loss/hidden": 0.9609375, "loss/logits": 0.1395394653081894, "loss/reg": 0.00011477321822894737, "step": 1578 }, { "epoch": 0.197375, "grad_norm": 3.1018998622894287, "grad_norm_var": 277.5990399379963, "learning_rate": 0.0001, "loss": 1.3226, "loss/crossentropy": 2.614349842071533, "loss/hidden": 1.1328125, "loss/logits": 0.18863807618618011, "loss/reg": 0.00011471893958514556, "step": 1579 }, { "epoch": 0.1975, "grad_norm": 2.1523640155792236, "grad_norm_var": 277.5071283474112, "learning_rate": 0.0001, "loss": 1.3284, "loss/crossentropy": 2.5176281929016113, "loss/hidden": 1.1328125, "loss/logits": 0.19448712468147278, "loss/reg": 0.00011466956493677571, "step": 1580 }, { "epoch": 0.197625, "grad_norm": 5.82731819152832, "grad_norm_var": 276.1939474190271, "learning_rate": 0.0001, "loss": 1.8401, "loss/crossentropy": 2.6345596313476562, "loss/hidden": 1.5234375, "loss/logits": 0.3155391812324524, "loss/reg": 0.0001146175927715376, "step": 1581 }, { "epoch": 0.19775, "grad_norm": 2.895268440246582, "grad_norm_var": 275.91182244406446, "learning_rate": 0.0001, "loss": 1.4961, "loss/crossentropy": 2.330193519592285, "loss/hidden": 1.2578125, "loss/logits": 0.2371354103088379, "loss/reg": 0.00011456407810328528, "step": 1582 }, { "epoch": 0.197875, "grad_norm": 2.388038396835327, "grad_norm_var": 275.8270011279102, "learning_rate": 0.0001, "loss": 1.2644, "loss/crossentropy": 2.6678552627563477, "loss/hidden": 1.0859375, "loss/logits": 0.17732274532318115, "loss/reg": 0.00011450510646682233, "step": 1583 }, { "epoch": 0.198, "grad_norm": 2.385690450668335, "grad_norm_var": 276.6934242841179, "learning_rate": 0.0001, "loss": 1.3231, "loss/crossentropy": 2.8147614002227783, "loss/hidden": 1.1171875, "loss/logits": 0.20480355620384216, "loss/reg": 0.00011444108531577513, "step": 1584 }, { "epoch": 0.198125, "grad_norm": 3.9338924884796143, "grad_norm_var": 275.82494159384544, "learning_rate": 0.0001, "loss": 1.4702, "loss/crossentropy": 2.281322479248047, "loss/hidden": 1.2265625, "loss/logits": 0.24252253770828247, "loss/reg": 0.00011437801731517538, "step": 1585 }, { "epoch": 0.19825, "grad_norm": 2.84724760055542, "grad_norm_var": 275.8546683217825, "learning_rate": 0.0001, "loss": 1.3544, "loss/crossentropy": 2.699129104614258, "loss/hidden": 1.140625, "loss/logits": 0.21266697347164154, "loss/reg": 0.00011430990707594901, "step": 1586 }, { "epoch": 0.198375, "grad_norm": 2.1566436290740967, "grad_norm_var": 275.90908791520184, "learning_rate": 0.0001, "loss": 1.352, "loss/crossentropy": 2.4530932903289795, "loss/hidden": 1.1484375, "loss/logits": 0.20238140225410461, "loss/reg": 0.000114257782115601, "step": 1587 }, { "epoch": 0.1985, "grad_norm": 5.054636001586914, "grad_norm_var": 274.8344383042295, "learning_rate": 0.0001, "loss": 1.7257, "loss/crossentropy": 2.58975887298584, "loss/hidden": 1.3984375, "loss/logits": 0.32616952061653137, "loss/reg": 0.0001141895554610528, "step": 1588 }, { "epoch": 0.198625, "grad_norm": 2.0736770629882812, "grad_norm_var": 275.5231274704364, "learning_rate": 0.0001, "loss": 1.0843, "loss/crossentropy": 2.5109071731567383, "loss/hidden": 0.94140625, "loss/logits": 0.1417950838804245, "loss/reg": 0.0001141207103501074, "step": 1589 }, { "epoch": 0.19875, "grad_norm": 2.4042294025421143, "grad_norm_var": 275.60435393820967, "learning_rate": 0.0001, "loss": 1.2986, "loss/crossentropy": 2.4533636569976807, "loss/hidden": 1.1015625, "loss/logits": 0.19591335952281952, "loss/reg": 0.00011404707765905187, "step": 1590 }, { "epoch": 0.198875, "grad_norm": 2.18344783782959, "grad_norm_var": 275.59203883326927, "learning_rate": 0.0001, "loss": 1.3317, "loss/crossentropy": 2.5711987018585205, "loss/hidden": 1.1171875, "loss/logits": 0.21332630515098572, "loss/reg": 0.0001139720989158377, "step": 1591 }, { "epoch": 0.199, "grad_norm": 2.1486918926239014, "grad_norm_var": 275.7327858220944, "learning_rate": 0.0001, "loss": 1.3703, "loss/crossentropy": 2.308668375015259, "loss/hidden": 1.1796875, "loss/logits": 0.1895202100276947, "loss/reg": 0.00011392493615858257, "step": 1592 }, { "epoch": 0.199125, "grad_norm": 2.385261297225952, "grad_norm_var": 1.2292051790388732, "learning_rate": 0.0001, "loss": 1.5396, "loss/crossentropy": 2.3078272342681885, "loss/hidden": 1.2890625, "loss/logits": 0.24937817454338074, "loss/reg": 0.0001138773950515315, "step": 1593 }, { "epoch": 0.19925, "grad_norm": 2.633211851119995, "grad_norm_var": 1.2202073284724284, "learning_rate": 0.0001, "loss": 1.2863, "loss/crossentropy": 2.4559550285339355, "loss/hidden": 1.1015625, "loss/logits": 0.1835949420928955, "loss/reg": 0.00011381452350178733, "step": 1594 }, { "epoch": 0.199375, "grad_norm": 2.304307222366333, "grad_norm_var": 1.239635790707166, "learning_rate": 0.0001, "loss": 1.3892, "loss/crossentropy": 2.3330070972442627, "loss/hidden": 1.171875, "loss/logits": 0.21615339815616608, "loss/reg": 0.000113766232971102, "step": 1595 }, { "epoch": 0.1995, "grad_norm": 2.4209237098693848, "grad_norm_var": 1.218773393695564, "learning_rate": 0.0001, "loss": 1.339, "loss/crossentropy": 2.544914722442627, "loss/hidden": 1.1484375, "loss/logits": 0.18944834172725677, "loss/reg": 0.00011370563879609108, "step": 1596 }, { "epoch": 0.199625, "grad_norm": 2.432481050491333, "grad_norm_var": 0.6039308453322131, "learning_rate": 0.0001, "loss": 1.3315, "loss/crossentropy": 2.659142255783081, "loss/hidden": 1.125, "loss/logits": 0.20536407828330994, "loss/reg": 0.0001136428618337959, "step": 1597 }, { "epoch": 0.19975, "grad_norm": 2.6989870071411133, "grad_norm_var": 0.600324933017319, "learning_rate": 0.0001, "loss": 1.1871, "loss/crossentropy": 2.5923893451690674, "loss/hidden": 1.0390625, "loss/logits": 0.14690035581588745, "loss/reg": 0.00011358861229382455, "step": 1598 }, { "epoch": 0.199875, "grad_norm": 2.4925386905670166, "grad_norm_var": 0.5973127117465424, "learning_rate": 0.0001, "loss": 1.3261, "loss/crossentropy": 2.177574396133423, "loss/hidden": 1.15625, "loss/logits": 0.16873914003372192, "loss/reg": 0.00011354185699019581, "step": 1599 }, { "epoch": 0.2, "grad_norm": 1.839694857597351, "grad_norm_var": 0.6358954308741146, "learning_rate": 0.0001, "loss": 1.2803, "loss/crossentropy": 2.5489537715911865, "loss/hidden": 1.09375, "loss/logits": 0.185407817363739, "loss/reg": 0.00011349577107466757, "step": 1600 }, { "epoch": 0.200125, "grad_norm": 2.9289920330047607, "grad_norm_var": 0.5237179341453985, "learning_rate": 0.0001, "loss": 1.2235, "loss/crossentropy": 2.4224767684936523, "loss/hidden": 1.0625, "loss/logits": 0.15989291667938232, "loss/reg": 0.00011344721860950813, "step": 1601 }, { "epoch": 0.20025, "grad_norm": 3.37585186958313, "grad_norm_var": 0.5612291136115255, "learning_rate": 0.0001, "loss": 1.528, "loss/crossentropy": 1.834604263305664, "loss/hidden": 1.3359375, "loss/logits": 0.19095176458358765, "loss/reg": 0.00011339721095282584, "step": 1602 }, { "epoch": 0.200375, "grad_norm": 2.4694302082061768, "grad_norm_var": 0.5490268451747472, "learning_rate": 0.0001, "loss": 1.2063, "loss/crossentropy": 2.4114913940429688, "loss/hidden": 1.046875, "loss/logits": 0.15830372273921967, "loss/reg": 0.00011335093586239964, "step": 1603 }, { "epoch": 0.2005, "grad_norm": 4.270150661468506, "grad_norm_var": 0.3323508568626214, "learning_rate": 0.0001, "loss": 1.7418, "loss/crossentropy": 2.505373477935791, "loss/hidden": 1.4375, "loss/logits": 0.3031439781188965, "loss/reg": 0.00011330132110742852, "step": 1604 }, { "epoch": 0.200625, "grad_norm": 2.5729105472564697, "grad_norm_var": 0.3151323251307279, "learning_rate": 0.0001, "loss": 1.2885, "loss/crossentropy": 2.4896180629730225, "loss/hidden": 1.109375, "loss/logits": 0.17802546918392181, "loss/reg": 0.00011324916704325005, "step": 1605 }, { "epoch": 0.20075, "grad_norm": 2.9794859886169434, "grad_norm_var": 0.32098548753608225, "learning_rate": 0.0001, "loss": 1.5189, "loss/crossentropy": 2.4023597240448, "loss/hidden": 1.2890625, "loss/logits": 0.22872374951839447, "loss/reg": 0.00011319564509904012, "step": 1606 }, { "epoch": 0.200875, "grad_norm": 2.3553476333618164, "grad_norm_var": 0.31251662514723455, "learning_rate": 0.0001, "loss": 1.2699, "loss/crossentropy": 2.6196365356445312, "loss/hidden": 1.0859375, "loss/logits": 0.1828354001045227, "loss/reg": 0.00011314019502606243, "step": 1607 }, { "epoch": 0.201, "grad_norm": 2.4748053550720215, "grad_norm_var": 0.2976150192151564, "learning_rate": 0.0001, "loss": 1.5229, "loss/crossentropy": 2.5705764293670654, "loss/hidden": 1.265625, "loss/logits": 0.2561238408088684, "loss/reg": 0.00011309135152259842, "step": 1608 }, { "epoch": 0.201125, "grad_norm": 2.4888110160827637, "grad_norm_var": 0.2944277792888286, "learning_rate": 0.0001, "loss": 1.3075, "loss/crossentropy": 2.5866353511810303, "loss/hidden": 1.125, "loss/logits": 0.18139275908470154, "loss/reg": 0.00011304724466754124, "step": 1609 }, { "epoch": 0.20125, "grad_norm": 2.9420037269592285, "grad_norm_var": 0.2988265169480598, "learning_rate": 0.0001, "loss": 1.514, "loss/crossentropy": 2.5440328121185303, "loss/hidden": 1.25, "loss/logits": 0.26287442445755005, "loss/reg": 0.00011300118057988584, "step": 1610 }, { "epoch": 0.201375, "grad_norm": 2.568551778793335, "grad_norm_var": 0.28958682761989435, "learning_rate": 0.0001, "loss": 1.268, "loss/crossentropy": 2.396118640899658, "loss/hidden": 1.0859375, "loss/logits": 0.18092429637908936, "loss/reg": 0.00011295860895188525, "step": 1611 }, { "epoch": 0.2015, "grad_norm": 4.15261173248291, "grad_norm_var": 0.4109705586861679, "learning_rate": 0.0001, "loss": 1.6776, "loss/crossentropy": 2.0784783363342285, "loss/hidden": 1.453125, "loss/logits": 0.22337478399276733, "loss/reg": 0.00011291028931736946, "step": 1612 }, { "epoch": 0.201625, "grad_norm": 2.9938302040100098, "grad_norm_var": 0.40202247215598413, "learning_rate": 0.0001, "loss": 1.1989, "loss/crossentropy": 2.934537410736084, "loss/hidden": 1.03125, "loss/logits": 0.1664934754371643, "loss/reg": 0.00011286533845122904, "step": 1613 }, { "epoch": 0.20175, "grad_norm": 2.3741188049316406, "grad_norm_var": 0.4151707619580278, "learning_rate": 0.0001, "loss": 1.3349, "loss/crossentropy": 2.465813398361206, "loss/hidden": 1.140625, "loss/logits": 0.1931827962398529, "loss/reg": 0.00011281906336080283, "step": 1614 }, { "epoch": 0.201875, "grad_norm": 2.3127527236938477, "grad_norm_var": 0.4252790943202569, "learning_rate": 0.0001, "loss": 1.3371, "loss/crossentropy": 2.6733384132385254, "loss/hidden": 1.15625, "loss/logits": 0.1797255575656891, "loss/reg": 0.00011277921294094995, "step": 1615 }, { "epoch": 0.202, "grad_norm": 4.533862590789795, "grad_norm_var": 0.5272539397658247, "learning_rate": 0.0001, "loss": 1.5621, "loss/crossentropy": 3.3387370109558105, "loss/hidden": 1.3125, "loss/logits": 0.24846123158931732, "loss/reg": 0.00011272945994278416, "step": 1616 }, { "epoch": 0.202125, "grad_norm": 2.3726918697357178, "grad_norm_var": 0.5509054842574298, "learning_rate": 0.0001, "loss": 1.1179, "loss/crossentropy": 2.6090011596679688, "loss/hidden": 0.95703125, "loss/logits": 0.1597638726234436, "loss/reg": 0.00011268185335211456, "step": 1617 }, { "epoch": 0.20225, "grad_norm": 2.7269279956817627, "grad_norm_var": 0.5405795688366801, "learning_rate": 0.0001, "loss": 1.1681, "loss/crossentropy": 2.851790189743042, "loss/hidden": 1.0078125, "loss/logits": 0.15917493402957916, "loss/reg": 0.00011262712359894067, "step": 1618 }, { "epoch": 0.202375, "grad_norm": 2.3095877170562744, "grad_norm_var": 0.5516036765960941, "learning_rate": 0.0001, "loss": 1.3965, "loss/crossentropy": 2.562356948852539, "loss/hidden": 1.203125, "loss/logits": 0.19224533438682556, "loss/reg": 0.0001125685012084432, "step": 1619 }, { "epoch": 0.2025, "grad_norm": 2.6173062324523926, "grad_norm_var": 0.42078617735705065, "learning_rate": 0.0001, "loss": 1.5776, "loss/crossentropy": 2.370835542678833, "loss/hidden": 1.3203125, "loss/logits": 0.2561703324317932, "loss/reg": 0.00011251666728639975, "step": 1620 }, { "epoch": 0.202625, "grad_norm": 2.6424407958984375, "grad_norm_var": 0.41899718706470046, "learning_rate": 0.0001, "loss": 1.2265, "loss/crossentropy": 2.5992889404296875, "loss/hidden": 1.0546875, "loss/logits": 0.17063888907432556, "loss/reg": 0.00011245961650274694, "step": 1621 }, { "epoch": 0.20275, "grad_norm": 2.6895523071289062, "grad_norm_var": 0.41742154962171296, "learning_rate": 0.0001, "loss": 1.4186, "loss/crossentropy": 2.477802276611328, "loss/hidden": 1.2109375, "loss/logits": 0.20650723576545715, "loss/reg": 0.0001124002956203185, "step": 1622 }, { "epoch": 0.202875, "grad_norm": 2.5851221084594727, "grad_norm_var": 0.40756741891286763, "learning_rate": 0.0001, "loss": 1.4079, "loss/crossentropy": 2.321220636367798, "loss/hidden": 1.203125, "loss/logits": 0.20361343026161194, "loss/reg": 0.00011235993588343263, "step": 1623 }, { "epoch": 0.203, "grad_norm": 2.2355692386627197, "grad_norm_var": 0.42148769561622973, "learning_rate": 0.0001, "loss": 1.393, "loss/crossentropy": 2.5755577087402344, "loss/hidden": 1.1875, "loss/logits": 0.20434287190437317, "loss/reg": 0.00011231639655306935, "step": 1624 }, { "epoch": 0.203125, "grad_norm": 2.535264015197754, "grad_norm_var": 0.41979356747741553, "learning_rate": 0.0001, "loss": 1.3762, "loss/crossentropy": 2.5176937580108643, "loss/hidden": 1.203125, "loss/logits": 0.17192497849464417, "loss/reg": 0.00011227159848203883, "step": 1625 }, { "epoch": 0.20325, "grad_norm": 2.42753529548645, "grad_norm_var": 0.4257041557226065, "learning_rate": 0.0001, "loss": 1.3559, "loss/crossentropy": 2.198944330215454, "loss/hidden": 1.171875, "loss/logits": 0.1829289048910141, "loss/reg": 0.00011221806198591366, "step": 1626 }, { "epoch": 0.203375, "grad_norm": 3.5947937965393066, "grad_norm_var": 0.4660347673224576, "learning_rate": 0.0001, "loss": 1.5219, "loss/crossentropy": 2.79641056060791, "loss/hidden": 1.265625, "loss/logits": 0.25510913133621216, "loss/reg": 0.00011216471466468647, "step": 1627 }, { "epoch": 0.2035, "grad_norm": 2.5547397136688232, "grad_norm_var": 0.3414835708850878, "learning_rate": 0.0001, "loss": 1.2413, "loss/crossentropy": 2.3658392429351807, "loss/hidden": 1.078125, "loss/logits": 0.16203923523426056, "loss/reg": 0.00011210514639969915, "step": 1628 }, { "epoch": 0.203625, "grad_norm": 2.5729472637176514, "grad_norm_var": 0.337139477412313, "learning_rate": 0.0001, "loss": 1.475, "loss/crossentropy": 2.540390729904175, "loss/hidden": 1.2578125, "loss/logits": 0.216062992811203, "loss/reg": 0.00011204430484212935, "step": 1629 }, { "epoch": 0.20375, "grad_norm": 2.2355432510375977, "grad_norm_var": 0.34422834248829093, "learning_rate": 0.0001, "loss": 1.3391, "loss/crossentropy": 2.5381197929382324, "loss/hidden": 1.125, "loss/logits": 0.21300330758094788, "loss/reg": 0.00011199909931747243, "step": 1630 }, { "epoch": 0.203875, "grad_norm": 2.4133527278900146, "grad_norm_var": 0.3398789907531551, "learning_rate": 0.0001, "loss": 1.2994, "loss/crossentropy": 2.542715072631836, "loss/hidden": 1.1171875, "loss/logits": 0.18106934428215027, "loss/reg": 0.00011195150727871805, "step": 1631 }, { "epoch": 0.204, "grad_norm": 3.147031784057617, "grad_norm_var": 0.11921879844417163, "learning_rate": 0.0001, "loss": 1.329, "loss/crossentropy": 2.598322868347168, "loss/hidden": 1.140625, "loss/logits": 0.18727406859397888, "loss/reg": 0.00011191445810254663, "step": 1632 }, { "epoch": 0.204125, "grad_norm": 2.2079691886901855, "grad_norm_var": 0.12598993880634807, "learning_rate": 0.0001, "loss": 1.3776, "loss/crossentropy": 2.605336904525757, "loss/hidden": 1.15625, "loss/logits": 0.22023460268974304, "loss/reg": 0.00011187767086084932, "step": 1633 }, { "epoch": 0.20425, "grad_norm": 3.039386034011841, "grad_norm_var": 0.13765139300498178, "learning_rate": 0.0001, "loss": 1.5335, "loss/crossentropy": 2.668926239013672, "loss/hidden": 1.296875, "loss/logits": 0.2355237603187561, "loss/reg": 0.00011183385504409671, "step": 1634 }, { "epoch": 0.204375, "grad_norm": 2.7993054389953613, "grad_norm_var": 0.1328282648493996, "learning_rate": 0.0001, "loss": 1.4245, "loss/crossentropy": 2.465437650680542, "loss/hidden": 1.2109375, "loss/logits": 0.21241667866706848, "loss/reg": 0.00011179331340827048, "step": 1635 }, { "epoch": 0.2045, "grad_norm": 2.3275532722473145, "grad_norm_var": 0.13909201558219592, "learning_rate": 0.0001, "loss": 1.2738, "loss/crossentropy": 2.5804219245910645, "loss/hidden": 1.0859375, "loss/logits": 0.18669646978378296, "loss/reg": 0.00011175425606779754, "step": 1636 }, { "epoch": 0.204625, "grad_norm": 2.249389410018921, "grad_norm_var": 0.1478601367587276, "learning_rate": 0.0001, "loss": 1.2459, "loss/crossentropy": 2.4399354457855225, "loss/hidden": 1.0703125, "loss/logits": 0.1745118796825409, "loss/reg": 0.00011171126243425533, "step": 1637 }, { "epoch": 0.20475, "grad_norm": 3.0658020973205566, "grad_norm_var": 0.16115321584507736, "learning_rate": 0.0001, "loss": 1.3188, "loss/crossentropy": 2.5174713134765625, "loss/hidden": 1.15625, "loss/logits": 0.1614057421684265, "loss/reg": 0.00011167607590323314, "step": 1638 }, { "epoch": 0.204875, "grad_norm": 3.457867383956909, "grad_norm_var": 0.2041812937125845, "learning_rate": 0.0001, "loss": 1.3348, "loss/crossentropy": 2.500462055206299, "loss/hidden": 1.140625, "loss/logits": 0.1930893361568451, "loss/reg": 0.00011163955059600994, "step": 1639 }, { "epoch": 0.205, "grad_norm": 2.89260196685791, "grad_norm_var": 0.1923153010852102, "learning_rate": 0.0001, "loss": 1.451, "loss/crossentropy": 2.574960470199585, "loss/hidden": 1.234375, "loss/logits": 0.21554163098335266, "loss/reg": 0.00011161128350067884, "step": 1640 }, { "epoch": 0.205125, "grad_norm": 2.4591424465179443, "grad_norm_var": 0.1945531294072519, "learning_rate": 0.0001, "loss": 1.2595, "loss/crossentropy": 2.590061902999878, "loss/hidden": 1.0859375, "loss/logits": 0.1723993420600891, "loss/reg": 0.0001115696577471681, "step": 1641 }, { "epoch": 0.20525, "grad_norm": 2.8086323738098145, "grad_norm_var": 0.18900763256934638, "learning_rate": 0.0001, "loss": 1.5064, "loss/crossentropy": 2.5853464603424072, "loss/hidden": 1.28125, "loss/logits": 0.22407478094100952, "loss/reg": 0.00011152681690873578, "step": 1642 }, { "epoch": 0.205375, "grad_norm": 2.346402645111084, "grad_norm_var": 0.143985352810216, "learning_rate": 0.0001, "loss": 1.2622, "loss/crossentropy": 2.4879636764526367, "loss/hidden": 1.078125, "loss/logits": 0.18297691643238068, "loss/reg": 0.00011149232886964455, "step": 1643 }, { "epoch": 0.2055, "grad_norm": 2.233002185821533, "grad_norm_var": 0.15501790165435428, "learning_rate": 0.0001, "loss": 1.2806, "loss/crossentropy": 2.432508945465088, "loss/hidden": 1.1015625, "loss/logits": 0.17792685329914093, "loss/reg": 0.00011145360622322187, "step": 1644 }, { "epoch": 0.205625, "grad_norm": 2.5552756786346436, "grad_norm_var": 0.15519775570459443, "learning_rate": 0.0001, "loss": 1.3389, "loss/crossentropy": 2.701528549194336, "loss/hidden": 1.140625, "loss/logits": 0.19717130064964294, "loss/reg": 0.00011141985305584967, "step": 1645 }, { "epoch": 0.20575, "grad_norm": 2.407944679260254, "grad_norm_var": 0.14776070985217965, "learning_rate": 0.0001, "loss": 1.2399, "loss/crossentropy": 2.442035675048828, "loss/hidden": 1.0625, "loss/logits": 0.17623703181743622, "loss/reg": 0.00011136785906273872, "step": 1646 }, { "epoch": 0.205875, "grad_norm": 2.563610792160034, "grad_norm_var": 0.14441736751856052, "learning_rate": 0.0001, "loss": 1.1597, "loss/crossentropy": 2.61016583442688, "loss/hidden": 1.0, "loss/logits": 0.15855887532234192, "loss/reg": 0.00011132592771900818, "step": 1647 }, { "epoch": 0.206, "grad_norm": 3.9542369842529297, "grad_norm_var": 0.23755290645463656, "learning_rate": 0.0001, "loss": 1.3873, "loss/crossentropy": 2.562901258468628, "loss/hidden": 1.171875, "loss/logits": 0.2143251895904541, "loss/reg": 0.00011128820915473625, "step": 1648 }, { "epoch": 0.206125, "grad_norm": 2.4580447673797607, "grad_norm_var": 0.22470517207199972, "learning_rate": 0.0001, "loss": 1.3397, "loss/crossentropy": 2.5684804916381836, "loss/hidden": 1.140625, "loss/logits": 0.19793745875358582, "loss/reg": 0.00011123935109935701, "step": 1649 }, { "epoch": 0.20625, "grad_norm": 2.4187676906585693, "grad_norm_var": 0.22285706987883386, "learning_rate": 0.0001, "loss": 1.341, "loss/crossentropy": 2.5495643615722656, "loss/hidden": 1.140625, "loss/logits": 0.1992706060409546, "loss/reg": 0.00011119980626972392, "step": 1650 }, { "epoch": 0.206375, "grad_norm": 2.7590370178222656, "grad_norm_var": 0.2223573072109879, "learning_rate": 0.0001, "loss": 1.1908, "loss/crossentropy": 2.444124460220337, "loss/hidden": 1.0390625, "loss/logits": 0.1506166309118271, "loss/reg": 0.00011116037057945505, "step": 1651 }, { "epoch": 0.2065, "grad_norm": 7.482737064361572, "grad_norm_var": 1.6377739947823364, "learning_rate": 0.0001, "loss": 1.7528, "loss/crossentropy": 2.3700640201568604, "loss/hidden": 1.4296875, "loss/logits": 0.32198458909988403, "loss/reg": 0.00011111405183328316, "step": 1652 }, { "epoch": 0.206625, "grad_norm": 2.4284467697143555, "grad_norm_var": 1.6216896684358326, "learning_rate": 0.0001, "loss": 1.2504, "loss/crossentropy": 2.689178466796875, "loss/hidden": 1.078125, "loss/logits": 0.1711852252483368, "loss/reg": 0.00011107572208857164, "step": 1653 }, { "epoch": 0.20675, "grad_norm": 2.2003014087677, "grad_norm_var": 1.6630171398757017, "learning_rate": 0.0001, "loss": 1.1739, "loss/crossentropy": 2.5429723262786865, "loss/hidden": 1.015625, "loss/logits": 0.15712036192417145, "loss/reg": 0.00011103981523774564, "step": 1654 }, { "epoch": 0.206875, "grad_norm": 2.1543972492218018, "grad_norm_var": 1.6833968924157297, "learning_rate": 0.0001, "loss": 1.3167, "loss/crossentropy": 2.457568645477295, "loss/hidden": 1.1328125, "loss/logits": 0.18275953829288483, "loss/reg": 0.00011100307165179402, "step": 1655 }, { "epoch": 0.207, "grad_norm": 2.7721645832061768, "grad_norm_var": 1.6841438356629916, "learning_rate": 0.0001, "loss": 1.5163, "loss/crossentropy": 2.5110199451446533, "loss/hidden": 1.265625, "loss/logits": 0.24954883754253387, "loss/reg": 0.00011095251829829067, "step": 1656 }, { "epoch": 0.207125, "grad_norm": 2.179582357406616, "grad_norm_var": 1.704534403096188, "learning_rate": 0.0001, "loss": 1.2048, "loss/crossentropy": 2.5216922760009766, "loss/hidden": 1.0546875, "loss/logits": 0.1490367203950882, "loss/reg": 0.00011090424959547818, "step": 1657 }, { "epoch": 0.20725, "grad_norm": 2.802952527999878, "grad_norm_var": 1.7045735497820276, "learning_rate": 0.0001, "loss": 1.3516, "loss/crossentropy": 2.3874335289001465, "loss/hidden": 1.140625, "loss/logits": 0.2099037766456604, "loss/reg": 0.0001108548094634898, "step": 1658 }, { "epoch": 0.207375, "grad_norm": 2.1002986431121826, "grad_norm_var": 1.7251237304534925, "learning_rate": 0.0001, "loss": 1.1078, "loss/crossentropy": 2.384077548980713, "loss/hidden": 0.94921875, "loss/logits": 0.1575223207473755, "loss/reg": 0.0001107970456359908, "step": 1659 }, { "epoch": 0.2075, "grad_norm": 2.8213303089141846, "grad_norm_var": 1.6989906634199785, "learning_rate": 0.0001, "loss": 1.4658, "loss/crossentropy": 2.887385845184326, "loss/hidden": 1.2421875, "loss/logits": 0.2225031703710556, "loss/reg": 0.00011074725625803694, "step": 1660 }, { "epoch": 0.207625, "grad_norm": 2.578094482421875, "grad_norm_var": 1.6980391998074917, "learning_rate": 0.0001, "loss": 1.3008, "loss/crossentropy": 2.343104362487793, "loss/hidden": 1.1171875, "loss/logits": 0.18248461186885834, "loss/reg": 0.00011069087486248463, "step": 1661 }, { "epoch": 0.20775, "grad_norm": 2.040755271911621, "grad_norm_var": 1.7295830740539182, "learning_rate": 0.0001, "loss": 1.1974, "loss/crossentropy": 2.455885171890259, "loss/hidden": 1.0234375, "loss/logits": 0.17284810543060303, "loss/reg": 0.00011063710553571582, "step": 1662 }, { "epoch": 0.207875, "grad_norm": 2.6391217708587646, "grad_norm_var": 1.726983827024882, "learning_rate": 0.0001, "loss": 1.3462, "loss/crossentropy": 2.4627151489257812, "loss/hidden": 1.109375, "loss/logits": 0.2357426881790161, "loss/reg": 0.00011058292875532061, "step": 1663 }, { "epoch": 0.208, "grad_norm": 3.3892858028411865, "grad_norm_var": 1.6646490486753822, "learning_rate": 0.0001, "loss": 1.4722, "loss/crossentropy": 2.5689244270324707, "loss/hidden": 1.2265625, "loss/logits": 0.24456994235515594, "loss/reg": 0.00011052460467908531, "step": 1664 }, { "epoch": 0.208125, "grad_norm": 41.70608901977539, "grad_norm_var": 96.01162619940186, "learning_rate": 0.0001, "loss": 3.7632, "loss/crossentropy": 3.460641860961914, "loss/hidden": 2.421875, "loss/logits": 1.3402395248413086, "loss/reg": 0.00011047819134546444, "step": 1665 }, { "epoch": 0.20825, "grad_norm": 2.8536791801452637, "grad_norm_var": 95.85755430020885, "learning_rate": 0.0001, "loss": 1.3287, "loss/crossentropy": 2.8530144691467285, "loss/hidden": 1.1328125, "loss/logits": 0.19480541348457336, "loss/reg": 0.00011041966354241595, "step": 1666 }, { "epoch": 0.208375, "grad_norm": 2.516788959503174, "grad_norm_var": 95.94351307960362, "learning_rate": 0.0001, "loss": 1.2264, "loss/crossentropy": 2.6264588832855225, "loss/hidden": 1.0546875, "loss/logits": 0.17059212923049927, "loss/reg": 0.00011035893840016797, "step": 1667 }, { "epoch": 0.2085, "grad_norm": 2.9335944652557373, "grad_norm_var": 95.90790852751213, "learning_rate": 0.0001, "loss": 1.4057, "loss/crossentropy": 2.4964194297790527, "loss/hidden": 1.2109375, "loss/logits": 0.19362276792526245, "loss/reg": 0.00011033112969016656, "step": 1668 }, { "epoch": 0.208625, "grad_norm": 2.3998830318450928, "grad_norm_var": 95.91778109852285, "learning_rate": 0.0001, "loss": 1.3396, "loss/crossentropy": 2.557877779006958, "loss/hidden": 1.140625, "loss/logits": 0.1978691667318344, "loss/reg": 0.00011028131848433986, "step": 1669 }, { "epoch": 0.20875, "grad_norm": 2.858804225921631, "grad_norm_var": 95.69858348016824, "learning_rate": 0.0001, "loss": 1.4352, "loss/crossentropy": 2.4914534091949463, "loss/hidden": 1.234375, "loss/logits": 0.19969812035560608, "loss/reg": 0.00011023021943401545, "step": 1670 }, { "epoch": 0.208875, "grad_norm": 2.223767042160034, "grad_norm_var": 95.67213266687155, "learning_rate": 0.0001, "loss": 1.0797, "loss/crossentropy": 2.4683454036712646, "loss/hidden": 0.9453125, "loss/logits": 0.13331161439418793, "loss/reg": 0.00011017613724106923, "step": 1671 }, { "epoch": 0.209, "grad_norm": 4.1252546310424805, "grad_norm_var": 95.37542952820918, "learning_rate": 0.0001, "loss": 1.4629, "loss/crossentropy": 2.485999584197998, "loss/hidden": 1.25, "loss/logits": 0.21180002391338348, "loss/reg": 0.00011013224138878286, "step": 1672 }, { "epoch": 0.209125, "grad_norm": 2.279583692550659, "grad_norm_var": 95.33664071533872, "learning_rate": 0.0001, "loss": 1.2446, "loss/crossentropy": 2.5324227809906006, "loss/hidden": 1.078125, "loss/logits": 0.16539137065410614, "loss/reg": 0.00011008464935002849, "step": 1673 }, { "epoch": 0.20925, "grad_norm": 2.79790997505188, "grad_norm_var": 95.33821482647477, "learning_rate": 0.0001, "loss": 1.3876, "loss/crossentropy": 2.662733793258667, "loss/hidden": 1.1875, "loss/logits": 0.19899530708789825, "loss/reg": 0.000110039567516651, "step": 1674 }, { "epoch": 0.209375, "grad_norm": 2.2751238346099854, "grad_norm_var": 95.26923423528484, "learning_rate": 0.0001, "loss": 1.2867, "loss/crossentropy": 2.567803382873535, "loss/hidden": 1.1015625, "loss/logits": 0.18400409817695618, "loss/reg": 0.00010999004007317126, "step": 1675 }, { "epoch": 0.2095, "grad_norm": 2.525566816329956, "grad_norm_var": 95.36662917051399, "learning_rate": 0.0001, "loss": 1.4816, "loss/crossentropy": 2.7200751304626465, "loss/hidden": 1.2421875, "loss/logits": 0.23828065395355225, "loss/reg": 0.00010994432523148134, "step": 1676 }, { "epoch": 0.209625, "grad_norm": 2.87634015083313, "grad_norm_var": 95.27055194312925, "learning_rate": 0.0001, "loss": 1.4779, "loss/crossentropy": 2.4793484210968018, "loss/hidden": 1.265625, "loss/logits": 0.2111985683441162, "loss/reg": 0.00010990487498929724, "step": 1677 }, { "epoch": 0.20975, "grad_norm": 2.4725472927093506, "grad_norm_var": 95.10304893939114, "learning_rate": 0.0001, "loss": 1.3005, "loss/crossentropy": 2.286247491836548, "loss/hidden": 1.125, "loss/logits": 0.17438845336437225, "loss/reg": 0.00010985445260303095, "step": 1678 }, { "epoch": 0.209875, "grad_norm": 2.4872000217437744, "grad_norm_var": 95.15595164319376, "learning_rate": 0.0001, "loss": 1.2777, "loss/crossentropy": 2.630589485168457, "loss/hidden": 1.1171875, "loss/logits": 0.15936625003814697, "loss/reg": 0.0001098025095416233, "step": 1679 }, { "epoch": 0.21, "grad_norm": 2.5033185482025146, "grad_norm_var": 95.41537466608287, "learning_rate": 0.0001, "loss": 1.0448, "loss/crossentropy": 2.584707498550415, "loss/hidden": 0.91015625, "loss/logits": 0.13357345759868622, "loss/reg": 0.00010976781777571887, "step": 1680 }, { "epoch": 0.210125, "grad_norm": 2.3200080394744873, "grad_norm_var": 0.2105631331709361, "learning_rate": 0.0001, "loss": 1.3514, "loss/crossentropy": 2.4606332778930664, "loss/hidden": 1.1484375, "loss/logits": 0.2018405795097351, "loss/reg": 0.00010973077587550506, "step": 1681 }, { "epoch": 0.21025, "grad_norm": 2.352738857269287, "grad_norm_var": 0.212848904856142, "learning_rate": 0.0001, "loss": 1.249, "loss/crossentropy": 2.3881287574768066, "loss/hidden": 1.078125, "loss/logits": 0.16979584097862244, "loss/reg": 0.00010969497816404328, "step": 1682 }, { "epoch": 0.210375, "grad_norm": 1.8870757818222046, "grad_norm_var": 0.24644754041125758, "learning_rate": 0.0001, "loss": 1.2449, "loss/crossentropy": 2.3981943130493164, "loss/hidden": 1.078125, "loss/logits": 0.16571944952011108, "loss/reg": 0.00010965076216962188, "step": 1683 }, { "epoch": 0.2105, "grad_norm": 2.7646870613098145, "grad_norm_var": 0.24032184666341336, "learning_rate": 0.0001, "loss": 1.3438, "loss/crossentropy": 2.618183135986328, "loss/hidden": 1.1484375, "loss/logits": 0.19428318738937378, "loss/reg": 0.00010959783685393631, "step": 1684 }, { "epoch": 0.210625, "grad_norm": 3.2599589824676514, "grad_norm_var": 0.2668328932481055, "learning_rate": 0.0001, "loss": 1.4152, "loss/crossentropy": 2.632329225540161, "loss/hidden": 1.203125, "loss/logits": 0.2109876424074173, "loss/reg": 0.00010954913886962458, "step": 1685 }, { "epoch": 0.21075, "grad_norm": 2.2210700511932373, "grad_norm_var": 0.27242382186249986, "learning_rate": 0.0001, "loss": 1.2261, "loss/crossentropy": 2.6125760078430176, "loss/hidden": 1.0546875, "loss/logits": 0.1702938675880432, "loss/reg": 0.00010950414434773847, "step": 1686 }, { "epoch": 0.210875, "grad_norm": 2.096109628677368, "grad_norm_var": 0.279603815963487, "learning_rate": 0.0001, "loss": 1.1785, "loss/crossentropy": 2.5393664836883545, "loss/hidden": 1.015625, "loss/logits": 0.16178768873214722, "loss/reg": 0.00010946387192234397, "step": 1687 }, { "epoch": 0.211, "grad_norm": 2.4515318870544434, "grad_norm_var": 0.1093491099329265, "learning_rate": 0.0001, "loss": 1.3773, "loss/crossentropy": 2.3360347747802734, "loss/hidden": 1.171875, "loss/logits": 0.20428118109703064, "loss/reg": 0.00010943051165668294, "step": 1688 }, { "epoch": 0.211125, "grad_norm": 2.180295705795288, "grad_norm_var": 0.11252805596727876, "learning_rate": 0.0001, "loss": 1.3498, "loss/crossentropy": 2.5154497623443604, "loss/hidden": 1.1640625, "loss/logits": 0.1846066117286682, "loss/reg": 0.00010938234481727704, "step": 1689 }, { "epoch": 0.21125, "grad_norm": 2.7476320266723633, "grad_norm_var": 0.1104675012533031, "learning_rate": 0.0001, "loss": 1.3704, "loss/crossentropy": 2.812041997909546, "loss/hidden": 1.171875, "loss/logits": 0.19742317497730255, "loss/reg": 0.00010933752491837367, "step": 1690 }, { "epoch": 0.211375, "grad_norm": 4.109058856964111, "grad_norm_var": 0.27453269364514715, "learning_rate": 0.0001, "loss": 1.7929, "loss/crossentropy": 2.410374164581299, "loss/hidden": 1.5390625, "loss/logits": 0.2527104616165161, "loss/reg": 0.00010929087875410914, "step": 1691 }, { "epoch": 0.2115, "grad_norm": 2.9683115482330322, "grad_norm_var": 0.2836625137311993, "learning_rate": 0.0001, "loss": 1.4409, "loss/crossentropy": 2.245936393737793, "loss/hidden": 1.21875, "loss/logits": 0.22108203172683716, "loss/reg": 0.00010923654190264642, "step": 1692 }, { "epoch": 0.211625, "grad_norm": 2.751962661743164, "grad_norm_var": 0.28014809634650956, "learning_rate": 0.0001, "loss": 1.4054, "loss/crossentropy": 2.2262914180755615, "loss/hidden": 1.21875, "loss/logits": 0.18555352091789246, "loss/reg": 0.00010919060150627047, "step": 1693 }, { "epoch": 0.21175, "grad_norm": 9.967276573181152, "grad_norm_var": 3.6651250466713803, "learning_rate": 0.0001, "loss": 1.6523, "loss/crossentropy": 2.260791540145874, "loss/hidden": 1.4453125, "loss/logits": 0.20587801933288574, "loss/reg": 0.00010913325968431309, "step": 1694 }, { "epoch": 0.211875, "grad_norm": 2.7172865867614746, "grad_norm_var": 3.650653777768873, "learning_rate": 0.0001, "loss": 1.332, "loss/crossentropy": 2.7124171257019043, "loss/hidden": 1.1328125, "loss/logits": 0.19809767603874207, "loss/reg": 0.00010908351396210492, "step": 1695 }, { "epoch": 0.212, "grad_norm": 2.614694118499756, "grad_norm_var": 3.642848290779846, "learning_rate": 0.0001, "loss": 1.3248, "loss/crossentropy": 2.441655397415161, "loss/hidden": 1.140625, "loss/logits": 0.18307194113731384, "loss/reg": 0.00010902514623012394, "step": 1696 }, { "epoch": 0.212125, "grad_norm": 4.637229919433594, "grad_norm_var": 3.7411292859528675, "learning_rate": 0.0001, "loss": 1.7092, "loss/crossentropy": 2.4005911350250244, "loss/hidden": 1.359375, "loss/logits": 0.3487456440925598, "loss/reg": 0.00010897710308199748, "step": 1697 }, { "epoch": 0.21225, "grad_norm": 12.33137035369873, "grad_norm_var": 8.793363440078272, "learning_rate": 0.0001, "loss": 1.4774, "loss/crossentropy": 2.4397168159484863, "loss/hidden": 1.2734375, "loss/logits": 0.2029198557138443, "loss/reg": 0.00010893220314756036, "step": 1698 }, { "epoch": 0.212375, "grad_norm": 2.9841771125793457, "grad_norm_var": 8.58048848728944, "learning_rate": 0.0001, "loss": 1.4087, "loss/crossentropy": 2.7524659633636475, "loss/hidden": 1.1875, "loss/logits": 0.22008132934570312, "loss/reg": 0.00010888621909543872, "step": 1699 }, { "epoch": 0.2125, "grad_norm": 2.638474464416504, "grad_norm_var": 8.601013026837087, "learning_rate": 0.0001, "loss": 1.6042, "loss/crossentropy": 2.215057849884033, "loss/hidden": 1.359375, "loss/logits": 0.24374061822891235, "loss/reg": 0.00010884197399718687, "step": 1700 }, { "epoch": 0.212625, "grad_norm": 3.7530136108398438, "grad_norm_var": 8.57299442905108, "learning_rate": 0.0001, "loss": 1.2677, "loss/crossentropy": 2.809962034225464, "loss/hidden": 1.09375, "loss/logits": 0.17281728982925415, "loss/reg": 0.00010879183537326753, "step": 1701 }, { "epoch": 0.21275, "grad_norm": 2.1703004837036133, "grad_norm_var": 8.584846223289562, "learning_rate": 0.0001, "loss": 1.3218, "loss/crossentropy": 2.6305477619171143, "loss/hidden": 1.125, "loss/logits": 0.19571612775325775, "loss/reg": 0.000108753090898972, "step": 1702 }, { "epoch": 0.212875, "grad_norm": 3.5113539695739746, "grad_norm_var": 8.361159319946493, "learning_rate": 0.0001, "loss": 1.5757, "loss/crossentropy": 2.4285690784454346, "loss/hidden": 1.3359375, "loss/logits": 0.2386600822210312, "loss/reg": 0.00010871334961848333, "step": 1703 }, { "epoch": 0.213, "grad_norm": 2.3448433876037598, "grad_norm_var": 8.384372624489822, "learning_rate": 0.0001, "loss": 1.2473, "loss/crossentropy": 2.7019357681274414, "loss/hidden": 1.0859375, "loss/logits": 0.16030922532081604, "loss/reg": 0.00010866751108551398, "step": 1704 }, { "epoch": 0.213125, "grad_norm": 2.35117244720459, "grad_norm_var": 8.344129764188793, "learning_rate": 0.0001, "loss": 1.2196, "loss/crossentropy": 2.3574259281158447, "loss/hidden": 1.0546875, "loss/logits": 0.16384953260421753, "loss/reg": 0.00010863286297535524, "step": 1705 }, { "epoch": 0.21325, "grad_norm": 3.560102939605713, "grad_norm_var": 8.245668351672933, "learning_rate": 0.0001, "loss": 1.6964, "loss/crossentropy": 2.510143756866455, "loss/hidden": 1.421875, "loss/logits": 0.2734162211418152, "loss/reg": 0.00010858573659788817, "step": 1706 }, { "epoch": 0.213375, "grad_norm": 2.139786958694458, "grad_norm_var": 8.48255906841504, "learning_rate": 0.0001, "loss": 1.1382, "loss/crossentropy": 2.3996312618255615, "loss/hidden": 0.97265625, "loss/logits": 0.16440927982330322, "loss/reg": 0.00010854018182726577, "step": 1707 }, { "epoch": 0.2135, "grad_norm": 2.5312178134918213, "grad_norm_var": 8.552590865388114, "learning_rate": 0.0001, "loss": 1.3418, "loss/crossentropy": 2.5410661697387695, "loss/hidden": 1.125, "loss/logits": 0.21571648120880127, "loss/reg": 0.00010849641694221646, "step": 1708 }, { "epoch": 0.213625, "grad_norm": 2.2579665184020996, "grad_norm_var": 8.645947211155674, "learning_rate": 0.0001, "loss": 1.0775, "loss/crossentropy": 2.5572733879089355, "loss/hidden": 0.93359375, "loss/logits": 0.14278024435043335, "loss/reg": 0.00010845089855138212, "step": 1709 }, { "epoch": 0.21375, "grad_norm": 2.7395756244659424, "grad_norm_var": 6.070572761054658, "learning_rate": 0.0001, "loss": 1.5432, "loss/crossentropy": 2.4370927810668945, "loss/hidden": 1.3046875, "loss/logits": 0.23737972974777222, "loss/reg": 0.00010840956383617595, "step": 1710 }, { "epoch": 0.213875, "grad_norm": 2.8249099254608154, "grad_norm_var": 6.060708359299905, "learning_rate": 0.0001, "loss": 1.3841, "loss/crossentropy": 2.4933722019195557, "loss/hidden": 1.1875, "loss/logits": 0.19554808735847473, "loss/reg": 0.00010836165893124416, "step": 1711 }, { "epoch": 0.214, "grad_norm": 2.667609214782715, "grad_norm_var": 6.0549061217479565, "learning_rate": 0.0001, "loss": 1.2844, "loss/crossentropy": 2.787307024002075, "loss/hidden": 1.0859375, "loss/logits": 0.19740629196166992, "loss/reg": 0.00010830125393113121, "step": 1712 }, { "epoch": 0.214125, "grad_norm": 3.383188247680664, "grad_norm_var": 5.957223993843578, "learning_rate": 0.0001, "loss": 1.5493, "loss/crossentropy": 2.1179311275482178, "loss/hidden": 1.34375, "loss/logits": 0.20449507236480713, "loss/reg": 0.00010825315257534385, "step": 1713 }, { "epoch": 0.21425, "grad_norm": 2.5583598613739014, "grad_norm_var": 0.27134481029386825, "learning_rate": 0.0001, "loss": 1.3314, "loss/crossentropy": 2.402310371398926, "loss/hidden": 1.140625, "loss/logits": 0.1896945834159851, "loss/reg": 0.0001082055241568014, "step": 1714 }, { "epoch": 0.214375, "grad_norm": 3.098884105682373, "grad_norm_var": 0.2753510320071198, "learning_rate": 0.0001, "loss": 1.2997, "loss/crossentropy": 2.58268141746521, "loss/hidden": 1.1171875, "loss/logits": 0.1813991665840149, "loss/reg": 0.00010815304995048791, "step": 1715 }, { "epoch": 0.2145, "grad_norm": 2.4488911628723145, "grad_norm_var": 0.2812550397715671, "learning_rate": 0.0001, "loss": 1.1564, "loss/crossentropy": 2.533371686935425, "loss/hidden": 1.0078125, "loss/logits": 0.14750328660011292, "loss/reg": 0.00010810411185957491, "step": 1716 }, { "epoch": 0.214625, "grad_norm": 2.460645914077759, "grad_norm_var": 0.21648282250799392, "learning_rate": 0.0001, "loss": 1.3206, "loss/crossentropy": 2.5660560131073, "loss/hidden": 1.1484375, "loss/logits": 0.17107239365577698, "loss/reg": 0.00010805453348439187, "step": 1717 }, { "epoch": 0.21475, "grad_norm": 25.326862335205078, "grad_norm_var": 32.124336453097776, "learning_rate": 0.0001, "loss": 1.2991, "loss/crossentropy": 2.850249767303467, "loss/hidden": 1.1015625, "loss/logits": 0.1965024620294571, "loss/reg": 0.00010800695599755272, "step": 1718 }, { "epoch": 0.214875, "grad_norm": 2.690297842025757, "grad_norm_var": 32.235053325731485, "learning_rate": 0.0001, "loss": 1.397, "loss/crossentropy": 2.6676886081695557, "loss/hidden": 1.1875, "loss/logits": 0.20843768119812012, "loss/reg": 0.00010795376147143543, "step": 1719 }, { "epoch": 0.215, "grad_norm": 2.1675710678100586, "grad_norm_var": 32.27818421664067, "learning_rate": 0.0001, "loss": 1.3116, "loss/crossentropy": 2.602102756500244, "loss/hidden": 1.125, "loss/logits": 0.18547466397285461, "loss/reg": 0.00010790182568598539, "step": 1720 }, { "epoch": 0.215125, "grad_norm": 2.504962921142578, "grad_norm_var": 32.24430564358637, "learning_rate": 0.0001, "loss": 1.2649, "loss/crossentropy": 2.5871379375457764, "loss/hidden": 1.09375, "loss/logits": 0.1701194941997528, "loss/reg": 0.0001078498171409592, "step": 1721 }, { "epoch": 0.21525, "grad_norm": 3.6672990322113037, "grad_norm_var": 32.23752083241161, "learning_rate": 0.0001, "loss": 1.6673, "loss/crossentropy": 2.4313607215881348, "loss/hidden": 1.3828125, "loss/logits": 0.2833651602268219, "loss/reg": 0.00010779568401630968, "step": 1722 }, { "epoch": 0.215375, "grad_norm": 2.3093271255493164, "grad_norm_var": 32.195192465213786, "learning_rate": 0.0001, "loss": 1.2286, "loss/crossentropy": 2.5574374198913574, "loss/hidden": 1.046875, "loss/logits": 0.18068531155586243, "loss/reg": 0.00010774154361570254, "step": 1723 }, { "epoch": 0.2155, "grad_norm": 2.4022529125213623, "grad_norm_var": 32.22324804910634, "learning_rate": 0.0001, "loss": 1.2208, "loss/crossentropy": 2.3605575561523438, "loss/hidden": 1.0390625, "loss/logits": 0.18068303167819977, "loss/reg": 0.00010769553773570806, "step": 1724 }, { "epoch": 0.215625, "grad_norm": 3.056169271469116, "grad_norm_var": 32.06763430587257, "learning_rate": 0.0001, "loss": 1.4906, "loss/crossentropy": 2.630885124206543, "loss/hidden": 1.28125, "loss/logits": 0.2082456350326538, "loss/reg": 0.00010764100443338975, "step": 1725 }, { "epoch": 0.21575, "grad_norm": 2.6524758338928223, "grad_norm_var": 32.08442050050846, "learning_rate": 0.0001, "loss": 1.5044, "loss/crossentropy": 2.5272116661071777, "loss/hidden": 1.28125, "loss/logits": 0.22204947471618652, "loss/reg": 0.00010759227734524757, "step": 1726 }, { "epoch": 0.215875, "grad_norm": 2.092261791229248, "grad_norm_var": 32.24631137135366, "learning_rate": 0.0001, "loss": 1.312, "loss/crossentropy": 2.71053147315979, "loss/hidden": 1.125, "loss/logits": 0.1859012246131897, "loss/reg": 0.00010754590039141476, "step": 1727 }, { "epoch": 0.216, "grad_norm": 2.8914434909820557, "grad_norm_var": 32.20690431341523, "learning_rate": 0.0001, "loss": 1.3532, "loss/crossentropy": 2.653993606567383, "loss/hidden": 1.15625, "loss/logits": 0.19590504467487335, "loss/reg": 0.00010750328510766849, "step": 1728 }, { "epoch": 0.216125, "grad_norm": 3.938722848892212, "grad_norm_var": 32.17258444384326, "learning_rate": 0.0001, "loss": 1.5115, "loss/crossentropy": 2.7737607955932617, "loss/hidden": 1.2734375, "loss/logits": 0.23701538145542145, "loss/reg": 0.00010746272891992703, "step": 1729 }, { "epoch": 0.21625, "grad_norm": 3.2468061447143555, "grad_norm_var": 32.05687198394092, "learning_rate": 0.0001, "loss": 1.6218, "loss/crossentropy": 2.333163261413574, "loss/hidden": 1.3515625, "loss/logits": 0.26916706562042236, "loss/reg": 0.00010742261656560004, "step": 1730 }, { "epoch": 0.216375, "grad_norm": 2.4073538780212402, "grad_norm_var": 32.186875084297824, "learning_rate": 0.0001, "loss": 1.2337, "loss/crossentropy": 2.4928300380706787, "loss/hidden": 1.0546875, "loss/logits": 0.1779213845729828, "loss/reg": 0.00010737736738519743, "step": 1731 }, { "epoch": 0.2165, "grad_norm": 2.5807344913482666, "grad_norm_var": 32.158207664245595, "learning_rate": 0.0001, "loss": 1.2152, "loss/crossentropy": 2.9008967876434326, "loss/hidden": 1.0390625, "loss/logits": 0.17502188682556152, "loss/reg": 0.00010733335511758924, "step": 1732 }, { "epoch": 0.216625, "grad_norm": 2.4438059329986572, "grad_norm_var": 32.16201787164736, "learning_rate": 0.0001, "loss": 1.3343, "loss/crossentropy": 2.4517171382904053, "loss/hidden": 1.140625, "loss/logits": 0.19260676205158234, "loss/reg": 0.00010729050700319931, "step": 1733 }, { "epoch": 0.21675, "grad_norm": 2.3218982219696045, "grad_norm_var": 0.27824576097423587, "learning_rate": 0.0001, "loss": 1.2627, "loss/crossentropy": 2.8024637699127197, "loss/hidden": 1.0859375, "loss/logits": 0.17568574845790863, "loss/reg": 0.0001072497689165175, "step": 1734 }, { "epoch": 0.216875, "grad_norm": 2.12477707862854, "grad_norm_var": 0.29978278538712266, "learning_rate": 0.0001, "loss": 1.4775, "loss/crossentropy": 2.416490316390991, "loss/hidden": 1.203125, "loss/logits": 0.27327853441238403, "loss/reg": 0.00010720891441451386, "step": 1735 }, { "epoch": 0.217, "grad_norm": 2.1573688983917236, "grad_norm_var": 0.30048020919377527, "learning_rate": 0.0001, "loss": 1.2216, "loss/crossentropy": 2.590989112854004, "loss/hidden": 1.0546875, "loss/logits": 0.16587519645690918, "loss/reg": 0.00010715733515098691, "step": 1736 }, { "epoch": 0.217125, "grad_norm": 2.5584542751312256, "grad_norm_var": 0.2994473499973175, "learning_rate": 0.0001, "loss": 1.2038, "loss/crossentropy": 2.6309661865234375, "loss/hidden": 1.03125, "loss/logits": 0.17148445546627045, "loss/reg": 0.00010710606875363737, "step": 1737 }, { "epoch": 0.21725, "grad_norm": 2.129730701446533, "grad_norm_var": 0.24442968525758318, "learning_rate": 0.0001, "loss": 1.1505, "loss/crossentropy": 2.4888367652893066, "loss/hidden": 1.0, "loss/logits": 0.14942702651023865, "loss/reg": 0.00010706311149988323, "step": 1738 }, { "epoch": 0.217375, "grad_norm": 2.2836310863494873, "grad_norm_var": 0.24540550716047632, "learning_rate": 0.0001, "loss": 1.2546, "loss/crossentropy": 2.6207454204559326, "loss/hidden": 1.0703125, "loss/logits": 0.18323281407356262, "loss/reg": 0.00010701623978093266, "step": 1739 }, { "epoch": 0.2175, "grad_norm": 2.887248992919922, "grad_norm_var": 0.24858073747826193, "learning_rate": 0.0001, "loss": 1.4756, "loss/crossentropy": 2.6547434329986572, "loss/hidden": 1.25, "loss/logits": 0.22450393438339233, "loss/reg": 0.00010697107791202143, "step": 1740 }, { "epoch": 0.217625, "grad_norm": 3.9517366886138916, "grad_norm_var": 0.351888775336378, "learning_rate": 0.0001, "loss": 1.879, "loss/crossentropy": 2.2204384803771973, "loss/hidden": 1.59375, "loss/logits": 0.28417372703552246, "loss/reg": 0.00010692681826185435, "step": 1741 }, { "epoch": 0.21775, "grad_norm": 2.5325450897216797, "grad_norm_var": 0.35301644174282104, "learning_rate": 0.0001, "loss": 1.4584, "loss/crossentropy": 2.3659422397613525, "loss/hidden": 1.25, "loss/logits": 0.20734064280986786, "loss/reg": 0.00010687608300941065, "step": 1742 }, { "epoch": 0.217875, "grad_norm": 2.4293994903564453, "grad_norm_var": 0.3346317661471744, "learning_rate": 0.0001, "loss": 1.3196, "loss/crossentropy": 2.532431125640869, "loss/hidden": 1.1328125, "loss/logits": 0.18573454022407532, "loss/reg": 0.00010683256550692022, "step": 1743 }, { "epoch": 0.218, "grad_norm": 2.901585578918457, "grad_norm_var": 0.3349236473416186, "learning_rate": 0.0001, "loss": 1.3132, "loss/crossentropy": 2.770111322402954, "loss/hidden": 1.1171875, "loss/logits": 0.1949133723974228, "loss/reg": 0.00010678686521714553, "step": 1744 }, { "epoch": 0.218125, "grad_norm": 4.848549842834473, "grad_norm_var": 0.5392364338886779, "learning_rate": 0.0001, "loss": 1.1872, "loss/crossentropy": 3.1159143447875977, "loss/hidden": 1.0390625, "loss/logits": 0.14703276753425598, "loss/reg": 0.00010674065561033785, "step": 1745 }, { "epoch": 0.21825, "grad_norm": 2.8269357681274414, "grad_norm_var": 0.521761974301405, "learning_rate": 0.0001, "loss": 1.5073, "loss/crossentropy": 2.622831106185913, "loss/hidden": 1.2734375, "loss/logits": 0.2327616810798645, "loss/reg": 0.00010669450421119109, "step": 1746 }, { "epoch": 0.218375, "grad_norm": 2.957434892654419, "grad_norm_var": 0.5183584105598317, "learning_rate": 0.0001, "loss": 1.418, "loss/crossentropy": 2.3989624977111816, "loss/hidden": 1.203125, "loss/logits": 0.21384263038635254, "loss/reg": 0.0001066496261046268, "step": 1747 }, { "epoch": 0.2185, "grad_norm": 2.228024959564209, "grad_norm_var": 0.533905278438858, "learning_rate": 0.0001, "loss": 1.2596, "loss/crossentropy": 2.6070363521575928, "loss/hidden": 1.09375, "loss/logits": 0.16482186317443848, "loss/reg": 0.00010660703992471099, "step": 1748 }, { "epoch": 0.218625, "grad_norm": 3.0797135829925537, "grad_norm_var": 0.5354265539736806, "learning_rate": 0.0001, "loss": 1.3702, "loss/crossentropy": 2.486755847930908, "loss/hidden": 1.1875, "loss/logits": 0.18159964680671692, "loss/reg": 0.00010656205995474011, "step": 1749 }, { "epoch": 0.21875, "grad_norm": 3.0834736824035645, "grad_norm_var": 0.5268153717311347, "learning_rate": 0.0001, "loss": 1.3435, "loss/crossentropy": 2.9270730018615723, "loss/hidden": 1.171875, "loss/logits": 0.1706061214208603, "loss/reg": 0.00010651863703969866, "step": 1750 }, { "epoch": 0.218875, "grad_norm": 4.782742500305176, "grad_norm_var": 0.7250677699536244, "learning_rate": 0.0001, "loss": 2.1383, "loss/crossentropy": 2.5934109687805176, "loss/hidden": 1.7734375, "loss/logits": 0.3637796640396118, "loss/reg": 0.00010646959708537906, "step": 1751 }, { "epoch": 0.219, "grad_norm": 4.486395835876465, "grad_norm_var": 0.8094373214754699, "learning_rate": 0.0001, "loss": 1.5889, "loss/crossentropy": 2.5074267387390137, "loss/hidden": 1.3515625, "loss/logits": 0.23630976676940918, "loss/reg": 0.00010643388668540865, "step": 1752 }, { "epoch": 0.219125, "grad_norm": 3.012922763824463, "grad_norm_var": 0.7881385765961265, "learning_rate": 0.0001, "loss": 1.4438, "loss/crossentropy": 2.4883944988250732, "loss/hidden": 1.2265625, "loss/logits": 0.2161332219839096, "loss/reg": 0.00010638884850777686, "step": 1753 }, { "epoch": 0.21925, "grad_norm": 2.5361666679382324, "grad_norm_var": 0.7430983233668296, "learning_rate": 0.0001, "loss": 1.2527, "loss/crossentropy": 2.8505678176879883, "loss/hidden": 1.0703125, "loss/logits": 0.18133214116096497, "loss/reg": 0.00010634720092639327, "step": 1754 }, { "epoch": 0.219375, "grad_norm": 3.5491437911987305, "grad_norm_var": 0.6924876782051359, "learning_rate": 0.0001, "loss": 1.4709, "loss/crossentropy": 2.4026288986206055, "loss/hidden": 1.2578125, "loss/logits": 0.2120504081249237, "loss/reg": 0.00010630129690980539, "step": 1755 }, { "epoch": 0.2195, "grad_norm": 2.7130327224731445, "grad_norm_var": 0.702947417318656, "learning_rate": 0.0001, "loss": 1.4438, "loss/crossentropy": 2.750009059906006, "loss/hidden": 1.2109375, "loss/logits": 0.23179300129413605, "loss/reg": 0.00010625218419590965, "step": 1756 }, { "epoch": 0.219625, "grad_norm": 3.013866424560547, "grad_norm_var": 0.6695439472292264, "learning_rate": 0.0001, "loss": 1.3467, "loss/crossentropy": 2.8481884002685547, "loss/hidden": 1.171875, "loss/logits": 0.1737177073955536, "loss/reg": 0.00010620335524436086, "step": 1757 }, { "epoch": 0.21975, "grad_norm": 2.9183197021484375, "grad_norm_var": 0.6452147415717491, "learning_rate": 0.0001, "loss": 1.4114, "loss/crossentropy": 2.492070198059082, "loss/hidden": 1.21875, "loss/logits": 0.191619873046875, "loss/reg": 0.00010615569044603035, "step": 1758 }, { "epoch": 0.219875, "grad_norm": 5.9483489990234375, "grad_norm_var": 1.0526740504697727, "learning_rate": 0.0001, "loss": 1.4602, "loss/crossentropy": 2.5709919929504395, "loss/hidden": 1.265625, "loss/logits": 0.19354471564292908, "loss/reg": 0.00010611433390295133, "step": 1759 }, { "epoch": 0.22, "grad_norm": 2.2060651779174805, "grad_norm_var": 1.1319499958763344, "learning_rate": 0.0001, "loss": 1.3273, "loss/crossentropy": 2.6466784477233887, "loss/hidden": 1.140625, "loss/logits": 0.18564406037330627, "loss/reg": 0.0001060725626302883, "step": 1760 }, { "epoch": 0.220125, "grad_norm": 2.482260227203369, "grad_norm_var": 1.0207641981205664, "learning_rate": 0.0001, "loss": 1.284, "loss/crossentropy": 2.619065761566162, "loss/hidden": 1.09375, "loss/logits": 0.18922455608844757, "loss/reg": 0.0001060303402482532, "step": 1761 }, { "epoch": 0.22025, "grad_norm": 2.1800968647003174, "grad_norm_var": 1.0824573597102203, "learning_rate": 0.0001, "loss": 1.241, "loss/crossentropy": 2.4391303062438965, "loss/hidden": 1.078125, "loss/logits": 0.16182076930999756, "loss/reg": 0.00010598796507110819, "step": 1762 }, { "epoch": 0.220375, "grad_norm": 2.6164677143096924, "grad_norm_var": 1.1006886029567717, "learning_rate": 0.0001, "loss": 1.2542, "loss/crossentropy": 2.6189143657684326, "loss/hidden": 1.078125, "loss/logits": 0.17497533559799194, "loss/reg": 0.00010594382183626294, "step": 1763 }, { "epoch": 0.2205, "grad_norm": 2.446408748626709, "grad_norm_var": 1.076028043346279, "learning_rate": 0.0001, "loss": 1.2698, "loss/crossentropy": 2.4670791625976562, "loss/hidden": 1.0859375, "loss/logits": 0.18278223276138306, "loss/reg": 0.00010589415614958853, "step": 1764 }, { "epoch": 0.220625, "grad_norm": 5.594709396362305, "grad_norm_var": 1.4340473491506374, "learning_rate": 0.0001, "loss": 1.2026, "loss/crossentropy": 2.7168948650360107, "loss/hidden": 1.046875, "loss/logits": 0.1546253263950348, "loss/reg": 0.00010584336996544152, "step": 1765 }, { "epoch": 0.22075, "grad_norm": 2.432340145111084, "grad_norm_var": 1.4835245114202016, "learning_rate": 0.0001, "loss": 1.2074, "loss/crossentropy": 2.686396360397339, "loss/hidden": 1.046875, "loss/logits": 0.15943613648414612, "loss/reg": 0.0001057912886608392, "step": 1766 }, { "epoch": 0.220875, "grad_norm": 2.752319812774658, "grad_norm_var": 1.3417938646880876, "learning_rate": 0.0001, "loss": 1.179, "loss/crossentropy": 2.319268226623535, "loss/hidden": 1.03125, "loss/logits": 0.1467100977897644, "loss/reg": 0.00010575036139925942, "step": 1767 }, { "epoch": 0.221, "grad_norm": 2.1969354152679443, "grad_norm_var": 1.2707726040740053, "learning_rate": 0.0001, "loss": 1.136, "loss/crossentropy": 2.8369998931884766, "loss/hidden": 1.0, "loss/logits": 0.13489779829978943, "loss/reg": 0.00010570134327281266, "step": 1768 }, { "epoch": 0.221125, "grad_norm": 2.7919363975524902, "grad_norm_var": 1.2745478579930134, "learning_rate": 0.0001, "loss": 1.3405, "loss/crossentropy": 2.4475255012512207, "loss/hidden": 1.1484375, "loss/logits": 0.19099673628807068, "loss/reg": 0.00010565707634668797, "step": 1769 }, { "epoch": 0.22125, "grad_norm": 2.359417676925659, "grad_norm_var": 1.2879886892848154, "learning_rate": 0.0001, "loss": 1.215, "loss/crossentropy": 2.398890256881714, "loss/hidden": 1.046875, "loss/logits": 0.16706836223602295, "loss/reg": 0.00010560419468674809, "step": 1770 }, { "epoch": 0.221375, "grad_norm": 3.442528247833252, "grad_norm_var": 1.2810719926995071, "learning_rate": 0.0001, "loss": 1.3243, "loss/crossentropy": 2.5889194011688232, "loss/hidden": 1.140625, "loss/logits": 0.18263903260231018, "loss/reg": 0.00010555358312558383, "step": 1771 }, { "epoch": 0.2215, "grad_norm": 5.295337677001953, "grad_norm_var": 1.596990256495034, "learning_rate": 0.0001, "loss": 1.8908, "loss/crossentropy": 2.2776787281036377, "loss/hidden": 1.5546875, "loss/logits": 0.33503007888793945, "loss/reg": 0.00010550576553214341, "step": 1772 }, { "epoch": 0.221625, "grad_norm": 2.8537566661834717, "grad_norm_var": 1.6018686927882884, "learning_rate": 0.0001, "loss": 1.3127, "loss/crossentropy": 2.3155508041381836, "loss/hidden": 1.1328125, "loss/logits": 0.1788116693496704, "loss/reg": 0.00010546801786404103, "step": 1773 }, { "epoch": 0.22175, "grad_norm": 2.827420234680176, "grad_norm_var": 1.6052818766199946, "learning_rate": 0.0001, "loss": 1.1672, "loss/crossentropy": 2.411863088607788, "loss/hidden": 1.015625, "loss/logits": 0.150514617562294, "loss/reg": 0.00010543551616137847, "step": 1774 }, { "epoch": 0.221875, "grad_norm": 1.9824949502944946, "grad_norm_var": 1.1094400939418814, "learning_rate": 0.0001, "loss": 1.2777, "loss/crossentropy": 2.245543956756592, "loss/hidden": 1.1015625, "loss/logits": 0.1750360131263733, "loss/reg": 0.00010538693459238857, "step": 1775 }, { "epoch": 0.222, "grad_norm": 2.7314796447753906, "grad_norm_var": 1.0778152045094558, "learning_rate": 0.0001, "loss": 1.4049, "loss/crossentropy": 2.2997615337371826, "loss/hidden": 1.1953125, "loss/logits": 0.2085481733083725, "loss/reg": 0.00010534859029576182, "step": 1776 }, { "epoch": 0.222125, "grad_norm": 3.557342290878296, "grad_norm_var": 1.0849230136511527, "learning_rate": 0.0001, "loss": 1.5997, "loss/crossentropy": 2.5679614543914795, "loss/hidden": 1.2890625, "loss/logits": 0.30957457423210144, "loss/reg": 0.00010530195868341252, "step": 1777 }, { "epoch": 0.22225, "grad_norm": 2.6340949535369873, "grad_norm_var": 1.047943142678206, "learning_rate": 0.0001, "loss": 1.3979, "loss/crossentropy": 2.5570225715637207, "loss/hidden": 1.171875, "loss/logits": 0.22497732937335968, "loss/reg": 0.00010526271216804162, "step": 1778 }, { "epoch": 0.222375, "grad_norm": 2.9312124252319336, "grad_norm_var": 1.036688603043017, "learning_rate": 0.0001, "loss": 1.2252, "loss/crossentropy": 2.965564727783203, "loss/hidden": 1.0390625, "loss/logits": 0.18505871295928955, "loss/reg": 0.00010521546937525272, "step": 1779 }, { "epoch": 0.2225, "grad_norm": 2.9292547702789307, "grad_norm_var": 1.0122813420464007, "learning_rate": 0.0001, "loss": 1.3654, "loss/crossentropy": 2.4473202228546143, "loss/hidden": 1.1640625, "loss/logits": 0.2002868801355362, "loss/reg": 0.0001051818544510752, "step": 1780 }, { "epoch": 0.222625, "grad_norm": 2.1676013469696045, "grad_norm_var": 0.5981878956745593, "learning_rate": 0.0001, "loss": 1.2957, "loss/crossentropy": 2.78108549118042, "loss/hidden": 1.1015625, "loss/logits": 0.19312900304794312, "loss/reg": 0.00010514919995330274, "step": 1781 }, { "epoch": 0.22275, "grad_norm": 2.7551753520965576, "grad_norm_var": 0.5859557603955384, "learning_rate": 0.0001, "loss": 1.2004, "loss/crossentropy": 2.414184331893921, "loss/hidden": 1.0390625, "loss/logits": 0.16027426719665527, "loss/reg": 0.00010511980508454144, "step": 1782 }, { "epoch": 0.222875, "grad_norm": 4.684420585632324, "grad_norm_var": 0.7843111015541504, "learning_rate": 0.0001, "loss": 1.5213, "loss/crossentropy": 2.702359676361084, "loss/hidden": 1.25, "loss/logits": 0.2702234387397766, "loss/reg": 0.00010507923434488475, "step": 1783 }, { "epoch": 0.223, "grad_norm": 3.0283427238464355, "grad_norm_var": 0.7375175085552503, "learning_rate": 0.0001, "loss": 1.3343, "loss/crossentropy": 2.7344627380371094, "loss/hidden": 1.140625, "loss/logits": 0.19262126088142395, "loss/reg": 0.00010503626253921539, "step": 1784 }, { "epoch": 0.223125, "grad_norm": 2.179420232772827, "grad_norm_var": 0.7829187625753585, "learning_rate": 0.0001, "loss": 1.2727, "loss/crossentropy": 2.80378794670105, "loss/hidden": 1.09375, "loss/logits": 0.1778942048549652, "loss/reg": 0.00010499458585400134, "step": 1785 }, { "epoch": 0.22325, "grad_norm": 2.5573503971099854, "grad_norm_var": 0.7678690776000275, "learning_rate": 0.0001, "loss": 1.5436, "loss/crossentropy": 2.5518131256103516, "loss/hidden": 1.265625, "loss/logits": 0.2769346237182617, "loss/reg": 0.00010495285096112639, "step": 1786 }, { "epoch": 0.223375, "grad_norm": 1.9553093910217285, "grad_norm_var": 0.8252623647929581, "learning_rate": 0.0001, "loss": 1.1531, "loss/crossentropy": 2.416489839553833, "loss/hidden": 1.0, "loss/logits": 0.15208527445793152, "loss/reg": 0.00010490811109775677, "step": 1787 }, { "epoch": 0.2235, "grad_norm": 2.066514015197754, "grad_norm_var": 0.46365532464700476, "learning_rate": 0.0001, "loss": 1.1302, "loss/crossentropy": 2.55816912651062, "loss/hidden": 0.97265625, "loss/logits": 0.1564754992723465, "loss/reg": 0.00010486772225704044, "step": 1788 }, { "epoch": 0.223625, "grad_norm": 2.1248667240142822, "grad_norm_var": 0.4858121082796306, "learning_rate": 0.0001, "loss": 1.3433, "loss/crossentropy": 2.2857651710510254, "loss/hidden": 1.140625, "loss/logits": 0.20158007740974426, "loss/reg": 0.00010483019286766648, "step": 1789 }, { "epoch": 0.22375, "grad_norm": 2.6658339500427246, "grad_norm_var": 0.4845806503417184, "learning_rate": 0.0001, "loss": 1.29, "loss/crossentropy": 2.4735755920410156, "loss/hidden": 1.0859375, "loss/logits": 0.20297685265541077, "loss/reg": 0.00010478597687324509, "step": 1790 }, { "epoch": 0.223875, "grad_norm": 2.365380048751831, "grad_norm_var": 0.4579090137834818, "learning_rate": 0.0001, "loss": 1.4283, "loss/crossentropy": 2.4906954765319824, "loss/hidden": 1.21875, "loss/logits": 0.2085450142621994, "loss/reg": 0.00010474467853782699, "step": 1791 }, { "epoch": 0.224, "grad_norm": 2.6895592212677, "grad_norm_var": 0.4578895654010807, "learning_rate": 0.0001, "loss": 1.4949, "loss/crossentropy": 2.631418466567993, "loss/hidden": 1.2890625, "loss/logits": 0.2047949582338333, "loss/reg": 0.00010469168773852289, "step": 1792 }, { "epoch": 0.224125, "grad_norm": 2.1883509159088135, "grad_norm_var": 0.41957648321695823, "learning_rate": 0.0001, "loss": 1.2417, "loss/crossentropy": 2.48760724067688, "loss/hidden": 1.0859375, "loss/logits": 0.15473410487174988, "loss/reg": 0.00010463318903930485, "step": 1793 }, { "epoch": 0.22425, "grad_norm": 3.606192111968994, "grad_norm_var": 0.48044240981690955, "learning_rate": 0.0001, "loss": 1.5114, "loss/crossentropy": 2.104339361190796, "loss/hidden": 1.3046875, "loss/logits": 0.20571112632751465, "loss/reg": 0.0001045740136760287, "step": 1794 }, { "epoch": 0.224375, "grad_norm": 2.88336181640625, "grad_norm_var": 0.47898865447871963, "learning_rate": 0.0001, "loss": 1.2339, "loss/crossentropy": 2.9455947875976562, "loss/hidden": 1.0625, "loss/logits": 0.17031556367874146, "loss/reg": 0.00010453058348502964, "step": 1795 }, { "epoch": 0.2245, "grad_norm": 2.3131027221679688, "grad_norm_var": 0.48206940259985154, "learning_rate": 0.0001, "loss": 1.2737, "loss/crossentropy": 2.6630239486694336, "loss/hidden": 1.078125, "loss/logits": 0.1945057511329651, "loss/reg": 0.00010448443208588287, "step": 1796 }, { "epoch": 0.224625, "grad_norm": 3.4034016132354736, "grad_norm_var": 0.4997757633761987, "learning_rate": 0.0001, "loss": 1.2059, "loss/crossentropy": 2.6262810230255127, "loss/hidden": 1.0390625, "loss/logits": 0.1657547652721405, "loss/reg": 0.00010444205690873787, "step": 1797 }, { "epoch": 0.22475, "grad_norm": 2.6877377033233643, "grad_norm_var": 0.4997136974473165, "learning_rate": 0.0001, "loss": 1.3368, "loss/crossentropy": 2.6557042598724365, "loss/hidden": 1.125, "loss/logits": 0.21079379320144653, "loss/reg": 0.00010438980825711042, "step": 1798 }, { "epoch": 0.224875, "grad_norm": 2.442887544631958, "grad_norm_var": 0.22437708984901003, "learning_rate": 0.0001, "loss": 1.4382, "loss/crossentropy": 2.4040186405181885, "loss/hidden": 1.21875, "loss/logits": 0.21840031445026398, "loss/reg": 0.0001043371157720685, "step": 1799 }, { "epoch": 0.225, "grad_norm": 2.815455675125122, "grad_norm_var": 0.2142663395377478, "learning_rate": 0.0001, "loss": 1.2122, "loss/crossentropy": 2.572850465774536, "loss/hidden": 1.046875, "loss/logits": 0.16424572467803955, "loss/reg": 0.00010429352551000193, "step": 1800 }, { "epoch": 0.225125, "grad_norm": 3.124791383743286, "grad_norm_var": 0.22227271360580497, "learning_rate": 0.0001, "loss": 1.4524, "loss/crossentropy": 2.5623857975006104, "loss/hidden": 1.25, "loss/logits": 0.20132501423358917, "loss/reg": 0.00010425566142657772, "step": 1801 }, { "epoch": 0.22525, "grad_norm": 2.3594796657562256, "grad_norm_var": 0.2263233243007941, "learning_rate": 0.0001, "loss": 1.325, "loss/crossentropy": 2.518688201904297, "loss/hidden": 1.1328125, "loss/logits": 0.1911948323249817, "loss/reg": 0.0001042091753333807, "step": 1802 }, { "epoch": 0.225375, "grad_norm": 2.3640241622924805, "grad_norm_var": 0.20131708695760817, "learning_rate": 0.0001, "loss": 1.2946, "loss/crossentropy": 2.6283557415008545, "loss/hidden": 1.109375, "loss/logits": 0.18418952822685242, "loss/reg": 0.00010416570148663595, "step": 1803 }, { "epoch": 0.2255, "grad_norm": 3.6102724075317383, "grad_norm_var": 0.23401225476206095, "learning_rate": 0.0001, "loss": 1.5205, "loss/crossentropy": 2.584832191467285, "loss/hidden": 1.3046875, "loss/logits": 0.21479302644729614, "loss/reg": 0.0001041177602019161, "step": 1804 }, { "epoch": 0.225625, "grad_norm": 2.2279210090637207, "grad_norm_var": 0.22639145655718734, "learning_rate": 0.0001, "loss": 1.1638, "loss/crossentropy": 2.499499797821045, "loss/hidden": 1.015625, "loss/logits": 0.14717817306518555, "loss/reg": 0.00010407467925688252, "step": 1805 }, { "epoch": 0.22575, "grad_norm": 2.646442174911499, "grad_norm_var": 0.2265918135193753, "learning_rate": 0.0001, "loss": 1.3177, "loss/crossentropy": 2.3247742652893066, "loss/hidden": 1.1328125, "loss/logits": 0.18389290571212769, "loss/reg": 0.0001040282440953888, "step": 1806 }, { "epoch": 0.225875, "grad_norm": 2.671255588531494, "grad_norm_var": 0.2174455923390627, "learning_rate": 0.0001, "loss": 1.4177, "loss/crossentropy": 2.7769968509674072, "loss/hidden": 1.203125, "loss/logits": 0.21350911259651184, "loss/reg": 0.00010398849553894252, "step": 1807 }, { "epoch": 0.226, "grad_norm": 2.480722188949585, "grad_norm_var": 0.2219139493939771, "learning_rate": 0.0001, "loss": 1.2911, "loss/crossentropy": 2.637551784515381, "loss/hidden": 1.0859375, "loss/logits": 0.20416325330734253, "loss/reg": 0.00010395440767752007, "step": 1808 }, { "epoch": 0.226125, "grad_norm": 3.120615005493164, "grad_norm_var": 0.20777613839437664, "learning_rate": 0.0001, "loss": 1.4131, "loss/crossentropy": 2.571394443511963, "loss/hidden": 1.203125, "loss/logits": 0.20897647738456726, "loss/reg": 0.00010392448166385293, "step": 1809 }, { "epoch": 0.22625, "grad_norm": 2.531287908554077, "grad_norm_var": 0.1640666862870866, "learning_rate": 0.0001, "loss": 1.2446, "loss/crossentropy": 2.610358953475952, "loss/hidden": 1.0703125, "loss/logits": 0.17328737676143646, "loss/reg": 0.00010389392264187336, "step": 1810 }, { "epoch": 0.226375, "grad_norm": 3.9285495281219482, "grad_norm_var": 0.2536909954329872, "learning_rate": 0.0001, "loss": 1.7071, "loss/crossentropy": 2.4226601123809814, "loss/hidden": 1.4609375, "loss/logits": 0.24509462714195251, "loss/reg": 0.00010385114001110196, "step": 1811 }, { "epoch": 0.2265, "grad_norm": 6.80695104598999, "grad_norm_var": 1.226817361350056, "learning_rate": 0.0001, "loss": 1.4741, "loss/crossentropy": 2.681352376937866, "loss/hidden": 1.2578125, "loss/logits": 0.2152562141418457, "loss/reg": 0.00010381911124568433, "step": 1812 }, { "epoch": 0.226625, "grad_norm": 3.4935665130615234, "grad_norm_var": 1.2312571320772783, "learning_rate": 0.0001, "loss": 1.6717, "loss/crossentropy": 2.249314546585083, "loss/hidden": 1.453125, "loss/logits": 0.21755951642990112, "loss/reg": 0.00010377457510912791, "step": 1813 }, { "epoch": 0.22675, "grad_norm": 2.461994171142578, "grad_norm_var": 1.2463090199211517, "learning_rate": 0.0001, "loss": 1.3287, "loss/crossentropy": 2.482147693634033, "loss/hidden": 1.140625, "loss/logits": 0.18700021505355835, "loss/reg": 0.00010372886754339561, "step": 1814 }, { "epoch": 0.226875, "grad_norm": 2.7086877822875977, "grad_norm_var": 1.2285745767143792, "learning_rate": 0.0001, "loss": 1.4455, "loss/crossentropy": 2.430147409439087, "loss/hidden": 1.234375, "loss/logits": 0.2101128101348877, "loss/reg": 0.00010368255607318133, "step": 1815 }, { "epoch": 0.227, "grad_norm": 3.6032145023345947, "grad_norm_var": 1.2391007795022704, "learning_rate": 0.0001, "loss": 1.5436, "loss/crossentropy": 2.6306979656219482, "loss/hidden": 1.2890625, "loss/logits": 0.2535264492034912, "loss/reg": 0.00010363001638324931, "step": 1816 }, { "epoch": 0.227125, "grad_norm": 2.5286929607391357, "grad_norm_var": 1.2620200240609694, "learning_rate": 0.0001, "loss": 1.3465, "loss/crossentropy": 2.4993367195129395, "loss/hidden": 1.1328125, "loss/logits": 0.21265153586864471, "loss/reg": 0.00010357388237025589, "step": 1817 }, { "epoch": 0.22725, "grad_norm": 4.015971660614014, "grad_norm_var": 1.2707399083377444, "learning_rate": 0.0001, "loss": 1.5333, "loss/crossentropy": 2.685762643814087, "loss/hidden": 1.296875, "loss/logits": 0.23542049527168274, "loss/reg": 0.00010352853132644668, "step": 1818 }, { "epoch": 0.227375, "grad_norm": 3.003467321395874, "grad_norm_var": 1.225019944563566, "learning_rate": 0.0001, "loss": 1.6584, "loss/crossentropy": 2.38911509513855, "loss/hidden": 1.3828125, "loss/logits": 0.27454596757888794, "loss/reg": 0.0001034869346767664, "step": 1819 }, { "epoch": 0.2275, "grad_norm": 2.817819595336914, "grad_norm_var": 1.2251431005774973, "learning_rate": 0.0001, "loss": 1.4207, "loss/crossentropy": 2.42526912689209, "loss/hidden": 1.21875, "loss/logits": 0.20094624161720276, "loss/reg": 0.00010344657493988052, "step": 1820 }, { "epoch": 0.227625, "grad_norm": 2.6171326637268066, "grad_norm_var": 1.1846607572105272, "learning_rate": 0.0001, "loss": 1.3053, "loss/crossentropy": 2.4875502586364746, "loss/hidden": 1.109375, "loss/logits": 0.19491535425186157, "loss/reg": 0.00010340951121179387, "step": 1821 }, { "epoch": 0.22775, "grad_norm": 2.637593984603882, "grad_norm_var": 1.185336143797288, "learning_rate": 0.0001, "loss": 1.2409, "loss/crossentropy": 2.278505325317383, "loss/hidden": 1.078125, "loss/logits": 0.16172973811626434, "loss/reg": 0.00010336627019569278, "step": 1822 }, { "epoch": 0.227875, "grad_norm": 3.2410335540771484, "grad_norm_var": 1.1643773443982106, "learning_rate": 0.0001, "loss": 1.3307, "loss/crossentropy": 2.7665815353393555, "loss/hidden": 1.140625, "loss/logits": 0.1890442967414856, "loss/reg": 0.00010332785313948989, "step": 1823 }, { "epoch": 0.228, "grad_norm": 2.0146126747131348, "grad_norm_var": 1.2257545159651955, "learning_rate": 0.0001, "loss": 1.1518, "loss/crossentropy": 2.2917630672454834, "loss/hidden": 1.0, "loss/logits": 0.15073367953300476, "loss/reg": 0.00010329387441743165, "step": 1824 }, { "epoch": 0.228125, "grad_norm": 2.6497304439544678, "grad_norm_var": 1.2458965442081267, "learning_rate": 0.0001, "loss": 1.3766, "loss/crossentropy": 2.539210319519043, "loss/hidden": 1.1796875, "loss/logits": 0.1959182173013687, "loss/reg": 0.00010324899631086737, "step": 1825 }, { "epoch": 0.22825, "grad_norm": 2.0025196075439453, "grad_norm_var": 1.309901576539058, "learning_rate": 0.0001, "loss": 1.2524, "loss/crossentropy": 2.5563809871673584, "loss/hidden": 1.078125, "loss/logits": 0.17320874333381653, "loss/reg": 0.00010320307774236426, "step": 1826 }, { "epoch": 0.228375, "grad_norm": 3.1805193424224854, "grad_norm_var": 1.268042879227436, "learning_rate": 0.0001, "loss": 1.5313, "loss/crossentropy": 2.423975706100464, "loss/hidden": 1.3046875, "loss/logits": 0.22555269300937653, "loss/reg": 0.00010315808322047815, "step": 1827 }, { "epoch": 0.2285, "grad_norm": 2.7439262866973877, "grad_norm_var": 0.29782563914749455, "learning_rate": 0.0001, "loss": 1.511, "loss/crossentropy": 2.5041115283966064, "loss/hidden": 1.28125, "loss/logits": 0.22867602109909058, "loss/reg": 0.00010311271762475371, "step": 1828 }, { "epoch": 0.228625, "grad_norm": 3.3902804851531982, "grad_norm_var": 0.2897332340026878, "learning_rate": 0.0001, "loss": 1.6404, "loss/crossentropy": 2.6569721698760986, "loss/hidden": 1.34375, "loss/logits": 0.2956143021583557, "loss/reg": 0.00010306618787581101, "step": 1829 }, { "epoch": 0.22875, "grad_norm": 3.047163486480713, "grad_norm_var": 0.2807776056307517, "learning_rate": 0.0001, "loss": 1.3833, "loss/crossentropy": 2.6154046058654785, "loss/hidden": 1.15625, "loss/logits": 0.22602683305740356, "loss/reg": 0.0001030224229907617, "step": 1830 }, { "epoch": 0.228875, "grad_norm": 2.5162124633789062, "grad_norm_var": 0.2876857480920222, "learning_rate": 0.0001, "loss": 1.6281, "loss/crossentropy": 2.301731824874878, "loss/hidden": 1.3984375, "loss/logits": 0.2286001592874527, "loss/reg": 0.00010297903645550832, "step": 1831 }, { "epoch": 0.229, "grad_norm": 3.3600194454193115, "grad_norm_var": 0.26778919426466247, "learning_rate": 0.0001, "loss": 1.5693, "loss/crossentropy": 3.06721830368042, "loss/hidden": 1.3046875, "loss/logits": 0.26360511779785156, "loss/reg": 0.00010292678780388087, "step": 1832 }, { "epoch": 0.229125, "grad_norm": 2.581206798553467, "grad_norm_var": 0.2656388599003281, "learning_rate": 0.0001, "loss": 1.1779, "loss/crossentropy": 2.5028367042541504, "loss/hidden": 1.015625, "loss/logits": 0.1612866371870041, "loss/reg": 0.000102871963463258, "step": 1833 }, { "epoch": 0.22925, "grad_norm": 2.7111976146698, "grad_norm_var": 0.1715806193259861, "learning_rate": 0.0001, "loss": 1.3911, "loss/crossentropy": 2.5683140754699707, "loss/hidden": 1.171875, "loss/logits": 0.2182171642780304, "loss/reg": 0.00010282120638294145, "step": 1834 }, { "epoch": 0.229375, "grad_norm": 2.737333059310913, "grad_norm_var": 0.16815406439081546, "learning_rate": 0.0001, "loss": 1.4433, "loss/crossentropy": 2.5977940559387207, "loss/hidden": 1.2265625, "loss/logits": 0.21571005880832672, "loss/reg": 0.0001027772159432061, "step": 1835 }, { "epoch": 0.2295, "grad_norm": 2.259483814239502, "grad_norm_var": 0.18374422473204863, "learning_rate": 0.0001, "loss": 1.2612, "loss/crossentropy": 2.6560111045837402, "loss/hidden": 1.078125, "loss/logits": 0.18203189969062805, "loss/reg": 0.0001027348407660611, "step": 1836 }, { "epoch": 0.229625, "grad_norm": 3.844223737716675, "grad_norm_var": 0.25928538233781995, "learning_rate": 0.0001, "loss": 1.4775, "loss/crossentropy": 2.147368907928467, "loss/hidden": 1.296875, "loss/logits": 0.17963360249996185, "loss/reg": 0.00010268503683619201, "step": 1837 }, { "epoch": 0.22975, "grad_norm": 3.0884807109832764, "grad_norm_var": 0.2617881696486182, "learning_rate": 0.0001, "loss": 1.6286, "loss/crossentropy": 2.6087253093719482, "loss/hidden": 1.28125, "loss/logits": 0.346326619386673, "loss/reg": 0.00010264750017086044, "step": 1838 }, { "epoch": 0.229875, "grad_norm": 2.630241632461548, "grad_norm_var": 0.25207833957911513, "learning_rate": 0.0001, "loss": 1.4445, "loss/crossentropy": 2.338000535964966, "loss/hidden": 1.2265625, "loss/logits": 0.21692870557308197, "loss/reg": 0.00010260462295264006, "step": 1839 }, { "epoch": 0.23, "grad_norm": 2.6667590141296387, "grad_norm_var": 0.21060046689199982, "learning_rate": 0.0001, "loss": 1.3682, "loss/crossentropy": 2.565429210662842, "loss/hidden": 1.15625, "loss/logits": 0.2109432965517044, "loss/reg": 0.00010256018140353262, "step": 1840 }, { "epoch": 0.230125, "grad_norm": 2.4561452865600586, "grad_norm_var": 0.21780425378768556, "learning_rate": 0.0001, "loss": 1.4542, "loss/crossentropy": 2.2738218307495117, "loss/hidden": 1.2421875, "loss/logits": 0.21101590991020203, "loss/reg": 0.00010251777712255716, "step": 1841 }, { "epoch": 0.23025, "grad_norm": 2.48793888092041, "grad_norm_var": 0.1792346403848036, "learning_rate": 0.0001, "loss": 1.1844, "loss/crossentropy": 2.517746925354004, "loss/hidden": 1.03125, "loss/logits": 0.15212376415729523, "loss/reg": 0.00010248507169308141, "step": 1842 }, { "epoch": 0.230375, "grad_norm": 2.6713204383850098, "grad_norm_var": 0.17342898515069674, "learning_rate": 0.0001, "loss": 1.5759, "loss/crossentropy": 2.7372846603393555, "loss/hidden": 1.3125, "loss/logits": 0.2623566687107086, "loss/reg": 0.00010245316661894321, "step": 1843 }, { "epoch": 0.2305, "grad_norm": 3.1259453296661377, "grad_norm_var": 0.17844626489621288, "learning_rate": 0.0001, "loss": 1.4521, "loss/crossentropy": 2.2056045532226562, "loss/hidden": 1.265625, "loss/logits": 0.18540745973587036, "loss/reg": 0.00010242007556371391, "step": 1844 }, { "epoch": 0.230625, "grad_norm": 3.748760461807251, "grad_norm_var": 0.2123797864726839, "learning_rate": 0.0001, "loss": 1.6286, "loss/crossentropy": 2.5298638343811035, "loss/hidden": 1.34375, "loss/logits": 0.28387510776519775, "loss/reg": 0.0001023921649903059, "step": 1845 }, { "epoch": 0.23075, "grad_norm": 3.8029263019561768, "grad_norm_var": 0.26585255463558966, "learning_rate": 0.0001, "loss": 1.5265, "loss/crossentropy": 2.4724678993225098, "loss/hidden": 1.25, "loss/logits": 0.27545684576034546, "loss/reg": 0.00010235208901576698, "step": 1846 }, { "epoch": 0.230875, "grad_norm": 2.8058130741119385, "grad_norm_var": 0.2555794773681896, "learning_rate": 0.0001, "loss": 1.4602, "loss/crossentropy": 2.516950845718384, "loss/hidden": 1.2265625, "loss/logits": 0.23265376687049866, "loss/reg": 0.00010230591578874737, "step": 1847 }, { "epoch": 0.231, "grad_norm": 2.847660779953003, "grad_norm_var": 0.24302743497048596, "learning_rate": 0.0001, "loss": 1.5456, "loss/crossentropy": 2.5391757488250732, "loss/hidden": 1.265625, "loss/logits": 0.2789611220359802, "loss/reg": 0.00010226416634395719, "step": 1848 }, { "epoch": 0.231125, "grad_norm": 2.421632766723633, "grad_norm_var": 0.2514887594410756, "learning_rate": 0.0001, "loss": 1.3196, "loss/crossentropy": 2.5305466651916504, "loss/hidden": 1.125, "loss/logits": 0.19353465735912323, "loss/reg": 0.00010221732372883707, "step": 1849 }, { "epoch": 0.23125, "grad_norm": 2.7795138359069824, "grad_norm_var": 0.2501142772570288, "learning_rate": 0.0001, "loss": 1.6232, "loss/crossentropy": 2.2129874229431152, "loss/hidden": 1.3515625, "loss/logits": 0.2706400156021118, "loss/reg": 0.00010218070383416489, "step": 1850 }, { "epoch": 0.231375, "grad_norm": 2.8464436531066895, "grad_norm_var": 0.24851533358851158, "learning_rate": 0.0001, "loss": 1.3934, "loss/crossentropy": 2.953432083129883, "loss/hidden": 1.203125, "loss/logits": 0.18923673033714294, "loss/reg": 0.00010213378845946863, "step": 1851 }, { "epoch": 0.2315, "grad_norm": 2.334512233734131, "grad_norm_var": 0.24240749782840113, "learning_rate": 0.0001, "loss": 1.3938, "loss/crossentropy": 2.5913569927215576, "loss/hidden": 1.1875, "loss/logits": 0.20529164373874664, "loss/reg": 0.00010209732863586396, "step": 1852 }, { "epoch": 0.231625, "grad_norm": 2.171050786972046, "grad_norm_var": 0.2089375617552226, "learning_rate": 0.0001, "loss": 1.1773, "loss/crossentropy": 2.775158643722534, "loss/hidden": 1.015625, "loss/logits": 0.160679429769516, "loss/reg": 0.00010205565922660753, "step": 1853 }, { "epoch": 0.23175, "grad_norm": 3.217833995819092, "grad_norm_var": 0.21486700403686973, "learning_rate": 0.0001, "loss": 1.7906, "loss/crossentropy": 2.5315303802490234, "loss/hidden": 1.4765625, "loss/logits": 0.31298625469207764, "loss/reg": 0.00010202094563283026, "step": 1854 }, { "epoch": 0.231875, "grad_norm": 12.01088809967041, "grad_norm_var": 5.485556462732142, "learning_rate": 0.0001, "loss": 1.418, "loss/crossentropy": 2.6089963912963867, "loss/hidden": 1.21875, "loss/logits": 0.1982203722000122, "loss/reg": 0.00010197410301771015, "step": 1855 }, { "epoch": 0.232, "grad_norm": 2.8098154067993164, "grad_norm_var": 5.472855346625818, "learning_rate": 0.0001, "loss": 1.5244, "loss/crossentropy": 2.655829429626465, "loss/hidden": 1.2734375, "loss/logits": 0.24995499849319458, "loss/reg": 0.00010193647176492959, "step": 1856 }, { "epoch": 0.232125, "grad_norm": 2.4442906379699707, "grad_norm_var": 5.474369658114088, "learning_rate": 0.0001, "loss": 1.3702, "loss/crossentropy": 2.849839210510254, "loss/hidden": 1.171875, "loss/logits": 0.19734014570713043, "loss/reg": 0.00010188823944190517, "step": 1857 }, { "epoch": 0.23225, "grad_norm": 3.7021000385284424, "grad_norm_var": 5.4175760972217075, "learning_rate": 0.0001, "loss": 1.9052, "loss/crossentropy": 2.719900131225586, "loss/hidden": 1.5859375, "loss/logits": 0.3182139992713928, "loss/reg": 0.0001018389593809843, "step": 1858 }, { "epoch": 0.232375, "grad_norm": 2.590937376022339, "grad_norm_var": 5.4266876873471235, "learning_rate": 0.0001, "loss": 1.4591, "loss/crossentropy": 2.2560436725616455, "loss/hidden": 1.234375, "loss/logits": 0.22373943030834198, "loss/reg": 0.00010178654338233173, "step": 1859 }, { "epoch": 0.2325, "grad_norm": 3.3076484203338623, "grad_norm_var": 5.420203572696921, "learning_rate": 0.0001, "loss": 1.3941, "loss/crossentropy": 2.5213418006896973, "loss/hidden": 1.1875, "loss/logits": 0.20558680593967438, "loss/reg": 0.00010173388727707788, "step": 1860 }, { "epoch": 0.232625, "grad_norm": 3.2896065711975098, "grad_norm_var": 5.4175455103921095, "learning_rate": 0.0001, "loss": 1.9019, "loss/crossentropy": 1.9756035804748535, "loss/hidden": 1.6484375, "loss/logits": 0.25245004892349243, "loss/reg": 0.00010168523294851184, "step": 1861 }, { "epoch": 0.23275, "grad_norm": 2.2200093269348145, "grad_norm_var": 5.502069721365107, "learning_rate": 0.0001, "loss": 1.4149, "loss/crossentropy": 2.5068764686584473, "loss/hidden": 1.1875, "loss/logits": 0.2263980507850647, "loss/reg": 0.00010163044498767704, "step": 1862 }, { "epoch": 0.232875, "grad_norm": 2.527611494064331, "grad_norm_var": 5.527555906445179, "learning_rate": 0.0001, "loss": 1.324, "loss/crossentropy": 2.872347593307495, "loss/hidden": 1.1328125, "loss/logits": 0.19017130136489868, "loss/reg": 0.0001015712259686552, "step": 1863 }, { "epoch": 0.233, "grad_norm": 31.164201736450195, "grad_norm_var": 53.76362108592601, "learning_rate": 0.0001, "loss": 2.3242, "loss/crossentropy": 2.3194406032562256, "loss/hidden": 2.078125, "loss/logits": 0.24510958790779114, "loss/reg": 0.00010150723392143846, "step": 1864 }, { "epoch": 0.233125, "grad_norm": 2.858231782913208, "grad_norm_var": 53.618752149484166, "learning_rate": 0.0001, "loss": 1.1896, "loss/crossentropy": 2.6053354740142822, "loss/hidden": 1.0390625, "loss/logits": 0.1495300531387329, "loss/reg": 0.00010144842235604301, "step": 1865 }, { "epoch": 0.23325, "grad_norm": 2.6274778842926025, "grad_norm_var": 53.66809129190645, "learning_rate": 0.0001, "loss": 1.3218, "loss/crossentropy": 2.615084648132324, "loss/hidden": 1.1484375, "loss/logits": 0.1723259538412094, "loss/reg": 0.00010140474478248507, "step": 1866 }, { "epoch": 0.233375, "grad_norm": 2.807380199432373, "grad_norm_var": 53.68009436388109, "learning_rate": 0.0001, "loss": 1.3398, "loss/crossentropy": 2.9445853233337402, "loss/hidden": 1.1328125, "loss/logits": 0.20594742894172668, "loss/reg": 0.00010136484343092889, "step": 1867 }, { "epoch": 0.2335, "grad_norm": 3.308789014816284, "grad_norm_var": 53.37624727801399, "learning_rate": 0.0001, "loss": 1.6649, "loss/crossentropy": 2.8572754859924316, "loss/hidden": 1.3984375, "loss/logits": 0.26543131470680237, "loss/reg": 0.00010131901217391714, "step": 1868 }, { "epoch": 0.233625, "grad_norm": 3.2875092029571533, "grad_norm_var": 53.00458178761111, "learning_rate": 0.0001, "loss": 1.3644, "loss/crossentropy": 2.709136962890625, "loss/hidden": 1.171875, "loss/logits": 0.19156011939048767, "loss/reg": 0.00010127508721780032, "step": 1869 }, { "epoch": 0.23375, "grad_norm": 2.7487704753875732, "grad_norm_var": 53.146109836258724, "learning_rate": 0.0001, "loss": 1.5502, "loss/crossentropy": 2.696678876876831, "loss/hidden": 1.296875, "loss/logits": 0.2523132562637329, "loss/reg": 0.00010123303945874795, "step": 1870 }, { "epoch": 0.233875, "grad_norm": 2.725707769393921, "grad_norm_var": 50.141573313067674, "learning_rate": 0.0001, "loss": 1.6481, "loss/crossentropy": 2.3140811920166016, "loss/hidden": 1.4140625, "loss/logits": 0.23302508890628815, "loss/reg": 0.0001011961794574745, "step": 1871 }, { "epoch": 0.234, "grad_norm": 2.793262243270874, "grad_norm_var": 50.14565465962646, "learning_rate": 0.0001, "loss": 1.6792, "loss/crossentropy": 2.5104198455810547, "loss/hidden": 1.3984375, "loss/logits": 0.2797708213329315, "loss/reg": 0.00010115499026142061, "step": 1872 }, { "epoch": 0.234125, "grad_norm": 4.7126359939575195, "grad_norm_var": 49.80006669104162, "learning_rate": 0.0001, "loss": 1.7637, "loss/crossentropy": 2.421973705291748, "loss/hidden": 1.5703125, "loss/logits": 0.19241148233413696, "loss/reg": 0.00010111834126291797, "step": 1873 }, { "epoch": 0.23425, "grad_norm": 3.0829038619995117, "grad_norm_var": 49.91401039867657, "learning_rate": 0.0001, "loss": 1.492, "loss/crossentropy": 2.523608684539795, "loss/hidden": 1.2578125, "loss/logits": 0.2331656664609909, "loss/reg": 0.000101075267593842, "step": 1874 }, { "epoch": 0.234375, "grad_norm": 2.282280921936035, "grad_norm_var": 50.00895468972325, "learning_rate": 0.0001, "loss": 1.4459, "loss/crossentropy": 2.2948641777038574, "loss/hidden": 1.203125, "loss/logits": 0.24179668724536896, "loss/reg": 0.00010104035027325153, "step": 1875 }, { "epoch": 0.2345, "grad_norm": 3.0979037284851074, "grad_norm_var": 50.05159357864745, "learning_rate": 0.0001, "loss": 1.4571, "loss/crossentropy": 2.592439651489258, "loss/hidden": 1.234375, "loss/logits": 0.22174429893493652, "loss/reg": 0.00010099709470523521, "step": 1876 }, { "epoch": 0.234625, "grad_norm": 4.0839948654174805, "grad_norm_var": 49.939434789989036, "learning_rate": 0.0001, "loss": 1.888, "loss/crossentropy": 2.4488718509674072, "loss/hidden": 1.59375, "loss/logits": 0.29320117831230164, "loss/reg": 0.00010096084588440135, "step": 1877 }, { "epoch": 0.23475, "grad_norm": 2.9100539684295654, "grad_norm_var": 49.73453071185745, "learning_rate": 0.0001, "loss": 1.3946, "loss/crossentropy": 2.78420090675354, "loss/hidden": 1.1953125, "loss/logits": 0.19826452434062958, "loss/reg": 0.00010092945740325376, "step": 1878 }, { "epoch": 0.234875, "grad_norm": 3.360579013824463, "grad_norm_var": 49.524000428302585, "learning_rate": 0.0001, "loss": 1.6672, "loss/crossentropy": 2.28676438331604, "loss/hidden": 1.421875, "loss/logits": 0.24435168504714966, "loss/reg": 0.00010088978160638362, "step": 1879 }, { "epoch": 0.235, "grad_norm": 2.508573055267334, "grad_norm_var": 0.3656682138512745, "learning_rate": 0.0001, "loss": 1.3772, "loss/crossentropy": 2.567340850830078, "loss/hidden": 1.1640625, "loss/logits": 0.2121252715587616, "loss/reg": 0.00010085524263558909, "step": 1880 }, { "epoch": 0.235125, "grad_norm": 3.021430730819702, "grad_norm_var": 0.3626213529430667, "learning_rate": 0.0001, "loss": 1.5435, "loss/crossentropy": 2.155583143234253, "loss/hidden": 1.3203125, "loss/logits": 0.22215692698955536, "loss/reg": 0.00010082768858410418, "step": 1881 }, { "epoch": 0.23525, "grad_norm": 2.571753740310669, "grad_norm_var": 0.3662144168916067, "learning_rate": 0.0001, "loss": 1.3567, "loss/crossentropy": 2.819533586502075, "loss/hidden": 1.1640625, "loss/logits": 0.19163212180137634, "loss/reg": 0.00010080639185616747, "step": 1882 }, { "epoch": 0.235375, "grad_norm": 2.8638505935668945, "grad_norm_var": 0.3643499914700205, "learning_rate": 0.0001, "loss": 1.2193, "loss/crossentropy": 2.5291144847869873, "loss/hidden": 1.0390625, "loss/logits": 0.17923784255981445, "loss/reg": 0.00010076520266011357, "step": 1883 }, { "epoch": 0.2355, "grad_norm": 2.934446096420288, "grad_norm_var": 0.3619384333079163, "learning_rate": 0.0001, "loss": 1.6401, "loss/crossentropy": 2.7803385257720947, "loss/hidden": 1.3671875, "loss/logits": 0.27194589376449585, "loss/reg": 0.00010073379235109314, "step": 1884 }, { "epoch": 0.235625, "grad_norm": 2.8970041275024414, "grad_norm_var": 0.35970701272042876, "learning_rate": 0.0001, "loss": 1.2373, "loss/crossentropy": 2.5701301097869873, "loss/hidden": 1.0625, "loss/logits": 0.1737690567970276, "loss/reg": 0.00010069384734379128, "step": 1885 }, { "epoch": 0.23575, "grad_norm": 2.7076284885406494, "grad_norm_var": 0.3613949959215402, "learning_rate": 0.0001, "loss": 1.2029, "loss/crossentropy": 2.528377056121826, "loss/hidden": 1.046875, "loss/logits": 0.15506258606910706, "loss/reg": 0.00010066442337119952, "step": 1886 }, { "epoch": 0.235875, "grad_norm": 2.282021999359131, "grad_norm_var": 0.39197355358761854, "learning_rate": 0.0001, "loss": 1.2113, "loss/crossentropy": 2.3408849239349365, "loss/hidden": 1.0546875, "loss/logits": 0.15561705827713013, "loss/reg": 0.00010063280933536589, "step": 1887 }, { "epoch": 0.236, "grad_norm": 3.6190524101257324, "grad_norm_var": 0.4110720068262251, "learning_rate": 0.0001, "loss": 1.6962, "loss/crossentropy": 2.10512113571167, "loss/hidden": 1.46875, "loss/logits": 0.22647377848625183, "loss/reg": 0.000100590186775662, "step": 1888 }, { "epoch": 0.236125, "grad_norm": 2.0451228618621826, "grad_norm_var": 0.2674772448639563, "learning_rate": 0.0001, "loss": 1.225, "loss/crossentropy": 2.5362894535064697, "loss/hidden": 1.0625, "loss/logits": 0.16144642233848572, "loss/reg": 0.00010054962331196293, "step": 1889 }, { "epoch": 0.23625, "grad_norm": 2.091447353363037, "grad_norm_var": 0.3036493994273305, "learning_rate": 0.0001, "loss": 1.314, "loss/crossentropy": 2.5738067626953125, "loss/hidden": 1.125, "loss/logits": 0.188033327460289, "loss/reg": 0.0001005115409498103, "step": 1890 }, { "epoch": 0.236375, "grad_norm": 2.339015245437622, "grad_norm_var": 0.29970866084346215, "learning_rate": 0.0001, "loss": 1.2515, "loss/crossentropy": 2.361067056655884, "loss/hidden": 1.0859375, "loss/logits": 0.16460666060447693, "loss/reg": 0.00010047376417787746, "step": 1891 }, { "epoch": 0.2365, "grad_norm": 2.537259340286255, "grad_norm_var": 0.2995790189977536, "learning_rate": 0.0001, "loss": 1.4673, "loss/crossentropy": 2.8068201541900635, "loss/hidden": 1.265625, "loss/logits": 0.20069506764411926, "loss/reg": 0.0001004432633635588, "step": 1892 }, { "epoch": 0.236625, "grad_norm": 2.7380168437957764, "grad_norm_var": 0.1820768337099705, "learning_rate": 0.0001, "loss": 1.6009, "loss/crossentropy": 2.458124876022339, "loss/hidden": 1.3203125, "loss/logits": 0.2795846462249756, "loss/reg": 0.00010040310735348612, "step": 1893 }, { "epoch": 0.23675, "grad_norm": 2.423954486846924, "grad_norm_var": 0.18415141914745922, "learning_rate": 0.0001, "loss": 1.2585, "loss/crossentropy": 2.73473858833313, "loss/hidden": 1.0859375, "loss/logits": 0.17156445980072021, "loss/reg": 0.00010036468302132562, "step": 1894 }, { "epoch": 0.236875, "grad_norm": 2.879147529602051, "grad_norm_var": 0.1551958360576909, "learning_rate": 0.0001, "loss": 1.2758, "loss/crossentropy": 2.638183832168579, "loss/hidden": 1.1015625, "loss/logits": 0.17322765290737152, "loss/reg": 0.00010032658610725775, "step": 1895 }, { "epoch": 0.237, "grad_norm": 11.608695030212402, "grad_norm_var": 5.154830057945992, "learning_rate": 0.0001, "loss": 2.1352, "loss/crossentropy": 2.7513511180877686, "loss/hidden": 1.71875, "loss/logits": 0.4154278635978699, "loss/reg": 0.0001002883946057409, "step": 1896 }, { "epoch": 0.237125, "grad_norm": 3.6973209381103516, "grad_norm_var": 5.165262543658859, "learning_rate": 0.0001, "loss": 1.6303, "loss/crossentropy": 3.033592700958252, "loss/hidden": 1.390625, "loss/logits": 0.23864644765853882, "loss/reg": 0.00010025579103967175, "step": 1897 }, { "epoch": 0.23725, "grad_norm": 2.7073252201080322, "grad_norm_var": 5.1538848302006555, "learning_rate": 0.0001, "loss": 1.3073, "loss/crossentropy": 2.816115140914917, "loss/hidden": 1.1171875, "loss/logits": 0.1891186237335205, "loss/reg": 0.00010022510105045512, "step": 1898 }, { "epoch": 0.237375, "grad_norm": 2.7596566677093506, "grad_norm_var": 5.160250344079653, "learning_rate": 0.0001, "loss": 1.5459, "loss/crossentropy": 2.4892849922180176, "loss/hidden": 1.296875, "loss/logits": 0.24804270267486572, "loss/reg": 0.00010018237662734464, "step": 1899 }, { "epoch": 0.2375, "grad_norm": 3.72668719291687, "grad_norm_var": 5.164382086899722, "learning_rate": 0.0001, "loss": 1.4494, "loss/crossentropy": 2.4603869915008545, "loss/hidden": 1.21875, "loss/logits": 0.22965267300605774, "loss/reg": 0.0001001388009171933, "step": 1900 }, { "epoch": 0.237625, "grad_norm": 3.5097012519836426, "grad_norm_var": 5.153598304716092, "learning_rate": 0.0001, "loss": 1.5847, "loss/crossentropy": 2.761608123779297, "loss/hidden": 1.3359375, "loss/logits": 0.24772918224334717, "loss/reg": 0.0001000955599010922, "step": 1901 }, { "epoch": 0.23775, "grad_norm": 2.337001085281372, "grad_norm_var": 5.19415020111913, "learning_rate": 0.0001, "loss": 1.3164, "loss/crossentropy": 2.5078110694885254, "loss/hidden": 1.125, "loss/logits": 0.19041401147842407, "loss/reg": 0.00010004679643316194, "step": 1902 }, { "epoch": 0.237875, "grad_norm": 2.427476167678833, "grad_norm_var": 5.175122168994746, "learning_rate": 0.0001, "loss": 1.4011, "loss/crossentropy": 2.669965982437134, "loss/hidden": 1.1875, "loss/logits": 0.21262109279632568, "loss/reg": 0.00010000415204558522, "step": 1903 }, { "epoch": 0.238, "grad_norm": 2.0555174350738525, "grad_norm_var": 5.269827480842385, "learning_rate": 0.0001, "loss": 1.2281, "loss/crossentropy": 2.496699094772339, "loss/hidden": 1.0546875, "loss/logits": 0.17244219779968262, "loss/reg": 9.99614640022628e-05, "step": 1904 }, { "epoch": 0.238125, "grad_norm": 2.413318395614624, "grad_norm_var": 5.219507693476547, "learning_rate": 0.0001, "loss": 1.2506, "loss/crossentropy": 2.5161492824554443, "loss/hidden": 1.0703125, "loss/logits": 0.1793210506439209, "loss/reg": 9.99144758679904e-05, "step": 1905 }, { "epoch": 0.23825, "grad_norm": 2.286193370819092, "grad_norm_var": 5.19138671358207, "learning_rate": 0.0001, "loss": 1.3716, "loss/crossentropy": 2.4983043670654297, "loss/hidden": 1.15625, "loss/logits": 0.21438170969486237, "loss/reg": 9.98717951006256e-05, "step": 1906 }, { "epoch": 0.238375, "grad_norm": 2.7304441928863525, "grad_norm_var": 5.151962234088137, "learning_rate": 0.0001, "loss": 1.393, "loss/crossentropy": 2.4288413524627686, "loss/hidden": 1.171875, "loss/logits": 0.22009283304214478, "loss/reg": 9.98244431684725e-05, "step": 1907 }, { "epoch": 0.2385, "grad_norm": 2.2762558460235596, "grad_norm_var": 5.182845672456559, "learning_rate": 0.0001, "loss": 1.4002, "loss/crossentropy": 2.4639878273010254, "loss/hidden": 1.203125, "loss/logits": 0.19606274366378784, "loss/reg": 9.977377339964733e-05, "step": 1908 }, { "epoch": 0.238625, "grad_norm": 4.444680213928223, "grad_norm_var": 5.24018292166406, "learning_rate": 0.0001, "loss": 1.5012, "loss/crossentropy": 2.356444835662842, "loss/hidden": 1.2890625, "loss/logits": 0.2110910415649414, "loss/reg": 9.972224506782368e-05, "step": 1909 }, { "epoch": 0.23875, "grad_norm": 3.8996760845184326, "grad_norm_var": 5.185677252025455, "learning_rate": 0.0001, "loss": 1.442, "loss/crossentropy": 2.0415446758270264, "loss/hidden": 1.265625, "loss/logits": 0.17540983855724335, "loss/reg": 9.96757808024995e-05, "step": 1910 }, { "epoch": 0.238875, "grad_norm": 2.41021990776062, "grad_norm_var": 5.237297169278398, "learning_rate": 0.0001, "loss": 1.1585, "loss/crossentropy": 2.764937400817871, "loss/hidden": 1.0078125, "loss/logits": 0.14972862601280212, "loss/reg": 9.962261538021266e-05, "step": 1911 }, { "epoch": 0.239, "grad_norm": 2.267669200897217, "grad_norm_var": 0.536328014043702, "learning_rate": 0.0001, "loss": 1.4589, "loss/crossentropy": 2.277825355529785, "loss/hidden": 1.234375, "loss/logits": 0.22355429828166962, "loss/reg": 9.956878784578294e-05, "step": 1912 }, { "epoch": 0.239125, "grad_norm": 2.3872971534729004, "grad_norm_var": 0.4993982966828277, "learning_rate": 0.0001, "loss": 1.2157, "loss/crossentropy": 2.4454731941223145, "loss/hidden": 1.0625, "loss/logits": 0.15223410725593567, "loss/reg": 9.95240334304981e-05, "step": 1913 }, { "epoch": 0.23925, "grad_norm": 2.759357213973999, "grad_norm_var": 0.49899432205546435, "learning_rate": 0.0001, "loss": 1.4285, "loss/crossentropy": 2.4319281578063965, "loss/hidden": 1.203125, "loss/logits": 0.22441807389259338, "loss/reg": 9.948140359483659e-05, "step": 1914 }, { "epoch": 0.239375, "grad_norm": 2.7283589839935303, "grad_norm_var": 0.49919550808500784, "learning_rate": 0.0001, "loss": 1.3722, "loss/crossentropy": 2.6263864040374756, "loss/hidden": 1.15625, "loss/logits": 0.2149399369955063, "loss/reg": 9.943256009137258e-05, "step": 1915 }, { "epoch": 0.2395, "grad_norm": 2.5406413078308105, "grad_norm_var": 0.4391835902061975, "learning_rate": 0.0001, "loss": 1.2883, "loss/crossentropy": 2.5064470767974854, "loss/hidden": 1.125, "loss/logits": 0.16234064102172852, "loss/reg": 9.938384027918801e-05, "step": 1916 }, { "epoch": 0.239625, "grad_norm": 2.1613900661468506, "grad_norm_var": 0.41031761483502743, "learning_rate": 0.0001, "loss": 1.1174, "loss/crossentropy": 2.5534422397613525, "loss/hidden": 0.97265625, "loss/logits": 0.14372506737709045, "loss/reg": 9.933317051036283e-05, "step": 1917 }, { "epoch": 0.23975, "grad_norm": 2.1708312034606934, "grad_norm_var": 0.418598072432106, "learning_rate": 0.0001, "loss": 1.2723, "loss/crossentropy": 2.262831211090088, "loss/hidden": 1.09375, "loss/logits": 0.1775897592306137, "loss/reg": 9.929437510436401e-05, "step": 1918 }, { "epoch": 0.239875, "grad_norm": 2.506824254989624, "grad_norm_var": 0.4169287226368842, "learning_rate": 0.0001, "loss": 1.3578, "loss/crossentropy": 2.6624338626861572, "loss/hidden": 1.1640625, "loss/logits": 0.19276925921440125, "loss/reg": 9.925808990374207e-05, "step": 1919 }, { "epoch": 0.24, "grad_norm": 2.518611192703247, "grad_norm_var": 0.3950197851813291, "learning_rate": 0.0001, "loss": 1.2401, "loss/crossentropy": 2.5525996685028076, "loss/hidden": 1.0703125, "loss/logits": 0.16880980134010315, "loss/reg": 9.921249147737399e-05, "step": 1920 }, { "epoch": 0.240125, "grad_norm": 2.865478038787842, "grad_norm_var": 0.39314529356806793, "learning_rate": 0.0001, "loss": 1.5064, "loss/crossentropy": 2.2712812423706055, "loss/hidden": 1.3125, "loss/logits": 0.19293969869613647, "loss/reg": 9.917128772940487e-05, "step": 1921 }, { "epoch": 0.24025, "grad_norm": 2.2972774505615234, "grad_norm_var": 0.39256414568858583, "learning_rate": 0.0001, "loss": 1.2127, "loss/crossentropy": 2.4816088676452637, "loss/hidden": 1.046875, "loss/logits": 0.16483771800994873, "loss/reg": 9.91302149486728e-05, "step": 1922 }, { "epoch": 0.240375, "grad_norm": 3.2930009365081787, "grad_norm_var": 0.4157286870072878, "learning_rate": 0.0001, "loss": 1.5681, "loss/crossentropy": 2.645097017288208, "loss/hidden": 1.3125, "loss/logits": 0.25461071729660034, "loss/reg": 9.90895860013552e-05, "step": 1923 }, { "epoch": 0.2405, "grad_norm": 2.648514747619629, "grad_norm_var": 0.40234122153845725, "learning_rate": 0.0001, "loss": 1.219, "loss/crossentropy": 2.294752597808838, "loss/hidden": 1.0546875, "loss/logits": 0.16332939267158508, "loss/reg": 9.904119360726327e-05, "step": 1924 }, { "epoch": 0.240625, "grad_norm": 4.316061973571777, "grad_norm_var": 0.3742055327296384, "learning_rate": 0.0001, "loss": 1.5356, "loss/crossentropy": 2.6617865562438965, "loss/hidden": 1.328125, "loss/logits": 0.20652295649051666, "loss/reg": 9.89965265034698e-05, "step": 1925 }, { "epoch": 0.24075, "grad_norm": 2.6829357147216797, "grad_norm_var": 0.27789997618658024, "learning_rate": 0.0001, "loss": 1.2734, "loss/crossentropy": 2.5207204818725586, "loss/hidden": 1.0859375, "loss/logits": 0.1864495426416397, "loss/reg": 9.895602852338925e-05, "step": 1926 }, { "epoch": 0.240875, "grad_norm": 2.7026216983795166, "grad_norm_var": 0.27351897524333874, "learning_rate": 0.0001, "loss": 1.432, "loss/crossentropy": 2.707408905029297, "loss/hidden": 1.21875, "loss/logits": 0.2123045027256012, "loss/reg": 9.892123489407822e-05, "step": 1927 }, { "epoch": 0.241, "grad_norm": 3.547189474105835, "grad_norm_var": 0.305850726536327, "learning_rate": 0.0001, "loss": 1.6948, "loss/crossentropy": 2.6288089752197266, "loss/hidden": 1.4140625, "loss/logits": 0.2797739505767822, "loss/reg": 9.888653585221618e-05, "step": 1928 }, { "epoch": 0.241125, "grad_norm": 2.4466805458068848, "grad_norm_var": 0.30313677609404216, "learning_rate": 0.0001, "loss": 1.204, "loss/crossentropy": 2.680166006088257, "loss/hidden": 1.046875, "loss/logits": 0.15614420175552368, "loss/reg": 9.885340114124119e-05, "step": 1929 }, { "epoch": 0.24125, "grad_norm": 2.3889732360839844, "grad_norm_var": 0.3118220927567959, "learning_rate": 0.0001, "loss": 1.4305, "loss/crossentropy": 2.444502592086792, "loss/hidden": 1.21875, "loss/logits": 0.21072471141815186, "loss/reg": 9.881625010166317e-05, "step": 1930 }, { "epoch": 0.241375, "grad_norm": 3.3422133922576904, "grad_norm_var": 0.3345462718035035, "learning_rate": 0.0001, "loss": 1.318, "loss/crossentropy": 2.572443962097168, "loss/hidden": 1.1484375, "loss/logits": 0.1685420274734497, "loss/reg": 9.878368291538209e-05, "step": 1931 }, { "epoch": 0.2415, "grad_norm": 2.7276008129119873, "grad_norm_var": 0.3308432458707775, "learning_rate": 0.0001, "loss": 1.5097, "loss/crossentropy": 2.324035882949829, "loss/hidden": 1.2578125, "loss/logits": 0.2509124279022217, "loss/reg": 9.875216346699744e-05, "step": 1932 }, { "epoch": 0.241625, "grad_norm": 4.58004093170166, "grad_norm_var": 0.4942214552928076, "learning_rate": 0.0001, "loss": 1.6693, "loss/crossentropy": 2.759860038757324, "loss/hidden": 1.3828125, "loss/logits": 0.2855346202850342, "loss/reg": 9.871440124697983e-05, "step": 1933 }, { "epoch": 0.24175, "grad_norm": 2.973862409591675, "grad_norm_var": 0.45220403656852187, "learning_rate": 0.0001, "loss": 1.393, "loss/crossentropy": 2.8311591148376465, "loss/hidden": 1.1796875, "loss/logits": 0.2123367190361023, "loss/reg": 9.866865002550185e-05, "step": 1934 }, { "epoch": 0.241875, "grad_norm": 2.732325792312622, "grad_norm_var": 0.44085860848340325, "learning_rate": 0.0001, "loss": 1.1873, "loss/crossentropy": 2.7610654830932617, "loss/hidden": 1.0390625, "loss/logits": 0.14728528261184692, "loss/reg": 9.863568266155198e-05, "step": 1935 }, { "epoch": 0.242, "grad_norm": 3.05810546875, "grad_norm_var": 0.42413697353347646, "learning_rate": 0.0001, "loss": 1.3304, "loss/crossentropy": 2.4901087284088135, "loss/hidden": 1.15625, "loss/logits": 0.1731736660003662, "loss/reg": 9.859199053607881e-05, "step": 1936 }, { "epoch": 0.242125, "grad_norm": 4.303008556365967, "grad_norm_var": 0.5202867398233553, "learning_rate": 0.0001, "loss": 1.5263, "loss/crossentropy": 2.95527982711792, "loss/hidden": 1.3125, "loss/logits": 0.2127995491027832, "loss/reg": 9.85479200608097e-05, "step": 1937 }, { "epoch": 0.24225, "grad_norm": 2.5895142555236816, "grad_norm_var": 0.49327383588287693, "learning_rate": 0.0001, "loss": 1.4188, "loss/crossentropy": 2.8629066944122314, "loss/hidden": 1.2109375, "loss/logits": 0.2068871259689331, "loss/reg": 9.85019578365609e-05, "step": 1938 }, { "epoch": 0.242375, "grad_norm": 2.2477469444274902, "grad_norm_var": 0.5410422908913224, "learning_rate": 0.0001, "loss": 1.3007, "loss/crossentropy": 2.614689588546753, "loss/hidden": 1.1171875, "loss/logits": 0.18252143263816833, "loss/reg": 9.845710883382708e-05, "step": 1939 }, { "epoch": 0.2425, "grad_norm": 2.0195326805114746, "grad_norm_var": 0.6019934075879708, "learning_rate": 0.0001, "loss": 1.1967, "loss/crossentropy": 2.3576529026031494, "loss/hidden": 1.03125, "loss/logits": 0.1644943356513977, "loss/reg": 9.84140278887935e-05, "step": 1940 }, { "epoch": 0.242625, "grad_norm": 3.0596840381622314, "grad_norm_var": 0.4870793946777191, "learning_rate": 0.0001, "loss": 1.4552, "loss/crossentropy": 2.9617488384246826, "loss/hidden": 1.234375, "loss/logits": 0.21983496844768524, "loss/reg": 9.837697143666446e-05, "step": 1941 }, { "epoch": 0.24275, "grad_norm": 2.196047306060791, "grad_norm_var": 0.5200528068405975, "learning_rate": 0.0001, "loss": 1.2973, "loss/crossentropy": 2.700929641723633, "loss/hidden": 1.109375, "loss/logits": 0.18692409992218018, "loss/reg": 9.833132935455069e-05, "step": 1942 }, { "epoch": 0.242875, "grad_norm": 2.305645227432251, "grad_norm_var": 0.5420536527419622, "learning_rate": 0.0001, "loss": 1.2274, "loss/crossentropy": 2.506577968597412, "loss/hidden": 1.0625, "loss/logits": 0.16395613551139832, "loss/reg": 9.828194015426561e-05, "step": 1943 }, { "epoch": 0.243, "grad_norm": 2.6418752670288086, "grad_norm_var": 0.516048472263708, "learning_rate": 0.0001, "loss": 1.5679, "loss/crossentropy": 2.3768882751464844, "loss/hidden": 1.3515625, "loss/logits": 0.2153974175453186, "loss/reg": 9.823461004998535e-05, "step": 1944 }, { "epoch": 0.243125, "grad_norm": 3.0772340297698975, "grad_norm_var": 0.5069221721653394, "learning_rate": 0.0001, "loss": 1.722, "loss/crossentropy": 2.538573741912842, "loss/hidden": 1.4296875, "loss/logits": 0.29129740595817566, "loss/reg": 9.818300168262795e-05, "step": 1945 }, { "epoch": 0.24325, "grad_norm": 2.8947367668151855, "grad_norm_var": 0.48910828671360657, "learning_rate": 0.0001, "loss": 1.4674, "loss/crossentropy": 2.627796173095703, "loss/hidden": 1.28125, "loss/logits": 0.18517246842384338, "loss/reg": 9.813852375373244e-05, "step": 1946 }, { "epoch": 0.243375, "grad_norm": 2.9444336891174316, "grad_norm_var": 0.4767012307432566, "learning_rate": 0.0001, "loss": 1.5436, "loss/crossentropy": 2.2501718997955322, "loss/hidden": 1.28125, "loss/logits": 0.2613542675971985, "loss/reg": 9.809021867113188e-05, "step": 1947 }, { "epoch": 0.2435, "grad_norm": 2.3651037216186523, "grad_norm_var": 0.4930997211690735, "learning_rate": 0.0001, "loss": 1.2891, "loss/crossentropy": 2.5500845909118652, "loss/hidden": 1.1015625, "loss/logits": 0.18651536107063293, "loss/reg": 9.80504701146856e-05, "step": 1948 }, { "epoch": 0.243625, "grad_norm": 2.0637013912200928, "grad_norm_var": 0.3165531027156921, "learning_rate": 0.0001, "loss": 1.2313, "loss/crossentropy": 2.4126768112182617, "loss/hidden": 1.0625, "loss/logits": 0.16779418289661407, "loss/reg": 9.800533734960482e-05, "step": 1949 }, { "epoch": 0.24375, "grad_norm": 2.753376007080078, "grad_norm_var": 0.3120412288458546, "learning_rate": 0.0001, "loss": 1.4213, "loss/crossentropy": 2.91988205909729, "loss/hidden": 1.21875, "loss/logits": 0.20155711472034454, "loss/reg": 9.796633821679279e-05, "step": 1950 }, { "epoch": 0.243875, "grad_norm": 2.4517252445220947, "grad_norm_var": 0.31587461248073867, "learning_rate": 0.0001, "loss": 1.2663, "loss/crossentropy": 2.531790256500244, "loss/hidden": 1.0859375, "loss/logits": 0.17936506867408752, "loss/reg": 9.792015043785796e-05, "step": 1951 }, { "epoch": 0.244, "grad_norm": 2.587846279144287, "grad_norm_var": 0.30634687528944726, "learning_rate": 0.0001, "loss": 1.3673, "loss/crossentropy": 2.4702157974243164, "loss/hidden": 1.1796875, "loss/logits": 0.18666133284568787, "loss/reg": 9.788086026674137e-05, "step": 1952 }, { "epoch": 0.244125, "grad_norm": 2.9344563484191895, "grad_norm_var": 0.12292912972666438, "learning_rate": 0.0001, "loss": 1.3717, "loss/crossentropy": 2.6732494831085205, "loss/hidden": 1.1875, "loss/logits": 0.18319424986839294, "loss/reg": 9.783470159163699e-05, "step": 1953 }, { "epoch": 0.24425, "grad_norm": 2.290635347366333, "grad_norm_var": 0.12776604380868928, "learning_rate": 0.0001, "loss": 1.3002, "loss/crossentropy": 2.754128932952881, "loss/hidden": 1.140625, "loss/logits": 0.15861055254936218, "loss/reg": 9.778476669453084e-05, "step": 1954 }, { "epoch": 0.244375, "grad_norm": 2.984539031982422, "grad_norm_var": 0.13179452502650502, "learning_rate": 0.0001, "loss": 1.3927, "loss/crossentropy": 2.405066967010498, "loss/hidden": 1.203125, "loss/logits": 0.18859969079494476, "loss/reg": 9.773512283572927e-05, "step": 1955 }, { "epoch": 0.2445, "grad_norm": 3.0516316890716553, "grad_norm_var": 0.11874443359480603, "learning_rate": 0.0001, "loss": 1.4691, "loss/crossentropy": 2.2891998291015625, "loss/hidden": 1.28125, "loss/logits": 0.18691197037696838, "loss/reg": 9.768154268385842e-05, "step": 1956 }, { "epoch": 0.244625, "grad_norm": 2.979217290878296, "grad_norm_var": 0.11488955831397879, "learning_rate": 0.0001, "loss": 1.3423, "loss/crossentropy": 2.6419332027435303, "loss/hidden": 1.1484375, "loss/logits": 0.19285361468791962, "loss/reg": 9.763892740011215e-05, "step": 1957 }, { "epoch": 0.24475, "grad_norm": 2.4295432567596436, "grad_norm_var": 0.10392647957171354, "learning_rate": 0.0001, "loss": 1.3761, "loss/crossentropy": 2.468214988708496, "loss/hidden": 1.171875, "loss/logits": 0.20325082540512085, "loss/reg": 9.759193926583976e-05, "step": 1958 }, { "epoch": 0.244875, "grad_norm": 3.045116424560547, "grad_norm_var": 0.10195860516381737, "learning_rate": 0.0001, "loss": 1.5143, "loss/crossentropy": 2.26703143119812, "loss/hidden": 1.3046875, "loss/logits": 0.20867738127708435, "loss/reg": 9.755010978551582e-05, "step": 1959 }, { "epoch": 0.245, "grad_norm": 2.5482125282287598, "grad_norm_var": 0.1034631706600083, "learning_rate": 0.0001, "loss": 1.3059, "loss/crossentropy": 2.4883971214294434, "loss/hidden": 1.09375, "loss/logits": 0.2111463099718094, "loss/reg": 9.750707249622792e-05, "step": 1960 }, { "epoch": 0.245125, "grad_norm": 2.682587146759033, "grad_norm_var": 0.09401011557599664, "learning_rate": 0.0001, "loss": 1.7908, "loss/crossentropy": 2.364849805831909, "loss/hidden": 1.46875, "loss/logits": 0.3210746645927429, "loss/reg": 9.745372517500073e-05, "step": 1961 }, { "epoch": 0.24525, "grad_norm": 1.9779616594314575, "grad_norm_var": 0.12126039387345025, "learning_rate": 0.0001, "loss": 1.3284, "loss/crossentropy": 2.6453933715820312, "loss/hidden": 1.125, "loss/logits": 0.2024185210466385, "loss/reg": 9.740971290739253e-05, "step": 1962 }, { "epoch": 0.245375, "grad_norm": 2.801231622695923, "grad_norm_var": 0.11655043438549335, "learning_rate": 0.0001, "loss": 1.3439, "loss/crossentropy": 2.6006577014923096, "loss/hidden": 1.140625, "loss/logits": 0.20230454206466675, "loss/reg": 9.736261563375592e-05, "step": 1963 }, { "epoch": 0.2455, "grad_norm": 2.9748597145080566, "grad_norm_var": 0.11892820075999797, "learning_rate": 0.0001, "loss": 1.6596, "loss/crossentropy": 2.3430681228637695, "loss/hidden": 1.4140625, "loss/logits": 0.24461254477500916, "loss/reg": 9.731513273436576e-05, "step": 1964 }, { "epoch": 0.245625, "grad_norm": 2.9419193267822266, "grad_norm_var": 0.09733293730978539, "learning_rate": 0.0001, "loss": 1.4014, "loss/crossentropy": 2.578578472137451, "loss/hidden": 1.1953125, "loss/logits": 0.20512482523918152, "loss/reg": 9.726876305649057e-05, "step": 1965 }, { "epoch": 0.24575, "grad_norm": 2.64227032661438, "grad_norm_var": 0.09753120114530413, "learning_rate": 0.0001, "loss": 1.3965, "loss/crossentropy": 2.647516965866089, "loss/hidden": 1.203125, "loss/logits": 0.19238628447055817, "loss/reg": 9.721569222165272e-05, "step": 1966 }, { "epoch": 0.245875, "grad_norm": 2.2472825050354004, "grad_norm_var": 0.10712206983190047, "learning_rate": 0.0001, "loss": 1.3487, "loss/crossentropy": 2.6264638900756836, "loss/hidden": 1.140625, "loss/logits": 0.2071080356836319, "loss/reg": 9.717119246488437e-05, "step": 1967 }, { "epoch": 0.246, "grad_norm": 4.349966049194336, "grad_norm_var": 0.27602313296046593, "learning_rate": 0.0001, "loss": 1.5271, "loss/crossentropy": 2.577239751815796, "loss/hidden": 1.2890625, "loss/logits": 0.23705226182937622, "loss/reg": 9.712397877592593e-05, "step": 1968 }, { "epoch": 0.246125, "grad_norm": 2.545440196990967, "grad_norm_var": 0.2787713694268428, "learning_rate": 0.0001, "loss": 1.4474, "loss/crossentropy": 2.5919463634490967, "loss/hidden": 1.21875, "loss/logits": 0.22764267027378082, "loss/reg": 9.707472781883553e-05, "step": 1969 }, { "epoch": 0.24625, "grad_norm": 2.2360846996307373, "grad_norm_var": 0.2825223530715068, "learning_rate": 0.0001, "loss": 1.2302, "loss/crossentropy": 2.648597240447998, "loss/hidden": 1.0546875, "loss/logits": 0.17449527978897095, "loss/reg": 9.702756506158039e-05, "step": 1970 }, { "epoch": 0.246375, "grad_norm": 2.0517008304595947, "grad_norm_var": 0.3111412497148435, "learning_rate": 0.0001, "loss": 1.3235, "loss/crossentropy": 2.1619086265563965, "loss/hidden": 1.125, "loss/logits": 0.1975041776895523, "loss/reg": 9.697582572698593e-05, "step": 1971 }, { "epoch": 0.2465, "grad_norm": 2.406048059463501, "grad_norm_var": 0.30856319468698806, "learning_rate": 0.0001, "loss": 1.3312, "loss/crossentropy": 2.449512481689453, "loss/hidden": 1.140625, "loss/logits": 0.18959781527519226, "loss/reg": 9.693184256320819e-05, "step": 1972 }, { "epoch": 0.246625, "grad_norm": 2.684312105178833, "grad_norm_var": 0.3021828076443431, "learning_rate": 0.0001, "loss": 1.2465, "loss/crossentropy": 2.4254841804504395, "loss/hidden": 1.078125, "loss/logits": 0.16741403937339783, "loss/reg": 9.688859427114949e-05, "step": 1973 }, { "epoch": 0.24675, "grad_norm": 2.778441905975342, "grad_norm_var": 0.29905695348517053, "learning_rate": 0.0001, "loss": 1.1735, "loss/crossentropy": 2.5412042140960693, "loss/hidden": 1.0078125, "loss/logits": 0.16476303339004517, "loss/reg": 9.684533142717555e-05, "step": 1974 }, { "epoch": 0.246875, "grad_norm": 2.3991239070892334, "grad_norm_var": 0.2938702590498441, "learning_rate": 0.0001, "loss": 1.2028, "loss/crossentropy": 2.574761390686035, "loss/hidden": 1.03125, "loss/logits": 0.17053796350955963, "loss/reg": 9.680320363258943e-05, "step": 1975 }, { "epoch": 0.247, "grad_norm": 2.523468255996704, "grad_norm_var": 0.2942170137694806, "learning_rate": 0.0001, "loss": 1.3111, "loss/crossentropy": 2.7701334953308105, "loss/hidden": 1.1015625, "loss/logits": 0.20852957665920258, "loss/reg": 9.676381159806624e-05, "step": 1976 }, { "epoch": 0.247125, "grad_norm": 2.7431626319885254, "grad_norm_var": 0.29478895345535955, "learning_rate": 0.0001, "loss": 1.2604, "loss/crossentropy": 2.5848798751831055, "loss/hidden": 1.078125, "loss/logits": 0.18131427466869354, "loss/reg": 9.67183877946809e-05, "step": 1977 }, { "epoch": 0.24725, "grad_norm": 1.8584957122802734, "grad_norm_var": 0.30628942434336653, "learning_rate": 0.0001, "loss": 1.1114, "loss/crossentropy": 2.452209234237671, "loss/hidden": 0.97265625, "loss/logits": 0.13777723908424377, "loss/reg": 9.667713311500847e-05, "step": 1978 }, { "epoch": 0.247375, "grad_norm": 2.7744898796081543, "grad_norm_var": 0.3057467151435766, "learning_rate": 0.0001, "loss": 1.2275, "loss/crossentropy": 2.6266050338745117, "loss/hidden": 1.0546875, "loss/logits": 0.17179882526397705, "loss/reg": 9.663808305049315e-05, "step": 1979 }, { "epoch": 0.2475, "grad_norm": 3.05360746383667, "grad_norm_var": 0.30970464097148137, "learning_rate": 0.0001, "loss": 1.3887, "loss/crossentropy": 2.4099338054656982, "loss/hidden": 1.1875, "loss/logits": 0.20021164417266846, "loss/reg": 9.660720388637856e-05, "step": 1980 }, { "epoch": 0.247625, "grad_norm": 2.4127182960510254, "grad_norm_var": 0.30588606903319426, "learning_rate": 0.0001, "loss": 1.2139, "loss/crossentropy": 2.659632682800293, "loss/hidden": 1.0390625, "loss/logits": 0.1738429218530655, "loss/reg": 9.657906775828451e-05, "step": 1981 }, { "epoch": 0.24775, "grad_norm": 13.248802185058594, "grad_norm_var": 7.387399054090027, "learning_rate": 0.0001, "loss": 1.7002, "loss/crossentropy": 2.2823972702026367, "loss/hidden": 1.4609375, "loss/logits": 0.2383258193731308, "loss/reg": 9.655113535700366e-05, "step": 1982 }, { "epoch": 0.247875, "grad_norm": 4.120009899139404, "grad_norm_var": 7.351330805965733, "learning_rate": 0.0001, "loss": 1.5429, "loss/crossentropy": 2.557643413543701, "loss/hidden": 1.296875, "loss/logits": 0.24506798386573792, "loss/reg": 9.651603613747284e-05, "step": 1983 }, { "epoch": 0.248, "grad_norm": 3.085603713989258, "grad_norm_var": 7.288841096827192, "learning_rate": 0.0001, "loss": 1.3716, "loss/crossentropy": 2.648674249649048, "loss/hidden": 1.15625, "loss/logits": 0.2143906205892563, "loss/reg": 9.648627019487321e-05, "step": 1984 }, { "epoch": 0.248125, "grad_norm": 3.3684277534484863, "grad_norm_var": 7.247540427024668, "learning_rate": 0.0001, "loss": 1.5459, "loss/crossentropy": 2.8058836460113525, "loss/hidden": 1.3203125, "loss/logits": 0.22459742426872253, "loss/reg": 9.645480167819187e-05, "step": 1985 }, { "epoch": 0.24825, "grad_norm": 2.765535831451416, "grad_norm_var": 7.185787635643559, "learning_rate": 0.0001, "loss": 1.4475, "loss/crossentropy": 2.374544382095337, "loss/hidden": 1.2265625, "loss/logits": 0.21997015178203583, "loss/reg": 9.642915392760187e-05, "step": 1986 }, { "epoch": 0.248375, "grad_norm": 2.2456467151641846, "grad_norm_var": 7.15347602335182, "learning_rate": 0.0001, "loss": 1.3095, "loss/crossentropy": 2.334543466567993, "loss/hidden": 1.1328125, "loss/logits": 0.17575593292713165, "loss/reg": 9.63941274676472e-05, "step": 1987 }, { "epoch": 0.2485, "grad_norm": 2.889707088470459, "grad_norm_var": 7.103724910324339, "learning_rate": 0.0001, "loss": 1.4292, "loss/crossentropy": 2.844774007797241, "loss/hidden": 1.21875, "loss/logits": 0.20946523547172546, "loss/reg": 9.636789036449045e-05, "step": 1988 }, { "epoch": 0.248625, "grad_norm": 2.9871268272399902, "grad_norm_var": 7.079168026167649, "learning_rate": 0.0001, "loss": 1.3969, "loss/crossentropy": 2.5316884517669678, "loss/hidden": 1.1953125, "loss/logits": 0.20058360695838928, "loss/reg": 9.63446800597012e-05, "step": 1989 }, { "epoch": 0.24875, "grad_norm": 2.311487913131714, "grad_norm_var": 7.134819029138603, "learning_rate": 0.0001, "loss": 1.2181, "loss/crossentropy": 2.714524507522583, "loss/hidden": 1.046875, "loss/logits": 0.17030149698257446, "loss/reg": 9.630551357986405e-05, "step": 1990 }, { "epoch": 0.248875, "grad_norm": 2.7794313430786133, "grad_norm_var": 7.09187875172761, "learning_rate": 0.0001, "loss": 1.3526, "loss/crossentropy": 2.797808885574341, "loss/hidden": 1.171875, "loss/logits": 0.1797522008419037, "loss/reg": 9.626129758544266e-05, "step": 1991 }, { "epoch": 0.249, "grad_norm": 2.2392332553863525, "grad_norm_var": 7.131965342171799, "learning_rate": 0.0001, "loss": 1.2431, "loss/crossentropy": 2.6418049335479736, "loss/hidden": 1.0625, "loss/logits": 0.1796623319387436, "loss/reg": 9.624339872971177e-05, "step": 1992 }, { "epoch": 0.249125, "grad_norm": 3.665799379348755, "grad_norm_var": 7.10064867677366, "learning_rate": 0.0001, "loss": 2.3084, "loss/crossentropy": 2.812539577484131, "loss/hidden": 1.9140625, "loss/logits": 0.3933258056640625, "loss/reg": 9.620123455533758e-05, "step": 1993 }, { "epoch": 0.24925, "grad_norm": 2.597216844558716, "grad_norm_var": 6.9742671366476445, "learning_rate": 0.0001, "loss": 1.5049, "loss/crossentropy": 2.172825336456299, "loss/hidden": 1.28125, "loss/logits": 0.22264839708805084, "loss/reg": 9.617758769309148e-05, "step": 1994 }, { "epoch": 0.249375, "grad_norm": 2.733091115951538, "grad_norm_var": 6.9785669147176765, "learning_rate": 0.0001, "loss": 1.2205, "loss/crossentropy": 2.9570679664611816, "loss/hidden": 1.0546875, "loss/logits": 0.16483381390571594, "loss/reg": 9.613706060918048e-05, "step": 1995 }, { "epoch": 0.2495, "grad_norm": 2.9629180431365967, "grad_norm_var": 6.9848591710757315, "learning_rate": 0.0001, "loss": 1.4844, "loss/crossentropy": 2.3117270469665527, "loss/hidden": 1.2578125, "loss/logits": 0.22566689550876617, "loss/reg": 9.611460700398311e-05, "step": 1996 }, { "epoch": 0.249625, "grad_norm": 2.7912280559539795, "grad_norm_var": 6.937638689811751, "learning_rate": 0.0001, "loss": 1.3136, "loss/crossentropy": 2.6462905406951904, "loss/hidden": 1.1171875, "loss/logits": 0.19548577070236206, "loss/reg": 9.607260290067643e-05, "step": 1997 }, { "epoch": 0.24975, "grad_norm": 2.7128522396087646, "grad_norm_var": 0.2499493431064894, "learning_rate": 0.0001, "loss": 1.4065, "loss/crossentropy": 2.67246150970459, "loss/hidden": 1.1953125, "loss/logits": 0.21020692586898804, "loss/reg": 9.604328079149127e-05, "step": 1998 }, { "epoch": 0.249875, "grad_norm": 2.6144251823425293, "grad_norm_var": 0.1448977091036414, "learning_rate": 0.0001, "loss": 1.3733, "loss/crossentropy": 2.4188284873962402, "loss/hidden": 1.1796875, "loss/logits": 0.19267773628234863, "loss/reg": 9.600563498679549e-05, "step": 1999 }, { "epoch": 0.25, "grad_norm": 5.68969202041626, "grad_norm_var": 0.6689832933155512, "learning_rate": 0.0001, "loss": 2.1631, "loss/crossentropy": 2.6871252059936523, "loss/hidden": 1.7421875, "loss/logits": 0.4199907183647156, "loss/reg": 9.59687095019035e-05, "step": 2000 }, { "epoch": 0.250125, "grad_norm": 2.4005062580108643, "grad_norm_var": 0.6747778099492042, "learning_rate": 0.0001, "loss": 1.4732, "loss/crossentropy": 2.575969934463501, "loss/hidden": 1.2421875, "loss/logits": 0.23009833693504333, "loss/reg": 9.593054710421711e-05, "step": 2001 }, { "epoch": 0.25025, "grad_norm": 3.366863250732422, "grad_norm_var": 0.6866672097547394, "learning_rate": 0.0001, "loss": 1.8134, "loss/crossentropy": 2.2118849754333496, "loss/hidden": 1.5234375, "loss/logits": 0.28903844952583313, "loss/reg": 9.588948159944266e-05, "step": 2002 }, { "epoch": 0.250375, "grad_norm": 2.4148623943328857, "grad_norm_var": 0.6728651885889491, "learning_rate": 0.0001, "loss": 1.3536, "loss/crossentropy": 2.364380121231079, "loss/hidden": 1.15625, "loss/logits": 0.19643577933311462, "loss/reg": 9.584679355612025e-05, "step": 2003 }, { "epoch": 0.2505, "grad_norm": 2.626894235610962, "grad_norm_var": 0.6791994693487119, "learning_rate": 0.0001, "loss": 1.3165, "loss/crossentropy": 2.4404542446136475, "loss/hidden": 1.140625, "loss/logits": 0.17491121590137482, "loss/reg": 9.580230107530951e-05, "step": 2004 }, { "epoch": 0.250625, "grad_norm": 2.5890696048736572, "grad_norm_var": 0.6861158074318003, "learning_rate": 0.0001, "loss": 1.3482, "loss/crossentropy": 2.569962501525879, "loss/hidden": 1.140625, "loss/logits": 0.20662370324134827, "loss/reg": 9.576054435456172e-05, "step": 2005 }, { "epoch": 0.25075, "grad_norm": 2.831395149230957, "grad_norm_var": 0.6617994849383192, "learning_rate": 0.0001, "loss": 1.4489, "loss/crossentropy": 2.8096015453338623, "loss/hidden": 1.2421875, "loss/logits": 0.20577533543109894, "loss/reg": 9.572070848662406e-05, "step": 2006 }, { "epoch": 0.250875, "grad_norm": 2.8180978298187256, "grad_norm_var": 0.6610730131104712, "learning_rate": 0.0001, "loss": 1.5035, "loss/crossentropy": 2.4672577381134033, "loss/hidden": 1.2890625, "loss/logits": 0.21351200342178345, "loss/reg": 9.568345558363944e-05, "step": 2007 }, { "epoch": 0.251, "grad_norm": 2.169816493988037, "grad_norm_var": 0.66786835784009, "learning_rate": 0.0001, "loss": 1.2189, "loss/crossentropy": 2.5453741550445557, "loss/hidden": 1.046875, "loss/logits": 0.1710432469844818, "loss/reg": 9.56423464231193e-05, "step": 2008 }, { "epoch": 0.251125, "grad_norm": 2.187429666519165, "grad_norm_var": 0.6607193422756019, "learning_rate": 0.0001, "loss": 1.267, "loss/crossentropy": 2.3902041912078857, "loss/hidden": 1.09375, "loss/logits": 0.17229482531547546, "loss/reg": 9.559595491737127e-05, "step": 2009 }, { "epoch": 0.25125, "grad_norm": 3.036590099334717, "grad_norm_var": 0.6583189383137693, "learning_rate": 0.0001, "loss": 1.3753, "loss/crossentropy": 2.529231548309326, "loss/hidden": 1.171875, "loss/logits": 0.20251384377479553, "loss/reg": 9.555405267747119e-05, "step": 2010 }, { "epoch": 0.251375, "grad_norm": 2.424698829650879, "grad_norm_var": 0.6699587321169832, "learning_rate": 0.0001, "loss": 1.3718, "loss/crossentropy": 2.507408618927002, "loss/hidden": 1.1875, "loss/logits": 0.18337589502334595, "loss/reg": 9.551119001116604e-05, "step": 2011 }, { "epoch": 0.2515, "grad_norm": 2.2643308639526367, "grad_norm_var": 0.6901598620323018, "learning_rate": 0.0001, "loss": 1.3311, "loss/crossentropy": 2.4347689151763916, "loss/hidden": 1.1484375, "loss/logits": 0.18174521625041962, "loss/reg": 9.546944056637585e-05, "step": 2012 }, { "epoch": 0.251625, "grad_norm": 2.1031980514526367, "grad_norm_var": 0.7213467043955591, "learning_rate": 0.0001, "loss": 1.1076, "loss/crossentropy": 2.482074022293091, "loss/hidden": 0.9609375, "loss/logits": 0.14575164020061493, "loss/reg": 9.542825864627957e-05, "step": 2013 }, { "epoch": 0.25175, "grad_norm": 2.6249425411224365, "grad_norm_var": 0.7224488056302616, "learning_rate": 0.0001, "loss": 1.3281, "loss/crossentropy": 2.7580642700195312, "loss/hidden": 1.1328125, "loss/logits": 0.19435018301010132, "loss/reg": 9.538805898046121e-05, "step": 2014 }, { "epoch": 0.251875, "grad_norm": 2.9123213291168213, "grad_norm_var": 0.7222060489354438, "learning_rate": 0.0001, "loss": 1.3316, "loss/crossentropy": 2.8712997436523438, "loss/hidden": 1.1328125, "loss/logits": 0.1978493630886078, "loss/reg": 9.534717537462711e-05, "step": 2015 }, { "epoch": 0.252, "grad_norm": 2.515836238861084, "grad_norm_var": 0.11995513549686242, "learning_rate": 0.0001, "loss": 1.2558, "loss/crossentropy": 2.3165299892425537, "loss/hidden": 1.078125, "loss/logits": 0.17673757672309875, "loss/reg": 9.529656381346285e-05, "step": 2016 }, { "epoch": 0.252125, "grad_norm": 2.7671844959259033, "grad_norm_var": 0.11956197721087468, "learning_rate": 0.0001, "loss": 1.3306, "loss/crossentropy": 2.5165247917175293, "loss/hidden": 1.125, "loss/logits": 0.2046728879213333, "loss/reg": 9.525470522930846e-05, "step": 2017 }, { "epoch": 0.25225, "grad_norm": 2.9182581901550293, "grad_norm_var": 0.08647083806884646, "learning_rate": 0.0001, "loss": 1.3245, "loss/crossentropy": 2.787257432937622, "loss/hidden": 1.140625, "loss/logits": 0.18288323283195496, "loss/reg": 9.520564344711602e-05, "step": 2018 }, { "epoch": 0.252375, "grad_norm": 2.4375534057617188, "grad_norm_var": 0.08601759549311865, "learning_rate": 0.0001, "loss": 1.3976, "loss/crossentropy": 2.6237270832061768, "loss/hidden": 1.203125, "loss/logits": 0.19349300861358643, "loss/reg": 9.516306454315782e-05, "step": 2019 }, { "epoch": 0.2525, "grad_norm": 3.8579325675964355, "grad_norm_var": 0.1889680820449371, "learning_rate": 0.0001, "loss": 1.694, "loss/crossentropy": 2.8406319618225098, "loss/hidden": 1.3671875, "loss/logits": 0.32587742805480957, "loss/reg": 9.511532698525116e-05, "step": 2020 }, { "epoch": 0.252625, "grad_norm": 2.3740110397338867, "grad_norm_var": 0.19371098528560324, "learning_rate": 0.0001, "loss": 1.3664, "loss/crossentropy": 2.870011568069458, "loss/hidden": 1.171875, "loss/logits": 0.1935890018939972, "loss/reg": 9.506918286206201e-05, "step": 2021 }, { "epoch": 0.25275, "grad_norm": 2.5380375385284424, "grad_norm_var": 0.19161214966639667, "learning_rate": 0.0001, "loss": 1.3129, "loss/crossentropy": 2.7569057941436768, "loss/hidden": 1.125, "loss/logits": 0.18696236610412598, "loss/reg": 9.502415923634544e-05, "step": 2022 }, { "epoch": 0.252875, "grad_norm": 2.0883727073669434, "grad_norm_var": 0.20580294581652628, "learning_rate": 0.0001, "loss": 1.3738, "loss/crossentropy": 2.451807737350464, "loss/hidden": 1.15625, "loss/logits": 0.21660012006759644, "loss/reg": 9.497259452473372e-05, "step": 2023 }, { "epoch": 0.253, "grad_norm": 2.312168836593628, "grad_norm_var": 0.19935461295169338, "learning_rate": 0.0001, "loss": 1.369, "loss/crossentropy": 2.452712297439575, "loss/hidden": 1.171875, "loss/logits": 0.19618546962738037, "loss/reg": 9.49245149968192e-05, "step": 2024 }, { "epoch": 0.253125, "grad_norm": 2.7186806201934814, "grad_norm_var": 0.1888198641075102, "learning_rate": 0.0001, "loss": 1.3682, "loss/crossentropy": 2.689171552658081, "loss/hidden": 1.140625, "loss/logits": 0.22660332918167114, "loss/reg": 9.487794886808842e-05, "step": 2025 }, { "epoch": 0.25325, "grad_norm": 2.168450117111206, "grad_norm_var": 0.1875156692237489, "learning_rate": 0.0001, "loss": 1.2378, "loss/crossentropy": 2.436767578125, "loss/hidden": 1.078125, "loss/logits": 0.15869760513305664, "loss/reg": 9.483707981416956e-05, "step": 2026 }, { "epoch": 0.253375, "grad_norm": 2.4654626846313477, "grad_norm_var": 0.1868617262269801, "learning_rate": 0.0001, "loss": 1.193, "loss/crossentropy": 2.608302593231201, "loss/hidden": 1.0234375, "loss/logits": 0.1685754507780075, "loss/reg": 9.479909931542352e-05, "step": 2027 }, { "epoch": 0.2535, "grad_norm": 2.1647226810455322, "grad_norm_var": 0.1914972493242497, "learning_rate": 0.0001, "loss": 1.3094, "loss/crossentropy": 2.5616140365600586, "loss/hidden": 1.109375, "loss/logits": 0.19904828071594238, "loss/reg": 9.47593871387653e-05, "step": 2028 }, { "epoch": 0.253625, "grad_norm": 1.969323992729187, "grad_norm_var": 0.20077920599809757, "learning_rate": 0.0001, "loss": 1.3084, "loss/crossentropy": 2.1181600093841553, "loss/hidden": 1.1328125, "loss/logits": 0.17463427782058716, "loss/reg": 9.472283272771165e-05, "step": 2029 }, { "epoch": 0.25375, "grad_norm": 2.179377555847168, "grad_norm_var": 0.20885847145548025, "learning_rate": 0.0001, "loss": 1.3281, "loss/crossentropy": 2.418504238128662, "loss/hidden": 1.125, "loss/logits": 0.20214447379112244, "loss/reg": 9.46901272982359e-05, "step": 2030 }, { "epoch": 0.253875, "grad_norm": 3.243285894393921, "grad_norm_var": 0.23283045971032165, "learning_rate": 0.0001, "loss": 1.4377, "loss/crossentropy": 2.5771806240081787, "loss/hidden": 1.2109375, "loss/logits": 0.22580477595329285, "loss/reg": 9.465317270951346e-05, "step": 2031 }, { "epoch": 0.254, "grad_norm": 4.961824893951416, "grad_norm_var": 0.5972753532540271, "learning_rate": 0.0001, "loss": 1.7128, "loss/crossentropy": 2.7560319900512695, "loss/hidden": 1.453125, "loss/logits": 0.2586979866027832, "loss/reg": 9.4611692475155e-05, "step": 2032 }, { "epoch": 0.254125, "grad_norm": 2.47149658203125, "grad_norm_var": 0.6000039481184207, "learning_rate": 0.0001, "loss": 1.2981, "loss/crossentropy": 2.5317463874816895, "loss/hidden": 1.125, "loss/logits": 0.17215600609779358, "loss/reg": 9.456802945351228e-05, "step": 2033 }, { "epoch": 0.25425, "grad_norm": 2.616236925125122, "grad_norm_var": 0.5960826745367801, "learning_rate": 0.0001, "loss": 1.3749, "loss/crossentropy": 2.575054407119751, "loss/hidden": 1.1875, "loss/logits": 0.18641766905784607, "loss/reg": 9.452815720578656e-05, "step": 2034 }, { "epoch": 0.254375, "grad_norm": 2.585395574569702, "grad_norm_var": 0.5930552768312334, "learning_rate": 0.0001, "loss": 1.3548, "loss/crossentropy": 2.2288618087768555, "loss/hidden": 1.1875, "loss/logits": 0.1663748025894165, "loss/reg": 9.44915518630296e-05, "step": 2035 }, { "epoch": 0.2545, "grad_norm": 3.154613494873047, "grad_norm_var": 0.5125413734827541, "learning_rate": 0.0001, "loss": 1.4183, "loss/crossentropy": 2.3625869750976562, "loss/hidden": 1.2109375, "loss/logits": 0.2064605951309204, "loss/reg": 9.445334580959752e-05, "step": 2036 }, { "epoch": 0.254625, "grad_norm": 3.5775365829467773, "grad_norm_var": 0.5626798170629086, "learning_rate": 0.0001, "loss": 1.4634, "loss/crossentropy": 2.7854855060577393, "loss/hidden": 1.203125, "loss/logits": 0.25933700799942017, "loss/reg": 9.441698057344183e-05, "step": 2037 }, { "epoch": 0.25475, "grad_norm": 2.6480889320373535, "grad_norm_var": 0.5610464704009984, "learning_rate": 0.0001, "loss": 1.2315, "loss/crossentropy": 2.412829875946045, "loss/hidden": 1.0546875, "loss/logits": 0.1758311688899994, "loss/reg": 9.437553671887144e-05, "step": 2038 }, { "epoch": 0.254875, "grad_norm": 3.6237802505493164, "grad_norm_var": 0.5815759160979435, "learning_rate": 0.0001, "loss": 1.552, "loss/crossentropy": 2.4818501472473145, "loss/hidden": 1.296875, "loss/logits": 0.2542248070240021, "loss/reg": 9.434214734937996e-05, "step": 2039 }, { "epoch": 0.255, "grad_norm": 2.768970251083374, "grad_norm_var": 0.5646752777678581, "learning_rate": 0.0001, "loss": 1.3853, "loss/crossentropy": 2.354048728942871, "loss/hidden": 1.1796875, "loss/logits": 0.204716756939888, "loss/reg": 9.43112900131382e-05, "step": 2040 }, { "epoch": 0.255125, "grad_norm": 2.5547468662261963, "grad_norm_var": 0.5688390042242981, "learning_rate": 0.0001, "loss": 1.3155, "loss/crossentropy": 2.5512290000915527, "loss/hidden": 1.140625, "loss/logits": 0.17397405207157135, "loss/reg": 9.427510667592287e-05, "step": 2041 }, { "epoch": 0.25525, "grad_norm": 2.1818482875823975, "grad_norm_var": 0.5676825606649141, "learning_rate": 0.0001, "loss": 1.234, "loss/crossentropy": 2.1975560188293457, "loss/hidden": 1.078125, "loss/logits": 0.154951274394989, "loss/reg": 9.423371375305578e-05, "step": 2042 }, { "epoch": 0.255375, "grad_norm": 2.8149912357330322, "grad_norm_var": 0.5586593519025901, "learning_rate": 0.0001, "loss": 1.5511, "loss/crossentropy": 2.1967945098876953, "loss/hidden": 1.3359375, "loss/logits": 0.21418288350105286, "loss/reg": 9.419465641258284e-05, "step": 2043 }, { "epoch": 0.2555, "grad_norm": 2.0753862857818604, "grad_norm_var": 0.5672585011266675, "learning_rate": 0.0001, "loss": 1.1327, "loss/crossentropy": 2.303165912628174, "loss/hidden": 0.99609375, "loss/logits": 0.13568684458732605, "loss/reg": 9.416255488758907e-05, "step": 2044 }, { "epoch": 0.255625, "grad_norm": 2.4382834434509277, "grad_norm_var": 0.5266133015610639, "learning_rate": 0.0001, "loss": 1.2807, "loss/crossentropy": 2.3105971813201904, "loss/hidden": 1.109375, "loss/logits": 0.17040488123893738, "loss/reg": 9.413416410097852e-05, "step": 2045 }, { "epoch": 0.25575, "grad_norm": 2.2979180812835693, "grad_norm_var": 0.5165998196340162, "learning_rate": 0.0001, "loss": 1.4035, "loss/crossentropy": 2.6098291873931885, "loss/hidden": 1.203125, "loss/logits": 0.19942006468772888, "loss/reg": 9.409799531567842e-05, "step": 2046 }, { "epoch": 0.255875, "grad_norm": 2.761948823928833, "grad_norm_var": 0.507501976611443, "learning_rate": 0.0001, "loss": 1.4217, "loss/crossentropy": 2.6243505477905273, "loss/hidden": 1.2265625, "loss/logits": 0.1941855549812317, "loss/reg": 9.405255696037784e-05, "step": 2047 }, { "epoch": 0.256, "grad_norm": 14.743537902832031, "grad_norm_var": 9.2473793532084, "learning_rate": 0.0001, "loss": 1.8086, "loss/crossentropy": 2.3343136310577393, "loss/hidden": 1.5, "loss/logits": 0.30765753984451294, "loss/reg": 9.40203681238927e-05, "step": 2048 }, { "epoch": 0.256125, "grad_norm": 3.8056328296661377, "grad_norm_var": 9.183287310564127, "learning_rate": 0.0001, "loss": 1.2942, "loss/crossentropy": 2.590744733810425, "loss/hidden": 1.1171875, "loss/logits": 0.1760888695716858, "loss/reg": 9.398475231137127e-05, "step": 2049 }, { "epoch": 0.25625, "grad_norm": 2.5367870330810547, "grad_norm_var": 9.193473448247811, "learning_rate": 0.0001, "loss": 1.4497, "loss/crossentropy": 2.584937334060669, "loss/hidden": 1.2421875, "loss/logits": 0.2065267264842987, "loss/reg": 9.395254892297089e-05, "step": 2050 }, { "epoch": 0.256375, "grad_norm": 2.355182647705078, "grad_norm_var": 9.225952144338816, "learning_rate": 0.0001, "loss": 1.6133, "loss/crossentropy": 2.342164993286133, "loss/hidden": 1.3359375, "loss/logits": 0.27642208337783813, "loss/reg": 9.391779894940555e-05, "step": 2051 }, { "epoch": 0.2565, "grad_norm": 2.4915518760681152, "grad_norm_var": 9.285839865173209, "learning_rate": 0.0001, "loss": 1.3137, "loss/crossentropy": 3.034698247909546, "loss/hidden": 1.1328125, "loss/logits": 0.17999693751335144, "loss/reg": 9.388868056703359e-05, "step": 2052 }, { "epoch": 0.256625, "grad_norm": 2.374518871307373, "grad_norm_var": 9.360609810358884, "learning_rate": 0.0001, "loss": 1.2534, "loss/crossentropy": 2.673931837081909, "loss/hidden": 1.0859375, "loss/logits": 0.1665673702955246, "loss/reg": 9.384778240928426e-05, "step": 2053 }, { "epoch": 0.25675, "grad_norm": 2.4672439098358154, "grad_norm_var": 9.38089472686314, "learning_rate": 0.0001, "loss": 1.4087, "loss/crossentropy": 2.1917357444763184, "loss/hidden": 1.1875, "loss/logits": 0.22023721039295197, "loss/reg": 9.380647679790854e-05, "step": 2054 }, { "epoch": 0.256875, "grad_norm": 2.246994972229004, "grad_norm_var": 9.45705084930622, "learning_rate": 0.0001, "loss": 1.3232, "loss/crossentropy": 2.5544562339782715, "loss/hidden": 1.140625, "loss/logits": 0.18164512515068054, "loss/reg": 9.377204696647823e-05, "step": 2055 }, { "epoch": 0.257, "grad_norm": 2.349238872528076, "grad_norm_var": 9.498184540632943, "learning_rate": 0.0001, "loss": 1.2865, "loss/crossentropy": 2.333779811859131, "loss/hidden": 1.1015625, "loss/logits": 0.18400782346725464, "loss/reg": 9.37282748054713e-05, "step": 2056 }, { "epoch": 0.257125, "grad_norm": 2.677232265472412, "grad_norm_var": 9.487261678980488, "learning_rate": 0.0001, "loss": 1.3137, "loss/crossentropy": 2.838472843170166, "loss/hidden": 1.1328125, "loss/logits": 0.17996443808078766, "loss/reg": 9.368820610688999e-05, "step": 2057 }, { "epoch": 0.25725, "grad_norm": 3.318225145339966, "grad_norm_var": 9.40027299356079, "learning_rate": 0.0001, "loss": 1.3548, "loss/crossentropy": 2.619826555252075, "loss/hidden": 1.171875, "loss/logits": 0.1820184886455536, "loss/reg": 9.364218567498028e-05, "step": 2058 }, { "epoch": 0.257375, "grad_norm": 2.2141191959381104, "grad_norm_var": 9.46647584673785, "learning_rate": 0.0001, "loss": 1.3381, "loss/crossentropy": 2.513486385345459, "loss/hidden": 1.15625, "loss/logits": 0.18086759746074677, "loss/reg": 9.359866089653224e-05, "step": 2059 }, { "epoch": 0.2575, "grad_norm": 2.7752013206481934, "grad_norm_var": 9.380754285308042, "learning_rate": 0.0001, "loss": 1.4462, "loss/crossentropy": 2.671276330947876, "loss/hidden": 1.234375, "loss/logits": 0.21085430681705475, "loss/reg": 9.35519055929035e-05, "step": 2060 }, { "epoch": 0.257625, "grad_norm": 9.235503196716309, "grad_norm_var": 11.427740755499, "learning_rate": 0.0001, "loss": 2.9761, "loss/crossentropy": 1.8292793035507202, "loss/hidden": 2.4375, "loss/logits": 0.5376511812210083, "loss/reg": 9.350452455691993e-05, "step": 2061 }, { "epoch": 0.25775, "grad_norm": 14.16940689086914, "grad_norm_var": 17.873169569566667, "learning_rate": 0.0001, "loss": 2.6285, "loss/crossentropy": 2.6334726810455322, "loss/hidden": 2.3125, "loss/logits": 0.315062940120697, "loss/reg": 9.346017031930387e-05, "step": 2062 }, { "epoch": 0.257875, "grad_norm": 2.5949931144714355, "grad_norm_var": 17.914328760471214, "learning_rate": 0.0001, "loss": 1.3548, "loss/crossentropy": 2.4007623195648193, "loss/hidden": 1.15625, "loss/logits": 0.197630375623703, "loss/reg": 9.341397526441142e-05, "step": 2063 }, { "epoch": 0.258, "grad_norm": 3.1068527698516846, "grad_norm_var": 10.518624030294612, "learning_rate": 0.0001, "loss": 1.6278, "loss/crossentropy": 2.486703634262085, "loss/hidden": 1.3671875, "loss/logits": 0.2596863806247711, "loss/reg": 9.33645642362535e-05, "step": 2064 }, { "epoch": 0.258125, "grad_norm": 2.447516441345215, "grad_norm_var": 10.631963738337769, "learning_rate": 0.0001, "loss": 1.3259, "loss/crossentropy": 2.8385558128356934, "loss/hidden": 1.1171875, "loss/logits": 0.2078067660331726, "loss/reg": 9.332211629953235e-05, "step": 2065 }, { "epoch": 0.25825, "grad_norm": 3.7714943885803223, "grad_norm_var": 10.534095988885657, "learning_rate": 0.0001, "loss": 1.5656, "loss/crossentropy": 2.9268579483032227, "loss/hidden": 1.3046875, "loss/logits": 0.25996047258377075, "loss/reg": 9.327394218416885e-05, "step": 2066 }, { "epoch": 0.258375, "grad_norm": 2.7615771293640137, "grad_norm_var": 10.466822818301093, "learning_rate": 0.0001, "loss": 1.464, "loss/crossentropy": 2.3156068325042725, "loss/hidden": 1.265625, "loss/logits": 0.1974049210548401, "loss/reg": 9.322468395112082e-05, "step": 2067 }, { "epoch": 0.2585, "grad_norm": 3.994533061981201, "grad_norm_var": 10.343271734744203, "learning_rate": 0.0001, "loss": 1.5948, "loss/crossentropy": 2.4510979652404785, "loss/hidden": 1.3515625, "loss/logits": 0.2423471212387085, "loss/reg": 9.318804950453341e-05, "step": 2068 }, { "epoch": 0.258625, "grad_norm": 2.765977144241333, "grad_norm_var": 10.272886191441847, "learning_rate": 0.0001, "loss": 1.3923, "loss/crossentropy": 2.682530403137207, "loss/hidden": 1.1875, "loss/logits": 0.2038971334695816, "loss/reg": 9.314657654613256e-05, "step": 2069 }, { "epoch": 0.25875, "grad_norm": 2.6897993087768555, "grad_norm_var": 10.232546093287668, "learning_rate": 0.0001, "loss": 1.4942, "loss/crossentropy": 2.409651517868042, "loss/hidden": 1.2734375, "loss/logits": 0.21983151137828827, "loss/reg": 9.310444875154644e-05, "step": 2070 }, { "epoch": 0.258875, "grad_norm": 4.442226886749268, "grad_norm_var": 10.036758731592435, "learning_rate": 0.0001, "loss": 1.4886, "loss/crossentropy": 2.468057155609131, "loss/hidden": 1.265625, "loss/logits": 0.22205092012882233, "loss/reg": 9.305962157668546e-05, "step": 2071 }, { "epoch": 0.259, "grad_norm": 2.0204811096191406, "grad_norm_var": 10.119473522825583, "learning_rate": 0.0001, "loss": 1.2719, "loss/crossentropy": 2.5842671394348145, "loss/hidden": 1.0859375, "loss/logits": 0.18500789999961853, "loss/reg": 9.301403042627499e-05, "step": 2072 }, { "epoch": 0.259125, "grad_norm": 3.6547791957855225, "grad_norm_var": 9.998764226373252, "learning_rate": 0.0001, "loss": 1.614, "loss/crossentropy": 2.8583312034606934, "loss/hidden": 1.3671875, "loss/logits": 0.2458512783050537, "loss/reg": 9.296349890064448e-05, "step": 2073 }, { "epoch": 0.25925, "grad_norm": 3.056248664855957, "grad_norm_var": 10.031153050141528, "learning_rate": 0.0001, "loss": 1.5444, "loss/crossentropy": 2.7155492305755615, "loss/hidden": 1.3046875, "loss/logits": 0.23882417380809784, "loss/reg": 9.291690366808325e-05, "step": 2074 }, { "epoch": 0.259375, "grad_norm": 2.170560359954834, "grad_norm_var": 10.042261095608312, "learning_rate": 0.0001, "loss": 1.2079, "loss/crossentropy": 2.5628488063812256, "loss/hidden": 1.0390625, "loss/logits": 0.16793249547481537, "loss/reg": 9.286720887757838e-05, "step": 2075 }, { "epoch": 0.2595, "grad_norm": 2.655763864517212, "grad_norm_var": 10.064306971516096, "learning_rate": 0.0001, "loss": 1.3424, "loss/crossentropy": 2.702951192855835, "loss/hidden": 1.1484375, "loss/logits": 0.19300052523612976, "loss/reg": 9.282527025789022e-05, "step": 2076 }, { "epoch": 0.259625, "grad_norm": 5.085419654846191, "grad_norm_var": 8.296900135978436, "learning_rate": 0.0001, "loss": 1.9834, "loss/crossentropy": 3.033550977706909, "loss/hidden": 1.6640625, "loss/logits": 0.3184581995010376, "loss/reg": 9.278040670324117e-05, "step": 2077 }, { "epoch": 0.25975, "grad_norm": 2.112060308456421, "grad_norm_var": 0.771831670711619, "learning_rate": 0.0001, "loss": 1.1499, "loss/crossentropy": 2.455791473388672, "loss/hidden": 0.9921875, "loss/logits": 0.15683138370513916, "loss/reg": 9.273832984035835e-05, "step": 2078 }, { "epoch": 0.259875, "grad_norm": 2.1645634174346924, "grad_norm_var": 0.8114262396245712, "learning_rate": 0.0001, "loss": 1.1393, "loss/crossentropy": 2.5414977073669434, "loss/hidden": 0.99609375, "loss/logits": 0.14225973188877106, "loss/reg": 9.270187001675367e-05, "step": 2079 }, { "epoch": 0.26, "grad_norm": 2.680417537689209, "grad_norm_var": 0.819913983848757, "learning_rate": 0.0001, "loss": 1.3218, "loss/crossentropy": 2.319683313369751, "loss/hidden": 1.140625, "loss/logits": 0.18022984266281128, "loss/reg": 9.266919369110838e-05, "step": 2080 }, { "epoch": 0.260125, "grad_norm": 2.5630970001220703, "grad_norm_var": 0.8117787487252672, "learning_rate": 0.0001, "loss": 1.3005, "loss/crossentropy": 2.5042214393615723, "loss/hidden": 1.1171875, "loss/logits": 0.1823413074016571, "loss/reg": 9.263054380426183e-05, "step": 2081 }, { "epoch": 0.26025, "grad_norm": 2.1475729942321777, "grad_norm_var": 0.8175233608235788, "learning_rate": 0.0001, "loss": 1.1162, "loss/crossentropy": 2.6608388423919678, "loss/hidden": 0.97265625, "loss/logits": 0.14263775944709778, "loss/reg": 9.259507351089269e-05, "step": 2082 }, { "epoch": 0.260375, "grad_norm": 2.5348618030548096, "grad_norm_var": 0.8259877936207923, "learning_rate": 0.0001, "loss": 1.383, "loss/crossentropy": 2.2249271869659424, "loss/hidden": 1.1875, "loss/logits": 0.1946042776107788, "loss/reg": 9.255597979063168e-05, "step": 2083 }, { "epoch": 0.2605, "grad_norm": 2.438204765319824, "grad_norm_var": 0.7546339742684183, "learning_rate": 0.0001, "loss": 1.2242, "loss/crossentropy": 2.665700912475586, "loss/hidden": 1.046875, "loss/logits": 0.17639219760894775, "loss/reg": 9.251668961951509e-05, "step": 2084 }, { "epoch": 0.260625, "grad_norm": 2.814317464828491, "grad_norm_var": 0.7544068362733507, "learning_rate": 0.0001, "loss": 1.3199, "loss/crossentropy": 2.6239285469055176, "loss/hidden": 1.125, "loss/logits": 0.1939510703086853, "loss/reg": 9.247325215255842e-05, "step": 2085 }, { "epoch": 0.26075, "grad_norm": 2.89892315864563, "grad_norm_var": 0.753317376784944, "learning_rate": 0.0001, "loss": 1.8204, "loss/crossentropy": 2.3342649936676025, "loss/hidden": 1.515625, "loss/logits": 0.3038952350616455, "loss/reg": 9.242909436579794e-05, "step": 2086 }, { "epoch": 0.260875, "grad_norm": 4.04351282119751, "grad_norm_var": 0.6780741299518295, "learning_rate": 0.0001, "loss": 1.3835, "loss/crossentropy": 2.612851619720459, "loss/hidden": 1.234375, "loss/logits": 0.14824256300926208, "loss/reg": 9.237717313226312e-05, "step": 2087 }, { "epoch": 0.261, "grad_norm": 2.387793779373169, "grad_norm_var": 0.6475925615023689, "learning_rate": 0.0001, "loss": 1.387, "loss/crossentropy": 2.4238879680633545, "loss/hidden": 1.1796875, "loss/logits": 0.20634642243385315, "loss/reg": 9.233960008714348e-05, "step": 2088 }, { "epoch": 0.261125, "grad_norm": 2.1520371437072754, "grad_norm_var": 0.6250789189831548, "learning_rate": 0.0001, "loss": 1.1428, "loss/crossentropy": 2.4199275970458984, "loss/hidden": 0.9921875, "loss/logits": 0.14970625936985016, "loss/reg": 9.230035357177258e-05, "step": 2089 }, { "epoch": 0.26125, "grad_norm": 2.5915846824645996, "grad_norm_var": 0.6192332755858082, "learning_rate": 0.0001, "loss": 1.4866, "loss/crossentropy": 2.709597587585449, "loss/hidden": 1.265625, "loss/logits": 0.22002598643302917, "loss/reg": 9.226069232681766e-05, "step": 2090 }, { "epoch": 0.261375, "grad_norm": 2.588500738143921, "grad_norm_var": 0.5998088969038236, "learning_rate": 0.0001, "loss": 1.3099, "loss/crossentropy": 2.4175915718078613, "loss/hidden": 1.0859375, "loss/logits": 0.22300255298614502, "loss/reg": 9.221673099091277e-05, "step": 2091 }, { "epoch": 0.2615, "grad_norm": 2.7310712337493896, "grad_norm_var": 0.5993058411467008, "learning_rate": 0.0001, "loss": 1.2283, "loss/crossentropy": 2.7236084938049316, "loss/hidden": 1.0625, "loss/logits": 0.16487032175064087, "loss/reg": 9.217495971824974e-05, "step": 2092 }, { "epoch": 0.261625, "grad_norm": 3.5325915813446045, "grad_norm_var": 0.26562165191305076, "learning_rate": 0.0001, "loss": 1.6905, "loss/crossentropy": 2.6735928058624268, "loss/hidden": 1.4140625, "loss/logits": 0.27555668354034424, "loss/reg": 9.213597513735294e-05, "step": 2093 }, { "epoch": 0.26175, "grad_norm": 2.2293472290039062, "grad_norm_var": 0.2580874396191111, "learning_rate": 0.0001, "loss": 1.305, "loss/crossentropy": 2.5095393657684326, "loss/hidden": 1.125, "loss/logits": 0.1790449321269989, "loss/reg": 9.209771815221757e-05, "step": 2094 }, { "epoch": 0.261875, "grad_norm": 2.1910505294799805, "grad_norm_var": 0.25639519362901503, "learning_rate": 0.0001, "loss": 1.3524, "loss/crossentropy": 2.4585516452789307, "loss/hidden": 1.1484375, "loss/logits": 0.20302852988243103, "loss/reg": 9.205927926814184e-05, "step": 2095 }, { "epoch": 0.262, "grad_norm": 2.0321285724639893, "grad_norm_var": 0.28070803465058136, "learning_rate": 0.0001, "loss": 1.1495, "loss/crossentropy": 2.278193235397339, "loss/hidden": 0.99609375, "loss/logits": 0.15247491002082825, "loss/reg": 9.201082139043137e-05, "step": 2096 }, { "epoch": 0.262125, "grad_norm": 2.956857204437256, "grad_norm_var": 0.28755341810854795, "learning_rate": 0.0001, "loss": 1.4418, "loss/crossentropy": 2.5081677436828613, "loss/hidden": 1.2265625, "loss/logits": 0.21433480083942413, "loss/reg": 9.197665349347517e-05, "step": 2097 }, { "epoch": 0.26225, "grad_norm": 3.3956503868103027, "grad_norm_var": 0.3026488377333247, "learning_rate": 0.0001, "loss": 1.402, "loss/crossentropy": 2.5740201473236084, "loss/hidden": 1.21875, "loss/logits": 0.18236249685287476, "loss/reg": 9.19240846997127e-05, "step": 2098 }, { "epoch": 0.262375, "grad_norm": 3.2248470783233643, "grad_norm_var": 0.31538047661828444, "learning_rate": 0.0001, "loss": 1.5047, "loss/crossentropy": 2.483081340789795, "loss/hidden": 1.2734375, "loss/logits": 0.23030242323875427, "loss/reg": 9.18762234505266e-05, "step": 2099 }, { "epoch": 0.2625, "grad_norm": 2.6450891494750977, "grad_norm_var": 0.30909548549601124, "learning_rate": 0.0001, "loss": 1.2504, "loss/crossentropy": 2.083801507949829, "loss/hidden": 1.0625, "loss/logits": 0.18701060116291046, "loss/reg": 9.183735528495163e-05, "step": 2100 }, { "epoch": 0.262625, "grad_norm": 2.6805336475372314, "grad_norm_var": 0.3095298391255407, "learning_rate": 0.0001, "loss": 1.4171, "loss/crossentropy": 2.605583429336548, "loss/hidden": 1.1875, "loss/logits": 0.22866840660572052, "loss/reg": 9.179821063298732e-05, "step": 2101 }, { "epoch": 0.26275, "grad_norm": 2.966921806335449, "grad_norm_var": 0.31100951319271697, "learning_rate": 0.0001, "loss": 1.4912, "loss/crossentropy": 2.2555954456329346, "loss/hidden": 1.3125, "loss/logits": 0.17776569724082947, "loss/reg": 9.17647557798773e-05, "step": 2102 }, { "epoch": 0.262875, "grad_norm": 2.4917759895324707, "grad_norm_var": 0.19839659218133246, "learning_rate": 0.0001, "loss": 1.4037, "loss/crossentropy": 2.4854156970977783, "loss/hidden": 1.203125, "loss/logits": 0.19967535138130188, "loss/reg": 9.172321006190032e-05, "step": 2103 }, { "epoch": 0.263, "grad_norm": 3.304420232772827, "grad_norm_var": 0.21582485487947428, "learning_rate": 0.0001, "loss": 1.658, "loss/crossentropy": 2.7377443313598633, "loss/hidden": 1.3828125, "loss/logits": 0.2742348909378052, "loss/reg": 9.168332326225936e-05, "step": 2104 }, { "epoch": 0.263125, "grad_norm": 3.256571054458618, "grad_norm_var": 0.20664057647762396, "learning_rate": 0.0001, "loss": 1.2782, "loss/crossentropy": 2.513913631439209, "loss/hidden": 1.109375, "loss/logits": 0.1678800880908966, "loss/reg": 9.164318180410191e-05, "step": 2105 }, { "epoch": 0.26325, "grad_norm": 3.496617078781128, "grad_norm_var": 0.23254076927624373, "learning_rate": 0.0001, "loss": 1.5928, "loss/crossentropy": 2.3363990783691406, "loss/hidden": 1.34375, "loss/logits": 0.24812299013137817, "loss/reg": 9.160180343315005e-05, "step": 2106 }, { "epoch": 0.263375, "grad_norm": 2.95719575881958, "grad_norm_var": 0.22780073684387692, "learning_rate": 0.0001, "loss": 1.3355, "loss/crossentropy": 3.1020922660827637, "loss/hidden": 1.140625, "loss/logits": 0.1939433515071869, "loss/reg": 9.156770101981238e-05, "step": 2107 }, { "epoch": 0.2635, "grad_norm": 2.8837075233459473, "grad_norm_var": 0.22620981309493662, "learning_rate": 0.0001, "loss": 1.2706, "loss/crossentropy": 2.7670750617980957, "loss/hidden": 1.109375, "loss/logits": 0.1602964848279953, "loss/reg": 9.152858547167853e-05, "step": 2108 }, { "epoch": 0.263625, "grad_norm": 2.561954975128174, "grad_norm_var": 0.20197313082893043, "learning_rate": 0.0001, "loss": 1.3867, "loss/crossentropy": 2.1758992671966553, "loss/hidden": 1.21875, "loss/logits": 0.16704291105270386, "loss/reg": 9.148954268312082e-05, "step": 2109 }, { "epoch": 0.26375, "grad_norm": 4.583591938018799, "grad_norm_var": 0.35993751181128647, "learning_rate": 0.0001, "loss": 1.4437, "loss/crossentropy": 2.0622799396514893, "loss/hidden": 1.2421875, "loss/logits": 0.20058636367321014, "loss/reg": 9.14481352083385e-05, "step": 2110 }, { "epoch": 0.263875, "grad_norm": 3.719545602798462, "grad_norm_var": 0.34581942180666353, "learning_rate": 0.0001, "loss": 1.4632, "loss/crossentropy": 2.584282636642456, "loss/hidden": 1.265625, "loss/logits": 0.19665011763572693, "loss/reg": 9.140949259744957e-05, "step": 2111 }, { "epoch": 0.264, "grad_norm": 3.101961135864258, "grad_norm_var": 0.2689732898110113, "learning_rate": 0.0001, "loss": 1.425, "loss/crossentropy": 2.5918591022491455, "loss/hidden": 1.203125, "loss/logits": 0.22097471356391907, "loss/reg": 9.137268352787942e-05, "step": 2112 }, { "epoch": 0.264125, "grad_norm": 4.002130508422852, "grad_norm_var": 0.3118471298055046, "learning_rate": 0.0001, "loss": 1.5599, "loss/crossentropy": 2.2550313472747803, "loss/hidden": 1.359375, "loss/logits": 0.19957178831100464, "loss/reg": 9.133792627835646e-05, "step": 2113 }, { "epoch": 0.26425, "grad_norm": 2.085766077041626, "grad_norm_var": 0.3857053932478263, "learning_rate": 0.0001, "loss": 1.2629, "loss/crossentropy": 2.7203943729400635, "loss/hidden": 1.09375, "loss/logits": 0.16819560527801514, "loss/reg": 9.131157275987789e-05, "step": 2114 }, { "epoch": 0.264375, "grad_norm": 2.4794015884399414, "grad_norm_var": 0.41027973359810704, "learning_rate": 0.0001, "loss": 1.3448, "loss/crossentropy": 2.3258554935455322, "loss/hidden": 1.15625, "loss/logits": 0.18767914175987244, "loss/reg": 9.127124940278009e-05, "step": 2115 }, { "epoch": 0.2645, "grad_norm": 1.8872106075286865, "grad_norm_var": 0.48972969947348566, "learning_rate": 0.0001, "loss": 1.208, "loss/crossentropy": 2.388852596282959, "loss/hidden": 1.046875, "loss/logits": 0.16016846895217896, "loss/reg": 9.123095514951274e-05, "step": 2116 }, { "epoch": 0.264625, "grad_norm": 2.6246016025543213, "grad_norm_var": 0.4925217607404759, "learning_rate": 0.0001, "loss": 1.3418, "loss/crossentropy": 2.3809802532196045, "loss/hidden": 1.15625, "loss/logits": 0.1846769154071808, "loss/reg": 9.119183232542127e-05, "step": 2117 }, { "epoch": 0.26475, "grad_norm": 2.4297525882720947, "grad_norm_var": 0.5147309939223722, "learning_rate": 0.0001, "loss": 1.5291, "loss/crossentropy": 2.7968549728393555, "loss/hidden": 1.28125, "loss/logits": 0.24697484076023102, "loss/reg": 9.114842396229506e-05, "step": 2118 }, { "epoch": 0.264875, "grad_norm": 2.824307680130005, "grad_norm_var": 0.49947942585540633, "learning_rate": 0.0001, "loss": 1.2374, "loss/crossentropy": 2.4023280143737793, "loss/hidden": 1.078125, "loss/logits": 0.15833215415477753, "loss/reg": 9.110840619541705e-05, "step": 2119 }, { "epoch": 0.265, "grad_norm": 12.156834602355957, "grad_norm_var": 5.741960033924378, "learning_rate": 0.0001, "loss": 2.8821, "loss/crossentropy": 3.2864983081817627, "loss/hidden": 2.125, "loss/logits": 0.7561862468719482, "loss/reg": 9.106768266065046e-05, "step": 2120 }, { "epoch": 0.265125, "grad_norm": 3.85833477973938, "grad_norm_var": 5.739789745413069, "learning_rate": 0.0001, "loss": 1.4741, "loss/crossentropy": 2.95538592338562, "loss/hidden": 1.28125, "loss/logits": 0.1919463872909546, "loss/reg": 9.102983312914148e-05, "step": 2121 }, { "epoch": 0.26525, "grad_norm": 3.0685269832611084, "grad_norm_var": 5.757333293142827, "learning_rate": 0.0001, "loss": 1.369, "loss/crossentropy": 2.4340481758117676, "loss/hidden": 1.1875, "loss/logits": 0.18060876429080963, "loss/reg": 9.098959708353505e-05, "step": 2122 }, { "epoch": 0.265375, "grad_norm": 2.691805124282837, "grad_norm_var": 5.783651466596406, "learning_rate": 0.0001, "loss": 1.248, "loss/crossentropy": 2.6185781955718994, "loss/hidden": 1.0859375, "loss/logits": 0.16115383803844452, "loss/reg": 9.095053974306211e-05, "step": 2123 }, { "epoch": 0.2655, "grad_norm": 3.2312166690826416, "grad_norm_var": 5.759865061112721, "learning_rate": 0.0001, "loss": 1.7269, "loss/crossentropy": 2.2541229724884033, "loss/hidden": 1.4609375, "loss/logits": 0.2650327682495117, "loss/reg": 9.090742241824046e-05, "step": 2124 }, { "epoch": 0.265625, "grad_norm": 2.5996127128601074, "grad_norm_var": 5.754833601413936, "learning_rate": 0.0001, "loss": 1.3558, "loss/crossentropy": 2.535979986190796, "loss/hidden": 1.171875, "loss/logits": 0.18304461240768433, "loss/reg": 9.086965292226523e-05, "step": 2125 }, { "epoch": 0.26575, "grad_norm": 2.81913161277771, "grad_norm_var": 5.714259566149964, "learning_rate": 0.0001, "loss": 1.3728, "loss/crossentropy": 2.7871487140655518, "loss/hidden": 1.1953125, "loss/logits": 0.17660707235336304, "loss/reg": 9.083384065888822e-05, "step": 2126 }, { "epoch": 0.265875, "grad_norm": 3.558943271636963, "grad_norm_var": 5.710608443077858, "learning_rate": 0.0001, "loss": 1.7603, "loss/crossentropy": 2.760807752609253, "loss/hidden": 1.46875, "loss/logits": 0.2906295657157898, "loss/reg": 9.079890878638253e-05, "step": 2127 }, { "epoch": 0.266, "grad_norm": 4.221079349517822, "grad_norm_var": 5.7349047534613105, "learning_rate": 0.0001, "loss": 1.7137, "loss/crossentropy": 2.5065906047821045, "loss/hidden": 1.4921875, "loss/logits": 0.22063076496124268, "loss/reg": 9.076666901819408e-05, "step": 2128 }, { "epoch": 0.266125, "grad_norm": 2.4123337268829346, "grad_norm_var": 5.793568830798606, "learning_rate": 0.0001, "loss": 1.4017, "loss/crossentropy": 2.5079715251922607, "loss/hidden": 1.203125, "loss/logits": 0.19762490689754486, "loss/reg": 9.073211549548432e-05, "step": 2129 }, { "epoch": 0.26625, "grad_norm": 2.698634386062622, "grad_norm_var": 5.706847508352249, "learning_rate": 0.0001, "loss": 1.3949, "loss/crossentropy": 2.459258556365967, "loss/hidden": 1.1875, "loss/logits": 0.20652708411216736, "loss/reg": 9.069246880244464e-05, "step": 2130 }, { "epoch": 0.266375, "grad_norm": 19.905359268188477, "grad_norm_var": 22.378171292136773, "learning_rate": 0.0001, "loss": 1.6105, "loss/crossentropy": 2.759188175201416, "loss/hidden": 1.3671875, "loss/logits": 0.24243687093257904, "loss/reg": 9.064989717444405e-05, "step": 2131 }, { "epoch": 0.2665, "grad_norm": 2.2006354331970215, "grad_norm_var": 22.272542871008294, "learning_rate": 0.0001, "loss": 1.3277, "loss/crossentropy": 2.5931265354156494, "loss/hidden": 1.1328125, "loss/logits": 0.1939885914325714, "loss/reg": 9.060993033926934e-05, "step": 2132 }, { "epoch": 0.266625, "grad_norm": 2.5270087718963623, "grad_norm_var": 22.29859969353696, "learning_rate": 0.0001, "loss": 1.5786, "loss/crossentropy": 2.3109354972839355, "loss/hidden": 1.3359375, "loss/logits": 0.24177613854408264, "loss/reg": 9.056490671355277e-05, "step": 2133 }, { "epoch": 0.26675, "grad_norm": 2.2362310886383057, "grad_norm_var": 22.356299558768665, "learning_rate": 0.0001, "loss": 1.5117, "loss/crossentropy": 2.4276862144470215, "loss/hidden": 1.2578125, "loss/logits": 0.2529800534248352, "loss/reg": 9.052040695678443e-05, "step": 2134 }, { "epoch": 0.266875, "grad_norm": 2.3193960189819336, "grad_norm_var": 22.48929291178717, "learning_rate": 0.0001, "loss": 1.3545, "loss/crossentropy": 2.7607345581054688, "loss/hidden": 1.1484375, "loss/logits": 0.2051132768392563, "loss/reg": 9.048045467352495e-05, "step": 2135 }, { "epoch": 0.267, "grad_norm": 2.420997142791748, "grad_norm_var": 18.515003264071165, "learning_rate": 0.0001, "loss": 1.3559, "loss/crossentropy": 2.586461067199707, "loss/hidden": 1.15625, "loss/logits": 0.19874610006809235, "loss/reg": 9.043844329426065e-05, "step": 2136 }, { "epoch": 0.267125, "grad_norm": 5.4822564125061035, "grad_norm_var": 18.66580498957947, "learning_rate": 0.0001, "loss": 1.8507, "loss/crossentropy": 1.905639410018921, "loss/hidden": 1.625, "loss/logits": 0.2248174250125885, "loss/reg": 9.040002623805776e-05, "step": 2137 }, { "epoch": 0.26725, "grad_norm": 3.23170804977417, "grad_norm_var": 18.646668095576448, "learning_rate": 0.0001, "loss": 1.604, "loss/crossentropy": 2.258283853530884, "loss/hidden": 1.3125, "loss/logits": 0.2906220853328705, "loss/reg": 9.036483970703557e-05, "step": 2138 }, { "epoch": 0.267375, "grad_norm": 3.150378942489624, "grad_norm_var": 18.57769796883996, "learning_rate": 0.0001, "loss": 1.3789, "loss/crossentropy": 2.741101026535034, "loss/hidden": 1.171875, "loss/logits": 0.2061539888381958, "loss/reg": 9.032683738041669e-05, "step": 2139 }, { "epoch": 0.2675, "grad_norm": 2.3450660705566406, "grad_norm_var": 18.725106061033443, "learning_rate": 0.0001, "loss": 1.3496, "loss/crossentropy": 2.4336600303649902, "loss/hidden": 1.140625, "loss/logits": 0.20809516310691833, "loss/reg": 9.0291905507911e-05, "step": 2140 }, { "epoch": 0.267625, "grad_norm": 2.1249938011169434, "grad_norm_var": 18.828314358771422, "learning_rate": 0.0001, "loss": 1.1692, "loss/crossentropy": 2.4852468967437744, "loss/hidden": 1.015625, "loss/logits": 0.15262919664382935, "loss/reg": 9.026085899677128e-05, "step": 2141 }, { "epoch": 0.26775, "grad_norm": 2.235658645629883, "grad_norm_var": 18.939777605520636, "learning_rate": 0.0001, "loss": 1.3667, "loss/crossentropy": 2.541858434677124, "loss/hidden": 1.15625, "loss/logits": 0.20956584811210632, "loss/reg": 9.023477468872443e-05, "step": 2142 }, { "epoch": 0.267875, "grad_norm": 2.8265771865844727, "grad_norm_var": 19.01069709117106, "learning_rate": 0.0001, "loss": 1.4534, "loss/crossentropy": 2.6113200187683105, "loss/hidden": 1.203125, "loss/logits": 0.24933210015296936, "loss/reg": 9.021165897138417e-05, "step": 2143 }, { "epoch": 0.268, "grad_norm": 3.0839881896972656, "grad_norm_var": 19.042244059371143, "learning_rate": 0.0001, "loss": 1.7227, "loss/crossentropy": 2.591771125793457, "loss/hidden": 1.4609375, "loss/logits": 0.2608566880226135, "loss/reg": 9.017697448143736e-05, "step": 2144 }, { "epoch": 0.268125, "grad_norm": 3.0496013164520264, "grad_norm_var": 18.94758658019878, "learning_rate": 0.0001, "loss": 1.5843, "loss/crossentropy": 2.2692813873291016, "loss/hidden": 1.328125, "loss/logits": 0.2553032636642456, "loss/reg": 9.013501403387636e-05, "step": 2145 }, { "epoch": 0.26825, "grad_norm": 2.9669201374053955, "grad_norm_var": 18.910365962271175, "learning_rate": 0.0001, "loss": 1.2365, "loss/crossentropy": 2.015852451324463, "loss/hidden": 1.0703125, "loss/logits": 0.16525325179100037, "loss/reg": 9.010936628328636e-05, "step": 2146 }, { "epoch": 0.268375, "grad_norm": 2.614943265914917, "grad_norm_var": 0.6544456670142987, "learning_rate": 0.0001, "loss": 1.4472, "loss/crossentropy": 2.838805913925171, "loss/hidden": 1.2265625, "loss/logits": 0.2197452187538147, "loss/reg": 9.007466724142432e-05, "step": 2147 }, { "epoch": 0.2685, "grad_norm": 3.185328483581543, "grad_norm_var": 0.6362206753821122, "learning_rate": 0.0001, "loss": 1.3221, "loss/crossentropy": 2.5648984909057617, "loss/hidden": 1.125, "loss/logits": 0.1962217092514038, "loss/reg": 9.004329331219196e-05, "step": 2148 }, { "epoch": 0.268625, "grad_norm": 3.2083404064178467, "grad_norm_var": 0.6347505552427255, "learning_rate": 0.0001, "loss": 1.5453, "loss/crossentropy": 2.1921334266662598, "loss/hidden": 1.3125, "loss/logits": 0.2318781316280365, "loss/reg": 9.000660793390125e-05, "step": 2149 }, { "epoch": 0.26875, "grad_norm": 3.028928518295288, "grad_norm_var": 0.6033236889944609, "learning_rate": 0.0001, "loss": 1.4137, "loss/crossentropy": 2.714355707168579, "loss/hidden": 1.21875, "loss/logits": 0.19405603408813477, "loss/reg": 8.997020631795749e-05, "step": 2150 }, { "epoch": 0.268875, "grad_norm": 2.7356183528900146, "grad_norm_var": 0.5788946332629391, "learning_rate": 0.0001, "loss": 1.3096, "loss/crossentropy": 2.6323273181915283, "loss/hidden": 1.125, "loss/logits": 0.18367579579353333, "loss/reg": 8.993390656542033e-05, "step": 2151 }, { "epoch": 0.269, "grad_norm": 3.8078200817108154, "grad_norm_var": 0.5956037856736164, "learning_rate": 0.0001, "loss": 1.4226, "loss/crossentropy": 2.398768901824951, "loss/hidden": 1.234375, "loss/logits": 0.18729771673679352, "loss/reg": 8.989709749585018e-05, "step": 2152 }, { "epoch": 0.269125, "grad_norm": 2.6075973510742188, "grad_norm_var": 0.18649112898183282, "learning_rate": 0.0001, "loss": 1.4588, "loss/crossentropy": 2.4405405521392822, "loss/hidden": 1.2421875, "loss/logits": 0.21567553281784058, "loss/reg": 8.987094042822719e-05, "step": 2153 }, { "epoch": 0.26925, "grad_norm": 2.47141695022583, "grad_norm_var": 0.18774765732907686, "learning_rate": 0.0001, "loss": 1.3047, "loss/crossentropy": 2.2910423278808594, "loss/hidden": 1.109375, "loss/logits": 0.1944679617881775, "loss/reg": 8.984307351056486e-05, "step": 2154 }, { "epoch": 0.269375, "grad_norm": 3.305403470993042, "grad_norm_var": 0.19566110315658894, "learning_rate": 0.0001, "loss": 1.4715, "loss/crossentropy": 2.610257863998413, "loss/hidden": 1.265625, "loss/logits": 0.2049887776374817, "loss/reg": 8.981455175671726e-05, "step": 2155 }, { "epoch": 0.2695, "grad_norm": 2.599703550338745, "grad_norm_var": 0.18257408627172797, "learning_rate": 0.0001, "loss": 1.4432, "loss/crossentropy": 2.154520034790039, "loss/hidden": 1.2421875, "loss/logits": 0.20016363263130188, "loss/reg": 8.97833815542981e-05, "step": 2156 }, { "epoch": 0.269625, "grad_norm": 2.838005781173706, "grad_norm_var": 0.14392082127178488, "learning_rate": 0.0001, "loss": 1.5304, "loss/crossentropy": 2.4893879890441895, "loss/hidden": 1.296875, "loss/logits": 0.2326088845729828, "loss/reg": 8.974419324658811e-05, "step": 2157 }, { "epoch": 0.26975, "grad_norm": 2.557812213897705, "grad_norm_var": 0.1214260145439501, "learning_rate": 0.0001, "loss": 1.3164, "loss/crossentropy": 2.419548273086548, "loss/hidden": 1.1328125, "loss/logits": 0.18268349766731262, "loss/reg": 8.971517672762275e-05, "step": 2158 }, { "epoch": 0.269875, "grad_norm": 4.894566059112549, "grad_norm_var": 0.36005721794501727, "learning_rate": 0.0001, "loss": 1.4661, "loss/crossentropy": 2.518158435821533, "loss/hidden": 1.2578125, "loss/logits": 0.20741476118564606, "loss/reg": 8.968536712927744e-05, "step": 2159 }, { "epoch": 0.27, "grad_norm": 2.621678590774536, "grad_norm_var": 0.3719212576213823, "learning_rate": 0.0001, "loss": 1.29, "loss/crossentropy": 2.6870412826538086, "loss/hidden": 1.109375, "loss/logits": 0.17970490455627441, "loss/reg": 8.965859160525724e-05, "step": 2160 }, { "epoch": 0.270125, "grad_norm": 2.8002030849456787, "grad_norm_var": 0.37518536131472213, "learning_rate": 0.0001, "loss": 1.2354, "loss/crossentropy": 2.7394697666168213, "loss/hidden": 1.0546875, "loss/logits": 0.17978498339653015, "loss/reg": 8.961993444245309e-05, "step": 2161 }, { "epoch": 0.27025, "grad_norm": 2.7785491943359375, "grad_norm_var": 0.378617395402541, "learning_rate": 0.0001, "loss": 1.5087, "loss/crossentropy": 2.556204080581665, "loss/hidden": 1.296875, "loss/logits": 0.2109331339597702, "loss/reg": 8.95902921911329e-05, "step": 2162 }, { "epoch": 0.270375, "grad_norm": 4.290672302246094, "grad_norm_var": 0.4673073749014764, "learning_rate": 0.0001, "loss": 1.4328, "loss/crossentropy": 2.574556350708008, "loss/hidden": 1.21875, "loss/logits": 0.2131568193435669, "loss/reg": 8.955172961577773e-05, "step": 2163 }, { "epoch": 0.2705, "grad_norm": 2.3835153579711914, "grad_norm_var": 0.49924617818857253, "learning_rate": 0.0001, "loss": 1.2862, "loss/crossentropy": 2.6769771575927734, "loss/hidden": 1.1015625, "loss/logits": 0.18374209105968475, "loss/reg": 8.951124618761241e-05, "step": 2164 }, { "epoch": 0.270625, "grad_norm": 3.2428500652313232, "grad_norm_var": 0.5000118433207279, "learning_rate": 0.0001, "loss": 1.5199, "loss/crossentropy": 2.4590914249420166, "loss/hidden": 1.3046875, "loss/logits": 0.21435073018074036, "loss/reg": 8.9471330284141e-05, "step": 2165 }, { "epoch": 0.27075, "grad_norm": 2.5840418338775635, "grad_norm_var": 0.5142413020726221, "learning_rate": 0.0001, "loss": 1.2298, "loss/crossentropy": 2.42362117767334, "loss/hidden": 1.0703125, "loss/logits": 0.15861323475837708, "loss/reg": 8.943559078034014e-05, "step": 2166 }, { "epoch": 0.270875, "grad_norm": 2.706963300704956, "grad_norm_var": 0.5154267791293816, "learning_rate": 0.0001, "loss": 1.674, "loss/crossentropy": 2.3239402770996094, "loss/hidden": 1.4296875, "loss/logits": 0.24342799186706543, "loss/reg": 8.93983306013979e-05, "step": 2167 }, { "epoch": 0.271, "grad_norm": 3.0479815006256104, "grad_norm_var": 0.47277746533202747, "learning_rate": 0.0001, "loss": 1.4264, "loss/crossentropy": 2.976659059524536, "loss/hidden": 1.234375, "loss/logits": 0.19112327694892883, "loss/reg": 8.93661635927856e-05, "step": 2168 }, { "epoch": 0.271125, "grad_norm": 2.422661304473877, "grad_norm_var": 0.4841763427608041, "learning_rate": 0.0001, "loss": 1.3961, "loss/crossentropy": 2.507882833480835, "loss/hidden": 1.1875, "loss/logits": 0.20770679414272308, "loss/reg": 8.932583295973018e-05, "step": 2169 }, { "epoch": 0.27125, "grad_norm": 2.400315284729004, "grad_norm_var": 0.4892344061319532, "learning_rate": 0.0001, "loss": 1.3286, "loss/crossentropy": 2.6506967544555664, "loss/hidden": 1.109375, "loss/logits": 0.2183111608028412, "loss/reg": 8.929073374019936e-05, "step": 2170 }, { "epoch": 0.271375, "grad_norm": 2.1473493576049805, "grad_norm_var": 0.5208287589444315, "learning_rate": 0.0001, "loss": 1.1853, "loss/crossentropy": 2.574639320373535, "loss/hidden": 1.015625, "loss/logits": 0.16880351305007935, "loss/reg": 8.925698784878477e-05, "step": 2171 }, { "epoch": 0.2715, "grad_norm": 3.158116340637207, "grad_norm_var": 0.5183460740627138, "learning_rate": 0.0001, "loss": 1.4407, "loss/crossentropy": 2.07778000831604, "loss/hidden": 1.25, "loss/logits": 0.18976925313472748, "loss/reg": 8.92263269633986e-05, "step": 2172 }, { "epoch": 0.271625, "grad_norm": 3.0437121391296387, "grad_norm_var": 0.5184756838295547, "learning_rate": 0.0001, "loss": 1.3577, "loss/crossentropy": 2.424862861633301, "loss/hidden": 1.15625, "loss/logits": 0.20058290660381317, "loss/reg": 8.918832463677973e-05, "step": 2173 }, { "epoch": 0.27175, "grad_norm": 2.7510032653808594, "grad_norm_var": 0.5108976688484584, "learning_rate": 0.0001, "loss": 1.3476, "loss/crossentropy": 2.714420795440674, "loss/hidden": 1.1640625, "loss/logits": 0.1825968623161316, "loss/reg": 8.914989302866161e-05, "step": 2174 }, { "epoch": 0.271875, "grad_norm": 2.244661569595337, "grad_norm_var": 0.2643550976842351, "learning_rate": 0.0001, "loss": 1.2563, "loss/crossentropy": 2.6799263954162598, "loss/hidden": 1.0859375, "loss/logits": 0.16943755745887756, "loss/reg": 8.911270560929552e-05, "step": 2175 }, { "epoch": 0.272, "grad_norm": 2.0984318256378174, "grad_norm_var": 0.2931413779694838, "learning_rate": 0.0001, "loss": 1.3093, "loss/crossentropy": 2.3841729164123535, "loss/hidden": 1.1171875, "loss/logits": 0.19126509130001068, "loss/reg": 8.90806841198355e-05, "step": 2176 }, { "epoch": 0.272125, "grad_norm": 2.1878421306610107, "grad_norm_var": 0.31299455654281755, "learning_rate": 0.0001, "loss": 1.2463, "loss/crossentropy": 2.6540238857269287, "loss/hidden": 1.0703125, "loss/logits": 0.17509347200393677, "loss/reg": 8.905051799956709e-05, "step": 2177 }, { "epoch": 0.27225, "grad_norm": 2.270481586456299, "grad_norm_var": 0.3250289283995033, "learning_rate": 0.0001, "loss": 1.1772, "loss/crossentropy": 2.292816400527954, "loss/hidden": 1.015625, "loss/logits": 0.16069671511650085, "loss/reg": 8.901660476112738e-05, "step": 2178 }, { "epoch": 0.272375, "grad_norm": 2.660308837890625, "grad_norm_var": 0.14239518259356387, "learning_rate": 0.0001, "loss": 1.5349, "loss/crossentropy": 2.373868703842163, "loss/hidden": 1.3125, "loss/logits": 0.22154484689235687, "loss/reg": 8.898842497728765e-05, "step": 2179 }, { "epoch": 0.2725, "grad_norm": 2.829458236694336, "grad_norm_var": 0.1428804487798286, "learning_rate": 0.0001, "loss": 1.3081, "loss/crossentropy": 2.7010090351104736, "loss/hidden": 1.1171875, "loss/logits": 0.19003155827522278, "loss/reg": 8.895338396541774e-05, "step": 2180 }, { "epoch": 0.272625, "grad_norm": 3.1504218578338623, "grad_norm_var": 0.1356431576911857, "learning_rate": 0.0001, "loss": 1.2885, "loss/crossentropy": 2.6727566719055176, "loss/hidden": 1.1015625, "loss/logits": 0.18604663014411926, "loss/reg": 8.891464676707983e-05, "step": 2181 }, { "epoch": 0.27275, "grad_norm": 3.0626466274261475, "grad_norm_var": 0.14852741778184372, "learning_rate": 0.0001, "loss": 1.384, "loss/crossentropy": 2.4894278049468994, "loss/hidden": 1.1953125, "loss/logits": 0.18780067563056946, "loss/reg": 8.887294097803533e-05, "step": 2182 }, { "epoch": 0.272875, "grad_norm": 2.203096866607666, "grad_norm_var": 0.15965421882931044, "learning_rate": 0.0001, "loss": 1.2007, "loss/crossentropy": 2.4092605113983154, "loss/hidden": 1.03125, "loss/logits": 0.16856923699378967, "loss/reg": 8.882681868271902e-05, "step": 2183 }, { "epoch": 0.273, "grad_norm": 2.4811582565307617, "grad_norm_var": 0.14624865568788792, "learning_rate": 0.0001, "loss": 1.3812, "loss/crossentropy": 2.675706148147583, "loss/hidden": 1.15625, "loss/logits": 0.22407343983650208, "loss/reg": 8.878641529008746e-05, "step": 2184 }, { "epoch": 0.273125, "grad_norm": 3.1112558841705322, "grad_norm_var": 0.16240408719023947, "learning_rate": 0.0001, "loss": 1.6927, "loss/crossentropy": 2.5677549839019775, "loss/hidden": 1.421875, "loss/logits": 0.2698906660079956, "loss/reg": 8.875471394276246e-05, "step": 2185 }, { "epoch": 0.27325, "grad_norm": 2.2244081497192383, "grad_norm_var": 0.16931506664392657, "learning_rate": 0.0001, "loss": 1.622, "loss/crossentropy": 2.28446102142334, "loss/hidden": 1.3671875, "loss/logits": 0.25393012166023254, "loss/reg": 8.872429316397756e-05, "step": 2186 }, { "epoch": 0.273375, "grad_norm": 2.6466481685638428, "grad_norm_var": 0.15466055447114566, "learning_rate": 0.0001, "loss": 1.4142, "loss/crossentropy": 2.5898969173431396, "loss/hidden": 1.21875, "loss/logits": 0.19457581639289856, "loss/reg": 8.869516750564799e-05, "step": 2187 }, { "epoch": 0.2735, "grad_norm": 2.591900110244751, "grad_norm_var": 0.13503366925752497, "learning_rate": 0.0001, "loss": 1.322, "loss/crossentropy": 2.970034122467041, "loss/hidden": 1.125, "loss/logits": 0.19611407816410065, "loss/reg": 8.867220458341762e-05, "step": 2188 }, { "epoch": 0.273625, "grad_norm": 2.3932650089263916, "grad_norm_var": 0.12276403983815551, "learning_rate": 0.0001, "loss": 1.3413, "loss/crossentropy": 2.3595399856567383, "loss/hidden": 1.140625, "loss/logits": 0.1997908055782318, "loss/reg": 8.86523921508342e-05, "step": 2189 }, { "epoch": 0.27375, "grad_norm": 2.6990280151367188, "grad_norm_var": 0.12158625923349646, "learning_rate": 0.0001, "loss": 1.3676, "loss/crossentropy": 2.6477363109588623, "loss/hidden": 1.1875, "loss/logits": 0.17919524013996124, "loss/reg": 8.861719106789678e-05, "step": 2190 }, { "epoch": 0.273875, "grad_norm": 2.3955636024475098, "grad_norm_var": 0.11679680127707855, "learning_rate": 0.0001, "loss": 1.2931, "loss/crossentropy": 2.4216725826263428, "loss/hidden": 1.1171875, "loss/logits": 0.17502285540103912, "loss/reg": 8.859470835886896e-05, "step": 2191 }, { "epoch": 0.274, "grad_norm": 3.660187244415283, "grad_norm_var": 0.17252751872557207, "learning_rate": 0.0001, "loss": 1.8769, "loss/crossentropy": 2.3117990493774414, "loss/hidden": 1.59375, "loss/logits": 0.28231126070022583, "loss/reg": 8.855896885506809e-05, "step": 2192 }, { "epoch": 0.274125, "grad_norm": 2.887098550796509, "grad_norm_var": 0.15902153630969618, "learning_rate": 0.0001, "loss": 1.2607, "loss/crossentropy": 2.4796411991119385, "loss/hidden": 1.1015625, "loss/logits": 0.1582556813955307, "loss/reg": 8.852600149111822e-05, "step": 2193 }, { "epoch": 0.27425, "grad_norm": 2.44486141204834, "grad_norm_var": 0.15083822106689293, "learning_rate": 0.0001, "loss": 1.4067, "loss/crossentropy": 2.8775737285614014, "loss/hidden": 1.1875, "loss/logits": 0.21830376982688904, "loss/reg": 8.848853758536279e-05, "step": 2194 }, { "epoch": 0.274375, "grad_norm": 2.6925208568573, "grad_norm_var": 0.15066782612197946, "learning_rate": 0.0001, "loss": 1.3492, "loss/crossentropy": 2.6197140216827393, "loss/hidden": 1.171875, "loss/logits": 0.17646434903144836, "loss/reg": 8.845413685776293e-05, "step": 2195 }, { "epoch": 0.2745, "grad_norm": 2.7529358863830566, "grad_norm_var": 0.14988736490731375, "learning_rate": 0.0001, "loss": 1.4735, "loss/crossentropy": 2.638287305831909, "loss/hidden": 1.265625, "loss/logits": 0.20699143409729004, "loss/reg": 8.842156967148185e-05, "step": 2196 }, { "epoch": 0.274625, "grad_norm": 2.632119655609131, "grad_norm_var": 0.13640076708652463, "learning_rate": 0.0001, "loss": 1.4439, "loss/crossentropy": 2.552913188934326, "loss/hidden": 1.2109375, "loss/logits": 0.23202946782112122, "loss/reg": 8.839357178658247e-05, "step": 2197 }, { "epoch": 0.27475, "grad_norm": 2.4767160415649414, "grad_norm_var": 0.12795764235743642, "learning_rate": 0.0001, "loss": 1.4848, "loss/crossentropy": 2.5421743392944336, "loss/hidden": 1.2734375, "loss/logits": 0.2104777693748474, "loss/reg": 8.83589600562118e-05, "step": 2198 }, { "epoch": 0.274875, "grad_norm": 4.943397045135498, "grad_norm_var": 0.4364477911770564, "learning_rate": 0.0001, "loss": 1.5386, "loss/crossentropy": 2.61765456199646, "loss/hidden": 1.2890625, "loss/logits": 0.24861164391040802, "loss/reg": 8.833300671540201e-05, "step": 2199 }, { "epoch": 0.275, "grad_norm": 2.4323232173919678, "grad_norm_var": 0.43876777889638346, "learning_rate": 0.0001, "loss": 1.3211, "loss/crossentropy": 2.677091360092163, "loss/hidden": 1.125, "loss/logits": 0.1952204406261444, "loss/reg": 8.830582373775542e-05, "step": 2200 }, { "epoch": 0.275125, "grad_norm": 2.6691641807556152, "grad_norm_var": 0.4333146605469892, "learning_rate": 0.0001, "loss": 1.3629, "loss/crossentropy": 2.563295841217041, "loss/hidden": 1.15625, "loss/logits": 0.20577046275138855, "loss/reg": 8.827573037706316e-05, "step": 2201 }, { "epoch": 0.27525, "grad_norm": 2.5552756786346436, "grad_norm_var": 0.4154751097746799, "learning_rate": 0.0001, "loss": 1.1759, "loss/crossentropy": 2.521275758743286, "loss/hidden": 1.015625, "loss/logits": 0.1593630015850067, "loss/reg": 8.823868120089173e-05, "step": 2202 }, { "epoch": 0.275375, "grad_norm": 2.817121744155884, "grad_norm_var": 0.41370206786959873, "learning_rate": 0.0001, "loss": 1.3641, "loss/crossentropy": 2.81768798828125, "loss/hidden": 1.15625, "loss/logits": 0.20697495341300964, "loss/reg": 8.82075255503878e-05, "step": 2203 }, { "epoch": 0.2755, "grad_norm": 6.086315631866455, "grad_norm_var": 1.0728373582734083, "learning_rate": 0.0001, "loss": 1.8475, "loss/crossentropy": 2.8540616035461426, "loss/hidden": 1.5625, "loss/logits": 0.28407561779022217, "loss/reg": 8.818701462587342e-05, "step": 2204 }, { "epoch": 0.275625, "grad_norm": 4.379019260406494, "grad_norm_var": 1.149744019531357, "learning_rate": 0.0001, "loss": 1.9067, "loss/crossentropy": 2.5547308921813965, "loss/hidden": 1.640625, "loss/logits": 0.26523351669311523, "loss/reg": 8.814973989501595e-05, "step": 2205 }, { "epoch": 0.27575, "grad_norm": 3.1754562854766846, "grad_norm_var": 1.1347921609338831, "learning_rate": 0.0001, "loss": 1.4239, "loss/crossentropy": 2.464599847793579, "loss/hidden": 1.234375, "loss/logits": 0.18866819143295288, "loss/reg": 8.81114392541349e-05, "step": 2206 }, { "epoch": 0.275875, "grad_norm": 3.1562039852142334, "grad_norm_var": 1.0906353653837242, "learning_rate": 0.0001, "loss": 1.4538, "loss/crossentropy": 3.1389553546905518, "loss/hidden": 1.2421875, "loss/logits": 0.21074464917182922, "loss/reg": 8.80814841366373e-05, "step": 2207 }, { "epoch": 0.276, "grad_norm": 2.7403523921966553, "grad_norm_var": 1.0913749291443604, "learning_rate": 0.0001, "loss": 1.4635, "loss/crossentropy": 2.5373611450195312, "loss/hidden": 1.2578125, "loss/logits": 0.20484095811843872, "loss/reg": 8.805259858490899e-05, "step": 2208 }, { "epoch": 0.276125, "grad_norm": 2.379646062850952, "grad_norm_var": 1.127121568284944, "learning_rate": 0.0001, "loss": 1.3275, "loss/crossentropy": 2.6416730880737305, "loss/hidden": 1.1484375, "loss/logits": 0.17820969223976135, "loss/reg": 8.80336228874512e-05, "step": 2209 }, { "epoch": 0.27625, "grad_norm": 3.214848518371582, "grad_norm_var": 1.092210715764092, "learning_rate": 0.0001, "loss": 1.6643, "loss/crossentropy": 2.470205068588257, "loss/hidden": 1.4140625, "loss/logits": 0.24930927157402039, "loss/reg": 8.7996173533611e-05, "step": 2210 }, { "epoch": 0.276375, "grad_norm": 2.2948567867279053, "grad_norm_var": 1.1286816914281759, "learning_rate": 0.0001, "loss": 1.3702, "loss/crossentropy": 2.60504150390625, "loss/hidden": 1.171875, "loss/logits": 0.19745083153247833, "loss/reg": 8.795940084382892e-05, "step": 2211 }, { "epoch": 0.2765, "grad_norm": 2.591606378555298, "grad_norm_var": 1.1392605371277207, "learning_rate": 0.0001, "loss": 1.2634, "loss/crossentropy": 2.619102716445923, "loss/hidden": 1.078125, "loss/logits": 0.18434739112854004, "loss/reg": 8.793108281679451e-05, "step": 2212 }, { "epoch": 0.276625, "grad_norm": 3.1959331035614014, "grad_norm_var": 1.1195181040918443, "learning_rate": 0.0001, "loss": 1.6207, "loss/crossentropy": 2.6664299964904785, "loss/hidden": 1.3671875, "loss/logits": 0.25261062383651733, "loss/reg": 8.790126594249159e-05, "step": 2213 }, { "epoch": 0.27675, "grad_norm": 2.6725211143493652, "grad_norm_var": 1.1031810399618118, "learning_rate": 0.0001, "loss": 1.4698, "loss/crossentropy": 2.5011229515075684, "loss/hidden": 1.21875, "loss/logits": 0.2501499652862549, "loss/reg": 8.78764913068153e-05, "step": 2214 }, { "epoch": 0.276875, "grad_norm": 2.8706202507019043, "grad_norm_var": 0.8916803303630625, "learning_rate": 0.0001, "loss": 1.41, "loss/crossentropy": 2.4667508602142334, "loss/hidden": 1.203125, "loss/logits": 0.20603647828102112, "loss/reg": 8.785132376942784e-05, "step": 2215 }, { "epoch": 0.277, "grad_norm": 3.125086545944214, "grad_norm_var": 0.8621318490670536, "learning_rate": 0.0001, "loss": 1.3562, "loss/crossentropy": 2.762350559234619, "loss/hidden": 1.1640625, "loss/logits": 0.1912788450717926, "loss/reg": 8.782604709267616e-05, "step": 2216 }, { "epoch": 0.277125, "grad_norm": 2.2561075687408447, "grad_norm_var": 0.897638627829667, "learning_rate": 0.0001, "loss": 1.4231, "loss/crossentropy": 2.377209424972534, "loss/hidden": 1.203125, "loss/logits": 0.21911945939064026, "loss/reg": 8.780218922765926e-05, "step": 2217 }, { "epoch": 0.27725, "grad_norm": 2.32624888420105, "grad_norm_var": 0.9173812364215297, "learning_rate": 0.0001, "loss": 1.245, "loss/crossentropy": 2.7420287132263184, "loss/hidden": 1.0703125, "loss/logits": 0.1738043576478958, "loss/reg": 8.776214235695079e-05, "step": 2218 }, { "epoch": 0.277375, "grad_norm": 2.8332111835479736, "grad_norm_var": 0.9168332132472128, "learning_rate": 0.0001, "loss": 1.3474, "loss/crossentropy": 2.651977777481079, "loss/hidden": 1.15625, "loss/logits": 0.19026127457618713, "loss/reg": 8.772014552960172e-05, "step": 2219 }, { "epoch": 0.2775, "grad_norm": 2.319690465927124, "grad_norm_var": 0.2942939619419339, "learning_rate": 0.0001, "loss": 1.1852, "loss/crossentropy": 2.6636173725128174, "loss/hidden": 1.03125, "loss/logits": 0.15307757258415222, "loss/reg": 8.768399129621685e-05, "step": 2220 }, { "epoch": 0.277625, "grad_norm": 2.954275131225586, "grad_norm_var": 0.12988658185201227, "learning_rate": 0.0001, "loss": 1.4321, "loss/crossentropy": 2.359200954437256, "loss/hidden": 1.25, "loss/logits": 0.18127068877220154, "loss/reg": 8.765376696828753e-05, "step": 2221 }, { "epoch": 0.27775, "grad_norm": 2.6260569095611572, "grad_norm_var": 0.1180738515996192, "learning_rate": 0.0001, "loss": 1.4049, "loss/crossentropy": 2.590933322906494, "loss/hidden": 1.2109375, "loss/logits": 0.1931142359972, "loss/reg": 8.762926154304296e-05, "step": 2222 }, { "epoch": 0.277875, "grad_norm": 2.9411120414733887, "grad_norm_var": 0.10852231939196007, "learning_rate": 0.0001, "loss": 1.7153, "loss/crossentropy": 2.2022244930267334, "loss/hidden": 1.46875, "loss/logits": 0.2456853687763214, "loss/reg": 8.75918340170756e-05, "step": 2223 }, { "epoch": 0.278, "grad_norm": 2.0700292587280273, "grad_norm_var": 0.133793270060058, "learning_rate": 0.0001, "loss": 1.267, "loss/crossentropy": 2.7503654956817627, "loss/hidden": 1.0859375, "loss/logits": 0.1801900565624237, "loss/reg": 8.755311864661053e-05, "step": 2224 }, { "epoch": 0.278125, "grad_norm": 5.334056377410889, "grad_norm_var": 0.5661358691002304, "learning_rate": 0.0001, "loss": 1.7348, "loss/crossentropy": 2.208407402038574, "loss/hidden": 1.46875, "loss/logits": 0.2651841938495636, "loss/reg": 8.752733265282586e-05, "step": 2225 }, { "epoch": 0.27825, "grad_norm": 2.495821714401245, "grad_norm_var": 0.5636275755811226, "learning_rate": 0.0001, "loss": 1.322, "loss/crossentropy": 2.36448073387146, "loss/hidden": 1.1484375, "loss/logits": 0.1726653277873993, "loss/reg": 8.7489235738758e-05, "step": 2226 }, { "epoch": 0.278375, "grad_norm": 2.6611454486846924, "grad_norm_var": 0.5470152853986633, "learning_rate": 0.0001, "loss": 1.2733, "loss/crossentropy": 2.7170708179473877, "loss/hidden": 1.09375, "loss/logits": 0.17867141962051392, "loss/reg": 8.745229570195079e-05, "step": 2227 }, { "epoch": 0.2785, "grad_norm": 2.310807943344116, "grad_norm_var": 0.5608535203702094, "learning_rate": 0.0001, "loss": 1.2892, "loss/crossentropy": 2.402653694152832, "loss/hidden": 1.109375, "loss/logits": 0.1789644956588745, "loss/reg": 8.741153578739613e-05, "step": 2228 }, { "epoch": 0.278625, "grad_norm": 2.439602851867676, "grad_norm_var": 0.5578929360176003, "learning_rate": 0.0001, "loss": 1.3109, "loss/crossentropy": 2.3066911697387695, "loss/hidden": 1.140625, "loss/logits": 0.1693601906299591, "loss/reg": 8.736821473576128e-05, "step": 2229 }, { "epoch": 0.27875, "grad_norm": 2.2621219158172607, "grad_norm_var": 0.5734677560868856, "learning_rate": 0.0001, "loss": 1.3819, "loss/crossentropy": 2.4448463916778564, "loss/hidden": 1.171875, "loss/logits": 0.20913071930408478, "loss/reg": 8.732335845706984e-05, "step": 2230 }, { "epoch": 0.278875, "grad_norm": 2.3285398483276367, "grad_norm_var": 0.5823292957469363, "learning_rate": 0.0001, "loss": 1.2399, "loss/crossentropy": 2.795330286026001, "loss/hidden": 1.0703125, "loss/logits": 0.1687251627445221, "loss/reg": 8.728443935979158e-05, "step": 2231 }, { "epoch": 0.279, "grad_norm": 2.525573492050171, "grad_norm_var": 0.5712326950973562, "learning_rate": 0.0001, "loss": 1.2414, "loss/crossentropy": 2.605377674102783, "loss/hidden": 1.0625, "loss/logits": 0.17799171805381775, "loss/reg": 8.724019426153973e-05, "step": 2232 }, { "epoch": 0.279125, "grad_norm": 2.319056987762451, "grad_norm_var": 0.5680251288052354, "learning_rate": 0.0001, "loss": 1.3215, "loss/crossentropy": 2.4763545989990234, "loss/hidden": 1.140625, "loss/logits": 0.17997178435325623, "loss/reg": 8.719746983842924e-05, "step": 2233 }, { "epoch": 0.27925, "grad_norm": 2.476876974105835, "grad_norm_var": 0.5625050390611683, "learning_rate": 0.0001, "loss": 1.3008, "loss/crossentropy": 2.4937450885772705, "loss/hidden": 1.125, "loss/logits": 0.17488956451416016, "loss/reg": 8.716309821465984e-05, "step": 2234 }, { "epoch": 0.279375, "grad_norm": 3.2335829734802246, "grad_norm_var": 0.5806425111201823, "learning_rate": 0.0001, "loss": 1.4179, "loss/crossentropy": 2.7160301208496094, "loss/hidden": 1.1796875, "loss/logits": 0.23737436532974243, "loss/reg": 8.71291704243049e-05, "step": 2235 }, { "epoch": 0.2795, "grad_norm": 5.2108869552612305, "grad_norm_var": 0.9541048858894781, "learning_rate": 0.0001, "loss": 1.605, "loss/crossentropy": 2.798490285873413, "loss/hidden": 1.3515625, "loss/logits": 0.25261300802230835, "loss/reg": 8.70938747539185e-05, "step": 2236 }, { "epoch": 0.279625, "grad_norm": 2.553628444671631, "grad_norm_var": 0.9605352480911958, "learning_rate": 0.0001, "loss": 1.248, "loss/crossentropy": 2.401689291000366, "loss/hidden": 1.078125, "loss/logits": 0.16896769404411316, "loss/reg": 8.705330401426181e-05, "step": 2237 }, { "epoch": 0.27975, "grad_norm": 2.945718765258789, "grad_norm_var": 0.956873719548085, "learning_rate": 0.0001, "loss": 1.575, "loss/crossentropy": 2.7476418018341064, "loss/hidden": 1.3515625, "loss/logits": 0.222568541765213, "loss/reg": 8.701300976099446e-05, "step": 2238 }, { "epoch": 0.279875, "grad_norm": 1.9718936681747437, "grad_norm_var": 1.0079184704379567, "learning_rate": 0.0001, "loss": 1.3423, "loss/crossentropy": 2.425795078277588, "loss/hidden": 1.140625, "loss/logits": 0.2007797658443451, "loss/reg": 8.697212615516037e-05, "step": 2239 }, { "epoch": 0.28, "grad_norm": 2.094433546066284, "grad_norm_var": 1.0055114260930764, "learning_rate": 0.0001, "loss": 1.2251, "loss/crossentropy": 2.385324239730835, "loss/hidden": 1.046875, "loss/logits": 0.17735999822616577, "loss/reg": 8.693499694345519e-05, "step": 2240 }, { "epoch": 0.280125, "grad_norm": 2.411120653152466, "grad_norm_var": 0.5607590803185976, "learning_rate": 0.0001, "loss": 1.2767, "loss/crossentropy": 2.4005250930786133, "loss/hidden": 1.1015625, "loss/logits": 0.17424529790878296, "loss/reg": 8.690132381161675e-05, "step": 2241 }, { "epoch": 0.28025, "grad_norm": 2.4842140674591064, "grad_norm_var": 0.5609907227245026, "learning_rate": 0.0001, "loss": 1.2804, "loss/crossentropy": 2.61702561378479, "loss/hidden": 1.0859375, "loss/logits": 0.19358505308628082, "loss/reg": 8.686765067977831e-05, "step": 2242 }, { "epoch": 0.280375, "grad_norm": 2.298415184020996, "grad_norm_var": 0.5681587392903878, "learning_rate": 0.0001, "loss": 1.4265, "loss/crossentropy": 2.1398396492004395, "loss/hidden": 1.203125, "loss/logits": 0.2225508987903595, "loss/reg": 8.683041232870892e-05, "step": 2243 }, { "epoch": 0.2805, "grad_norm": 2.6239306926727295, "grad_norm_var": 0.5615175974151649, "learning_rate": 0.0001, "loss": 1.3048, "loss/crossentropy": 2.396775007247925, "loss/hidden": 1.1484375, "loss/logits": 0.15548476576805115, "loss/reg": 8.679150050738826e-05, "step": 2244 }, { "epoch": 0.280625, "grad_norm": 2.245466947555542, "grad_norm_var": 0.5689626618997029, "learning_rate": 0.0001, "loss": 1.1395, "loss/crossentropy": 2.530561685562134, "loss/hidden": 0.9921875, "loss/logits": 0.14642968773841858, "loss/reg": 8.675204298924655e-05, "step": 2245 }, { "epoch": 0.28075, "grad_norm": 2.451308012008667, "grad_norm_var": 0.5620690126242001, "learning_rate": 0.0001, "loss": 1.188, "loss/crossentropy": 2.634861469268799, "loss/hidden": 1.0390625, "loss/logits": 0.14807121455669403, "loss/reg": 8.671054092701524e-05, "step": 2246 }, { "epoch": 0.280875, "grad_norm": 2.434732675552368, "grad_norm_var": 0.5584216753433139, "learning_rate": 0.0001, "loss": 1.2707, "loss/crossentropy": 2.4515867233276367, "loss/hidden": 1.1015625, "loss/logits": 0.16829489171504974, "loss/reg": 8.666779467603192e-05, "step": 2247 }, { "epoch": 0.281, "grad_norm": 2.6720073223114014, "grad_norm_var": 0.5574778965681578, "learning_rate": 0.0001, "loss": 1.2559, "loss/crossentropy": 2.633791923522949, "loss/hidden": 1.0859375, "loss/logits": 0.16914598643779755, "loss/reg": 8.663427433930337e-05, "step": 2248 }, { "epoch": 0.281125, "grad_norm": 2.5431721210479736, "grad_norm_var": 0.5506769387647826, "learning_rate": 0.0001, "loss": 1.2391, "loss/crossentropy": 2.4651074409484863, "loss/hidden": 1.078125, "loss/logits": 0.16012148559093475, "loss/reg": 8.658935985295102e-05, "step": 2249 }, { "epoch": 0.28125, "grad_norm": 4.321831226348877, "grad_norm_var": 0.7169657323671633, "learning_rate": 0.0001, "loss": 1.4956, "loss/crossentropy": 2.444957733154297, "loss/hidden": 1.296875, "loss/logits": 0.19782647490501404, "loss/reg": 8.655684359837323e-05, "step": 2250 }, { "epoch": 0.281375, "grad_norm": 2.2453696727752686, "grad_norm_var": 0.7183707693823578, "learning_rate": 0.0001, "loss": 1.315, "loss/crossentropy": 2.4684507846832275, "loss/hidden": 1.140625, "loss/logits": 0.173477903008461, "loss/reg": 8.652325777802616e-05, "step": 2251 }, { "epoch": 0.2815, "grad_norm": 2.432375192642212, "grad_norm_var": 0.2778094092018221, "learning_rate": 0.0001, "loss": 1.667, "loss/crossentropy": 1.923933982849121, "loss/hidden": 1.421875, "loss/logits": 0.2442706823348999, "loss/reg": 8.648153743706644e-05, "step": 2252 }, { "epoch": 0.281625, "grad_norm": 2.491603136062622, "grad_norm_var": 0.2779834692186786, "learning_rate": 0.0001, "loss": 1.2385, "loss/crossentropy": 2.4863057136535645, "loss/hidden": 1.078125, "loss/logits": 0.15955743193626404, "loss/reg": 8.644390618428588e-05, "step": 2253 }, { "epoch": 0.28175, "grad_norm": 3.215808629989624, "grad_norm_var": 0.29709138486451864, "learning_rate": 0.0001, "loss": 1.3259, "loss/crossentropy": 2.29951548576355, "loss/hidden": 1.140625, "loss/logits": 0.18445566296577454, "loss/reg": 8.640801388537511e-05, "step": 2254 }, { "epoch": 0.281875, "grad_norm": 3.3176157474517822, "grad_norm_var": 0.3050034629285534, "learning_rate": 0.0001, "loss": 1.4357, "loss/crossentropy": 2.6989905834198, "loss/hidden": 1.21875, "loss/logits": 0.2160765528678894, "loss/reg": 8.636836719233543e-05, "step": 2255 }, { "epoch": 0.282, "grad_norm": 4.370736122131348, "grad_norm_var": 0.4624439539423631, "learning_rate": 0.0001, "loss": 1.4501, "loss/crossentropy": 2.2942302227020264, "loss/hidden": 1.25, "loss/logits": 0.1992512345314026, "loss/reg": 8.632920798845589e-05, "step": 2256 }, { "epoch": 0.282125, "grad_norm": 2.149205207824707, "grad_norm_var": 0.47978743334393337, "learning_rate": 0.0001, "loss": 1.3249, "loss/crossentropy": 2.8126156330108643, "loss/hidden": 1.1484375, "loss/logits": 0.1756015121936798, "loss/reg": 8.629413787275553e-05, "step": 2257 }, { "epoch": 0.28225, "grad_norm": 1.982301115989685, "grad_norm_var": 0.5145646221077418, "learning_rate": 0.0001, "loss": 1.3848, "loss/crossentropy": 2.465181827545166, "loss/hidden": 1.1953125, "loss/logits": 0.18864589929580688, "loss/reg": 8.625644113635644e-05, "step": 2258 }, { "epoch": 0.282375, "grad_norm": 2.3129236698150635, "grad_norm_var": 0.5137288822538854, "learning_rate": 0.0001, "loss": 1.2937, "loss/crossentropy": 2.4024274349212646, "loss/hidden": 1.1015625, "loss/logits": 0.19125649333000183, "loss/reg": 8.622586756246164e-05, "step": 2259 }, { "epoch": 0.2825, "grad_norm": 2.9555180072784424, "grad_norm_var": 0.5155509778972671, "learning_rate": 0.0001, "loss": 1.4903, "loss/crossentropy": 2.3760111331939697, "loss/hidden": 1.265625, "loss/logits": 0.2238578498363495, "loss/reg": 8.619026630185544e-05, "step": 2260 }, { "epoch": 0.282625, "grad_norm": 2.6486096382141113, "grad_norm_var": 0.49811192052646897, "learning_rate": 0.0001, "loss": 1.4073, "loss/crossentropy": 2.5902373790740967, "loss/hidden": 1.2109375, "loss/logits": 0.19548428058624268, "loss/reg": 8.615739352535456e-05, "step": 2261 }, { "epoch": 0.28275, "grad_norm": 2.9500203132629395, "grad_norm_var": 0.4915295538173457, "learning_rate": 0.0001, "loss": 1.2801, "loss/crossentropy": 2.4696438312530518, "loss/hidden": 1.109375, "loss/logits": 0.16988299787044525, "loss/reg": 8.611797238700092e-05, "step": 2262 }, { "epoch": 0.282875, "grad_norm": 2.7882165908813477, "grad_norm_var": 0.4814052512528012, "learning_rate": 0.0001, "loss": 1.349, "loss/crossentropy": 2.500150203704834, "loss/hidden": 1.1640625, "loss/logits": 0.18404217064380646, "loss/reg": 8.608463394921273e-05, "step": 2263 }, { "epoch": 0.283, "grad_norm": 2.783068895339966, "grad_norm_var": 0.4797280042272319, "learning_rate": 0.0001, "loss": 1.4279, "loss/crossentropy": 2.4459609985351562, "loss/hidden": 1.2421875, "loss/logits": 0.1848832070827484, "loss/reg": 8.604559843661264e-05, "step": 2264 }, { "epoch": 0.283125, "grad_norm": 2.209566831588745, "grad_norm_var": 0.5000769845083297, "learning_rate": 0.0001, "loss": 1.2843, "loss/crossentropy": 2.5794692039489746, "loss/hidden": 1.125, "loss/logits": 0.15843886137008667, "loss/reg": 8.600354340160266e-05, "step": 2265 }, { "epoch": 0.28325, "grad_norm": 2.478034019470215, "grad_norm_var": 0.3441831536230618, "learning_rate": 0.0001, "loss": 1.305, "loss/crossentropy": 2.290719985961914, "loss/hidden": 1.1328125, "loss/logits": 0.1713644564151764, "loss/reg": 8.596442057751119e-05, "step": 2266 }, { "epoch": 0.283375, "grad_norm": 2.409048080444336, "grad_norm_var": 0.3357571665570953, "learning_rate": 0.0001, "loss": 1.262, "loss/crossentropy": 2.7898764610290527, "loss/hidden": 1.0859375, "loss/logits": 0.17524859309196472, "loss/reg": 8.591546065872535e-05, "step": 2267 }, { "epoch": 0.2835, "grad_norm": 3.767354965209961, "grad_norm_var": 0.3962284147868312, "learning_rate": 0.0001, "loss": 1.3649, "loss/crossentropy": 2.5811054706573486, "loss/hidden": 1.1796875, "loss/logits": 0.1843661516904831, "loss/reg": 8.587965567130595e-05, "step": 2268 }, { "epoch": 0.283625, "grad_norm": 2.533318042755127, "grad_norm_var": 0.39461157316316114, "learning_rate": 0.0001, "loss": 1.5185, "loss/crossentropy": 2.053884744644165, "loss/hidden": 1.2890625, "loss/logits": 0.22860948741436005, "loss/reg": 8.58287894516252e-05, "step": 2269 }, { "epoch": 0.28375, "grad_norm": 2.5956497192382812, "grad_norm_var": 0.38463528156536636, "learning_rate": 0.0001, "loss": 1.3511, "loss/crossentropy": 2.6852545738220215, "loss/hidden": 1.15625, "loss/logits": 0.1939740777015686, "loss/reg": 8.577957487432286e-05, "step": 2270 }, { "epoch": 0.283875, "grad_norm": 2.8728203773498535, "grad_norm_var": 0.3642684732817845, "learning_rate": 0.0001, "loss": 1.4811, "loss/crossentropy": 2.28361439704895, "loss/hidden": 1.28125, "loss/logits": 0.19897811114788055, "loss/reg": 8.573936065658927e-05, "step": 2271 }, { "epoch": 0.284, "grad_norm": 3.8867838382720947, "grad_norm_var": 0.27354458331153536, "learning_rate": 0.0001, "loss": 1.4048, "loss/crossentropy": 2.473785161972046, "loss/hidden": 1.1875, "loss/logits": 0.21640585362911224, "loss/reg": 8.570017962483689e-05, "step": 2272 }, { "epoch": 0.284125, "grad_norm": 2.3237314224243164, "grad_norm_var": 0.2624531378571295, "learning_rate": 0.0001, "loss": 1.1737, "loss/crossentropy": 2.5389249324798584, "loss/hidden": 1.03125, "loss/logits": 0.14158153533935547, "loss/reg": 8.566216274630278e-05, "step": 2273 }, { "epoch": 0.28425, "grad_norm": 19.134498596191406, "grad_norm_var": 16.966025377755948, "learning_rate": 0.0001, "loss": 3.2762, "loss/crossentropy": 3.049570322036743, "loss/hidden": 2.328125, "loss/logits": 0.9472523927688599, "loss/reg": 8.562537550460547e-05, "step": 2274 }, { "epoch": 0.284375, "grad_norm": 3.1106767654418945, "grad_norm_var": 16.848627792388992, "learning_rate": 0.0001, "loss": 1.4035, "loss/crossentropy": 2.652965545654297, "loss/hidden": 1.1953125, "loss/logits": 0.2073637992143631, "loss/reg": 8.558407716918737e-05, "step": 2275 }, { "epoch": 0.2845, "grad_norm": 2.2494730949401855, "grad_norm_var": 16.963089233160968, "learning_rate": 0.0001, "loss": 1.2808, "loss/crossentropy": 2.6527912616729736, "loss/hidden": 1.109375, "loss/logits": 0.17054912447929382, "loss/reg": 8.554279338568449e-05, "step": 2276 }, { "epoch": 0.284625, "grad_norm": 4.926355838775635, "grad_norm_var": 16.93879288504051, "learning_rate": 0.0001, "loss": 1.477, "loss/crossentropy": 2.871070623397827, "loss/hidden": 1.28125, "loss/logits": 0.19494083523750305, "loss/reg": 8.550374332116917e-05, "step": 2277 }, { "epoch": 0.28475, "grad_norm": 2.5606396198272705, "grad_norm_var": 16.999596781613878, "learning_rate": 0.0001, "loss": 1.3061, "loss/crossentropy": 2.476513147354126, "loss/hidden": 1.1171875, "loss/logits": 0.1881042718887329, "loss/reg": 8.546398021280766e-05, "step": 2278 }, { "epoch": 0.284875, "grad_norm": 2.752072811126709, "grad_norm_var": 17.0051053495441, "learning_rate": 0.0001, "loss": 1.4615, "loss/crossentropy": 2.5557305812835693, "loss/hidden": 1.234375, "loss/logits": 0.22624070942401886, "loss/reg": 8.542438445147127e-05, "step": 2279 }, { "epoch": 0.285, "grad_norm": 2.743469476699829, "grad_norm_var": 17.01116438604807, "learning_rate": 0.0001, "loss": 1.3308, "loss/crossentropy": 2.2359375953674316, "loss/hidden": 1.1484375, "loss/logits": 0.18152764439582825, "loss/reg": 8.539095870219171e-05, "step": 2280 }, { "epoch": 0.285125, "grad_norm": 3.6563355922698975, "grad_norm_var": 16.81404625979791, "learning_rate": 0.0001, "loss": 1.4201, "loss/crossentropy": 2.640157699584961, "loss/hidden": 1.21875, "loss/logits": 0.2004491686820984, "loss/reg": 8.535806409781799e-05, "step": 2281 }, { "epoch": 0.28525, "grad_norm": 2.657550096511841, "grad_norm_var": 16.779631012205908, "learning_rate": 0.0001, "loss": 1.379, "loss/crossentropy": 2.561981201171875, "loss/hidden": 1.171875, "loss/logits": 0.20628194510936737, "loss/reg": 8.533070649718866e-05, "step": 2282 }, { "epoch": 0.285375, "grad_norm": 2.475048780441284, "grad_norm_var": 16.76580386346631, "learning_rate": 0.0001, "loss": 1.5697, "loss/crossentropy": 2.450159788131714, "loss/hidden": 1.3046875, "loss/logits": 0.2641494870185852, "loss/reg": 8.529368642484769e-05, "step": 2283 }, { "epoch": 0.2855, "grad_norm": 8.837115287780762, "grad_norm_var": 18.204563939167052, "learning_rate": 0.0001, "loss": 1.403, "loss/crossentropy": 2.627774477005005, "loss/hidden": 1.2109375, "loss/logits": 0.1912110447883606, "loss/reg": 8.525379962520674e-05, "step": 2284 }, { "epoch": 0.285625, "grad_norm": 2.7930781841278076, "grad_norm_var": 18.14647670188542, "learning_rate": 0.0001, "loss": 1.4142, "loss/crossentropy": 2.3626880645751953, "loss/hidden": 1.21875, "loss/logits": 0.19460242986679077, "loss/reg": 8.521330892108381e-05, "step": 2285 }, { "epoch": 0.28575, "grad_norm": 3.4590892791748047, "grad_norm_var": 17.9912798643002, "learning_rate": 0.0001, "loss": 1.356, "loss/crossentropy": 2.4905848503112793, "loss/hidden": 1.171875, "loss/logits": 0.18323737382888794, "loss/reg": 8.516639354638755e-05, "step": 2286 }, { "epoch": 0.285875, "grad_norm": 6.226871013641357, "grad_norm_var": 18.01033553260327, "learning_rate": 0.0001, "loss": 2.03, "loss/crossentropy": 2.6961071491241455, "loss/hidden": 1.7578125, "loss/logits": 0.27132365107536316, "loss/reg": 8.512053318554536e-05, "step": 2287 }, { "epoch": 0.286, "grad_norm": 3.323134422302246, "grad_norm_var": 18.08469788885076, "learning_rate": 0.0001, "loss": 1.4682, "loss/crossentropy": 2.499473810195923, "loss/hidden": 1.234375, "loss/logits": 0.23300108313560486, "loss/reg": 8.507860911777243e-05, "step": 2288 }, { "epoch": 0.286125, "grad_norm": 2.9087767601013184, "grad_norm_var": 17.93033563889149, "learning_rate": 0.0001, "loss": 1.1827, "loss/crossentropy": 2.9820828437805176, "loss/hidden": 1.0078125, "loss/logits": 0.1740124523639679, "loss/reg": 8.504079596605152e-05, "step": 2289 }, { "epoch": 0.28625, "grad_norm": 3.4410202503204346, "grad_norm_var": 2.938263664083083, "learning_rate": 0.0001, "loss": 1.5289, "loss/crossentropy": 2.3975584506988525, "loss/hidden": 1.3125, "loss/logits": 0.21553801000118256, "loss/reg": 8.500235708197579e-05, "step": 2290 }, { "epoch": 0.286375, "grad_norm": 2.908521890640259, "grad_norm_var": 2.9548842324142393, "learning_rate": 0.0001, "loss": 1.469, "loss/crossentropy": 2.6120924949645996, "loss/hidden": 1.25, "loss/logits": 0.21816346049308777, "loss/reg": 8.49602947710082e-05, "step": 2291 }, { "epoch": 0.2865, "grad_norm": 2.548259735107422, "grad_norm_var": 2.9058680772388765, "learning_rate": 0.0001, "loss": 1.2443, "loss/crossentropy": 2.5349459648132324, "loss/hidden": 1.0625, "loss/logits": 0.18090641498565674, "loss/reg": 8.491732296533883e-05, "step": 2292 }, { "epoch": 0.286625, "grad_norm": 2.8302152156829834, "grad_norm_var": 2.820567386460182, "learning_rate": 0.0001, "loss": 1.5713, "loss/crossentropy": 2.236919403076172, "loss/hidden": 1.3046875, "loss/logits": 0.265724241733551, "loss/reg": 8.487005106871948e-05, "step": 2293 }, { "epoch": 0.28675, "grad_norm": 2.6853528022766113, "grad_norm_var": 2.8057934309125776, "learning_rate": 0.0001, "loss": 1.3387, "loss/crossentropy": 2.460153102874756, "loss/hidden": 1.15625, "loss/logits": 0.1816484034061432, "loss/reg": 8.48222043714486e-05, "step": 2294 }, { "epoch": 0.286875, "grad_norm": 3.795640707015991, "grad_norm_var": 2.7676511338836822, "learning_rate": 0.0001, "loss": 1.781, "loss/crossentropy": 2.202775716781616, "loss/hidden": 1.4765625, "loss/logits": 0.3036268949508667, "loss/reg": 8.477411029161885e-05, "step": 2295 }, { "epoch": 0.287, "grad_norm": 2.4127814769744873, "grad_norm_var": 2.8113959971420774, "learning_rate": 0.0001, "loss": 1.1872, "loss/crossentropy": 2.563894748687744, "loss/hidden": 1.03125, "loss/logits": 0.1550886332988739, "loss/reg": 8.473470370518044e-05, "step": 2296 }, { "epoch": 0.287125, "grad_norm": 3.7841007709503174, "grad_norm_var": 2.8140586413763815, "learning_rate": 0.0001, "loss": 1.6376, "loss/crossentropy": 2.4153623580932617, "loss/hidden": 1.34375, "loss/logits": 0.29297587275505066, "loss/reg": 8.469449676340446e-05, "step": 2297 }, { "epoch": 0.28725, "grad_norm": 3.038658857345581, "grad_norm_var": 2.7768769102856647, "learning_rate": 0.0001, "loss": 1.3012, "loss/crossentropy": 2.5538923740386963, "loss/hidden": 1.140625, "loss/logits": 0.15968552231788635, "loss/reg": 8.468272426398471e-05, "step": 2298 }, { "epoch": 0.287375, "grad_norm": 2.8749172687530518, "grad_norm_var": 2.727333633494491, "learning_rate": 0.0001, "loss": 1.4193, "loss/crossentropy": 2.6747310161590576, "loss/hidden": 1.203125, "loss/logits": 0.2153615951538086, "loss/reg": 8.464339771308005e-05, "step": 2299 }, { "epoch": 0.2875, "grad_norm": 2.749770402908325, "grad_norm_var": 0.8062069503380286, "learning_rate": 0.0001, "loss": 1.3211, "loss/crossentropy": 2.514087438583374, "loss/hidden": 1.140625, "loss/logits": 0.1796771138906479, "loss/reg": 8.460676326649264e-05, "step": 2300 }, { "epoch": 0.287625, "grad_norm": 3.6792123317718506, "grad_norm_var": 0.802921371801736, "learning_rate": 0.0001, "loss": 1.4019, "loss/crossentropy": 2.648211717605591, "loss/hidden": 1.203125, "loss/logits": 0.19789031147956848, "loss/reg": 8.45683753141202e-05, "step": 2301 }, { "epoch": 0.28775, "grad_norm": 5.7822184562683105, "grad_norm_var": 1.1920953422158846, "learning_rate": 0.0001, "loss": 1.4607, "loss/crossentropy": 2.5735559463500977, "loss/hidden": 1.25, "loss/logits": 0.20983704924583435, "loss/reg": 8.452973997918889e-05, "step": 2302 }, { "epoch": 0.287875, "grad_norm": 2.56965970993042, "grad_norm_var": 0.6675476483651209, "learning_rate": 0.0001, "loss": 1.8661, "loss/crossentropy": 2.262181282043457, "loss/hidden": 1.5625, "loss/logits": 0.30278030037879944, "loss/reg": 8.449088636552915e-05, "step": 2303 }, { "epoch": 0.288, "grad_norm": 3.718374490737915, "grad_norm_var": 0.6833645308045881, "learning_rate": 0.0001, "loss": 1.1729, "loss/crossentropy": 2.6746716499328613, "loss/hidden": 1.0234375, "loss/logits": 0.14865157008171082, "loss/reg": 8.446330321021378e-05, "step": 2304 }, { "epoch": 0.288125, "grad_norm": 4.204628944396973, "grad_norm_var": 0.7323028034013662, "learning_rate": 0.0001, "loss": 1.2314, "loss/crossentropy": 2.6019034385681152, "loss/hidden": 1.0546875, "loss/logits": 0.17582456767559052, "loss/reg": 8.444174454780295e-05, "step": 2305 }, { "epoch": 0.28825, "grad_norm": 3.1385204792022705, "grad_norm_var": 0.7328971085414895, "learning_rate": 0.0001, "loss": 1.3512, "loss/crossentropy": 2.6848347187042236, "loss/hidden": 1.171875, "loss/logits": 0.17843236029148102, "loss/reg": 8.442116813967004e-05, "step": 2306 }, { "epoch": 0.288375, "grad_norm": 2.7352986335754395, "grad_norm_var": 0.7436999715799819, "learning_rate": 0.0001, "loss": 1.714, "loss/crossentropy": 1.904124140739441, "loss/hidden": 1.421875, "loss/logits": 0.29126304388046265, "loss/reg": 8.438222721451893e-05, "step": 2307 }, { "epoch": 0.2885, "grad_norm": 2.567528247833252, "grad_norm_var": 0.7418323805097837, "learning_rate": 0.0001, "loss": 1.6433, "loss/crossentropy": 2.445955276489258, "loss/hidden": 1.3671875, "loss/logits": 0.27528640627861023, "loss/reg": 8.435577910859138e-05, "step": 2308 }, { "epoch": 0.288625, "grad_norm": 2.7973902225494385, "grad_norm_var": 0.7438920456471578, "learning_rate": 0.0001, "loss": 1.5161, "loss/crossentropy": 2.379629373550415, "loss/hidden": 1.2890625, "loss/logits": 0.2262093722820282, "loss/reg": 8.433089533355087e-05, "step": 2309 }, { "epoch": 0.28875, "grad_norm": 2.4576785564422607, "grad_norm_var": 0.7652857707455342, "learning_rate": 0.0001, "loss": 1.3605, "loss/crossentropy": 2.4343888759613037, "loss/hidden": 1.171875, "loss/logits": 0.18775680661201477, "loss/reg": 8.429206354776397e-05, "step": 2310 }, { "epoch": 0.288875, "grad_norm": 3.190225124359131, "grad_norm_var": 0.7456942455854081, "learning_rate": 0.0001, "loss": 1.6091, "loss/crossentropy": 2.207489252090454, "loss/hidden": 1.375, "loss/logits": 0.23329588770866394, "loss/reg": 8.42673543957062e-05, "step": 2311 }, { "epoch": 0.289, "grad_norm": 2.1727216243743896, "grad_norm_var": 0.7754954942154844, "learning_rate": 0.0001, "loss": 1.6326, "loss/crossentropy": 2.422046661376953, "loss/hidden": 1.3359375, "loss/logits": 0.29584741592407227, "loss/reg": 8.424212865065783e-05, "step": 2312 }, { "epoch": 0.289125, "grad_norm": 2.2615480422973633, "grad_norm_var": 0.805114692603132, "learning_rate": 0.0001, "loss": 1.3861, "loss/crossentropy": 2.6602730751037598, "loss/hidden": 1.171875, "loss/logits": 0.2134070247411728, "loss/reg": 8.421846723649651e-05, "step": 2313 }, { "epoch": 0.28925, "grad_norm": 2.768659830093384, "grad_norm_var": 0.8126404708435072, "learning_rate": 0.0001, "loss": 1.3203, "loss/crossentropy": 2.8399715423583984, "loss/hidden": 1.140625, "loss/logits": 0.17885075509548187, "loss/reg": 8.420165977440774e-05, "step": 2314 }, { "epoch": 0.289375, "grad_norm": 2.360706329345703, "grad_norm_var": 0.8448911729558366, "learning_rate": 0.0001, "loss": 1.2458, "loss/crossentropy": 2.4714813232421875, "loss/hidden": 1.078125, "loss/logits": 0.16686497628688812, "loss/reg": 8.416413038503379e-05, "step": 2315 }, { "epoch": 0.2895, "grad_norm": 2.2862935066223145, "grad_norm_var": 0.8782379173622755, "learning_rate": 0.0001, "loss": 1.2081, "loss/crossentropy": 2.698533058166504, "loss/hidden": 1.0546875, "loss/logits": 0.15259894728660583, "loss/reg": 8.412565512116998e-05, "step": 2316 }, { "epoch": 0.289625, "grad_norm": 3.6414597034454346, "grad_norm_var": 0.875125342753324, "learning_rate": 0.0001, "loss": 1.9715, "loss/crossentropy": 2.5302560329437256, "loss/hidden": 1.6640625, "loss/logits": 0.30662602186203003, "loss/reg": 8.41018118080683e-05, "step": 2317 }, { "epoch": 0.28975, "grad_norm": 2.1817312240600586, "grad_norm_var": 0.3692890162582612, "learning_rate": 0.0001, "loss": 1.2865, "loss/crossentropy": 2.589988946914673, "loss/hidden": 1.1015625, "loss/logits": 0.18410810828208923, "loss/reg": 8.407612767769024e-05, "step": 2318 }, { "epoch": 0.289875, "grad_norm": 2.775604724884033, "grad_norm_var": 0.3651816459655149, "learning_rate": 0.0001, "loss": 1.53, "loss/crossentropy": 2.210362672805786, "loss/hidden": 1.3125, "loss/logits": 0.21663346886634827, "loss/reg": 8.403915853705257e-05, "step": 2319 }, { "epoch": 0.29, "grad_norm": 2.5542213916778564, "grad_norm_var": 0.31178122614833775, "learning_rate": 0.0001, "loss": 1.3474, "loss/crossentropy": 2.5236637592315674, "loss/hidden": 1.1484375, "loss/logits": 0.19815346598625183, "loss/reg": 8.400264050578699e-05, "step": 2320 }, { "epoch": 0.290125, "grad_norm": 2.590538263320923, "grad_norm_var": 0.16282531936032285, "learning_rate": 0.0001, "loss": 1.4164, "loss/crossentropy": 2.917722225189209, "loss/hidden": 1.1875, "loss/logits": 0.22808434069156647, "loss/reg": 8.39639687910676e-05, "step": 2321 }, { "epoch": 0.29025, "grad_norm": 2.5557661056518555, "grad_norm_var": 0.146481273835699, "learning_rate": 0.0001, "loss": 1.6344, "loss/crossentropy": 2.4396162033081055, "loss/hidden": 1.3515625, "loss/logits": 0.28196755051612854, "loss/reg": 8.392245217692107e-05, "step": 2322 }, { "epoch": 0.290375, "grad_norm": 2.4461874961853027, "grad_norm_var": 0.14720628487900905, "learning_rate": 0.0001, "loss": 1.3145, "loss/crossentropy": 2.6133203506469727, "loss/hidden": 1.1171875, "loss/logits": 0.19648027420043945, "loss/reg": 8.388679270865396e-05, "step": 2323 }, { "epoch": 0.2905, "grad_norm": 3.692687749862671, "grad_norm_var": 0.22138137337693234, "learning_rate": 0.0001, "loss": 2.0189, "loss/crossentropy": 2.490706205368042, "loss/hidden": 1.671875, "loss/logits": 0.34615498781204224, "loss/reg": 8.385383989661932e-05, "step": 2324 }, { "epoch": 0.290625, "grad_norm": 2.7185771465301514, "grad_norm_var": 0.22043973734070807, "learning_rate": 0.0001, "loss": 1.415, "loss/crossentropy": 2.2793941497802734, "loss/hidden": 1.21875, "loss/logits": 0.19539667665958405, "loss/reg": 8.381954830838367e-05, "step": 2325 }, { "epoch": 0.29075, "grad_norm": 2.6178364753723145, "grad_norm_var": 0.21759617950333995, "learning_rate": 0.0001, "loss": 1.5494, "loss/crossentropy": 2.4063539505004883, "loss/hidden": 1.3046875, "loss/logits": 0.24390634894371033, "loss/reg": 8.379229257116094e-05, "step": 2326 }, { "epoch": 0.290875, "grad_norm": 4.516688346862793, "grad_norm_var": 0.4185256385300929, "learning_rate": 0.0001, "loss": 2.0617, "loss/crossentropy": 2.5488338470458984, "loss/hidden": 1.6953125, "loss/logits": 0.3655855059623718, "loss/reg": 8.376422192668542e-05, "step": 2327 }, { "epoch": 0.291, "grad_norm": 2.19344162940979, "grad_norm_var": 0.41693325746270843, "learning_rate": 0.0001, "loss": 1.1254, "loss/crossentropy": 2.492417097091675, "loss/hidden": 0.98046875, "loss/logits": 0.14407147467136383, "loss/reg": 8.37249172036536e-05, "step": 2328 }, { "epoch": 0.291125, "grad_norm": 2.275452136993408, "grad_norm_var": 0.4160210447346266, "learning_rate": 0.0001, "loss": 1.5586, "loss/crossentropy": 2.214392900466919, "loss/hidden": 1.3203125, "loss/logits": 0.23747023940086365, "loss/reg": 8.368537964997813e-05, "step": 2329 }, { "epoch": 0.29125, "grad_norm": 3.38639760017395, "grad_norm_var": 0.4405027055186885, "learning_rate": 0.0001, "loss": 1.3716, "loss/crossentropy": 2.626892328262329, "loss/hidden": 1.171875, "loss/logits": 0.19889551401138306, "loss/reg": 8.365754183614627e-05, "step": 2330 }, { "epoch": 0.291375, "grad_norm": 4.706772804260254, "grad_norm_var": 0.6472148181483398, "learning_rate": 0.0001, "loss": 1.6759, "loss/crossentropy": 2.802755832672119, "loss/hidden": 1.4140625, "loss/logits": 0.26097774505615234, "loss/reg": 8.36257022456266e-05, "step": 2331 }, { "epoch": 0.2915, "grad_norm": 2.6332998275756836, "grad_norm_var": 0.624207105389926, "learning_rate": 0.0001, "loss": 1.2008, "loss/crossentropy": 2.673055410385132, "loss/hidden": 1.03125, "loss/logits": 0.16869977116584778, "loss/reg": 8.35860482766293e-05, "step": 2332 }, { "epoch": 0.291625, "grad_norm": 2.190122604370117, "grad_norm_var": 0.6255173678569756, "learning_rate": 0.0001, "loss": 1.1779, "loss/crossentropy": 2.3751983642578125, "loss/hidden": 1.015625, "loss/logits": 0.16144566237926483, "loss/reg": 8.354528108611703e-05, "step": 2333 }, { "epoch": 0.29175, "grad_norm": 2.6697237491607666, "grad_norm_var": 0.5951492595422354, "learning_rate": 0.0001, "loss": 1.3098, "loss/crossentropy": 2.333200216293335, "loss/hidden": 1.1484375, "loss/logits": 0.160551056265831, "loss/reg": 8.351143333129585e-05, "step": 2334 }, { "epoch": 0.291875, "grad_norm": 2.4320147037506104, "grad_norm_var": 0.6085795280852143, "learning_rate": 0.0001, "loss": 1.326, "loss/crossentropy": 2.5006771087646484, "loss/hidden": 1.140625, "loss/logits": 0.18452605605125427, "loss/reg": 8.34743696032092e-05, "step": 2335 }, { "epoch": 0.292, "grad_norm": 2.338085174560547, "grad_norm_var": 0.6210671715714339, "learning_rate": 0.0001, "loss": 1.2321, "loss/crossentropy": 2.6039676666259766, "loss/hidden": 1.0546875, "loss/logits": 0.17661318182945251, "loss/reg": 8.343465015059337e-05, "step": 2336 }, { "epoch": 0.292125, "grad_norm": 3.639334201812744, "grad_norm_var": 0.6503547102103757, "learning_rate": 0.0001, "loss": 1.561, "loss/crossentropy": 2.6972835063934326, "loss/hidden": 1.328125, "loss/logits": 0.2320675551891327, "loss/reg": 8.339344640262425e-05, "step": 2337 }, { "epoch": 0.29225, "grad_norm": 2.146465539932251, "grad_norm_var": 0.6816999172514951, "learning_rate": 0.0001, "loss": 1.3082, "loss/crossentropy": 2.3816463947296143, "loss/hidden": 1.1328125, "loss/logits": 0.1745510995388031, "loss/reg": 8.335086749866605e-05, "step": 2338 }, { "epoch": 0.292375, "grad_norm": 3.4470508098602295, "grad_norm_var": 0.6820534522390972, "learning_rate": 0.0001, "loss": 1.5629, "loss/crossentropy": 2.5176424980163574, "loss/hidden": 1.34375, "loss/logits": 0.2182859182357788, "loss/reg": 8.330956916324794e-05, "step": 2339 }, { "epoch": 0.2925, "grad_norm": 2.4095866680145264, "grad_norm_var": 0.6622103433707821, "learning_rate": 0.0001, "loss": 1.3521, "loss/crossentropy": 2.482180118560791, "loss/hidden": 1.15625, "loss/logits": 0.19502758979797363, "loss/reg": 8.326536044478416e-05, "step": 2340 }, { "epoch": 0.292625, "grad_norm": 2.882532835006714, "grad_norm_var": 0.660032537161635, "learning_rate": 0.0001, "loss": 1.5477, "loss/crossentropy": 2.426957607269287, "loss/hidden": 1.3203125, "loss/logits": 0.22653330862522125, "loss/reg": 8.322398934978992e-05, "step": 2341 }, { "epoch": 0.29275, "grad_norm": 4.590033054351807, "grad_norm_var": 0.8275386350463539, "learning_rate": 0.0001, "loss": 1.3019, "loss/crossentropy": 2.319603204727173, "loss/hidden": 1.140625, "loss/logits": 0.16047081351280212, "loss/reg": 8.317573519889265e-05, "step": 2342 }, { "epoch": 0.292875, "grad_norm": 2.201231002807617, "grad_norm_var": 0.7031969976228386, "learning_rate": 0.0001, "loss": 1.108, "loss/crossentropy": 2.5431463718414307, "loss/hidden": 0.96484375, "loss/logits": 0.14232410490512848, "loss/reg": 8.313078433275223e-05, "step": 2343 }, { "epoch": 0.293, "grad_norm": 3.333371877670288, "grad_norm_var": 0.679476935650563, "learning_rate": 0.0001, "loss": 1.601, "loss/crossentropy": 2.435316801071167, "loss/hidden": 1.375, "loss/logits": 0.2252006232738495, "loss/reg": 8.308786345878616e-05, "step": 2344 }, { "epoch": 0.293125, "grad_norm": 2.1505160331726074, "grad_norm_var": 0.6917740435894975, "learning_rate": 0.0001, "loss": 1.3249, "loss/crossentropy": 2.35553240776062, "loss/hidden": 1.140625, "loss/logits": 0.18344663083553314, "loss/reg": 8.304750372190028e-05, "step": 2345 }, { "epoch": 0.29325, "grad_norm": 14.185218811035156, "grad_norm_var": 8.612437829199335, "learning_rate": 0.0001, "loss": 2.0405, "loss/crossentropy": 2.0157363414764404, "loss/hidden": 1.6875, "loss/logits": 0.35220927000045776, "loss/reg": 8.300379704451188e-05, "step": 2346 }, { "epoch": 0.293375, "grad_norm": 2.90643310546875, "grad_norm_var": 8.554670067786583, "learning_rate": 0.0001, "loss": 1.5333, "loss/crossentropy": 2.4888951778411865, "loss/hidden": 1.2734375, "loss/logits": 0.25908029079437256, "loss/reg": 8.296300075016916e-05, "step": 2347 }, { "epoch": 0.2935, "grad_norm": 2.6149332523345947, "grad_norm_var": 8.556837319390462, "learning_rate": 0.0001, "loss": 1.469, "loss/crossentropy": 2.303095817565918, "loss/hidden": 1.25, "loss/logits": 0.21819883584976196, "loss/reg": 8.292507845908403e-05, "step": 2348 }, { "epoch": 0.293625, "grad_norm": 2.5563223361968994, "grad_norm_var": 8.500844789089346, "learning_rate": 0.0001, "loss": 1.2285, "loss/crossentropy": 2.359854221343994, "loss/hidden": 1.0625, "loss/logits": 0.16513678431510925, "loss/reg": 8.289817196782678e-05, "step": 2349 }, { "epoch": 0.29375, "grad_norm": 2.8449950218200684, "grad_norm_var": 8.482627182655108, "learning_rate": 0.0001, "loss": 1.8645, "loss/crossentropy": 2.1149067878723145, "loss/hidden": 1.53125, "loss/logits": 0.3324453830718994, "loss/reg": 8.287306991405785e-05, "step": 2350 }, { "epoch": 0.293875, "grad_norm": 2.2972021102905273, "grad_norm_var": 8.503721964813435, "learning_rate": 0.0001, "loss": 1.1581, "loss/crossentropy": 2.684999465942383, "loss/hidden": 1.0078125, "loss/logits": 0.14944112300872803, "loss/reg": 8.285068906843662e-05, "step": 2351 }, { "epoch": 0.294, "grad_norm": 2.641784906387329, "grad_norm_var": 8.461061766985829, "learning_rate": 0.0001, "loss": 1.3598, "loss/crossentropy": 2.338663101196289, "loss/hidden": 1.1953125, "loss/logits": 0.16368307173252106, "loss/reg": 8.281723421532661e-05, "step": 2352 }, { "epoch": 0.294125, "grad_norm": 4.462296485900879, "grad_norm_var": 8.51287103383754, "learning_rate": 0.0001, "loss": 1.6733, "loss/crossentropy": 3.021125078201294, "loss/hidden": 1.359375, "loss/logits": 0.3131229281425476, "loss/reg": 8.2791579188779e-05, "step": 2353 }, { "epoch": 0.29425, "grad_norm": 2.8096835613250732, "grad_norm_var": 8.41144073535938, "learning_rate": 0.0001, "loss": 1.4161, "loss/crossentropy": 2.7127649784088135, "loss/hidden": 1.21875, "loss/logits": 0.19650010764598846, "loss/reg": 8.275997242890298e-05, "step": 2354 }, { "epoch": 0.294375, "grad_norm": 2.4552905559539795, "grad_norm_var": 8.499199788762366, "learning_rate": 0.0001, "loss": 1.3273, "loss/crossentropy": 2.875708818435669, "loss/hidden": 1.140625, "loss/logits": 0.18581590056419373, "loss/reg": 8.272744162240997e-05, "step": 2355 }, { "epoch": 0.2945, "grad_norm": 2.878843307495117, "grad_norm_var": 8.439492277318159, "learning_rate": 0.0001, "loss": 1.551, "loss/crossentropy": 2.3646795749664307, "loss/hidden": 1.3125, "loss/logits": 0.2376846969127655, "loss/reg": 8.268951205536723e-05, "step": 2356 }, { "epoch": 0.294625, "grad_norm": 4.656099319458008, "grad_norm_var": 8.463311064390464, "learning_rate": 0.0001, "loss": 1.6441, "loss/crossentropy": 3.718486785888672, "loss/hidden": 1.421875, "loss/logits": 0.22139251232147217, "loss/reg": 8.264905045507476e-05, "step": 2357 }, { "epoch": 0.29475, "grad_norm": 2.7575976848602295, "grad_norm_var": 8.461585385838653, "learning_rate": 0.0001, "loss": 1.6643, "loss/crossentropy": 1.94184410572052, "loss/hidden": 1.4375, "loss/logits": 0.22594302892684937, "loss/reg": 8.2615420978982e-05, "step": 2358 }, { "epoch": 0.294875, "grad_norm": 3.8144338130950928, "grad_norm_var": 8.321329470256348, "learning_rate": 0.0001, "loss": 1.4497, "loss/crossentropy": 2.5423104763031006, "loss/hidden": 1.234375, "loss/logits": 0.2144797146320343, "loss/reg": 8.257880836026743e-05, "step": 2359 }, { "epoch": 0.295, "grad_norm": 2.112523317337036, "grad_norm_var": 8.475842468536147, "learning_rate": 0.0001, "loss": 1.2463, "loss/crossentropy": 2.6424152851104736, "loss/hidden": 1.0703125, "loss/logits": 0.17520596086978912, "loss/reg": 8.254631393356249e-05, "step": 2360 }, { "epoch": 0.295125, "grad_norm": 6.772077560424805, "grad_norm_var": 8.896627725892904, "learning_rate": 0.0001, "loss": 1.6515, "loss/crossentropy": 2.5250792503356934, "loss/hidden": 1.3984375, "loss/logits": 0.2522284984588623, "loss/reg": 8.25119495857507e-05, "step": 2361 }, { "epoch": 0.29525, "grad_norm": 2.7832281589508057, "grad_norm_var": 1.4204530606596713, "learning_rate": 0.0001, "loss": 1.3409, "loss/crossentropy": 2.6260013580322266, "loss/hidden": 1.1328125, "loss/logits": 0.2072310745716095, "loss/reg": 8.247976074926555e-05, "step": 2362 }, { "epoch": 0.295375, "grad_norm": 3.44627046585083, "grad_norm_var": 1.416800005503455, "learning_rate": 0.0001, "loss": 1.4298, "loss/crossentropy": 2.540471076965332, "loss/hidden": 1.1953125, "loss/logits": 0.23369266092777252, "loss/reg": 8.244736091000959e-05, "step": 2363 }, { "epoch": 0.2955, "grad_norm": 2.2319746017456055, "grad_norm_var": 1.4580856277892924, "learning_rate": 0.0001, "loss": 1.2873, "loss/crossentropy": 2.497370719909668, "loss/hidden": 1.1015625, "loss/logits": 0.18488933145999908, "loss/reg": 8.240812894655392e-05, "step": 2364 }, { "epoch": 0.295625, "grad_norm": 2.4985222816467285, "grad_norm_var": 1.4634094782179903, "learning_rate": 0.0001, "loss": 1.2324, "loss/crossentropy": 2.624879837036133, "loss/hidden": 1.0546875, "loss/logits": 0.1768735945224762, "loss/reg": 8.23720038169995e-05, "step": 2365 }, { "epoch": 0.29575, "grad_norm": 2.503586530685425, "grad_norm_var": 1.487602442073966, "learning_rate": 0.0001, "loss": 1.444, "loss/crossentropy": 2.2994611263275146, "loss/hidden": 1.2421875, "loss/logits": 0.2010190188884735, "loss/reg": 8.233336120611057e-05, "step": 2366 }, { "epoch": 0.295875, "grad_norm": 2.557996988296509, "grad_norm_var": 1.4606314284787494, "learning_rate": 0.0001, "loss": 1.5496, "loss/crossentropy": 2.262929916381836, "loss/hidden": 1.34375, "loss/logits": 0.20502957701683044, "loss/reg": 8.228925435105339e-05, "step": 2367 }, { "epoch": 0.296, "grad_norm": 4.147806644439697, "grad_norm_var": 1.488009799237716, "learning_rate": 0.0001, "loss": 1.463, "loss/crossentropy": 2.5727896690368652, "loss/hidden": 1.2421875, "loss/logits": 0.2199569046497345, "loss/reg": 8.224901830544695e-05, "step": 2368 }, { "epoch": 0.296125, "grad_norm": 2.622868061065674, "grad_norm_var": 1.4157693241325802, "learning_rate": 0.0001, "loss": 1.321, "loss/crossentropy": 2.392244338989258, "loss/hidden": 1.1484375, "loss/logits": 0.17173121869564056, "loss/reg": 8.220819290727377e-05, "step": 2369 }, { "epoch": 0.29625, "grad_norm": 2.3734002113342285, "grad_norm_var": 1.4498212068648832, "learning_rate": 0.0001, "loss": 1.3977, "loss/crossentropy": 2.63057279586792, "loss/hidden": 1.1796875, "loss/logits": 0.2172287255525589, "loss/reg": 8.216300193453208e-05, "step": 2370 }, { "epoch": 0.296375, "grad_norm": 2.3946311473846436, "grad_norm_var": 1.4557773623669938, "learning_rate": 0.0001, "loss": 1.3124, "loss/crossentropy": 2.61126971244812, "loss/hidden": 1.140625, "loss/logits": 0.17093132436275482, "loss/reg": 8.212071406887844e-05, "step": 2371 }, { "epoch": 0.2965, "grad_norm": 2.7198076248168945, "grad_norm_var": 1.4633092058816068, "learning_rate": 0.0001, "loss": 1.2059, "loss/crossentropy": 2.7492945194244385, "loss/hidden": 1.0390625, "loss/logits": 0.16598236560821533, "loss/reg": 8.207975042751059e-05, "step": 2372 }, { "epoch": 0.296625, "grad_norm": 2.661484718322754, "grad_norm_var": 1.3113003719277967, "learning_rate": 0.0001, "loss": 1.6597, "loss/crossentropy": 2.0693275928497314, "loss/hidden": 1.4140625, "loss/logits": 0.24478456377983093, "loss/reg": 8.204355981433764e-05, "step": 2373 }, { "epoch": 0.29675, "grad_norm": 3.4258320331573486, "grad_norm_var": 1.315393924089945, "learning_rate": 0.0001, "loss": 1.3346, "loss/crossentropy": 2.7914023399353027, "loss/hidden": 1.1484375, "loss/logits": 0.185336634516716, "loss/reg": 8.200707816286013e-05, "step": 2374 }, { "epoch": 0.296875, "grad_norm": 2.4130935668945312, "grad_norm_var": 1.2984091547512813, "learning_rate": 0.0001, "loss": 1.2614, "loss/crossentropy": 2.613556385040283, "loss/hidden": 1.0859375, "loss/logits": 0.17464062571525574, "loss/reg": 8.19725391920656e-05, "step": 2375 }, { "epoch": 0.297, "grad_norm": 2.025839328765869, "grad_norm_var": 1.3088942049454448, "learning_rate": 0.0001, "loss": 1.1734, "loss/crossentropy": 1.9743096828460693, "loss/hidden": 1.0078125, "loss/logits": 0.16481420397758484, "loss/reg": 8.1942169344984e-05, "step": 2376 }, { "epoch": 0.297125, "grad_norm": 3.37015438079834, "grad_norm_var": 0.3092845170847565, "learning_rate": 0.0001, "loss": 1.5628, "loss/crossentropy": 2.518052816390991, "loss/hidden": 1.3515625, "loss/logits": 0.2104596495628357, "loss/reg": 8.190058724721894e-05, "step": 2377 }, { "epoch": 0.29725, "grad_norm": 3.231936454772949, "grad_norm_var": 0.3231962168053959, "learning_rate": 0.0001, "loss": 1.3454, "loss/crossentropy": 2.4978525638580322, "loss/hidden": 1.15625, "loss/logits": 0.18832635879516602, "loss/reg": 8.186182094505057e-05, "step": 2378 }, { "epoch": 0.297375, "grad_norm": 2.2732250690460205, "grad_norm_var": 0.3064091362060895, "learning_rate": 0.0001, "loss": 1.2318, "loss/crossentropy": 2.580953359603882, "loss/hidden": 1.078125, "loss/logits": 0.15289446711540222, "loss/reg": 8.182745659723878e-05, "step": 2379 }, { "epoch": 0.2975, "grad_norm": 3.300450086593628, "grad_norm_var": 0.30883991901253677, "learning_rate": 0.0001, "loss": 1.6173, "loss/crossentropy": 2.3847243785858154, "loss/hidden": 1.4296875, "loss/logits": 0.18677116930484772, "loss/reg": 8.179021824616939e-05, "step": 2380 }, { "epoch": 0.297625, "grad_norm": 3.344837188720703, "grad_norm_var": 0.32155638713834755, "learning_rate": 0.0001, "loss": 1.3585, "loss/crossentropy": 2.1237237453460693, "loss/hidden": 1.203125, "loss/logits": 0.15459546446800232, "loss/reg": 8.175736002158374e-05, "step": 2381 }, { "epoch": 0.29775, "grad_norm": 5.444453239440918, "grad_norm_var": 0.7319772449993079, "learning_rate": 0.0001, "loss": 1.5584, "loss/crossentropy": 2.5035712718963623, "loss/hidden": 1.34375, "loss/logits": 0.21381795406341553, "loss/reg": 8.171830995706841e-05, "step": 2382 }, { "epoch": 0.297875, "grad_norm": 2.633551597595215, "grad_norm_var": 0.7276875027563923, "learning_rate": 0.0001, "loss": 1.2959, "loss/crossentropy": 2.7323615550994873, "loss/hidden": 1.1171875, "loss/logits": 0.17792974412441254, "loss/reg": 8.168231579475105e-05, "step": 2383 }, { "epoch": 0.298, "grad_norm": 2.808367967605591, "grad_norm_var": 0.6391088058901969, "learning_rate": 0.0001, "loss": 1.3288, "loss/crossentropy": 2.608471393585205, "loss/hidden": 1.1484375, "loss/logits": 0.1795274168252945, "loss/reg": 8.164923201547936e-05, "step": 2384 }, { "epoch": 0.298125, "grad_norm": 3.064418077468872, "grad_norm_var": 0.6326091212959142, "learning_rate": 0.0001, "loss": 1.636, "loss/crossentropy": 2.462193489074707, "loss/hidden": 1.3828125, "loss/logits": 0.25233587622642517, "loss/reg": 8.161579899024218e-05, "step": 2385 }, { "epoch": 0.29825, "grad_norm": 2.833275318145752, "grad_norm_var": 0.6093777024607249, "learning_rate": 0.0001, "loss": 1.3246, "loss/crossentropy": 2.2275543212890625, "loss/hidden": 1.1328125, "loss/logits": 0.19094045460224152, "loss/reg": 8.15825114841573e-05, "step": 2386 }, { "epoch": 0.298375, "grad_norm": 2.065215587615967, "grad_norm_var": 0.6425989216506497, "learning_rate": 0.0001, "loss": 1.2745, "loss/crossentropy": 2.4590208530426025, "loss/hidden": 1.09375, "loss/logits": 0.17991051077842712, "loss/reg": 8.155161049216986e-05, "step": 2387 }, { "epoch": 0.2985, "grad_norm": 4.112203121185303, "grad_norm_var": 0.7162096034161226, "learning_rate": 0.0001, "loss": 1.9633, "loss/crossentropy": 2.0627379417419434, "loss/hidden": 1.6640625, "loss/logits": 0.29843249917030334, "loss/reg": 8.152447844622657e-05, "step": 2388 }, { "epoch": 0.298625, "grad_norm": 2.464708089828491, "grad_norm_var": 0.7291647321216781, "learning_rate": 0.0001, "loss": 1.4035, "loss/crossentropy": 2.434690237045288, "loss/hidden": 1.1953125, "loss/logits": 0.207401305437088, "loss/reg": 8.150032226694748e-05, "step": 2389 }, { "epoch": 0.29875, "grad_norm": 3.9490551948547363, "grad_norm_var": 0.7724436815569009, "learning_rate": 0.0001, "loss": 1.4497, "loss/crossentropy": 2.691868305206299, "loss/hidden": 1.2109375, "loss/logits": 0.23795056343078613, "loss/reg": 8.148174674715847e-05, "step": 2390 }, { "epoch": 0.298875, "grad_norm": 2.396487236022949, "grad_norm_var": 0.7739451477619618, "learning_rate": 0.0001, "loss": 1.405, "loss/crossentropy": 2.359788417816162, "loss/hidden": 1.21875, "loss/logits": 0.18543124198913574, "loss/reg": 8.146383333951235e-05, "step": 2391 }, { "epoch": 0.299, "grad_norm": 2.7426295280456543, "grad_norm_var": 0.7050805915082091, "learning_rate": 0.0001, "loss": 1.3386, "loss/crossentropy": 2.348921298980713, "loss/hidden": 1.1484375, "loss/logits": 0.1892995834350586, "loss/reg": 8.142957813106477e-05, "step": 2392 }, { "epoch": 0.299125, "grad_norm": 2.2707359790802, "grad_norm_var": 0.7450090496681734, "learning_rate": 0.0001, "loss": 1.3794, "loss/crossentropy": 2.2874224185943604, "loss/hidden": 1.171875, "loss/logits": 0.20674031972885132, "loss/reg": 8.13929655123502e-05, "step": 2393 }, { "epoch": 0.29925, "grad_norm": 2.2402567863464355, "grad_norm_var": 0.783537159857761, "learning_rate": 0.0001, "loss": 1.221, "loss/crossentropy": 2.6851518154144287, "loss/hidden": 1.0546875, "loss/logits": 0.16545909643173218, "loss/reg": 8.135676762321964e-05, "step": 2394 }, { "epoch": 0.299375, "grad_norm": 4.420713901519775, "grad_norm_var": 0.8646746080066897, "learning_rate": 0.0001, "loss": 1.8166, "loss/crossentropy": 2.720947027206421, "loss/hidden": 1.5078125, "loss/logits": 0.3080087900161743, "loss/reg": 8.13328952062875e-05, "step": 2395 }, { "epoch": 0.2995, "grad_norm": 3.4986205101013184, "grad_norm_var": 0.8716140749330776, "learning_rate": 0.0001, "loss": 1.4107, "loss/crossentropy": 2.7352921962738037, "loss/hidden": 1.234375, "loss/logits": 0.17553000152111053, "loss/reg": 8.129199704853818e-05, "step": 2396 }, { "epoch": 0.299625, "grad_norm": 2.1157727241516113, "grad_norm_var": 0.9329660825347522, "learning_rate": 0.0001, "loss": 1.3404, "loss/crossentropy": 2.3515403270721436, "loss/hidden": 1.15625, "loss/logits": 0.1833697259426117, "loss/reg": 8.12602011137642e-05, "step": 2397 }, { "epoch": 0.29975, "grad_norm": 2.44099497795105, "grad_norm_var": 0.5443974240052168, "learning_rate": 0.0001, "loss": 1.219, "loss/crossentropy": 2.8022847175598145, "loss/hidden": 1.0546875, "loss/logits": 0.16351178288459778, "loss/reg": 8.123033330775797e-05, "step": 2398 }, { "epoch": 0.299875, "grad_norm": 3.5493733882904053, "grad_norm_var": 0.56689979422277, "learning_rate": 0.0001, "loss": 1.3851, "loss/crossentropy": 2.414412021636963, "loss/hidden": 1.1875, "loss/logits": 0.19676430523395538, "loss/reg": 8.119652920868248e-05, "step": 2399 }, { "epoch": 0.3, "grad_norm": 2.6659233570098877, "grad_norm_var": 0.5705882496112348, "learning_rate": 0.0001, "loss": 1.3819, "loss/crossentropy": 2.7876930236816406, "loss/hidden": 1.1953125, "loss/logits": 0.185750812292099, "loss/reg": 8.116715616779402e-05, "step": 2400 }, { "epoch": 0.300125, "grad_norm": 4.032064914703369, "grad_norm_var": 0.6468521798880335, "learning_rate": 0.0001, "loss": 1.4183, "loss/crossentropy": 2.4644906520843506, "loss/hidden": 1.21875, "loss/logits": 0.19872011244297028, "loss/reg": 8.114356751320884e-05, "step": 2401 }, { "epoch": 0.30025, "grad_norm": 2.799051284790039, "grad_norm_var": 0.6476285822516784, "learning_rate": 0.0001, "loss": 1.3163, "loss/crossentropy": 2.67179274559021, "loss/hidden": 1.1171875, "loss/logits": 0.1982773244380951, "loss/reg": 8.111677743727341e-05, "step": 2402 }, { "epoch": 0.300375, "grad_norm": 4.64456844329834, "grad_norm_var": 0.7470366099842009, "learning_rate": 0.0001, "loss": 1.4679, "loss/crossentropy": 2.533597469329834, "loss/hidden": 1.2421875, "loss/logits": 0.22491943836212158, "loss/reg": 8.108331530820578e-05, "step": 2403 }, { "epoch": 0.3005, "grad_norm": 2.702794075012207, "grad_norm_var": 0.6897027584030143, "learning_rate": 0.0001, "loss": 1.2662, "loss/crossentropy": 2.3488337993621826, "loss/hidden": 1.09375, "loss/logits": 0.17164413630962372, "loss/reg": 8.10566489235498e-05, "step": 2404 }, { "epoch": 0.300625, "grad_norm": 2.6461870670318604, "grad_norm_var": 0.6773964744364905, "learning_rate": 0.0001, "loss": 1.3246, "loss/crossentropy": 2.856506824493408, "loss/hidden": 1.1328125, "loss/logits": 0.1909858137369156, "loss/reg": 8.102060382952914e-05, "step": 2405 }, { "epoch": 0.30075, "grad_norm": 2.5051991939544678, "grad_norm_var": 0.6384035339220114, "learning_rate": 0.0001, "loss": 1.3491, "loss/crossentropy": 2.7232823371887207, "loss/hidden": 1.15625, "loss/logits": 0.1920720636844635, "loss/reg": 8.098835678538308e-05, "step": 2406 }, { "epoch": 0.300875, "grad_norm": 2.2132256031036377, "grad_norm_var": 0.6547474780574246, "learning_rate": 0.0001, "loss": 1.2727, "loss/crossentropy": 2.8755857944488525, "loss/hidden": 1.0859375, "loss/logits": 0.18594175577163696, "loss/reg": 8.095723751466721e-05, "step": 2407 }, { "epoch": 0.301, "grad_norm": 2.902573823928833, "grad_norm_var": 0.6515399857150896, "learning_rate": 0.0001, "loss": 1.24, "loss/crossentropy": 2.736018180847168, "loss/hidden": 1.078125, "loss/logits": 0.16108819842338562, "loss/reg": 8.092416101135314e-05, "step": 2408 }, { "epoch": 0.301125, "grad_norm": 2.7543139457702637, "grad_norm_var": 0.6205529317535401, "learning_rate": 0.0001, "loss": 1.2162, "loss/crossentropy": 2.6218297481536865, "loss/hidden": 1.0703125, "loss/logits": 0.14507722854614258, "loss/reg": 8.089101902442053e-05, "step": 2409 }, { "epoch": 0.30125, "grad_norm": 3.188429355621338, "grad_norm_var": 0.5796532618807928, "learning_rate": 0.0001, "loss": 1.4844, "loss/crossentropy": 2.3062005043029785, "loss/hidden": 1.2734375, "loss/logits": 0.21019214391708374, "loss/reg": 8.08582772151567e-05, "step": 2410 }, { "epoch": 0.301375, "grad_norm": 3.229008674621582, "grad_norm_var": 0.45339381454546107, "learning_rate": 0.0001, "loss": 1.4533, "loss/crossentropy": 2.6585636138916016, "loss/hidden": 1.203125, "loss/logits": 0.24940580129623413, "loss/reg": 8.082361455308273e-05, "step": 2411 }, { "epoch": 0.3015, "grad_norm": 2.4063098430633545, "grad_norm_var": 0.4543268588577083, "learning_rate": 0.0001, "loss": 1.3526, "loss/crossentropy": 2.577148199081421, "loss/hidden": 1.15625, "loss/logits": 0.19549936056137085, "loss/reg": 8.078337123151869e-05, "step": 2412 }, { "epoch": 0.301625, "grad_norm": 1.8536466360092163, "grad_norm_var": 0.48689465514781977, "learning_rate": 0.0001, "loss": 1.0797, "loss/crossentropy": 2.509326457977295, "loss/hidden": 0.9609375, "loss/logits": 0.11794757843017578, "loss/reg": 8.074936340562999e-05, "step": 2413 }, { "epoch": 0.30175, "grad_norm": 4.986080169677734, "grad_norm_var": 0.7331400038317819, "learning_rate": 0.0001, "loss": 1.536, "loss/crossentropy": 2.462019920349121, "loss/hidden": 1.3125, "loss/logits": 0.22270503640174866, "loss/reg": 8.071649790508673e-05, "step": 2414 }, { "epoch": 0.301875, "grad_norm": 2.4726033210754395, "grad_norm_var": 0.7364111511425837, "learning_rate": 0.0001, "loss": 1.4444, "loss/crossentropy": 2.5543012619018555, "loss/hidden": 1.234375, "loss/logits": 0.2091817855834961, "loss/reg": 8.06844764156267e-05, "step": 2415 }, { "epoch": 0.302, "grad_norm": 3.323230743408203, "grad_norm_var": 0.7341248136717652, "learning_rate": 0.0001, "loss": 1.391, "loss/crossentropy": 2.434366464614868, "loss/hidden": 1.2109375, "loss/logits": 0.17923657596111298, "loss/reg": 8.065514703048393e-05, "step": 2416 }, { "epoch": 0.302125, "grad_norm": 3.4219970703125, "grad_norm_var": 0.6767873701973437, "learning_rate": 0.0001, "loss": 1.689, "loss/crossentropy": 2.5959019660949707, "loss/hidden": 1.3671875, "loss/logits": 0.3209618031978607, "loss/reg": 8.062664710450917e-05, "step": 2417 }, { "epoch": 0.30225, "grad_norm": 2.1136741638183594, "grad_norm_var": 0.7247907673911725, "learning_rate": 0.0001, "loss": 1.3466, "loss/crossentropy": 2.564812421798706, "loss/hidden": 1.140625, "loss/logits": 0.2051236480474472, "loss/reg": 8.05972158559598e-05, "step": 2418 }, { "epoch": 0.302375, "grad_norm": 2.736260414123535, "grad_norm_var": 0.5238309398654706, "learning_rate": 0.0001, "loss": 1.4718, "loss/crossentropy": 2.9307730197906494, "loss/hidden": 1.2421875, "loss/logits": 0.22883135080337524, "loss/reg": 8.056296792346984e-05, "step": 2419 }, { "epoch": 0.3025, "grad_norm": 2.614290475845337, "grad_norm_var": 0.5259510479305802, "learning_rate": 0.0001, "loss": 1.4012, "loss/crossentropy": 2.633868932723999, "loss/hidden": 1.203125, "loss/logits": 0.19730669260025024, "loss/reg": 8.05346280685626e-05, "step": 2420 }, { "epoch": 0.302625, "grad_norm": 2.1952273845672607, "grad_norm_var": 0.5500406942761809, "learning_rate": 0.0001, "loss": 1.1897, "loss/crossentropy": 2.6132874488830566, "loss/hidden": 1.0234375, "loss/logits": 0.16548341512680054, "loss/reg": 8.050599717535079e-05, "step": 2421 }, { "epoch": 0.30275, "grad_norm": 2.9054949283599854, "grad_norm_var": 0.5439339636605973, "learning_rate": 0.0001, "loss": 1.4509, "loss/crossentropy": 2.445962429046631, "loss/hidden": 1.25, "loss/logits": 0.20007222890853882, "loss/reg": 8.048416202655062e-05, "step": 2422 }, { "epoch": 0.302875, "grad_norm": 3.135242462158203, "grad_norm_var": 0.5209632162677261, "learning_rate": 0.0001, "loss": 1.565, "loss/crossentropy": 2.532257556915283, "loss/hidden": 1.34375, "loss/logits": 0.22043195366859436, "loss/reg": 8.045021968428046e-05, "step": 2423 }, { "epoch": 0.303, "grad_norm": 1.9578412771224976, "grad_norm_var": 0.5751491098781645, "learning_rate": 0.0001, "loss": 1.3646, "loss/crossentropy": 2.1764581203460693, "loss/hidden": 1.171875, "loss/logits": 0.19190068542957306, "loss/reg": 8.042194531299174e-05, "step": 2424 }, { "epoch": 0.303125, "grad_norm": 2.0885820388793945, "grad_norm_var": 0.6096429935862411, "learning_rate": 0.0001, "loss": 1.2822, "loss/crossentropy": 2.495537519454956, "loss/hidden": 1.0859375, "loss/logits": 0.1955076903104782, "loss/reg": 8.039774547796696e-05, "step": 2425 }, { "epoch": 0.30325, "grad_norm": 3.954063653945923, "grad_norm_var": 0.6870308071892216, "learning_rate": 0.0001, "loss": 1.4378, "loss/crossentropy": 2.3228790760040283, "loss/hidden": 1.203125, "loss/logits": 0.23384256660938263, "loss/reg": 8.036733925109729e-05, "step": 2426 }, { "epoch": 0.303375, "grad_norm": 2.3300158977508545, "grad_norm_var": 0.6905657987893787, "learning_rate": 0.0001, "loss": 1.618, "loss/crossentropy": 2.4686994552612305, "loss/hidden": 1.3984375, "loss/logits": 0.21880151331424713, "loss/reg": 8.034007623791695e-05, "step": 2427 }, { "epoch": 0.3035, "grad_norm": 2.6129348278045654, "grad_norm_var": 0.6829139321711539, "learning_rate": 0.0001, "loss": 1.1998, "loss/crossentropy": 2.9125680923461914, "loss/hidden": 1.0234375, "loss/logits": 0.17553314566612244, "loss/reg": 8.030523167690262e-05, "step": 2428 }, { "epoch": 0.303625, "grad_norm": 2.7347707748413086, "grad_norm_var": 0.6209825929743952, "learning_rate": 0.0001, "loss": 1.4843, "loss/crossentropy": 2.6327507495880127, "loss/hidden": 1.25, "loss/logits": 0.2334636002779007, "loss/reg": 8.027352305362001e-05, "step": 2429 }, { "epoch": 0.30375, "grad_norm": 2.5896637439727783, "grad_norm_var": 0.29703002964363157, "learning_rate": 0.0001, "loss": 1.2986, "loss/crossentropy": 2.3290977478027344, "loss/hidden": 1.1015625, "loss/logits": 0.19621030986309052, "loss/reg": 8.023594273254275e-05, "step": 2430 }, { "epoch": 0.303875, "grad_norm": 2.334099769592285, "grad_norm_var": 0.3024120660634333, "learning_rate": 0.0001, "loss": 1.491, "loss/crossentropy": 2.4949843883514404, "loss/hidden": 1.2734375, "loss/logits": 0.21672910451889038, "loss/reg": 8.020322275115177e-05, "step": 2431 }, { "epoch": 0.304, "grad_norm": 2.948395013809204, "grad_norm_var": 0.27956884484729105, "learning_rate": 0.0001, "loss": 1.6945, "loss/crossentropy": 2.6133899688720703, "loss/hidden": 1.3671875, "loss/logits": 0.3265211582183838, "loss/reg": 8.017147774808109e-05, "step": 2432 }, { "epoch": 0.304125, "grad_norm": 2.400258779525757, "grad_norm_var": 0.24196579310065017, "learning_rate": 0.0001, "loss": 1.2804, "loss/crossentropy": 2.4311017990112305, "loss/hidden": 1.1015625, "loss/logits": 0.17805978655815125, "loss/reg": 8.013026672415435e-05, "step": 2433 }, { "epoch": 0.30425, "grad_norm": 1.9865649938583374, "grad_norm_var": 0.25127161194688863, "learning_rate": 0.0001, "loss": 1.1196, "loss/crossentropy": 2.655052423477173, "loss/hidden": 0.96875, "loss/logits": 0.1500505805015564, "loss/reg": 8.008729491848499e-05, "step": 2434 }, { "epoch": 0.304375, "grad_norm": 2.453373432159424, "grad_norm_var": 0.25095381712368503, "learning_rate": 0.0001, "loss": 1.2433, "loss/crossentropy": 2.1065428256988525, "loss/hidden": 1.0703125, "loss/logits": 0.172180637717247, "loss/reg": 8.005250128917396e-05, "step": 2435 }, { "epoch": 0.3045, "grad_norm": 2.4197075366973877, "grad_norm_var": 0.25236704687867756, "learning_rate": 0.0001, "loss": 1.191, "loss/crossentropy": 2.511603593826294, "loss/hidden": 1.0234375, "loss/logits": 0.16671322286128998, "loss/reg": 8.002026152098551e-05, "step": 2436 }, { "epoch": 0.304625, "grad_norm": 2.6156063079833984, "grad_norm_var": 0.24266415330503696, "learning_rate": 0.0001, "loss": 1.2006, "loss/crossentropy": 2.5670175552368164, "loss/hidden": 1.0546875, "loss/logits": 0.14510346949100494, "loss/reg": 7.99787521827966e-05, "step": 2437 }, { "epoch": 0.30475, "grad_norm": 2.6225385665893555, "grad_norm_var": 0.23582809100719734, "learning_rate": 0.0001, "loss": 1.3906, "loss/crossentropy": 2.425818920135498, "loss/hidden": 1.1875, "loss/logits": 0.20225399732589722, "loss/reg": 7.994390762178227e-05, "step": 2438 }, { "epoch": 0.304875, "grad_norm": 2.4111196994781494, "grad_norm_var": 0.21441035242140719, "learning_rate": 0.0001, "loss": 1.2291, "loss/crossentropy": 2.2852821350097656, "loss/hidden": 1.0625, "loss/logits": 0.1658465713262558, "loss/reg": 7.989736332092434e-05, "step": 2439 }, { "epoch": 0.305, "grad_norm": 4.657589912414551, "grad_norm_var": 0.46445294668298176, "learning_rate": 0.0001, "loss": 1.9391, "loss/crossentropy": 2.320427417755127, "loss/hidden": 1.65625, "loss/logits": 0.2820591926574707, "loss/reg": 7.985137199284509e-05, "step": 2440 }, { "epoch": 0.305125, "grad_norm": 2.6582369804382324, "grad_norm_var": 0.4384882630711539, "learning_rate": 0.0001, "loss": 1.466, "loss/crossentropy": 2.6424992084503174, "loss/hidden": 1.2265625, "loss/logits": 0.23860402405261993, "loss/reg": 7.981588714756072e-05, "step": 2441 }, { "epoch": 0.30525, "grad_norm": 2.1081135272979736, "grad_norm_var": 0.3509371156615268, "learning_rate": 0.0001, "loss": 1.2827, "loss/crossentropy": 2.44482684135437, "loss/hidden": 1.1015625, "loss/logits": 0.1803685426712036, "loss/reg": 7.97805332695134e-05, "step": 2442 }, { "epoch": 0.305375, "grad_norm": 2.353989601135254, "grad_norm_var": 0.3500534983712223, "learning_rate": 0.0001, "loss": 1.2782, "loss/crossentropy": 2.7685375213623047, "loss/hidden": 1.1015625, "loss/logits": 0.17589005827903748, "loss/reg": 7.973861647769809e-05, "step": 2443 }, { "epoch": 0.3055, "grad_norm": 2.796731948852539, "grad_norm_var": 0.3520116609292865, "learning_rate": 0.0001, "loss": 1.4566, "loss/crossentropy": 2.1625680923461914, "loss/hidden": 1.2578125, "loss/logits": 0.19798722863197327, "loss/reg": 7.970331353135407e-05, "step": 2444 }, { "epoch": 0.305625, "grad_norm": 32.056121826171875, "grad_norm_var": 54.49283684236841, "learning_rate": 0.0001, "loss": 1.0669, "loss/crossentropy": 2.79866623878479, "loss/hidden": 0.93359375, "loss/logits": 0.13251978158950806, "loss/reg": 7.966662815306336e-05, "step": 2445 }, { "epoch": 0.30575, "grad_norm": 2.5276083946228027, "grad_norm_var": 54.5085797192035, "learning_rate": 0.0001, "loss": 1.2884, "loss/crossentropy": 2.6210412979125977, "loss/hidden": 1.1171875, "loss/logits": 0.1703847050666809, "loss/reg": 7.962933887029067e-05, "step": 2446 }, { "epoch": 0.305875, "grad_norm": 2.656249523162842, "grad_norm_var": 54.42377826150353, "learning_rate": 0.0001, "loss": 1.4714, "loss/crossentropy": 2.582623243331909, "loss/hidden": 1.2421875, "loss/logits": 0.22845858335494995, "loss/reg": 7.959014328662306e-05, "step": 2447 }, { "epoch": 0.306, "grad_norm": 3.218916416168213, "grad_norm_var": 54.37312543892662, "learning_rate": 0.0001, "loss": 1.2976, "loss/crossentropy": 2.4671380519866943, "loss/hidden": 1.125, "loss/logits": 0.17184722423553467, "loss/reg": 7.954478496685624e-05, "step": 2448 }, { "epoch": 0.306125, "grad_norm": 3.497976779937744, "grad_norm_var": 54.14163773420423, "learning_rate": 0.0001, "loss": 1.2879, "loss/crossentropy": 2.699605703353882, "loss/hidden": 1.109375, "loss/logits": 0.17768073081970215, "loss/reg": 7.950096187414601e-05, "step": 2449 }, { "epoch": 0.30625, "grad_norm": 2.651900053024292, "grad_norm_var": 53.94056575810265, "learning_rate": 0.0001, "loss": 1.4008, "loss/crossentropy": 2.7632486820220947, "loss/hidden": 1.1953125, "loss/logits": 0.2047252058982849, "loss/reg": 7.946318510221317e-05, "step": 2450 }, { "epoch": 0.306375, "grad_norm": 2.191272020339966, "grad_norm_var": 54.02010822069321, "learning_rate": 0.0001, "loss": 1.4103, "loss/crossentropy": 2.237988233566284, "loss/hidden": 1.2265625, "loss/logits": 0.18294823169708252, "loss/reg": 7.942627416923642e-05, "step": 2451 }, { "epoch": 0.3065, "grad_norm": 2.35791277885437, "grad_norm_var": 54.03823047023456, "learning_rate": 0.0001, "loss": 1.2743, "loss/crossentropy": 2.5080182552337646, "loss/hidden": 1.0859375, "loss/logits": 0.18758925795555115, "loss/reg": 7.938811904750764e-05, "step": 2452 }, { "epoch": 0.306625, "grad_norm": 2.439197301864624, "grad_norm_var": 54.08653015495696, "learning_rate": 0.0001, "loss": 1.3396, "loss/crossentropy": 2.64215087890625, "loss/hidden": 1.140625, "loss/logits": 0.19815394282341003, "loss/reg": 7.934834866318852e-05, "step": 2453 }, { "epoch": 0.30675, "grad_norm": 2.5458717346191406, "grad_norm_var": 54.10685955347332, "learning_rate": 0.0001, "loss": 1.4378, "loss/crossentropy": 2.449326515197754, "loss/hidden": 1.21875, "loss/logits": 0.21823540329933167, "loss/reg": 7.930670108180493e-05, "step": 2454 }, { "epoch": 0.306875, "grad_norm": 3.118328809738159, "grad_norm_var": 53.93449604454097, "learning_rate": 0.0001, "loss": 1.2974, "loss/crossentropy": 2.5785820484161377, "loss/hidden": 1.1171875, "loss/logits": 0.17937436699867249, "loss/reg": 7.92567734606564e-05, "step": 2455 }, { "epoch": 0.307, "grad_norm": 2.2959983348846436, "grad_norm_var": 54.269576681257035, "learning_rate": 0.0001, "loss": 1.4215, "loss/crossentropy": 2.1156275272369385, "loss/hidden": 1.234375, "loss/logits": 0.1862892359495163, "loss/reg": 7.920433563413098e-05, "step": 2456 }, { "epoch": 0.307125, "grad_norm": 2.576112747192383, "grad_norm_var": 54.2898056360593, "learning_rate": 0.0001, "loss": 1.1984, "loss/crossentropy": 2.643465995788574, "loss/hidden": 1.015625, "loss/logits": 0.18202939629554749, "loss/reg": 7.91515558375977e-05, "step": 2457 }, { "epoch": 0.30725, "grad_norm": 2.8919601440429688, "grad_norm_var": 54.08219317489983, "learning_rate": 0.0001, "loss": 1.3417, "loss/crossentropy": 2.4917900562286377, "loss/hidden": 1.1640625, "loss/logits": 0.176839679479599, "loss/reg": 7.910109707154334e-05, "step": 2458 }, { "epoch": 0.307375, "grad_norm": 2.717726707458496, "grad_norm_var": 53.98585047158563, "learning_rate": 0.0001, "loss": 1.7403, "loss/crossentropy": 2.4618191719055176, "loss/hidden": 1.4765625, "loss/logits": 0.2629888355731964, "loss/reg": 7.906236714916304e-05, "step": 2459 }, { "epoch": 0.3075, "grad_norm": 2.8151590824127197, "grad_norm_var": 53.98160394313485, "learning_rate": 0.0001, "loss": 1.3012, "loss/crossentropy": 2.52325439453125, "loss/hidden": 1.1171875, "loss/logits": 0.18327173590660095, "loss/reg": 7.902235665824264e-05, "step": 2460 }, { "epoch": 0.307625, "grad_norm": 2.8208816051483154, "grad_norm_var": 0.12168291865947746, "learning_rate": 0.0001, "loss": 1.4857, "loss/crossentropy": 2.494373321533203, "loss/hidden": 1.25, "loss/logits": 0.23495443165302277, "loss/reg": 7.89829864515923e-05, "step": 2461 }, { "epoch": 0.30775, "grad_norm": 2.8707544803619385, "grad_norm_var": 0.12080291344102699, "learning_rate": 0.0001, "loss": 1.4654, "loss/crossentropy": 2.534925699234009, "loss/hidden": 1.2265625, "loss/logits": 0.2380102276802063, "loss/reg": 7.893530710134655e-05, "step": 2462 }, { "epoch": 0.307875, "grad_norm": 3.8814547061920166, "grad_norm_var": 0.2027161778748289, "learning_rate": 0.0001, "loss": 1.7628, "loss/crossentropy": 1.9941784143447876, "loss/hidden": 1.5546875, "loss/logits": 0.2073415219783783, "loss/reg": 7.888730033300817e-05, "step": 2463 }, { "epoch": 0.308, "grad_norm": 2.3772897720336914, "grad_norm_var": 0.2006188504114031, "learning_rate": 0.0001, "loss": 1.3989, "loss/crossentropy": 2.32580828666687, "loss/hidden": 1.21875, "loss/logits": 0.17934134602546692, "loss/reg": 7.885072409408167e-05, "step": 2464 }, { "epoch": 0.308125, "grad_norm": 2.8482069969177246, "grad_norm_var": 0.16247434245738457, "learning_rate": 0.0001, "loss": 1.5811, "loss/crossentropy": 2.2356762886047363, "loss/hidden": 1.3203125, "loss/logits": 0.2599979043006897, "loss/reg": 7.880536577431485e-05, "step": 2465 }, { "epoch": 0.30825, "grad_norm": 3.0257906913757324, "grad_norm_var": 0.16819036185081185, "learning_rate": 0.0001, "loss": 1.5003, "loss/crossentropy": 2.326582431793213, "loss/hidden": 1.265625, "loss/logits": 0.23389410972595215, "loss/reg": 7.876929157646373e-05, "step": 2466 }, { "epoch": 0.308375, "grad_norm": 2.1856589317321777, "grad_norm_var": 0.1685999144880043, "learning_rate": 0.0001, "loss": 1.2533, "loss/crossentropy": 2.577054023742676, "loss/hidden": 1.0703125, "loss/logits": 0.1821662187576294, "loss/reg": 7.873309368733317e-05, "step": 2467 }, { "epoch": 0.3085, "grad_norm": 2.408513307571411, "grad_norm_var": 0.1662123300594421, "learning_rate": 0.0001, "loss": 1.2973, "loss/crossentropy": 2.587730884552002, "loss/hidden": 1.1171875, "loss/logits": 0.17933174967765808, "loss/reg": 7.869712862884626e-05, "step": 2468 }, { "epoch": 0.308625, "grad_norm": 3.1724202632904053, "grad_norm_var": 0.17053482414903462, "learning_rate": 0.0001, "loss": 1.4553, "loss/crossentropy": 2.5552852153778076, "loss/hidden": 1.2578125, "loss/logits": 0.19672027230262756, "loss/reg": 7.86641103331931e-05, "step": 2469 }, { "epoch": 0.30875, "grad_norm": 3.330458641052246, "grad_norm_var": 0.1840442418864304, "learning_rate": 0.0001, "loss": 1.37, "loss/crossentropy": 2.76078200340271, "loss/hidden": 1.1875, "loss/logits": 0.18175700306892395, "loss/reg": 7.86327727837488e-05, "step": 2470 }, { "epoch": 0.308875, "grad_norm": 2.8682022094726562, "grad_norm_var": 0.17845683836727796, "learning_rate": 0.0001, "loss": 1.4204, "loss/crossentropy": 2.2447423934936523, "loss/hidden": 1.234375, "loss/logits": 0.18521176278591156, "loss/reg": 7.859925244702026e-05, "step": 2471 }, { "epoch": 0.309, "grad_norm": 2.136385679244995, "grad_norm_var": 0.19115629984354973, "learning_rate": 0.0001, "loss": 1.0795, "loss/crossentropy": 2.3989346027374268, "loss/hidden": 0.9296875, "loss/logits": 0.14899390935897827, "loss/reg": 7.855860167182982e-05, "step": 2472 }, { "epoch": 0.309125, "grad_norm": 3.3988373279571533, "grad_norm_var": 0.20803080843688804, "learning_rate": 0.0001, "loss": 1.2973, "loss/crossentropy": 2.947479248046875, "loss/hidden": 1.1171875, "loss/logits": 0.179362952709198, "loss/reg": 7.851863483665511e-05, "step": 2473 }, { "epoch": 0.30925, "grad_norm": 2.229759693145752, "grad_norm_var": 0.2325589428295558, "learning_rate": 0.0001, "loss": 1.3593, "loss/crossentropy": 2.448793411254883, "loss/hidden": 1.171875, "loss/logits": 0.18661609292030334, "loss/reg": 7.84845178714022e-05, "step": 2474 }, { "epoch": 0.309375, "grad_norm": 5.719300746917725, "grad_norm_var": 0.7555315050840173, "learning_rate": 0.0001, "loss": 1.7305, "loss/crossentropy": 2.7119076251983643, "loss/hidden": 1.578125, "loss/logits": 0.15157337486743927, "loss/reg": 7.845018262742087e-05, "step": 2475 }, { "epoch": 0.3095, "grad_norm": 2.1197574138641357, "grad_norm_var": 0.803410149473468, "learning_rate": 0.0001, "loss": 1.3099, "loss/crossentropy": 2.3246421813964844, "loss/hidden": 1.109375, "loss/logits": 0.19971305131912231, "loss/reg": 7.841681508580223e-05, "step": 2476 }, { "epoch": 0.309625, "grad_norm": 2.460038661956787, "grad_norm_var": 0.8183426990653475, "learning_rate": 0.0001, "loss": 1.2352, "loss/crossentropy": 2.6255531311035156, "loss/hidden": 1.078125, "loss/logits": 0.1562531590461731, "loss/reg": 7.83847935963422e-05, "step": 2477 }, { "epoch": 0.30975, "grad_norm": 2.6131904125213623, "grad_norm_var": 0.8248515326828527, "learning_rate": 0.0001, "loss": 1.4781, "loss/crossentropy": 2.3561809062957764, "loss/hidden": 1.2578125, "loss/logits": 0.21954363584518433, "loss/reg": 7.835307769710198e-05, "step": 2478 }, { "epoch": 0.309875, "grad_norm": 2.5678606033325195, "grad_norm_var": 0.7649072632127011, "learning_rate": 0.0001, "loss": 1.384, "loss/crossentropy": 2.4529669284820557, "loss/hidden": 1.1875, "loss/logits": 0.19567444920539856, "loss/reg": 7.832497067283839e-05, "step": 2479 }, { "epoch": 0.31, "grad_norm": 2.642906427383423, "grad_norm_var": 0.7528816681666984, "learning_rate": 0.0001, "loss": 1.5825, "loss/crossentropy": 2.6808021068573, "loss/hidden": 1.328125, "loss/logits": 0.2536073923110962, "loss/reg": 7.828955858713016e-05, "step": 2480 }, { "epoch": 0.310125, "grad_norm": 2.2595791816711426, "grad_norm_var": 0.775301935935935, "learning_rate": 0.0001, "loss": 1.4553, "loss/crossentropy": 2.6597342491149902, "loss/hidden": 1.2265625, "loss/logits": 0.22799740731716156, "loss/reg": 7.826214277883992e-05, "step": 2481 }, { "epoch": 0.31025, "grad_norm": 2.248966932296753, "grad_norm_var": 0.7918236005240147, "learning_rate": 0.0001, "loss": 1.3208, "loss/crossentropy": 2.644463539123535, "loss/hidden": 1.125, "loss/logits": 0.19501683115959167, "loss/reg": 7.82259157858789e-05, "step": 2482 }, { "epoch": 0.310375, "grad_norm": 2.3729095458984375, "grad_norm_var": 0.7793606460514326, "learning_rate": 0.0001, "loss": 1.3079, "loss/crossentropy": 2.591496229171753, "loss/hidden": 1.1171875, "loss/logits": 0.1899484097957611, "loss/reg": 7.819272286724299e-05, "step": 2483 }, { "epoch": 0.3105, "grad_norm": 3.0870368480682373, "grad_norm_var": 0.7741363112325378, "learning_rate": 0.0001, "loss": 1.3819, "loss/crossentropy": 2.5221118927001953, "loss/hidden": 1.1796875, "loss/logits": 0.20143738389015198, "loss/reg": 7.815763092366979e-05, "step": 2484 }, { "epoch": 0.310625, "grad_norm": 2.2150983810424805, "grad_norm_var": 0.7872899178219311, "learning_rate": 0.0001, "loss": 1.1351, "loss/crossentropy": 2.440220832824707, "loss/hidden": 0.9921875, "loss/logits": 0.14217165112495422, "loss/reg": 7.812066178303212e-05, "step": 2485 }, { "epoch": 0.31075, "grad_norm": 2.5775105953216553, "grad_norm_var": 0.7661450083142843, "learning_rate": 0.0001, "loss": 1.3615, "loss/crossentropy": 2.776982545852661, "loss/hidden": 1.171875, "loss/logits": 0.18885642290115356, "loss/reg": 7.808783266227692e-05, "step": 2486 }, { "epoch": 0.310875, "grad_norm": 2.2068400382995605, "grad_norm_var": 0.7803991355114164, "learning_rate": 0.0001, "loss": 1.3371, "loss/crossentropy": 2.442225217819214, "loss/hidden": 1.15625, "loss/logits": 0.18001991510391235, "loss/reg": 7.805578206898645e-05, "step": 2487 }, { "epoch": 0.311, "grad_norm": 3.847132921218872, "grad_norm_var": 0.8396593728394806, "learning_rate": 0.0001, "loss": 1.5103, "loss/crossentropy": 2.4761884212493896, "loss/hidden": 1.265625, "loss/logits": 0.2438647449016571, "loss/reg": 7.802238542353734e-05, "step": 2488 }, { "epoch": 0.311125, "grad_norm": 2.677494525909424, "grad_norm_var": 0.8131824822783488, "learning_rate": 0.0001, "loss": 1.4889, "loss/crossentropy": 2.597899913787842, "loss/hidden": 1.2578125, "loss/logits": 0.23030102252960205, "loss/reg": 7.798484875820577e-05, "step": 2489 }, { "epoch": 0.31125, "grad_norm": 2.6941115856170654, "grad_norm_var": 0.7950472630572515, "learning_rate": 0.0001, "loss": 1.2416, "loss/crossentropy": 2.58589506149292, "loss/hidden": 1.0625, "loss/logits": 0.17834025621414185, "loss/reg": 7.795458805048838e-05, "step": 2490 }, { "epoch": 0.311375, "grad_norm": 6.1158223152160645, "grad_norm_var": 0.9608361984031935, "learning_rate": 0.0001, "loss": 2.4396, "loss/crossentropy": 2.9349818229675293, "loss/hidden": 1.8046875, "loss/logits": 0.6341601610183716, "loss/reg": 7.79182228143327e-05, "step": 2491 }, { "epoch": 0.3115, "grad_norm": 2.72094464302063, "grad_norm_var": 0.9293678867942011, "learning_rate": 0.0001, "loss": 1.3399, "loss/crossentropy": 2.679840326309204, "loss/hidden": 1.15625, "loss/logits": 0.18287675082683563, "loss/reg": 7.788587390678003e-05, "step": 2492 }, { "epoch": 0.311625, "grad_norm": 2.2249398231506348, "grad_norm_var": 0.9444731171158727, "learning_rate": 0.0001, "loss": 1.423, "loss/crossentropy": 2.2970497608184814, "loss/hidden": 1.2109375, "loss/logits": 0.21132957935333252, "loss/reg": 7.785234629409388e-05, "step": 2493 }, { "epoch": 0.31175, "grad_norm": 3.4726290702819824, "grad_norm_var": 0.9672804114354143, "learning_rate": 0.0001, "loss": 1.6415, "loss/crossentropy": 2.9574460983276367, "loss/hidden": 1.3984375, "loss/logits": 0.24229782819747925, "loss/reg": 7.782191823935136e-05, "step": 2494 }, { "epoch": 0.311875, "grad_norm": 2.8716678619384766, "grad_norm_var": 0.9607803048925364, "learning_rate": 0.0001, "loss": 1.595, "loss/crossentropy": 2.4191701412200928, "loss/hidden": 1.3359375, "loss/logits": 0.258322149515152, "loss/reg": 7.778344297548756e-05, "step": 2495 }, { "epoch": 0.312, "grad_norm": 4.6444783210754395, "grad_norm_var": 1.1453035722212046, "learning_rate": 0.0001, "loss": 1.805, "loss/crossentropy": 2.354905366897583, "loss/hidden": 1.546875, "loss/logits": 0.2573423981666565, "loss/reg": 7.774861296638846e-05, "step": 2496 }, { "epoch": 0.312125, "grad_norm": 2.929410219192505, "grad_norm_var": 1.1058942702075794, "learning_rate": 0.0001, "loss": 1.4098, "loss/crossentropy": 2.4823200702667236, "loss/hidden": 1.21875, "loss/logits": 0.1902921199798584, "loss/reg": 7.771390664856881e-05, "step": 2497 }, { "epoch": 0.31225, "grad_norm": 23.662355422973633, "grad_norm_var": 27.458082915377982, "learning_rate": 0.0001, "loss": 1.3473, "loss/crossentropy": 2.613009452819824, "loss/hidden": 1.1640625, "loss/logits": 0.18242289125919342, "loss/reg": 7.768178329570219e-05, "step": 2498 }, { "epoch": 0.312375, "grad_norm": 3.0880818367004395, "grad_norm_var": 27.297228525918126, "learning_rate": 0.0001, "loss": 1.4903, "loss/crossentropy": 2.4031972885131836, "loss/hidden": 1.265625, "loss/logits": 0.2238694280385971, "loss/reg": 7.764925976516679e-05, "step": 2499 }, { "epoch": 0.3125, "grad_norm": 3.0730206966400146, "grad_norm_var": 27.299768729804033, "learning_rate": 0.0001, "loss": 1.3276, "loss/crossentropy": 2.364023208618164, "loss/hidden": 1.15625, "loss/logits": 0.17059257626533508, "loss/reg": 7.76156157371588e-05, "step": 2500 }, { "epoch": 0.312625, "grad_norm": 3.005875825881958, "grad_norm_var": 27.10438651403611, "learning_rate": 0.0001, "loss": 1.4113, "loss/crossentropy": 2.8109560012817383, "loss/hidden": 1.1796875, "loss/logits": 0.23086920380592346, "loss/reg": 7.758261926937848e-05, "step": 2501 }, { "epoch": 0.31275, "grad_norm": 2.0928704738616943, "grad_norm_var": 27.242537005632943, "learning_rate": 0.0001, "loss": 1.2635, "loss/crossentropy": 2.7711355686187744, "loss/hidden": 1.0859375, "loss/logits": 0.17674031853675842, "loss/reg": 7.755256228847429e-05, "step": 2502 }, { "epoch": 0.312875, "grad_norm": 3.3940582275390625, "grad_norm_var": 26.97428408078233, "learning_rate": 0.0001, "loss": 1.7018, "loss/crossentropy": 2.825003147125244, "loss/hidden": 1.375, "loss/logits": 0.32600873708724976, "loss/reg": 7.752077362965792e-05, "step": 2503 }, { "epoch": 0.313, "grad_norm": 2.5844719409942627, "grad_norm_var": 27.189259734743835, "learning_rate": 0.0001, "loss": 1.1801, "loss/crossentropy": 2.6539602279663086, "loss/hidden": 1.0234375, "loss/logits": 0.15588229894638062, "loss/reg": 7.7481206972152e-05, "step": 2504 }, { "epoch": 0.313125, "grad_norm": 2.651426315307617, "grad_norm_var": 27.195474359646024, "learning_rate": 0.0001, "loss": 1.5167, "loss/crossentropy": 2.7718629837036133, "loss/hidden": 1.25, "loss/logits": 0.26588618755340576, "loss/reg": 7.743845344521105e-05, "step": 2505 }, { "epoch": 0.31325, "grad_norm": 2.679337978363037, "grad_norm_var": 27.198949996181945, "learning_rate": 0.0001, "loss": 1.0788, "loss/crossentropy": 2.2336440086364746, "loss/hidden": 0.9375, "loss/logits": 0.14055493474006653, "loss/reg": 7.740613364148885e-05, "step": 2506 }, { "epoch": 0.313375, "grad_norm": 2.510984182357788, "grad_norm_var": 27.21080150010058, "learning_rate": 0.0001, "loss": 1.4279, "loss/crossentropy": 2.439392328262329, "loss/hidden": 1.1875, "loss/logits": 0.23965178430080414, "loss/reg": 7.737482519587502e-05, "step": 2507 }, { "epoch": 0.3135, "grad_norm": 2.33518385887146, "grad_norm_var": 27.297484019937396, "learning_rate": 0.0001, "loss": 1.2709, "loss/crossentropy": 2.662557601928711, "loss/hidden": 1.109375, "loss/logits": 0.16071800887584686, "loss/reg": 7.734618702670559e-05, "step": 2508 }, { "epoch": 0.313625, "grad_norm": 2.0442068576812744, "grad_norm_var": 27.347151324685026, "learning_rate": 0.0001, "loss": 1.1277, "loss/crossentropy": 2.493053913116455, "loss/hidden": 0.96875, "loss/logits": 0.1582036316394806, "loss/reg": 7.731816731393337e-05, "step": 2509 }, { "epoch": 0.31375, "grad_norm": 2.0511441230773926, "grad_norm_var": 27.60940500434364, "learning_rate": 0.0001, "loss": 1.1993, "loss/crossentropy": 2.3783648014068604, "loss/hidden": 1.03125, "loss/logits": 0.16732577979564667, "loss/reg": 7.729270873824134e-05, "step": 2510 }, { "epoch": 0.313875, "grad_norm": 4.635650634765625, "grad_norm_var": 27.514708271096243, "learning_rate": 0.0001, "loss": 1.3632, "loss/crossentropy": 2.600620746612549, "loss/hidden": 1.171875, "loss/logits": 0.19058892130851746, "loss/reg": 7.726315379841253e-05, "step": 2511 }, { "epoch": 0.314, "grad_norm": 3.1055922508239746, "grad_norm_var": 27.573859836710863, "learning_rate": 0.0001, "loss": 1.5459, "loss/crossentropy": 2.329669237136841, "loss/hidden": 1.3046875, "loss/logits": 0.24044546484947205, "loss/reg": 7.722721784375608e-05, "step": 2512 }, { "epoch": 0.314125, "grad_norm": 2.1689419746398926, "grad_norm_var": 27.730241380571673, "learning_rate": 0.0001, "loss": 1.187, "loss/crossentropy": 2.0686306953430176, "loss/hidden": 1.0546875, "loss/logits": 0.13158056139945984, "loss/reg": 7.719744462519884e-05, "step": 2513 }, { "epoch": 0.31425, "grad_norm": 3.134702205657959, "grad_norm_var": 0.4358037971547179, "learning_rate": 0.0001, "loss": 1.6648, "loss/crossentropy": 2.5813217163085938, "loss/hidden": 1.390625, "loss/logits": 0.27339380979537964, "loss/reg": 7.716610707575455e-05, "step": 2514 }, { "epoch": 0.314375, "grad_norm": 2.2433063983917236, "grad_norm_var": 0.44623716652064616, "learning_rate": 0.0001, "loss": 1.2591, "loss/crossentropy": 2.625002384185791, "loss/hidden": 1.078125, "loss/logits": 0.1801910400390625, "loss/reg": 7.713311788393185e-05, "step": 2515 }, { "epoch": 0.3145, "grad_norm": 2.7379400730133057, "grad_norm_var": 0.43801525828695276, "learning_rate": 0.0001, "loss": 1.3364, "loss/crossentropy": 2.438453435897827, "loss/hidden": 1.140625, "loss/logits": 0.19505375623703003, "loss/reg": 7.709407509537414e-05, "step": 2516 }, { "epoch": 0.314625, "grad_norm": 2.395873546600342, "grad_norm_var": 0.43728679967384565, "learning_rate": 0.0001, "loss": 1.2902, "loss/crossentropy": 2.8403115272521973, "loss/hidden": 1.1171875, "loss/logits": 0.17226523160934448, "loss/reg": 7.705182360950857e-05, "step": 2517 }, { "epoch": 0.31475, "grad_norm": 2.065492630004883, "grad_norm_var": 0.43945081227840604, "learning_rate": 0.0001, "loss": 1.2315, "loss/crossentropy": 2.6837544441223145, "loss/hidden": 1.0625, "loss/logits": 0.16822361946105957, "loss/reg": 7.701461436226964e-05, "step": 2518 }, { "epoch": 0.314875, "grad_norm": 2.698315143585205, "grad_norm_var": 0.4026428414900314, "learning_rate": 0.0001, "loss": 1.5297, "loss/crossentropy": 2.6952924728393555, "loss/hidden": 1.265625, "loss/logits": 0.26333916187286377, "loss/reg": 7.698121771682054e-05, "step": 2519 }, { "epoch": 0.315, "grad_norm": 2.115609884262085, "grad_norm_var": 0.419082256729241, "learning_rate": 0.0001, "loss": 1.3009, "loss/crossentropy": 2.5197911262512207, "loss/hidden": 1.109375, "loss/logits": 0.1907692402601242, "loss/reg": 7.695573731325567e-05, "step": 2520 }, { "epoch": 0.315125, "grad_norm": 2.4453351497650146, "grad_norm_var": 0.42027856571745187, "learning_rate": 0.0001, "loss": 1.3111, "loss/crossentropy": 2.655104875564575, "loss/hidden": 1.15625, "loss/logits": 0.15410733222961426, "loss/reg": 7.692284270888194e-05, "step": 2521 }, { "epoch": 0.31525, "grad_norm": 2.753488779067993, "grad_norm_var": 0.42155020360769035, "learning_rate": 0.0001, "loss": 1.4152, "loss/crossentropy": 2.637538433074951, "loss/hidden": 1.203125, "loss/logits": 0.2113218605518341, "loss/reg": 7.689151243539527e-05, "step": 2522 }, { "epoch": 0.315375, "grad_norm": 3.8403470516204834, "grad_norm_var": 0.5179755475786768, "learning_rate": 0.0001, "loss": 1.5765, "loss/crossentropy": 1.93824303150177, "loss/hidden": 1.3515625, "loss/logits": 0.22418013215065002, "loss/reg": 7.686335447942838e-05, "step": 2523 }, { "epoch": 0.3155, "grad_norm": 2.992427110671997, "grad_norm_var": 0.5153527941115716, "learning_rate": 0.0001, "loss": 1.4306, "loss/crossentropy": 2.517690420150757, "loss/hidden": 1.2109375, "loss/logits": 0.21891675889492035, "loss/reg": 7.68398167565465e-05, "step": 2524 }, { "epoch": 0.315625, "grad_norm": 2.0606908798217773, "grad_norm_var": 0.5138970579828478, "learning_rate": 0.0001, "loss": 1.2711, "loss/crossentropy": 2.407917022705078, "loss/hidden": 1.1015625, "loss/logits": 0.16877859830856323, "loss/reg": 7.68047320889309e-05, "step": 2525 }, { "epoch": 0.31575, "grad_norm": 3.082048177719116, "grad_norm_var": 0.48902846104957975, "learning_rate": 0.0001, "loss": 1.2697, "loss/crossentropy": 2.8766891956329346, "loss/hidden": 1.09375, "loss/logits": 0.17522041499614716, "loss/reg": 7.677039684494957e-05, "step": 2526 }, { "epoch": 0.315875, "grad_norm": 2.4649159908294678, "grad_norm_var": 0.24637400253066416, "learning_rate": 0.0001, "loss": 1.411, "loss/crossentropy": 2.6688413619995117, "loss/hidden": 1.1953125, "loss/logits": 0.21490904688835144, "loss/reg": 7.673462823731825e-05, "step": 2527 }, { "epoch": 0.316, "grad_norm": 4.994858741760254, "grad_norm_var": 0.5857169247163974, "learning_rate": 0.0001, "loss": 1.9482, "loss/crossentropy": 2.3100836277008057, "loss/hidden": 1.71875, "loss/logits": 0.22866591811180115, "loss/reg": 7.669805199839175e-05, "step": 2528 }, { "epoch": 0.316125, "grad_norm": 2.1626980304718018, "grad_norm_var": 0.5862132169033951, "learning_rate": 0.0001, "loss": 1.2321, "loss/crossentropy": 2.3412845134735107, "loss/hidden": 1.0703125, "loss/logits": 0.1609715223312378, "loss/reg": 7.666328019695356e-05, "step": 2529 }, { "epoch": 0.31625, "grad_norm": 2.3346352577209473, "grad_norm_var": 0.5864353462390066, "learning_rate": 0.0001, "loss": 1.2292, "loss/crossentropy": 2.5466227531433105, "loss/hidden": 1.0625, "loss/logits": 0.1659517139196396, "loss/reg": 7.663120049983263e-05, "step": 2530 }, { "epoch": 0.316375, "grad_norm": 2.128408908843994, "grad_norm_var": 0.5944368185587012, "learning_rate": 0.0001, "loss": 1.3069, "loss/crossentropy": 2.440068244934082, "loss/hidden": 1.1328125, "loss/logits": 0.1732938438653946, "loss/reg": 7.660074334125966e-05, "step": 2531 }, { "epoch": 0.3165, "grad_norm": 2.5309929847717285, "grad_norm_var": 0.5961926738032882, "learning_rate": 0.0001, "loss": 1.3724, "loss/crossentropy": 2.693559408187866, "loss/hidden": 1.171875, "loss/logits": 0.1998092085123062, "loss/reg": 7.656928210053593e-05, "step": 2532 }, { "epoch": 0.316625, "grad_norm": 2.777916193008423, "grad_norm_var": 0.5902492310240971, "learning_rate": 0.0001, "loss": 1.4687, "loss/crossentropy": 2.3920562267303467, "loss/hidden": 1.265625, "loss/logits": 0.20234429836273193, "loss/reg": 7.65336153563112e-05, "step": 2533 }, { "epoch": 0.31675, "grad_norm": 2.9262938499450684, "grad_norm_var": 0.5619554862571038, "learning_rate": 0.0001, "loss": 1.3635, "loss/crossentropy": 2.517133951187134, "loss/hidden": 1.171875, "loss/logits": 0.1908150613307953, "loss/reg": 7.64992946642451e-05, "step": 2534 }, { "epoch": 0.316875, "grad_norm": 2.3383100032806396, "grad_norm_var": 0.5734635857409397, "learning_rate": 0.0001, "loss": 1.2703, "loss/crossentropy": 2.6534667015075684, "loss/hidden": 1.1015625, "loss/logits": 0.16795837879180908, "loss/reg": 7.646476296940818e-05, "step": 2535 }, { "epoch": 0.317, "grad_norm": 2.1641428470611572, "grad_norm_var": 0.5695262594998023, "learning_rate": 0.0001, "loss": 1.2516, "loss/crossentropy": 2.3692572116851807, "loss/hidden": 1.0859375, "loss/logits": 0.1649346649646759, "loss/reg": 7.643085700692609e-05, "step": 2536 }, { "epoch": 0.317125, "grad_norm": 2.0314042568206787, "grad_norm_var": 0.5970410367591322, "learning_rate": 0.0001, "loss": 1.1793, "loss/crossentropy": 2.4533584117889404, "loss/hidden": 1.0078125, "loss/logits": 0.17075516283512115, "loss/reg": 7.639366958756e-05, "step": 2537 }, { "epoch": 0.31725, "grad_norm": 2.232072591781616, "grad_norm_var": 0.6119812616890755, "learning_rate": 0.0001, "loss": 1.3368, "loss/crossentropy": 2.5370779037475586, "loss/hidden": 1.15625, "loss/logits": 0.1797570288181305, "loss/reg": 7.635747897438705e-05, "step": 2538 }, { "epoch": 0.317375, "grad_norm": 2.278836727142334, "grad_norm_var": 0.5251604741504969, "learning_rate": 0.0001, "loss": 1.3516, "loss/crossentropy": 2.5069193840026855, "loss/hidden": 1.15625, "loss/logits": 0.19454717636108398, "loss/reg": 7.632358028786257e-05, "step": 2539 }, { "epoch": 0.3175, "grad_norm": 2.36574125289917, "grad_norm_var": 0.5163971221852525, "learning_rate": 0.0001, "loss": 1.2673, "loss/crossentropy": 2.767737627029419, "loss/hidden": 1.1015625, "loss/logits": 0.1649796962738037, "loss/reg": 7.629060564795509e-05, "step": 2540 }, { "epoch": 0.317625, "grad_norm": 2.684457778930664, "grad_norm_var": 0.4996351495711972, "learning_rate": 0.0001, "loss": 1.5724, "loss/crossentropy": 2.370954751968384, "loss/hidden": 1.328125, "loss/logits": 0.24355752766132355, "loss/reg": 7.624427234986797e-05, "step": 2541 }, { "epoch": 0.31775, "grad_norm": 2.6527912616729736, "grad_norm_var": 0.4831960034116757, "learning_rate": 0.0001, "loss": 1.4755, "loss/crossentropy": 2.3622536659240723, "loss/hidden": 1.2578125, "loss/logits": 0.21697068214416504, "loss/reg": 7.620074757141992e-05, "step": 2542 }, { "epoch": 0.317875, "grad_norm": 2.3055875301361084, "grad_norm_var": 0.48694657450286816, "learning_rate": 0.0001, "loss": 1.3776, "loss/crossentropy": 2.4542124271392822, "loss/hidden": 1.171875, "loss/logits": 0.20496246218681335, "loss/reg": 7.61565170250833e-05, "step": 2543 }, { "epoch": 0.318, "grad_norm": 2.282367706298828, "grad_norm_var": 0.06504325757903946, "learning_rate": 0.0001, "loss": 1.3809, "loss/crossentropy": 2.434579610824585, "loss/hidden": 1.171875, "loss/logits": 0.2082221508026123, "loss/reg": 7.611144974362105e-05, "step": 2544 }, { "epoch": 0.318125, "grad_norm": 2.8612184524536133, "grad_norm_var": 0.07462122100450089, "learning_rate": 0.0001, "loss": 1.3344, "loss/crossentropy": 2.4470958709716797, "loss/hidden": 1.140625, "loss/logits": 0.19298559427261353, "loss/reg": 7.607592851854861e-05, "step": 2545 }, { "epoch": 0.31825, "grad_norm": 3.735902786254883, "grad_norm_var": 0.17934837099987605, "learning_rate": 0.0001, "loss": 1.8881, "loss/crossentropy": 2.5656769275665283, "loss/hidden": 1.5234375, "loss/logits": 0.3638624846935272, "loss/reg": 7.60443726903759e-05, "step": 2546 }, { "epoch": 0.318375, "grad_norm": 2.4728028774261475, "grad_norm_var": 0.16884737464624683, "learning_rate": 0.0001, "loss": 1.1898, "loss/crossentropy": 2.3154425621032715, "loss/hidden": 1.03125, "loss/logits": 0.15780413150787354, "loss/reg": 7.601500692544505e-05, "step": 2547 }, { "epoch": 0.3185, "grad_norm": 2.392200231552124, "grad_norm_var": 0.17021899055165832, "learning_rate": 0.0001, "loss": 1.3846, "loss/crossentropy": 2.4135842323303223, "loss/hidden": 1.203125, "loss/logits": 0.18073639273643494, "loss/reg": 7.598802039865404e-05, "step": 2548 }, { "epoch": 0.318625, "grad_norm": 2.771801710128784, "grad_norm_var": 0.17002033334067596, "learning_rate": 0.0001, "loss": 1.4705, "loss/crossentropy": 2.329624652862549, "loss/hidden": 1.265625, "loss/logits": 0.2041211575269699, "loss/reg": 7.595864735776559e-05, "step": 2549 }, { "epoch": 0.31875, "grad_norm": 2.5758466720581055, "grad_norm_var": 0.1592253456667888, "learning_rate": 0.0001, "loss": 1.2931, "loss/crossentropy": 2.533447265625, "loss/hidden": 1.109375, "loss/logits": 0.18296372890472412, "loss/reg": 7.592555630253628e-05, "step": 2550 }, { "epoch": 0.318875, "grad_norm": 3.436654567718506, "grad_norm_var": 0.20961244807037835, "learning_rate": 0.0001, "loss": 1.6246, "loss/crossentropy": 2.54769229888916, "loss/hidden": 1.3671875, "loss/logits": 0.25662529468536377, "loss/reg": 7.589293090859428e-05, "step": 2551 }, { "epoch": 0.319, "grad_norm": 2.2696189880371094, "grad_norm_var": 0.20449116599060227, "learning_rate": 0.0001, "loss": 1.3376, "loss/crossentropy": 2.450474739074707, "loss/hidden": 1.140625, "loss/logits": 0.19620829820632935, "loss/reg": 7.586183346575126e-05, "step": 2552 }, { "epoch": 0.319125, "grad_norm": 2.8527655601501465, "grad_norm_var": 0.18610206706658042, "learning_rate": 0.0001, "loss": 1.3273, "loss/crossentropy": 2.4960756301879883, "loss/hidden": 1.15625, "loss/logits": 0.1702694296836853, "loss/reg": 7.583201659144834e-05, "step": 2553 }, { "epoch": 0.31925, "grad_norm": 2.830066680908203, "grad_norm_var": 0.1762722922665354, "learning_rate": 0.0001, "loss": 1.5032, "loss/crossentropy": 2.6321537494659424, "loss/hidden": 1.234375, "loss/logits": 0.2680504024028778, "loss/reg": 7.580281817354262e-05, "step": 2554 }, { "epoch": 0.319375, "grad_norm": 2.365748405456543, "grad_norm_var": 0.17217626396401114, "learning_rate": 0.0001, "loss": 1.4076, "loss/crossentropy": 2.520787000656128, "loss/hidden": 1.1875, "loss/logits": 0.21935530006885529, "loss/reg": 7.577749056508765e-05, "step": 2555 }, { "epoch": 0.3195, "grad_norm": 2.3638808727264404, "grad_norm_var": 0.17225405367214536, "learning_rate": 0.0001, "loss": 1.3894, "loss/crossentropy": 2.526634693145752, "loss/hidden": 1.1796875, "loss/logits": 0.20891103148460388, "loss/reg": 7.574522169306874e-05, "step": 2556 }, { "epoch": 0.319625, "grad_norm": 2.056288480758667, "grad_norm_var": 0.19640536952956394, "learning_rate": 0.0001, "loss": 1.2171, "loss/crossentropy": 2.667778491973877, "loss/hidden": 1.0546875, "loss/logits": 0.16170328855514526, "loss/reg": 7.57065718062222e-05, "step": 2557 }, { "epoch": 0.31975, "grad_norm": 4.0994672775268555, "grad_norm_var": 0.3298514370543283, "learning_rate": 0.0001, "loss": 1.4393, "loss/crossentropy": 2.5001070499420166, "loss/hidden": 1.25, "loss/logits": 0.18855181336402893, "loss/reg": 7.567323336843401e-05, "step": 2558 }, { "epoch": 0.319875, "grad_norm": 2.060448169708252, "grad_norm_var": 0.3474633998831732, "learning_rate": 0.0001, "loss": 1.224, "loss/crossentropy": 2.439638376235962, "loss/hidden": 1.0546875, "loss/logits": 0.16858191788196564, "loss/reg": 7.563850522274151e-05, "step": 2559 }, { "epoch": 0.32, "grad_norm": 2.489480495452881, "grad_norm_var": 0.3382195242390156, "learning_rate": 0.0001, "loss": 1.4595, "loss/crossentropy": 2.420964479446411, "loss/hidden": 1.234375, "loss/logits": 0.22441241145133972, "loss/reg": 7.560083759017289e-05, "step": 2560 }, { "epoch": 0.320125, "grad_norm": 2.301086664199829, "grad_norm_var": 0.3478149804034198, "learning_rate": 0.0001, "loss": 1.5204, "loss/crossentropy": 2.511054515838623, "loss/hidden": 1.3046875, "loss/logits": 0.21491993963718414, "loss/reg": 7.55578075768426e-05, "step": 2561 }, { "epoch": 0.32025, "grad_norm": 1.9328948259353638, "grad_norm_var": 0.3000679574240564, "learning_rate": 0.0001, "loss": 1.2419, "loss/crossentropy": 2.4478394985198975, "loss/hidden": 1.0703125, "loss/logits": 0.17079448699951172, "loss/reg": 7.552719762315974e-05, "step": 2562 }, { "epoch": 0.320375, "grad_norm": 9.564568519592285, "grad_norm_var": 3.3425557341687395, "learning_rate": 0.0001, "loss": 1.8778, "loss/crossentropy": 2.2141997814178467, "loss/hidden": 1.5625, "loss/logits": 0.31457704305648804, "loss/reg": 7.548509893240407e-05, "step": 2563 }, { "epoch": 0.3205, "grad_norm": 2.9880619049072266, "grad_norm_var": 3.3146562399735857, "learning_rate": 0.0001, "loss": 1.6645, "loss/crossentropy": 2.4745194911956787, "loss/hidden": 1.4140625, "loss/logits": 0.24967148900032043, "loss/reg": 7.544483378296718e-05, "step": 2564 }, { "epoch": 0.320625, "grad_norm": 4.685522556304932, "grad_norm_var": 3.470035284798161, "learning_rate": 0.0001, "loss": 1.6573, "loss/crossentropy": 2.171630382537842, "loss/hidden": 1.4375, "loss/logits": 0.21900489926338196, "loss/reg": 7.540897786384448e-05, "step": 2565 }, { "epoch": 0.32075, "grad_norm": 2.8717610836029053, "grad_norm_var": 3.451689834611354, "learning_rate": 0.0001, "loss": 1.5832, "loss/crossentropy": 2.5051934719085693, "loss/hidden": 1.3203125, "loss/logits": 0.2621336579322815, "loss/reg": 7.537499914178625e-05, "step": 2566 }, { "epoch": 0.320875, "grad_norm": 3.7210745811462402, "grad_norm_var": 3.465795427432967, "learning_rate": 0.0001, "loss": 1.5391, "loss/crossentropy": 2.2826690673828125, "loss/hidden": 1.3125, "loss/logits": 0.2258358895778656, "loss/reg": 7.533721509389579e-05, "step": 2567 }, { "epoch": 0.321, "grad_norm": 3.346745252609253, "grad_norm_var": 3.4024210496080483, "learning_rate": 0.0001, "loss": 1.852, "loss/crossentropy": 2.997385025024414, "loss/hidden": 1.5234375, "loss/logits": 0.3277701437473297, "loss/reg": 7.52885898691602e-05, "step": 2568 }, { "epoch": 0.321125, "grad_norm": 3.8939995765686035, "grad_norm_var": 3.4104354517335556, "learning_rate": 0.0001, "loss": 1.6452, "loss/crossentropy": 2.1996195316314697, "loss/hidden": 1.421875, "loss/logits": 0.22260203957557678, "loss/reg": 7.525219552917406e-05, "step": 2569 }, { "epoch": 0.32125, "grad_norm": 3.1398026943206787, "grad_norm_var": 3.395033806908272, "learning_rate": 0.0001, "loss": 1.4317, "loss/crossentropy": 2.4148857593536377, "loss/hidden": 1.234375, "loss/logits": 0.1965729296207428, "loss/reg": 7.521462248405442e-05, "step": 2570 }, { "epoch": 0.321375, "grad_norm": 2.679417371749878, "grad_norm_var": 3.3592851126310888, "learning_rate": 0.0001, "loss": 1.2655, "loss/crossentropy": 2.5166618824005127, "loss/hidden": 1.0859375, "loss/logits": 0.17883551120758057, "loss/reg": 7.517539779655635e-05, "step": 2571 }, { "epoch": 0.3215, "grad_norm": 2.4026601314544678, "grad_norm_var": 3.3540881872097237, "learning_rate": 0.0001, "loss": 1.4753, "loss/crossentropy": 2.40878963470459, "loss/hidden": 1.2578125, "loss/logits": 0.21670792996883392, "loss/reg": 7.513214222854003e-05, "step": 2572 }, { "epoch": 0.321625, "grad_norm": 2.426900863647461, "grad_norm_var": 3.296788205031549, "learning_rate": 0.0001, "loss": 1.2617, "loss/crossentropy": 2.5726068019866943, "loss/hidden": 1.1015625, "loss/logits": 0.15938934683799744, "loss/reg": 7.50991894165054e-05, "step": 2573 }, { "epoch": 0.32175, "grad_norm": 2.5598957538604736, "grad_norm_var": 3.3039626334242236, "learning_rate": 0.0001, "loss": 1.3017, "loss/crossentropy": 2.3262577056884766, "loss/hidden": 1.1171875, "loss/logits": 0.18375766277313232, "loss/reg": 7.506419206038117e-05, "step": 2574 }, { "epoch": 0.321875, "grad_norm": 3.0619027614593506, "grad_norm_var": 3.1989247323167156, "learning_rate": 0.0001, "loss": 1.4713, "loss/crossentropy": 2.386106491088867, "loss/hidden": 1.2890625, "loss/logits": 0.18150153756141663, "loss/reg": 7.503407687181607e-05, "step": 2575 }, { "epoch": 0.322, "grad_norm": 2.771030902862549, "grad_norm_var": 3.170482371604559, "learning_rate": 0.0001, "loss": 1.3755, "loss/crossentropy": 3.002222776412964, "loss/hidden": 1.1796875, "loss/logits": 0.19507187604904175, "loss/reg": 7.499873026972637e-05, "step": 2576 }, { "epoch": 0.322125, "grad_norm": 2.8079395294189453, "grad_norm_var": 3.1124961131531563, "learning_rate": 0.0001, "loss": 1.3269, "loss/crossentropy": 2.722019672393799, "loss/hidden": 1.1484375, "loss/logits": 0.17767326533794403, "loss/reg": 7.496637408621609e-05, "step": 2577 }, { "epoch": 0.32225, "grad_norm": 3.0110485553741455, "grad_norm_var": 2.970164474034569, "learning_rate": 0.0001, "loss": 1.2453, "loss/crossentropy": 2.545199155807495, "loss/hidden": 1.0625, "loss/logits": 0.18207918107509613, "loss/reg": 7.493065641028807e-05, "step": 2578 }, { "epoch": 0.322375, "grad_norm": 2.4774599075317383, "grad_norm_var": 0.3746615645796699, "learning_rate": 0.0001, "loss": 1.2026, "loss/crossentropy": 2.3781564235687256, "loss/hidden": 1.0625, "loss/logits": 0.13939476013183594, "loss/reg": 7.489934068871662e-05, "step": 2579 }, { "epoch": 0.3225, "grad_norm": 2.676772356033325, "grad_norm_var": 0.38340595925709875, "learning_rate": 0.0001, "loss": 1.3681, "loss/crossentropy": 2.563326120376587, "loss/hidden": 1.171875, "loss/logits": 0.19551552832126617, "loss/reg": 7.486381946364418e-05, "step": 2580 }, { "epoch": 0.322625, "grad_norm": 2.0783538818359375, "grad_norm_var": 0.23391390648012375, "learning_rate": 0.0001, "loss": 1.1737, "loss/crossentropy": 2.385472297668457, "loss/hidden": 1.015625, "loss/logits": 0.157316192984581, "loss/reg": 7.482981163775548e-05, "step": 2581 }, { "epoch": 0.32275, "grad_norm": 2.730417013168335, "grad_norm_var": 0.23513731996889245, "learning_rate": 0.0001, "loss": 1.2903, "loss/crossentropy": 2.510589122772217, "loss/hidden": 1.1171875, "loss/logits": 0.17236675322055817, "loss/reg": 7.479616033378989e-05, "step": 2582 }, { "epoch": 0.322875, "grad_norm": 2.2580180168151855, "grad_norm_var": 0.201257222390696, "learning_rate": 0.0001, "loss": 1.2982, "loss/crossentropy": 2.373061180114746, "loss/hidden": 1.1171875, "loss/logits": 0.18028365075588226, "loss/reg": 7.477033068425953e-05, "step": 2583 }, { "epoch": 0.323, "grad_norm": 2.567807674407959, "grad_norm_var": 0.1792942488659103, "learning_rate": 0.0001, "loss": 1.4173, "loss/crossentropy": 2.6175060272216797, "loss/hidden": 1.2109375, "loss/logits": 0.20564085245132446, "loss/reg": 7.473398727597669e-05, "step": 2584 }, { "epoch": 0.323125, "grad_norm": 3.716187000274658, "grad_norm_var": 0.15347145909305007, "learning_rate": 0.0001, "loss": 1.2134, "loss/crossentropy": 2.6610400676727295, "loss/hidden": 1.03125, "loss/logits": 0.18142950534820557, "loss/reg": 7.470348646165803e-05, "step": 2585 }, { "epoch": 0.32325, "grad_norm": 2.785700798034668, "grad_norm_var": 0.14103225939693, "learning_rate": 0.0001, "loss": 1.3992, "loss/crossentropy": 2.7831954956054688, "loss/hidden": 1.203125, "loss/logits": 0.19531890749931335, "loss/reg": 7.4672483606264e-05, "step": 2586 }, { "epoch": 0.323375, "grad_norm": 2.9606080055236816, "grad_norm_var": 0.1456440088297607, "learning_rate": 0.0001, "loss": 1.3471, "loss/crossentropy": 2.6884801387786865, "loss/hidden": 1.15625, "loss/logits": 0.190125972032547, "loss/reg": 7.464631198672578e-05, "step": 2587 }, { "epoch": 0.3235, "grad_norm": 2.6545186042785645, "grad_norm_var": 0.13942897599692947, "learning_rate": 0.0001, "loss": 1.6278, "loss/crossentropy": 2.5967419147491455, "loss/hidden": 1.328125, "loss/logits": 0.2989652156829834, "loss/reg": 7.461464701918885e-05, "step": 2588 }, { "epoch": 0.323625, "grad_norm": 2.0617735385894775, "grad_norm_var": 0.16210521686921217, "learning_rate": 0.0001, "loss": 1.3184, "loss/crossentropy": 2.352571725845337, "loss/hidden": 1.140625, "loss/logits": 0.17704597115516663, "loss/reg": 7.458699838025495e-05, "step": 2589 }, { "epoch": 0.32375, "grad_norm": 5.987517833709717, "grad_norm_var": 0.8329497947481909, "learning_rate": 0.0001, "loss": 1.6515, "loss/crossentropy": 3.145853281021118, "loss/hidden": 1.4375, "loss/logits": 0.21323521435260773, "loss/reg": 7.454834849340841e-05, "step": 2590 }, { "epoch": 0.323875, "grad_norm": 2.2021641731262207, "grad_norm_var": 0.8620709433809367, "learning_rate": 0.0001, "loss": 1.1807, "loss/crossentropy": 2.4597527980804443, "loss/hidden": 1.015625, "loss/logits": 0.16435301303863525, "loss/reg": 7.45088400435634e-05, "step": 2591 }, { "epoch": 0.324, "grad_norm": 2.6404035091400146, "grad_norm_var": 0.8646731812685609, "learning_rate": 0.0001, "loss": 1.3162, "loss/crossentropy": 2.601716995239258, "loss/hidden": 1.140625, "loss/logits": 0.17480887472629547, "loss/reg": 7.447485404554754e-05, "step": 2592 }, { "epoch": 0.324125, "grad_norm": 2.452908515930176, "grad_norm_var": 0.8745915363241311, "learning_rate": 0.0001, "loss": 1.3155, "loss/crossentropy": 2.3160481452941895, "loss/hidden": 1.1328125, "loss/logits": 0.18195372819900513, "loss/reg": 7.443180220434442e-05, "step": 2593 }, { "epoch": 0.32425, "grad_norm": 2.7842917442321777, "grad_norm_var": 0.8722966791290371, "learning_rate": 0.0001, "loss": 1.6166, "loss/crossentropy": 2.6415045261383057, "loss/hidden": 1.328125, "loss/logits": 0.28768253326416016, "loss/reg": 7.439417822752148e-05, "step": 2594 }, { "epoch": 0.324375, "grad_norm": 2.425776243209839, "grad_norm_var": 0.8747874749315957, "learning_rate": 0.0001, "loss": 1.3893, "loss/crossentropy": 2.53092622756958, "loss/hidden": 1.171875, "loss/logits": 0.21672961115837097, "loss/reg": 7.43583295843564e-05, "step": 2595 }, { "epoch": 0.3245, "grad_norm": 2.3943357467651367, "grad_norm_var": 0.884844891440009, "learning_rate": 0.0001, "loss": 1.3466, "loss/crossentropy": 2.5136220455169678, "loss/hidden": 1.15625, "loss/logits": 0.18961352109909058, "loss/reg": 7.432932761730626e-05, "step": 2596 }, { "epoch": 0.324625, "grad_norm": 2.763503313064575, "grad_norm_var": 0.8488260179916, "learning_rate": 0.0001, "loss": 1.5256, "loss/crossentropy": 2.559614896774292, "loss/hidden": 1.28125, "loss/logits": 0.24360765516757965, "loss/reg": 7.429746619891375e-05, "step": 2597 }, { "epoch": 0.32475, "grad_norm": 2.7811567783355713, "grad_norm_var": 0.8482684254856638, "learning_rate": 0.0001, "loss": 1.3938, "loss/crossentropy": 2.3340201377868652, "loss/hidden": 1.203125, "loss/logits": 0.18994054198265076, "loss/reg": 7.426081720041111e-05, "step": 2598 }, { "epoch": 0.324875, "grad_norm": 2.529146194458008, "grad_norm_var": 0.8318314561887138, "learning_rate": 0.0001, "loss": 1.4675, "loss/crossentropy": 2.5285966396331787, "loss/hidden": 1.21875, "loss/logits": 0.2480500191450119, "loss/reg": 7.423215720336884e-05, "step": 2599 }, { "epoch": 0.325, "grad_norm": 2.3141870498657227, "grad_norm_var": 0.8456221443644132, "learning_rate": 0.0001, "loss": 1.2813, "loss/crossentropy": 2.6163501739501953, "loss/hidden": 1.1015625, "loss/logits": 0.1789630651473999, "loss/reg": 7.419602479785681e-05, "step": 2600 }, { "epoch": 0.325125, "grad_norm": 2.7949411869049072, "grad_norm_var": 0.7911498823833767, "learning_rate": 0.0001, "loss": 1.5112, "loss/crossentropy": 2.373425006866455, "loss/hidden": 1.28125, "loss/logits": 0.22922664880752563, "loss/reg": 7.415535219479352e-05, "step": 2601 }, { "epoch": 0.32525, "grad_norm": 2.4660699367523193, "grad_norm_var": 0.7974331643492517, "learning_rate": 0.0001, "loss": 1.5056, "loss/crossentropy": 2.658745527267456, "loss/hidden": 1.2734375, "loss/logits": 0.23140352964401245, "loss/reg": 7.411878323182464e-05, "step": 2602 }, { "epoch": 0.325375, "grad_norm": 2.4811019897460938, "grad_norm_var": 0.7991908312267296, "learning_rate": 0.0001, "loss": 1.353, "loss/crossentropy": 2.767068386077881, "loss/hidden": 1.1640625, "loss/logits": 0.18819120526313782, "loss/reg": 7.407829980365932e-05, "step": 2603 }, { "epoch": 0.3255, "grad_norm": 2.0228796005249023, "grad_norm_var": 0.8307664187978955, "learning_rate": 0.0001, "loss": 1.2341, "loss/crossentropy": 2.592275857925415, "loss/hidden": 1.0546875, "loss/logits": 0.17869332432746887, "loss/reg": 7.403740164591e-05, "step": 2604 }, { "epoch": 0.325625, "grad_norm": 2.3055174350738525, "grad_norm_var": 0.8139365090315965, "learning_rate": 0.0001, "loss": 1.3499, "loss/crossentropy": 2.5113227367401123, "loss/hidden": 1.140625, "loss/logits": 0.20857837796211243, "loss/reg": 7.400261529255658e-05, "step": 2605 }, { "epoch": 0.32575, "grad_norm": 2.6348824501037598, "grad_norm_var": 0.05094322565770592, "learning_rate": 0.0001, "loss": 1.634, "loss/crossentropy": 2.3635506629943848, "loss/hidden": 1.390625, "loss/logits": 0.24260619282722473, "loss/reg": 7.396981527563184e-05, "step": 2606 }, { "epoch": 0.325875, "grad_norm": 2.076352596282959, "grad_norm_var": 0.05692160928234647, "learning_rate": 0.0001, "loss": 1.1695, "loss/crossentropy": 2.7818753719329834, "loss/hidden": 0.99609375, "loss/logits": 0.1726517379283905, "loss/reg": 7.394200656563044e-05, "step": 2607 }, { "epoch": 0.326, "grad_norm": 4.001671314239502, "grad_norm_var": 0.19972439189604357, "learning_rate": 0.0001, "loss": 1.3293, "loss/crossentropy": 2.7362420558929443, "loss/hidden": 1.15625, "loss/logits": 0.17233188450336456, "loss/reg": 7.39042807254009e-05, "step": 2608 }, { "epoch": 0.326125, "grad_norm": 2.896078109741211, "grad_norm_var": 0.20467897666896134, "learning_rate": 0.0001, "loss": 1.4001, "loss/crossentropy": 2.583622455596924, "loss/hidden": 1.203125, "loss/logits": 0.19628393650054932, "loss/reg": 7.38763774279505e-05, "step": 2609 }, { "epoch": 0.32625, "grad_norm": 2.5033376216888428, "grad_norm_var": 0.20287707670319127, "learning_rate": 0.0001, "loss": 1.3267, "loss/crossentropy": 2.5374817848205566, "loss/hidden": 1.1484375, "loss/logits": 0.17753323912620544, "loss/reg": 7.384094351436943e-05, "step": 2610 }, { "epoch": 0.326375, "grad_norm": 2.6469624042510986, "grad_norm_var": 0.20118201385296264, "learning_rate": 0.0001, "loss": 1.5003, "loss/crossentropy": 2.54118013381958, "loss/hidden": 1.28125, "loss/logits": 0.21827156841754913, "loss/reg": 7.381234900094569e-05, "step": 2611 }, { "epoch": 0.3265, "grad_norm": 3.752530336380005, "grad_norm_var": 0.279093801158038, "learning_rate": 0.0001, "loss": 1.7812, "loss/crossentropy": 2.364124298095703, "loss/hidden": 1.515625, "loss/logits": 0.26482686400413513, "loss/reg": 7.378402369795367e-05, "step": 2612 }, { "epoch": 0.326625, "grad_norm": 2.9372332096099854, "grad_norm_var": 0.28278369229346084, "learning_rate": 0.0001, "loss": 1.3573, "loss/crossentropy": 2.3372104167938232, "loss/hidden": 1.1875, "loss/logits": 0.16907039284706116, "loss/reg": 7.375441055046394e-05, "step": 2613 }, { "epoch": 0.32675, "grad_norm": 2.2539608478546143, "grad_norm_var": 0.2942041147280624, "learning_rate": 0.0001, "loss": 1.3511, "loss/crossentropy": 2.453409433364868, "loss/hidden": 1.15625, "loss/logits": 0.19415083527565002, "loss/reg": 7.372931577265263e-05, "step": 2614 }, { "epoch": 0.326875, "grad_norm": 2.6904680728912354, "grad_norm_var": 0.2929396213025008, "learning_rate": 0.0001, "loss": 1.4253, "loss/crossentropy": 2.8936641216278076, "loss/hidden": 1.203125, "loss/logits": 0.22141653299331665, "loss/reg": 7.369727245531976e-05, "step": 2615 }, { "epoch": 0.327, "grad_norm": 10.803741455078125, "grad_norm_var": 4.39059827054028, "learning_rate": 0.0001, "loss": 2.9419, "loss/crossentropy": 2.887760639190674, "loss/hidden": 2.4375, "loss/logits": 0.5037049055099487, "loss/reg": 7.366092904703692e-05, "step": 2616 }, { "epoch": 0.327125, "grad_norm": 2.8554131984710693, "grad_norm_var": 4.387526731031859, "learning_rate": 0.0001, "loss": 1.2681, "loss/crossentropy": 2.3685312271118164, "loss/hidden": 1.09375, "loss/logits": 0.17364603281021118, "loss/reg": 7.362548058154061e-05, "step": 2617 }, { "epoch": 0.32725, "grad_norm": 2.890333890914917, "grad_norm_var": 4.356806124824989, "learning_rate": 0.0001, "loss": 1.5132, "loss/crossentropy": 2.796135663986206, "loss/hidden": 1.2578125, "loss/logits": 0.25466370582580566, "loss/reg": 7.35948487999849e-05, "step": 2618 }, { "epoch": 0.327375, "grad_norm": 2.574460744857788, "grad_norm_var": 4.347972331116709, "learning_rate": 0.0001, "loss": 1.429, "loss/crossentropy": 2.645780086517334, "loss/hidden": 1.234375, "loss/logits": 0.19388152658939362, "loss/reg": 7.356864080065861e-05, "step": 2619 }, { "epoch": 0.3275, "grad_norm": 2.3976995944976807, "grad_norm_var": 4.295907960713602, "learning_rate": 0.0001, "loss": 1.3874, "loss/crossentropy": 2.595097780227661, "loss/hidden": 1.1796875, "loss/logits": 0.20700842142105103, "loss/reg": 7.353505498031154e-05, "step": 2620 }, { "epoch": 0.327625, "grad_norm": 8.532444953918457, "grad_norm_var": 5.923709428373779, "learning_rate": 0.0001, "loss": 1.6096, "loss/crossentropy": 2.6661887168884277, "loss/hidden": 1.3359375, "loss/logits": 0.2728992700576782, "loss/reg": 7.350248779403046e-05, "step": 2621 }, { "epoch": 0.32775, "grad_norm": 2.546067476272583, "grad_norm_var": 5.936258661409819, "learning_rate": 0.0001, "loss": 1.3495, "loss/crossentropy": 2.7812271118164062, "loss/hidden": 1.140625, "loss/logits": 0.20813050866127014, "loss/reg": 7.347230712184682e-05, "step": 2622 }, { "epoch": 0.327875, "grad_norm": 2.8623673915863037, "grad_norm_var": 5.810221167225726, "learning_rate": 0.0001, "loss": 1.3715, "loss/crossentropy": 2.5098493099212646, "loss/hidden": 1.1875, "loss/logits": 0.18325331807136536, "loss/reg": 7.344169716816396e-05, "step": 2623 }, { "epoch": 0.328, "grad_norm": 2.872593402862549, "grad_norm_var": 5.843962850773729, "learning_rate": 0.0001, "loss": 1.5187, "loss/crossentropy": 2.4206650257110596, "loss/hidden": 1.2734375, "loss/logits": 0.24450047314167023, "loss/reg": 7.340486627072096e-05, "step": 2624 }, { "epoch": 0.328125, "grad_norm": 2.2695746421813965, "grad_norm_var": 5.929466054677905, "learning_rate": 0.0001, "loss": 1.387, "loss/crossentropy": 2.56137752532959, "loss/hidden": 1.1875, "loss/logits": 0.19876404106616974, "loss/reg": 7.336409908020869e-05, "step": 2625 }, { "epoch": 0.32825, "grad_norm": 3.025198459625244, "grad_norm_var": 5.871096654857538, "learning_rate": 0.0001, "loss": 1.5002, "loss/crossentropy": 2.5404484272003174, "loss/hidden": 1.28125, "loss/logits": 0.2182619571685791, "loss/reg": 7.333447138080373e-05, "step": 2626 }, { "epoch": 0.328375, "grad_norm": 4.807386875152588, "grad_norm_var": 5.882682505729113, "learning_rate": 0.0001, "loss": 1.8265, "loss/crossentropy": 2.2216336727142334, "loss/hidden": 1.6796875, "loss/logits": 0.14607112109661102, "loss/reg": 7.330234802793711e-05, "step": 2627 }, { "epoch": 0.3285, "grad_norm": 3.1921229362487793, "grad_norm_var": 5.902455755447009, "learning_rate": 0.0001, "loss": 1.457, "loss/crossentropy": 2.6298129558563232, "loss/hidden": 1.2421875, "loss/logits": 0.2140374481678009, "loss/reg": 7.326525519602001e-05, "step": 2628 }, { "epoch": 0.328625, "grad_norm": 26.00888442993164, "grad_norm_var": 36.765028362101425, "learning_rate": 0.0001, "loss": 1.296, "loss/crossentropy": 2.7615840435028076, "loss/hidden": 1.140625, "loss/logits": 0.15467199683189392, "loss/reg": 7.323660247493535e-05, "step": 2629 }, { "epoch": 0.32875, "grad_norm": 2.545583963394165, "grad_norm_var": 36.657292645818075, "learning_rate": 0.0001, "loss": 1.2151, "loss/crossentropy": 2.7347218990325928, "loss/hidden": 1.046875, "loss/logits": 0.16748276352882385, "loss/reg": 7.319488940993324e-05, "step": 2630 }, { "epoch": 0.328875, "grad_norm": 2.398716688156128, "grad_norm_var": 36.75944206951129, "learning_rate": 0.0001, "loss": 1.3189, "loss/crossentropy": 2.6329691410064697, "loss/hidden": 1.1484375, "loss/logits": 0.16975712776184082, "loss/reg": 7.315434777410701e-05, "step": 2631 }, { "epoch": 0.329, "grad_norm": 2.4865922927856445, "grad_norm_var": 34.82579814802279, "learning_rate": 0.0001, "loss": 1.3778, "loss/crossentropy": 2.616021156311035, "loss/hidden": 1.1484375, "loss/logits": 0.22862255573272705, "loss/reg": 7.312367233680561e-05, "step": 2632 }, { "epoch": 0.329125, "grad_norm": 2.8417603969573975, "grad_norm_var": 34.82906130704527, "learning_rate": 0.0001, "loss": 1.3815, "loss/crossentropy": 2.719162940979004, "loss/hidden": 1.2109375, "loss/logits": 0.16987726092338562, "loss/reg": 7.309381908271462e-05, "step": 2633 }, { "epoch": 0.32925, "grad_norm": 2.6292576789855957, "grad_norm_var": 34.894253162999, "learning_rate": 0.0001, "loss": 1.3092, "loss/crossentropy": 2.7661075592041016, "loss/hidden": 1.109375, "loss/logits": 0.1991180181503296, "loss/reg": 7.305471081053838e-05, "step": 2634 }, { "epoch": 0.329375, "grad_norm": 2.9685418605804443, "grad_norm_var": 34.7962460708097, "learning_rate": 0.0001, "loss": 1.4606, "loss/crossentropy": 2.4889883995056152, "loss/hidden": 1.21875, "loss/logits": 0.24108874797821045, "loss/reg": 7.301606092369184e-05, "step": 2635 }, { "epoch": 0.3295, "grad_norm": 2.2368271350860596, "grad_norm_var": 34.846154261809694, "learning_rate": 0.0001, "loss": 1.3791, "loss/crossentropy": 2.6473779678344727, "loss/hidden": 1.1640625, "loss/logits": 0.21428149938583374, "loss/reg": 7.297995034605265e-05, "step": 2636 }, { "epoch": 0.329625, "grad_norm": 4.779677391052246, "grad_norm_var": 33.77819661124311, "learning_rate": 0.0001, "loss": 2.0549, "loss/crossentropy": 2.500034809112549, "loss/hidden": 1.7421875, "loss/logits": 0.31196582317352295, "loss/reg": 7.294522947631776e-05, "step": 2637 }, { "epoch": 0.32975, "grad_norm": 2.489835023880005, "grad_norm_var": 33.79232774067808, "learning_rate": 0.0001, "loss": 1.4798, "loss/crossentropy": 2.3142828941345215, "loss/hidden": 1.28125, "loss/logits": 0.19785639643669128, "loss/reg": 7.291339716175571e-05, "step": 2638 }, { "epoch": 0.329875, "grad_norm": 2.4587931632995605, "grad_norm_var": 33.88529728262141, "learning_rate": 0.0001, "loss": 1.4783, "loss/crossentropy": 2.574721574783325, "loss/hidden": 1.234375, "loss/logits": 0.24323952198028564, "loss/reg": 7.287390326382592e-05, "step": 2639 }, { "epoch": 0.33, "grad_norm": 2.055978775024414, "grad_norm_var": 34.09063817205711, "learning_rate": 0.0001, "loss": 1.3986, "loss/crossentropy": 2.5819077491760254, "loss/hidden": 1.1875, "loss/logits": 0.21034713089466095, "loss/reg": 7.284063030965626e-05, "step": 2640 }, { "epoch": 0.330125, "grad_norm": 3.0851657390594482, "grad_norm_var": 33.90873006450847, "learning_rate": 0.0001, "loss": 1.426, "loss/crossentropy": 2.6252501010894775, "loss/hidden": 1.21875, "loss/logits": 0.20647302269935608, "loss/reg": 7.28144368622452e-05, "step": 2641 }, { "epoch": 0.33025, "grad_norm": 2.595111846923828, "grad_norm_var": 33.997732177569645, "learning_rate": 0.0001, "loss": 1.4047, "loss/crossentropy": 2.593153715133667, "loss/hidden": 1.1953125, "loss/logits": 0.20868709683418274, "loss/reg": 7.278018165379763e-05, "step": 2642 }, { "epoch": 0.330375, "grad_norm": 2.8504176139831543, "grad_norm_var": 34.11742230365832, "learning_rate": 0.0001, "loss": 1.3145, "loss/crossentropy": 2.6996572017669678, "loss/hidden": 1.140625, "loss/logits": 0.17311415076255798, "loss/reg": 7.275219104485586e-05, "step": 2643 }, { "epoch": 0.3305, "grad_norm": 2.674687623977661, "grad_norm_var": 34.20551594497635, "learning_rate": 0.0001, "loss": 1.8293, "loss/crossentropy": 2.039944648742676, "loss/hidden": 1.5625, "loss/logits": 0.26605886220932007, "loss/reg": 7.272609218489379e-05, "step": 2644 }, { "epoch": 0.330625, "grad_norm": 2.789642095565796, "grad_norm_var": 0.36501785388367103, "learning_rate": 0.0001, "loss": 1.5895, "loss/crossentropy": 2.3664987087249756, "loss/hidden": 1.3359375, "loss/logits": 0.2528393864631653, "loss/reg": 7.269273191923276e-05, "step": 2645 }, { "epoch": 0.33075, "grad_norm": 6.480691432952881, "grad_norm_var": 1.2293005968313528, "learning_rate": 0.0001, "loss": 2.3789, "loss/crossentropy": 3.0479769706726074, "loss/hidden": 1.8828125, "loss/logits": 0.4953523278236389, "loss/reg": 7.265769818332046e-05, "step": 2646 }, { "epoch": 0.330875, "grad_norm": 2.6697468757629395, "grad_norm_var": 1.2125656044937783, "learning_rate": 0.0001, "loss": 1.5474, "loss/crossentropy": 2.496976137161255, "loss/hidden": 1.3359375, "loss/logits": 0.21069106459617615, "loss/reg": 7.262485451065004e-05, "step": 2647 }, { "epoch": 0.331, "grad_norm": 2.4358112812042236, "grad_norm_var": 1.2162421953709317, "learning_rate": 0.0001, "loss": 1.4221, "loss/crossentropy": 2.348769426345825, "loss/hidden": 1.21875, "loss/logits": 0.20261234045028687, "loss/reg": 7.259786798385903e-05, "step": 2648 }, { "epoch": 0.331125, "grad_norm": 3.1406824588775635, "grad_norm_var": 1.2154155161492477, "learning_rate": 0.0001, "loss": 1.4765, "loss/crossentropy": 2.7714340686798096, "loss/hidden": 1.2109375, "loss/logits": 0.2647970914840698, "loss/reg": 7.256865501403809e-05, "step": 2649 }, { "epoch": 0.33125, "grad_norm": 2.377107620239258, "grad_norm_var": 1.232569853187502, "learning_rate": 0.0001, "loss": 1.3749, "loss/crossentropy": 2.1616692543029785, "loss/hidden": 1.1875, "loss/logits": 0.1866443157196045, "loss/reg": 7.253717922139913e-05, "step": 2650 }, { "epoch": 0.331375, "grad_norm": 3.0397589206695557, "grad_norm_var": 1.2325354789574097, "learning_rate": 0.0001, "loss": 1.5078, "loss/crossentropy": 2.674020528793335, "loss/hidden": 1.28125, "loss/logits": 0.22584572434425354, "loss/reg": 7.251067290781066e-05, "step": 2651 }, { "epoch": 0.3315, "grad_norm": 4.079615116119385, "grad_norm_var": 1.254805710248511, "learning_rate": 0.0001, "loss": 1.242, "loss/crossentropy": 2.7842562198638916, "loss/hidden": 1.0859375, "loss/logits": 0.15536388754844666, "loss/reg": 7.247751636896282e-05, "step": 2652 }, { "epoch": 0.331625, "grad_norm": 3.6742312908172607, "grad_norm_var": 1.0873189311204252, "learning_rate": 0.0001, "loss": 1.4066, "loss/crossentropy": 2.6340880393981934, "loss/hidden": 1.203125, "loss/logits": 0.20272301137447357, "loss/reg": 7.244678272400051e-05, "step": 2653 }, { "epoch": 0.33175, "grad_norm": 2.605694532394409, "grad_norm_var": 1.0794105829571654, "learning_rate": 0.0001, "loss": 1.4771, "loss/crossentropy": 2.5274760723114014, "loss/hidden": 1.2578125, "loss/logits": 0.21860189735889435, "loss/reg": 7.240991544676945e-05, "step": 2654 }, { "epoch": 0.331875, "grad_norm": 2.6233229637145996, "grad_norm_var": 1.0678407483025836, "learning_rate": 0.0001, "loss": 1.2808, "loss/crossentropy": 2.5258123874664307, "loss/hidden": 1.0859375, "loss/logits": 0.19412940740585327, "loss/reg": 7.23749544704333e-05, "step": 2655 }, { "epoch": 0.332, "grad_norm": 1.947811245918274, "grad_norm_var": 1.0832485478234781, "learning_rate": 0.0001, "loss": 1.417, "loss/crossentropy": 2.5325958728790283, "loss/hidden": 1.21875, "loss/logits": 0.1974962055683136, "loss/reg": 7.234352960949764e-05, "step": 2656 }, { "epoch": 0.332125, "grad_norm": 3.227940797805786, "grad_norm_var": 1.0848713839993744, "learning_rate": 0.0001, "loss": 1.7119, "loss/crossentropy": 2.20202898979187, "loss/hidden": 1.4375, "loss/logits": 0.2736613154411316, "loss/reg": 7.231249765027314e-05, "step": 2657 }, { "epoch": 0.33225, "grad_norm": 2.817164421081543, "grad_norm_var": 1.0737223280582393, "learning_rate": 0.0001, "loss": 1.2285, "loss/crossentropy": 2.670656204223633, "loss/hidden": 1.015625, "loss/logits": 0.212121844291687, "loss/reg": 7.228198955999687e-05, "step": 2658 }, { "epoch": 0.332375, "grad_norm": 2.5586390495300293, "grad_norm_var": 1.0883501204580635, "learning_rate": 0.0001, "loss": 1.4859, "loss/crossentropy": 2.1712629795074463, "loss/hidden": 1.2578125, "loss/logits": 0.22732074558734894, "loss/reg": 7.224716682685539e-05, "step": 2659 }, { "epoch": 0.3325, "grad_norm": 3.487541913986206, "grad_norm_var": 1.0866489616053632, "learning_rate": 0.0001, "loss": 1.5998, "loss/crossentropy": 2.822496175765991, "loss/hidden": 1.34375, "loss/logits": 0.25531402230262756, "loss/reg": 7.221529085654765e-05, "step": 2660 }, { "epoch": 0.332625, "grad_norm": 3.7866756916046143, "grad_norm_var": 1.104567512157549, "learning_rate": 0.0001, "loss": 1.4157, "loss/crossentropy": 2.975682020187378, "loss/hidden": 1.21875, "loss/logits": 0.196226567029953, "loss/reg": 7.219216058729216e-05, "step": 2661 }, { "epoch": 0.33275, "grad_norm": 3.04699969291687, "grad_norm_var": 0.3323892059164511, "learning_rate": 0.0001, "loss": 1.6929, "loss/crossentropy": 2.593344211578369, "loss/hidden": 1.453125, "loss/logits": 0.23901696503162384, "loss/reg": 7.217303937068209e-05, "step": 2662 }, { "epoch": 0.332875, "grad_norm": 2.703519582748413, "grad_norm_var": 0.3311087985686872, "learning_rate": 0.0001, "loss": 1.316, "loss/crossentropy": 2.523214817047119, "loss/hidden": 1.1484375, "loss/logits": 0.16683579981327057, "loss/reg": 7.214156357804313e-05, "step": 2663 }, { "epoch": 0.333, "grad_norm": 2.5519561767578125, "grad_norm_var": 0.3236479898501201, "learning_rate": 0.0001, "loss": 1.2636, "loss/crossentropy": 2.6752853393554688, "loss/hidden": 1.078125, "loss/logits": 0.18479138612747192, "loss/reg": 7.210994226625189e-05, "step": 2664 }, { "epoch": 0.333125, "grad_norm": 3.0975301265716553, "grad_norm_var": 0.32283578550683284, "learning_rate": 0.0001, "loss": 1.4624, "loss/crossentropy": 2.690812110900879, "loss/hidden": 1.203125, "loss/logits": 0.2585052251815796, "loss/reg": 7.208192255347967e-05, "step": 2665 }, { "epoch": 0.33325, "grad_norm": 2.0224759578704834, "grad_norm_var": 0.35904227355880536, "learning_rate": 0.0001, "loss": 1.2404, "loss/crossentropy": 2.7171380519866943, "loss/hidden": 1.0703125, "loss/logits": 0.16940629482269287, "loss/reg": 7.204522989923134e-05, "step": 2666 }, { "epoch": 0.333375, "grad_norm": 2.2607853412628174, "grad_norm_var": 0.3881047170738928, "learning_rate": 0.0001, "loss": 1.4006, "loss/crossentropy": 2.5300893783569336, "loss/hidden": 1.1796875, "loss/logits": 0.2202264964580536, "loss/reg": 7.20098105375655e-05, "step": 2667 }, { "epoch": 0.3335, "grad_norm": 3.206886053085327, "grad_norm_var": 0.29911202554032357, "learning_rate": 0.0001, "loss": 1.7498, "loss/crossentropy": 2.4054222106933594, "loss/hidden": 1.484375, "loss/logits": 0.26472705602645874, "loss/reg": 7.197881495812908e-05, "step": 2668 }, { "epoch": 0.333625, "grad_norm": 2.933565616607666, "grad_norm_var": 0.25211966934050495, "learning_rate": 0.0001, "loss": 1.3186, "loss/crossentropy": 2.2882280349731445, "loss/hidden": 1.140625, "loss/logits": 0.17721626162528992, "loss/reg": 7.194957288447767e-05, "step": 2669 }, { "epoch": 0.33375, "grad_norm": 3.1336753368377686, "grad_norm_var": 0.2555183670818341, "learning_rate": 0.0001, "loss": 1.4794, "loss/crossentropy": 3.0474095344543457, "loss/hidden": 1.2578125, "loss/logits": 0.22089585661888123, "loss/reg": 7.191635813796893e-05, "step": 2670 }, { "epoch": 0.333875, "grad_norm": 2.8498127460479736, "grad_norm_var": 0.2522443644525646, "learning_rate": 0.0001, "loss": 1.3148, "loss/crossentropy": 2.779575824737549, "loss/hidden": 1.1328125, "loss/logits": 0.1812690794467926, "loss/reg": 7.188418385339901e-05, "step": 2671 }, { "epoch": 0.334, "grad_norm": 2.0503108501434326, "grad_norm_var": 0.24054296454784588, "learning_rate": 0.0001, "loss": 1.256, "loss/crossentropy": 2.433976650238037, "loss/hidden": 1.078125, "loss/logits": 0.17717084288597107, "loss/reg": 7.184949208749458e-05, "step": 2672 }, { "epoch": 0.334125, "grad_norm": 2.141296625137329, "grad_norm_var": 0.2608112136115286, "learning_rate": 0.0001, "loss": 1.3257, "loss/crossentropy": 2.1956300735473633, "loss/hidden": 1.140625, "loss/logits": 0.18440282344818115, "loss/reg": 7.181593537097797e-05, "step": 2673 }, { "epoch": 0.33425, "grad_norm": 3.549621820449829, "grad_norm_var": 0.2969410546042958, "learning_rate": 0.0001, "loss": 1.6578, "loss/crossentropy": 2.542529582977295, "loss/hidden": 1.359375, "loss/logits": 0.29773572087287903, "loss/reg": 7.178921077866107e-05, "step": 2674 }, { "epoch": 0.334375, "grad_norm": 2.541468858718872, "grad_norm_var": 0.29759521658445465, "learning_rate": 0.0001, "loss": 1.3729, "loss/crossentropy": 2.444370746612549, "loss/hidden": 1.1875, "loss/logits": 0.1846354603767395, "loss/reg": 7.175756036303937e-05, "step": 2675 }, { "epoch": 0.3345, "grad_norm": 2.3823206424713135, "grad_norm_var": 0.2778173860363637, "learning_rate": 0.0001, "loss": 1.3246, "loss/crossentropy": 2.4075539112091064, "loss/hidden": 1.125, "loss/logits": 0.1988658607006073, "loss/reg": 7.1736030804459e-05, "step": 2676 }, { "epoch": 0.334625, "grad_norm": 2.6612138748168945, "grad_norm_var": 0.20384691157957088, "learning_rate": 0.0001, "loss": 1.3357, "loss/crossentropy": 2.462655544281006, "loss/hidden": 1.15625, "loss/logits": 0.17871293425559998, "loss/reg": 7.170351454988122e-05, "step": 2677 }, { "epoch": 0.33475, "grad_norm": 2.339268445968628, "grad_norm_var": 0.20201523568860177, "learning_rate": 0.0001, "loss": 1.4525, "loss/crossentropy": 2.563009738922119, "loss/hidden": 1.25, "loss/logits": 0.20182086527347565, "loss/reg": 7.167438161559403e-05, "step": 2678 }, { "epoch": 0.334875, "grad_norm": 2.3910586833953857, "grad_norm_var": 0.20595446013016552, "learning_rate": 0.0001, "loss": 1.3581, "loss/crossentropy": 2.6830689907073975, "loss/hidden": 1.15625, "loss/logits": 0.20109181106090546, "loss/reg": 7.16439462848939e-05, "step": 2679 }, { "epoch": 0.335, "grad_norm": 2.092654228210449, "grad_norm_var": 0.22404603066800988, "learning_rate": 0.0001, "loss": 1.2441, "loss/crossentropy": 2.5296478271484375, "loss/hidden": 1.078125, "loss/logits": 0.1652946174144745, "loss/reg": 7.162148540373892e-05, "step": 2680 }, { "epoch": 0.335125, "grad_norm": 2.6573119163513184, "grad_norm_var": 0.20715302281558176, "learning_rate": 0.0001, "loss": 1.4378, "loss/crossentropy": 2.560281991958618, "loss/hidden": 1.234375, "loss/logits": 0.2027072161436081, "loss/reg": 7.16000868123956e-05, "step": 2681 }, { "epoch": 0.33525, "grad_norm": 2.2862586975097656, "grad_norm_var": 0.1920388408924185, "learning_rate": 0.0001, "loss": 1.5295, "loss/crossentropy": 2.635943651199341, "loss/hidden": 1.2734375, "loss/logits": 0.2553861737251282, "loss/reg": 7.157657819334418e-05, "step": 2682 }, { "epoch": 0.335375, "grad_norm": 2.4769909381866455, "grad_norm_var": 0.1854024059511154, "learning_rate": 0.0001, "loss": 1.5216, "loss/crossentropy": 2.187251329421997, "loss/hidden": 1.28125, "loss/logits": 0.23961946368217468, "loss/reg": 7.154743798309937e-05, "step": 2683 }, { "epoch": 0.3355, "grad_norm": 3.9141600131988525, "grad_norm_var": 0.2733461245194915, "learning_rate": 0.0001, "loss": 1.6739, "loss/crossentropy": 2.893380641937256, "loss/hidden": 1.265625, "loss/logits": 0.4075168967247009, "loss/reg": 7.151874888222665e-05, "step": 2684 }, { "epoch": 0.335625, "grad_norm": 2.488043785095215, "grad_norm_var": 0.2689107808021122, "learning_rate": 0.0001, "loss": 1.2223, "loss/crossentropy": 2.5493669509887695, "loss/hidden": 1.0546875, "loss/logits": 0.16691246628761292, "loss/reg": 7.149224984459579e-05, "step": 2685 }, { "epoch": 0.33575, "grad_norm": 3.925008773803711, "grad_norm_var": 0.36201339322687903, "learning_rate": 0.0001, "loss": 1.4721, "loss/crossentropy": 2.680081844329834, "loss/hidden": 1.2578125, "loss/logits": 0.21358150243759155, "loss/reg": 7.147232099669054e-05, "step": 2686 }, { "epoch": 0.335875, "grad_norm": 3.4808578491210938, "grad_norm_var": 0.4018904022708168, "learning_rate": 0.0001, "loss": 1.211, "loss/crossentropy": 2.669466733932495, "loss/hidden": 1.0625, "loss/logits": 0.14776265621185303, "loss/reg": 7.145044219214469e-05, "step": 2687 }, { "epoch": 0.336, "grad_norm": 2.8102097511291504, "grad_norm_var": 0.371028180859624, "learning_rate": 0.0001, "loss": 1.5771, "loss/crossentropy": 2.2958567142486572, "loss/hidden": 1.3359375, "loss/logits": 0.24044862389564514, "loss/reg": 7.143166294554248e-05, "step": 2688 }, { "epoch": 0.336125, "grad_norm": 2.404911518096924, "grad_norm_var": 0.3536737815286223, "learning_rate": 0.0001, "loss": 1.3994, "loss/crossentropy": 2.4484593868255615, "loss/hidden": 1.2109375, "loss/logits": 0.18775638937950134, "loss/reg": 7.141553214751184e-05, "step": 2689 }, { "epoch": 0.33625, "grad_norm": 2.792910099029541, "grad_norm_var": 0.31131525748481387, "learning_rate": 0.0001, "loss": 1.2908, "loss/crossentropy": 2.746401786804199, "loss/hidden": 1.1015625, "loss/logits": 0.1885615438222885, "loss/reg": 7.139724039006978e-05, "step": 2690 }, { "epoch": 0.336375, "grad_norm": 2.931743860244751, "grad_norm_var": 0.3111393611135166, "learning_rate": 0.0001, "loss": 1.64, "loss/crossentropy": 2.264178514480591, "loss/hidden": 1.390625, "loss/logits": 0.24865320324897766, "loss/reg": 7.136804924812168e-05, "step": 2691 }, { "epoch": 0.3365, "grad_norm": 7.642730236053467, "grad_norm_var": 1.7812168960239108, "learning_rate": 0.0001, "loss": 1.5206, "loss/crossentropy": 2.5741586685180664, "loss/hidden": 1.296875, "loss/logits": 0.22304213047027588, "loss/reg": 7.134369661798701e-05, "step": 2692 }, { "epoch": 0.336625, "grad_norm": 2.8801169395446777, "grad_norm_var": 1.7719606936015393, "learning_rate": 0.0001, "loss": 1.5124, "loss/crossentropy": 2.711637020111084, "loss/hidden": 1.2734375, "loss/logits": 0.23829227685928345, "loss/reg": 7.131786696845666e-05, "step": 2693 }, { "epoch": 0.33675, "grad_norm": 2.4461138248443604, "grad_norm_var": 1.7619131320180919, "learning_rate": 0.0001, "loss": 1.4902, "loss/crossentropy": 2.613938331604004, "loss/hidden": 1.2421875, "loss/logits": 0.2473267912864685, "loss/reg": 7.129961159080267e-05, "step": 2694 }, { "epoch": 0.336875, "grad_norm": 2.128518581390381, "grad_norm_var": 1.7910839473156623, "learning_rate": 0.0001, "loss": 1.1915, "loss/crossentropy": 2.580587148666382, "loss/hidden": 1.0078125, "loss/logits": 0.18293863534927368, "loss/reg": 7.127159187803045e-05, "step": 2695 }, { "epoch": 0.337, "grad_norm": 2.285552501678467, "grad_norm_var": 1.7678889968041707, "learning_rate": 0.0001, "loss": 1.273, "loss/crossentropy": 2.6301279067993164, "loss/hidden": 1.09375, "loss/logits": 0.1785697489976883, "loss/reg": 7.124173134798184e-05, "step": 2696 }, { "epoch": 0.337125, "grad_norm": 2.3979592323303223, "grad_norm_var": 1.787296344649754, "learning_rate": 0.0001, "loss": 1.3789, "loss/crossentropy": 3.027653217315674, "loss/hidden": 1.1953125, "loss/logits": 0.1828649342060089, "loss/reg": 7.121470116544515e-05, "step": 2697 }, { "epoch": 0.33725, "grad_norm": 2.589761972427368, "grad_norm_var": 1.7609025038596557, "learning_rate": 0.0001, "loss": 1.3466, "loss/crossentropy": 2.821155548095703, "loss/hidden": 1.171875, "loss/logits": 0.174003005027771, "loss/reg": 7.118434587027878e-05, "step": 2698 }, { "epoch": 0.337375, "grad_norm": 2.2340927124023438, "grad_norm_var": 1.7847580882897243, "learning_rate": 0.0001, "loss": 1.3215, "loss/crossentropy": 2.625575304031372, "loss/hidden": 1.125, "loss/logits": 0.1958332359790802, "loss/reg": 7.116003689588979e-05, "step": 2699 }, { "epoch": 0.3375, "grad_norm": 2.156465530395508, "grad_norm_var": 1.7834228272632304, "learning_rate": 0.0001, "loss": 1.2476, "loss/crossentropy": 2.620509147644043, "loss/hidden": 1.078125, "loss/logits": 0.1687980592250824, "loss/reg": 7.112960156518966e-05, "step": 2700 }, { "epoch": 0.337625, "grad_norm": 2.0559799671173096, "grad_norm_var": 1.8231250823512861, "learning_rate": 0.0001, "loss": 1.2596, "loss/crossentropy": 2.6217739582061768, "loss/hidden": 1.0859375, "loss/logits": 0.17294803261756897, "loss/reg": 7.109998841769993e-05, "step": 2701 }, { "epoch": 0.33775, "grad_norm": 2.1868693828582764, "grad_norm_var": 1.785448570370342, "learning_rate": 0.0001, "loss": 1.2795, "loss/crossentropy": 2.7546658515930176, "loss/hidden": 1.0859375, "loss/logits": 0.1928234100341797, "loss/reg": 7.10659078322351e-05, "step": 2702 }, { "epoch": 0.337875, "grad_norm": 3.0767412185668945, "grad_norm_var": 1.7610734106584534, "learning_rate": 0.0001, "loss": 1.4077, "loss/crossentropy": 2.5087742805480957, "loss/hidden": 1.1875, "loss/logits": 0.21953627467155457, "loss/reg": 7.103780808392912e-05, "step": 2703 }, { "epoch": 0.338, "grad_norm": 2.297394275665283, "grad_norm_var": 1.777754603107856, "learning_rate": 0.0001, "loss": 1.1556, "loss/crossentropy": 2.4931986331939697, "loss/hidden": 1.0, "loss/logits": 0.1548832654953003, "loss/reg": 7.101027586031705e-05, "step": 2704 }, { "epoch": 0.338125, "grad_norm": 2.2321646213531494, "grad_norm_var": 1.7882991878256982, "learning_rate": 0.0001, "loss": 1.3134, "loss/crossentropy": 2.5433144569396973, "loss/hidden": 1.125, "loss/logits": 0.18769437074661255, "loss/reg": 7.098243804648519e-05, "step": 2705 }, { "epoch": 0.33825, "grad_norm": 2.710052013397217, "grad_norm_var": 1.7884856109324867, "learning_rate": 0.0001, "loss": 1.5118, "loss/crossentropy": 2.509829521179199, "loss/hidden": 1.296875, "loss/logits": 0.21419623494148254, "loss/reg": 7.095153705449775e-05, "step": 2706 }, { "epoch": 0.338375, "grad_norm": 2.084449052810669, "grad_norm_var": 1.8146039405284067, "learning_rate": 0.0001, "loss": 1.2339, "loss/crossentropy": 2.7254478931427, "loss/hidden": 1.0625, "loss/logits": 0.1707044541835785, "loss/reg": 7.091824227245525e-05, "step": 2707 }, { "epoch": 0.3385, "grad_norm": 2.282498359680176, "grad_norm_var": 0.08695731356670346, "learning_rate": 0.0001, "loss": 1.3515, "loss/crossentropy": 2.455270528793335, "loss/hidden": 1.140625, "loss/logits": 0.21012592315673828, "loss/reg": 7.088806160027161e-05, "step": 2708 }, { "epoch": 0.338625, "grad_norm": 2.2300703525543213, "grad_norm_var": 0.06982971575975677, "learning_rate": 0.0001, "loss": 1.3161, "loss/crossentropy": 2.804197072982788, "loss/hidden": 1.140625, "loss/logits": 0.17476443946361542, "loss/reg": 7.085863762767985e-05, "step": 2709 }, { "epoch": 0.33875, "grad_norm": 2.681544065475464, "grad_norm_var": 0.07671382234880918, "learning_rate": 0.0001, "loss": 1.2846, "loss/crossentropy": 2.8267271518707275, "loss/hidden": 1.109375, "loss/logits": 0.1745147705078125, "loss/reg": 7.083117816364393e-05, "step": 2710 }, { "epoch": 0.338875, "grad_norm": 5.462092399597168, "grad_norm_var": 0.6719786287577977, "learning_rate": 0.0001, "loss": 1.6471, "loss/crossentropy": 2.059929370880127, "loss/hidden": 1.4140625, "loss/logits": 0.23232778906822205, "loss/reg": 7.080292562022805e-05, "step": 2711 }, { "epoch": 0.339, "grad_norm": 3.698168992996216, "grad_norm_var": 0.7449611778873826, "learning_rate": 0.0001, "loss": 1.3937, "loss/crossentropy": 2.650939702987671, "loss/hidden": 1.1796875, "loss/logits": 0.21325981616973877, "loss/reg": 7.077681948430836e-05, "step": 2712 }, { "epoch": 0.339125, "grad_norm": 2.4998281002044678, "grad_norm_var": 0.7422065249181758, "learning_rate": 0.0001, "loss": 1.5236, "loss/crossentropy": 2.476491689682007, "loss/hidden": 1.2578125, "loss/logits": 0.2651017904281616, "loss/reg": 7.074617315083742e-05, "step": 2713 }, { "epoch": 0.33925, "grad_norm": 2.5457189083099365, "grad_norm_var": 0.7427101956883384, "learning_rate": 0.0001, "loss": 1.3318, "loss/crossentropy": 2.443406105041504, "loss/hidden": 1.140625, "loss/logits": 0.19049173593521118, "loss/reg": 7.071983418427408e-05, "step": 2714 }, { "epoch": 0.339375, "grad_norm": 4.167260646820068, "grad_norm_var": 0.868529028399003, "learning_rate": 0.0001, "loss": 1.3177, "loss/crossentropy": 2.494140863418579, "loss/hidden": 1.15625, "loss/logits": 0.1607470065355301, "loss/reg": 7.069118873914704e-05, "step": 2715 }, { "epoch": 0.3395, "grad_norm": 14.130678176879883, "grad_norm_var": 8.845624307799422, "learning_rate": 0.0001, "loss": 1.7811, "loss/crossentropy": 2.4222302436828613, "loss/hidden": 1.5390625, "loss/logits": 0.24136415123939514, "loss/reg": 7.066246325848624e-05, "step": 2716 }, { "epoch": 0.339625, "grad_norm": 2.451385259628296, "grad_norm_var": 8.778140844478491, "learning_rate": 0.0001, "loss": 1.4284, "loss/crossentropy": 2.615372896194458, "loss/hidden": 1.1953125, "loss/logits": 0.23241561651229858, "loss/reg": 7.059909694362432e-05, "step": 2717 }, { "epoch": 0.33975, "grad_norm": 2.34149432182312, "grad_norm_var": 8.751613237052899, "learning_rate": 0.0001, "loss": 1.3153, "loss/crossentropy": 2.4568495750427246, "loss/hidden": 1.1328125, "loss/logits": 0.18180054426193237, "loss/reg": 7.054670277284458e-05, "step": 2718 }, { "epoch": 0.339875, "grad_norm": 2.269932985305786, "grad_norm_var": 8.843822966920431, "learning_rate": 0.0001, "loss": 1.3598, "loss/crossentropy": 2.2863399982452393, "loss/hidden": 1.171875, "loss/logits": 0.18724440038204193, "loss/reg": 7.049988198559731e-05, "step": 2719 }, { "epoch": 0.34, "grad_norm": 4.682697296142578, "grad_norm_var": 8.815265891198313, "learning_rate": 0.0001, "loss": 1.7096, "loss/crossentropy": 2.8884828090667725, "loss/hidden": 1.4140625, "loss/logits": 0.29487526416778564, "loss/reg": 7.045908569125459e-05, "step": 2720 }, { "epoch": 0.340125, "grad_norm": 2.3777050971984863, "grad_norm_var": 8.788991168258773, "learning_rate": 0.0001, "loss": 1.3111, "loss/crossentropy": 2.610217332839966, "loss/hidden": 1.1171875, "loss/logits": 0.19325056672096252, "loss/reg": 7.04180565662682e-05, "step": 2721 }, { "epoch": 0.34025, "grad_norm": 3.0930371284484863, "grad_norm_var": 8.749472353031852, "learning_rate": 0.0001, "loss": 1.692, "loss/crossentropy": 2.3694515228271484, "loss/hidden": 1.4453125, "loss/logits": 0.2459462285041809, "loss/reg": 7.038575131446123e-05, "step": 2722 }, { "epoch": 0.340375, "grad_norm": 2.4214425086975098, "grad_norm_var": 8.684545156752112, "learning_rate": 0.0001, "loss": 1.4705, "loss/crossentropy": 2.5897274017333984, "loss/hidden": 1.2265625, "loss/logits": 0.243260458111763, "loss/reg": 7.035133603494614e-05, "step": 2723 }, { "epoch": 0.3405, "grad_norm": 2.5752546787261963, "grad_norm_var": 8.63424008593975, "learning_rate": 0.0001, "loss": 1.437, "loss/crossentropy": 2.4833950996398926, "loss/hidden": 1.2265625, "loss/logits": 0.20969432592391968, "loss/reg": 7.03274054103531e-05, "step": 2724 }, { "epoch": 0.340625, "grad_norm": 2.305251121520996, "grad_norm_var": 8.619590280159626, "learning_rate": 0.0001, "loss": 1.3184, "loss/crossentropy": 2.7865090370178223, "loss/hidden": 1.109375, "loss/logits": 0.2083156704902649, "loss/reg": 7.029481639619917e-05, "step": 2725 }, { "epoch": 0.34075, "grad_norm": 2.91369891166687, "grad_norm_var": 8.590459441125676, "learning_rate": 0.0001, "loss": 1.286, "loss/crossentropy": 2.540625810623169, "loss/hidden": 1.109375, "loss/logits": 0.1759338229894638, "loss/reg": 7.027312676655129e-05, "step": 2726 }, { "epoch": 0.340875, "grad_norm": 2.6863317489624023, "grad_norm_var": 8.436875980246516, "learning_rate": 0.0001, "loss": 1.3088, "loss/crossentropy": 2.6296451091766357, "loss/hidden": 1.1171875, "loss/logits": 0.1909254491329193, "loss/reg": 7.023104262771085e-05, "step": 2727 }, { "epoch": 0.341, "grad_norm": 2.3252112865448, "grad_norm_var": 8.53168288359722, "learning_rate": 0.0001, "loss": 1.2915, "loss/crossentropy": 2.244152069091797, "loss/hidden": 1.125, "loss/logits": 0.16579145193099976, "loss/reg": 7.019232725724578e-05, "step": 2728 }, { "epoch": 0.341125, "grad_norm": 2.367126226425171, "grad_norm_var": 8.550244494746176, "learning_rate": 0.0001, "loss": 1.2896, "loss/crossentropy": 2.6276116371154785, "loss/hidden": 1.1171875, "loss/logits": 0.17175135016441345, "loss/reg": 7.016301969997585e-05, "step": 2729 }, { "epoch": 0.34125, "grad_norm": 2.1562862396240234, "grad_norm_var": 8.608151408643407, "learning_rate": 0.0001, "loss": 1.2397, "loss/crossentropy": 2.8231451511383057, "loss/hidden": 1.0625, "loss/logits": 0.17645099759101868, "loss/reg": 7.01390381436795e-05, "step": 2730 }, { "epoch": 0.341375, "grad_norm": 2.313457489013672, "grad_norm_var": 8.646650991314708, "learning_rate": 0.0001, "loss": 1.3999, "loss/crossentropy": 2.6845054626464844, "loss/hidden": 1.1875, "loss/logits": 0.21169158816337585, "loss/reg": 7.010796252870932e-05, "step": 2731 }, { "epoch": 0.3415, "grad_norm": 2.49526047706604, "grad_norm_var": 0.3647325272348212, "learning_rate": 0.0001, "loss": 1.3909, "loss/crossentropy": 2.6574440002441406, "loss/hidden": 1.1875, "loss/logits": 0.20272809267044067, "loss/reg": 7.007573003647849e-05, "step": 2732 }, { "epoch": 0.341625, "grad_norm": 2.6853439807891846, "grad_norm_var": 0.3631753014687916, "learning_rate": 0.0001, "loss": 1.4628, "loss/crossentropy": 2.441225290298462, "loss/hidden": 1.234375, "loss/logits": 0.22772786021232605, "loss/reg": 7.003988139331341e-05, "step": 2733 }, { "epoch": 0.34175, "grad_norm": 2.4216859340667725, "grad_norm_var": 0.36053954667926574, "learning_rate": 0.0001, "loss": 1.3701, "loss/crossentropy": 2.5594747066497803, "loss/hidden": 1.1640625, "loss/logits": 0.20531296730041504, "loss/reg": 7.001181074883789e-05, "step": 2734 }, { "epoch": 0.341875, "grad_norm": 2.6835761070251465, "grad_norm_var": 0.35134125450974024, "learning_rate": 0.0001, "loss": 1.574, "loss/crossentropy": 2.483502149581909, "loss/hidden": 1.296875, "loss/logits": 0.2763804793357849, "loss/reg": 6.997953460086137e-05, "step": 2735 }, { "epoch": 0.342, "grad_norm": 2.427279233932495, "grad_norm_var": 0.059938326876059024, "learning_rate": 0.0001, "loss": 1.3284, "loss/crossentropy": 2.6998090744018555, "loss/hidden": 1.140625, "loss/logits": 0.18709218502044678, "loss/reg": 6.995311559876427e-05, "step": 2736 }, { "epoch": 0.342125, "grad_norm": 2.025721788406372, "grad_norm_var": 0.0741483078950464, "learning_rate": 0.0001, "loss": 1.3876, "loss/crossentropy": 2.481682777404785, "loss/hidden": 1.1953125, "loss/logits": 0.19154274463653564, "loss/reg": 6.993224815232679e-05, "step": 2737 }, { "epoch": 0.34225, "grad_norm": 2.1336188316345215, "grad_norm_var": 0.054983991632013604, "learning_rate": 0.0001, "loss": 1.3078, "loss/crossentropy": 2.584963321685791, "loss/hidden": 1.125, "loss/logits": 0.18211859464645386, "loss/reg": 6.989917164901271e-05, "step": 2738 }, { "epoch": 0.342375, "grad_norm": 3.238229751586914, "grad_norm_var": 0.09536348824269278, "learning_rate": 0.0001, "loss": 1.7829, "loss/crossentropy": 2.620985984802246, "loss/hidden": 1.5, "loss/logits": 0.28219300508499146, "loss/reg": 6.986671360209584e-05, "step": 2739 }, { "epoch": 0.3425, "grad_norm": 2.596540927886963, "grad_norm_var": 0.09564914756385103, "learning_rate": 0.0001, "loss": 1.3657, "loss/crossentropy": 2.791778802871704, "loss/hidden": 1.1640625, "loss/logits": 0.2009693831205368, "loss/reg": 6.983203638810664e-05, "step": 2740 }, { "epoch": 0.342625, "grad_norm": 2.030055046081543, "grad_norm_var": 0.10701147220017143, "learning_rate": 0.0001, "loss": 1.2921, "loss/crossentropy": 2.5915794372558594, "loss/hidden": 1.109375, "loss/logits": 0.1820673942565918, "loss/reg": 6.979393947403878e-05, "step": 2741 }, { "epoch": 0.34275, "grad_norm": 2.4897282123565674, "grad_norm_var": 0.09309117735820772, "learning_rate": 0.0001, "loss": 1.3338, "loss/crossentropy": 2.614738702774048, "loss/hidden": 1.140625, "loss/logits": 0.19251662492752075, "loss/reg": 6.975556607358158e-05, "step": 2742 }, { "epoch": 0.342875, "grad_norm": 2.6820151805877686, "grad_norm_var": 0.09295184283184478, "learning_rate": 0.0001, "loss": 1.3713, "loss/crossentropy": 2.750995397567749, "loss/hidden": 1.1875, "loss/logits": 0.1831102967262268, "loss/reg": 6.972347910050303e-05, "step": 2743 }, { "epoch": 0.343, "grad_norm": 2.32694411277771, "grad_norm_var": 0.09292505969296841, "learning_rate": 0.0001, "loss": 1.3128, "loss/crossentropy": 2.3548641204833984, "loss/hidden": 1.140625, "loss/logits": 0.17146719992160797, "loss/reg": 6.969535752432421e-05, "step": 2744 }, { "epoch": 0.343125, "grad_norm": 2.0112392902374268, "grad_norm_var": 0.10439648768326393, "learning_rate": 0.0001, "loss": 1.2114, "loss/crossentropy": 2.7490274906158447, "loss/hidden": 1.0390625, "loss/logits": 0.17163513600826263, "loss/reg": 6.96668794262223e-05, "step": 2745 }, { "epoch": 0.34325, "grad_norm": 1.8716583251953125, "grad_norm_var": 0.11946068600907604, "learning_rate": 0.0001, "loss": 1.2862, "loss/crossentropy": 2.2508699893951416, "loss/hidden": 1.078125, "loss/logits": 0.20737269520759583, "loss/reg": 6.964090425753966e-05, "step": 2746 }, { "epoch": 0.343375, "grad_norm": 2.391902208328247, "grad_norm_var": 0.11891896019835713, "learning_rate": 0.0001, "loss": 1.5218, "loss/crossentropy": 2.4311559200286865, "loss/hidden": 1.3046875, "loss/logits": 0.21639177203178406, "loss/reg": 6.96114293532446e-05, "step": 2747 }, { "epoch": 0.3435, "grad_norm": 2.7639143466949463, "grad_norm_var": 0.12659411524321057, "learning_rate": 0.0001, "loss": 1.2973, "loss/crossentropy": 2.345238447189331, "loss/hidden": 1.1171875, "loss/logits": 0.17946398258209229, "loss/reg": 6.958260200917721e-05, "step": 2748 }, { "epoch": 0.343625, "grad_norm": 2.74672532081604, "grad_norm_var": 0.1289708060821378, "learning_rate": 0.0001, "loss": 1.4154, "loss/crossentropy": 2.6919877529144287, "loss/hidden": 1.203125, "loss/logits": 0.21157383918762207, "loss/reg": 6.955744174774736e-05, "step": 2749 }, { "epoch": 0.34375, "grad_norm": 2.626401424407959, "grad_norm_var": 0.1314299620071682, "learning_rate": 0.0001, "loss": 1.3281, "loss/crossentropy": 2.6598777770996094, "loss/hidden": 1.1328125, "loss/logits": 0.1945762038230896, "loss/reg": 6.95273483870551e-05, "step": 2750 }, { "epoch": 0.343875, "grad_norm": 2.7835395336151123, "grad_norm_var": 0.13529637516389054, "learning_rate": 0.0001, "loss": 1.2705, "loss/crossentropy": 2.641899824142456, "loss/hidden": 1.09375, "loss/logits": 0.17602193355560303, "loss/reg": 6.950116221560165e-05, "step": 2751 }, { "epoch": 0.344, "grad_norm": 2.3936314582824707, "grad_norm_var": 0.1354537918264628, "learning_rate": 0.0001, "loss": 1.2548, "loss/crossentropy": 2.694915533065796, "loss/hidden": 1.078125, "loss/logits": 0.17597508430480957, "loss/reg": 6.947560177650303e-05, "step": 2752 }, { "epoch": 0.344125, "grad_norm": 2.68308162689209, "grad_norm_var": 0.12575708585615358, "learning_rate": 0.0001, "loss": 1.5649, "loss/crossentropy": 2.426647424697876, "loss/hidden": 1.3203125, "loss/logits": 0.24390573799610138, "loss/reg": 6.947021029191092e-05, "step": 2753 }, { "epoch": 0.34425, "grad_norm": 2.274909734725952, "grad_norm_var": 0.120374323356274, "learning_rate": 0.0001, "loss": 1.2042, "loss/crossentropy": 2.3825721740722656, "loss/hidden": 1.0390625, "loss/logits": 0.16446425020694733, "loss/reg": 6.943699554540217e-05, "step": 2754 }, { "epoch": 0.344375, "grad_norm": 3.3865203857421875, "grad_norm_var": 0.13645562614351903, "learning_rate": 0.0001, "loss": 1.3694, "loss/crossentropy": 2.3459036350250244, "loss/hidden": 1.1640625, "loss/logits": 0.20460620522499084, "loss/reg": 6.943032349226996e-05, "step": 2755 }, { "epoch": 0.3445, "grad_norm": 2.4622132778167725, "grad_norm_var": 0.1359201173963849, "learning_rate": 0.0001, "loss": 1.2444, "loss/crossentropy": 2.578232526779175, "loss/hidden": 1.078125, "loss/logits": 0.16553151607513428, "loss/reg": 6.939908053027466e-05, "step": 2756 }, { "epoch": 0.344625, "grad_norm": 2.331897020339966, "grad_norm_var": 0.12289114897324467, "learning_rate": 0.0001, "loss": 1.4269, "loss/crossentropy": 2.642990827560425, "loss/hidden": 1.2109375, "loss/logits": 0.2152491956949234, "loss/reg": 6.938540900591761e-05, "step": 2757 }, { "epoch": 0.34475, "grad_norm": 2.465266466140747, "grad_norm_var": 0.12300818480323202, "learning_rate": 0.0001, "loss": 1.3213, "loss/crossentropy": 2.603715181350708, "loss/hidden": 1.1328125, "loss/logits": 0.18774735927581787, "loss/reg": 6.937325815670192e-05, "step": 2758 }, { "epoch": 0.344875, "grad_norm": 2.6068384647369385, "grad_norm_var": 0.12166342508336901, "learning_rate": 0.0001, "loss": 1.6485, "loss/crossentropy": 1.7965141534805298, "loss/hidden": 1.390625, "loss/logits": 0.2572143077850342, "loss/reg": 6.934582052053884e-05, "step": 2759 }, { "epoch": 0.345, "grad_norm": 2.023719549179077, "grad_norm_var": 0.13472674716899533, "learning_rate": 0.0001, "loss": 1.3294, "loss/crossentropy": 2.455815076828003, "loss/hidden": 1.1484375, "loss/logits": 0.18031853437423706, "loss/reg": 6.932826363481581e-05, "step": 2760 }, { "epoch": 0.345125, "grad_norm": 1.9998779296875, "grad_norm_var": 0.13545849831731688, "learning_rate": 0.0001, "loss": 1.2802, "loss/crossentropy": 2.373211145401001, "loss/hidden": 1.109375, "loss/logits": 0.17013278603553772, "loss/reg": 6.931165989954025e-05, "step": 2761 }, { "epoch": 0.34525, "grad_norm": 2.5633294582366943, "grad_norm_var": 0.10849467692088197, "learning_rate": 0.0001, "loss": 1.4001, "loss/crossentropy": 2.423799514770508, "loss/hidden": 1.1796875, "loss/logits": 0.21973925828933716, "loss/reg": 6.929846131242812e-05, "step": 2762 }, { "epoch": 0.345375, "grad_norm": 2.180727005004883, "grad_norm_var": 0.1152120666888474, "learning_rate": 0.0001, "loss": 1.215, "loss/crossentropy": 2.500042676925659, "loss/hidden": 1.046875, "loss/logits": 0.1673997938632965, "loss/reg": 6.928755465196446e-05, "step": 2763 }, { "epoch": 0.3455, "grad_norm": 2.678558826446533, "grad_norm_var": 0.11287199520884669, "learning_rate": 0.0001, "loss": 1.4258, "loss/crossentropy": 2.3704588413238525, "loss/hidden": 1.203125, "loss/logits": 0.2219996303319931, "loss/reg": 6.92736211931333e-05, "step": 2764 }, { "epoch": 0.345625, "grad_norm": 2.577589511871338, "grad_norm_var": 0.10938801025555733, "learning_rate": 0.0001, "loss": 1.2156, "loss/crossentropy": 2.7088146209716797, "loss/hidden": 1.0546875, "loss/logits": 0.16021832823753357, "loss/reg": 6.925949128344655e-05, "step": 2765 }, { "epoch": 0.34575, "grad_norm": 2.6380178928375244, "grad_norm_var": 0.10958853418185086, "learning_rate": 0.0001, "loss": 1.4243, "loss/crossentropy": 2.7258501052856445, "loss/hidden": 1.234375, "loss/logits": 0.1892695426940918, "loss/reg": 6.925070192664862e-05, "step": 2766 }, { "epoch": 0.345875, "grad_norm": 3.1618165969848633, "grad_norm_var": 0.13267602023564962, "learning_rate": 0.0001, "loss": 1.4018, "loss/crossentropy": 2.4474806785583496, "loss/hidden": 1.2421875, "loss/logits": 0.1588926911354065, "loss/reg": 6.924259650986642e-05, "step": 2767 }, { "epoch": 0.346, "grad_norm": 2.756082057952881, "grad_norm_var": 0.1344535010075172, "learning_rate": 0.0001, "loss": 1.2726, "loss/crossentropy": 2.5201210975646973, "loss/hidden": 1.109375, "loss/logits": 0.1625133752822876, "loss/reg": 6.921265594428405e-05, "step": 2768 }, { "epoch": 0.346125, "grad_norm": 2.8123815059661865, "grad_norm_var": 0.13780302441585732, "learning_rate": 0.0001, "loss": 1.3267, "loss/crossentropy": 2.6401655673980713, "loss/hidden": 1.140625, "loss/logits": 0.18534722924232483, "loss/reg": 6.920135638210922e-05, "step": 2769 }, { "epoch": 0.34625, "grad_norm": 3.0944714546203613, "grad_norm_var": 0.14890482776557595, "learning_rate": 0.0001, "loss": 1.4391, "loss/crossentropy": 2.717965841293335, "loss/hidden": 1.21875, "loss/logits": 0.2196798324584961, "loss/reg": 6.917041901033372e-05, "step": 2770 }, { "epoch": 0.346375, "grad_norm": 2.781829595565796, "grad_norm_var": 0.10904644364192086, "learning_rate": 0.0001, "loss": 1.2831, "loss/crossentropy": 2.550140857696533, "loss/hidden": 1.125, "loss/logits": 0.15736284852027893, "loss/reg": 6.913873221492395e-05, "step": 2771 }, { "epoch": 0.3465, "grad_norm": 4.064074993133545, "grad_norm_var": 0.24620263189301472, "learning_rate": 0.0001, "loss": 1.6298, "loss/crossentropy": 2.34102463722229, "loss/hidden": 1.4140625, "loss/logits": 0.21502913534641266, "loss/reg": 6.910576485097408e-05, "step": 2772 }, { "epoch": 0.346625, "grad_norm": 3.10402250289917, "grad_norm_var": 0.2485499906216089, "learning_rate": 0.0001, "loss": 2.1297, "loss/crossentropy": 2.588038444519043, "loss/hidden": 1.6328125, "loss/logits": 0.4962162673473358, "loss/reg": 6.907415081514046e-05, "step": 2773 }, { "epoch": 0.34675, "grad_norm": 2.5266637802124023, "grad_norm_var": 0.2467060959979932, "learning_rate": 0.0001, "loss": 1.2293, "loss/crossentropy": 2.6339211463928223, "loss/hidden": 1.0703125, "loss/logits": 0.1583467721939087, "loss/reg": 6.904391193529591e-05, "step": 2774 }, { "epoch": 0.346875, "grad_norm": 2.5121307373046875, "grad_norm_var": 0.24873512511978574, "learning_rate": 0.0001, "loss": 1.2688, "loss/crossentropy": 2.718985080718994, "loss/hidden": 1.09375, "loss/logits": 0.17439186573028564, "loss/reg": 6.902075983816758e-05, "step": 2775 }, { "epoch": 0.347, "grad_norm": 2.451662540435791, "grad_norm_var": 0.22061139581334524, "learning_rate": 0.0001, "loss": 1.3664, "loss/crossentropy": 2.2248246669769287, "loss/hidden": 1.1796875, "loss/logits": 0.1860385686159134, "loss/reg": 6.899244908709079e-05, "step": 2776 }, { "epoch": 0.347125, "grad_norm": 3.207049608230591, "grad_norm_var": 0.19192696541244322, "learning_rate": 0.0001, "loss": 1.3519, "loss/crossentropy": 2.348137378692627, "loss/hidden": 1.15625, "loss/logits": 0.19493235647678375, "loss/reg": 6.896415288792923e-05, "step": 2777 }, { "epoch": 0.34725, "grad_norm": 3.2719638347625732, "grad_norm_var": 0.19911736675043865, "learning_rate": 0.0001, "loss": 1.3958, "loss/crossentropy": 2.800987720489502, "loss/hidden": 1.203125, "loss/logits": 0.1919705867767334, "loss/reg": 6.893116369610652e-05, "step": 2778 }, { "epoch": 0.347375, "grad_norm": 2.2053325176239014, "grad_norm_var": 0.19691458509023885, "learning_rate": 0.0001, "loss": 1.3008, "loss/crossentropy": 2.6421287059783936, "loss/hidden": 1.109375, "loss/logits": 0.19070783257484436, "loss/reg": 6.890611257404089e-05, "step": 2779 }, { "epoch": 0.3475, "grad_norm": 3.3738839626312256, "grad_norm_var": 0.20982579818374514, "learning_rate": 0.0001, "loss": 1.3672, "loss/crossentropy": 2.586411952972412, "loss/hidden": 1.1796875, "loss/logits": 0.18678493797779083, "loss/reg": 6.88726722728461e-05, "step": 2780 }, { "epoch": 0.347625, "grad_norm": 3.961165189743042, "grad_norm_var": 0.2683888288248975, "learning_rate": 0.0001, "loss": 1.5685, "loss/crossentropy": 2.4606070518493652, "loss/hidden": 1.3515625, "loss/logits": 0.21623259782791138, "loss/reg": 6.884294271003455e-05, "step": 2781 }, { "epoch": 0.34775, "grad_norm": 2.7618627548217773, "grad_norm_var": 0.2634500752827324, "learning_rate": 0.0001, "loss": 1.2443, "loss/crossentropy": 2.288517475128174, "loss/hidden": 1.09375, "loss/logits": 0.14989668130874634, "loss/reg": 6.882195157231763e-05, "step": 2782 }, { "epoch": 0.347875, "grad_norm": 2.108581066131592, "grad_norm_var": 0.3104647062630039, "learning_rate": 0.0001, "loss": 1.3306, "loss/crossentropy": 2.5457396507263184, "loss/hidden": 1.15625, "loss/logits": 0.17370593547821045, "loss/reg": 6.878824933664873e-05, "step": 2783 }, { "epoch": 0.348, "grad_norm": 2.7377817630767822, "grad_norm_var": 0.3109272610676565, "learning_rate": 0.0001, "loss": 1.485, "loss/crossentropy": 2.4026122093200684, "loss/hidden": 1.265625, "loss/logits": 0.21866238117218018, "loss/reg": 6.876230327179655e-05, "step": 2784 }, { "epoch": 0.348125, "grad_norm": 2.658975124359131, "grad_norm_var": 0.31492516122285275, "learning_rate": 0.0001, "loss": 1.6063, "loss/crossentropy": 2.5550155639648438, "loss/hidden": 1.328125, "loss/logits": 0.27747243642807007, "loss/reg": 6.872782250866294e-05, "step": 2785 }, { "epoch": 0.34825, "grad_norm": 3.494255304336548, "grad_norm_var": 0.33387648392232827, "learning_rate": 0.0001, "loss": 1.2896, "loss/crossentropy": 2.6681206226348877, "loss/hidden": 1.1015625, "loss/logits": 0.18730252981185913, "loss/reg": 6.86992279952392e-05, "step": 2786 }, { "epoch": 0.348375, "grad_norm": 2.978543996810913, "grad_norm_var": 0.33184933589168397, "learning_rate": 0.0001, "loss": 1.269, "loss/crossentropy": 2.4882562160491943, "loss/hidden": 1.109375, "loss/logits": 0.15893155336380005, "loss/reg": 6.866105104563758e-05, "step": 2787 }, { "epoch": 0.3485, "grad_norm": 3.4650771617889404, "grad_norm_var": 0.26638503023138826, "learning_rate": 0.0001, "loss": 1.66, "loss/crossentropy": 2.2834298610687256, "loss/hidden": 1.421875, "loss/logits": 0.23742827773094177, "loss/reg": 6.863172893645242e-05, "step": 2788 }, { "epoch": 0.348625, "grad_norm": 3.9796042442321777, "grad_norm_var": 0.33506180407133357, "learning_rate": 0.0001, "loss": 2.1899, "loss/crossentropy": 2.1577036380767822, "loss/hidden": 1.859375, "loss/logits": 0.32988205552101135, "loss/reg": 6.859521818114445e-05, "step": 2789 }, { "epoch": 0.34875, "grad_norm": 2.48178768157959, "grad_norm_var": 0.33790563379619487, "learning_rate": 0.0001, "loss": 1.312, "loss/crossentropy": 2.7563791275024414, "loss/hidden": 1.1171875, "loss/logits": 0.19408820569515228, "loss/reg": 6.855613901279867e-05, "step": 2790 }, { "epoch": 0.348875, "grad_norm": 3.1244864463806152, "grad_norm_var": 0.32329636832130454, "learning_rate": 0.0001, "loss": 1.6008, "loss/crossentropy": 2.3507707118988037, "loss/hidden": 1.3671875, "loss/logits": 0.23294275999069214, "loss/reg": 6.851572834420949e-05, "step": 2791 }, { "epoch": 0.349, "grad_norm": 3.0077781677246094, "grad_norm_var": 0.3007526209364715, "learning_rate": 0.0001, "loss": 1.3785, "loss/crossentropy": 2.4785544872283936, "loss/hidden": 1.1875, "loss/logits": 0.19034968316555023, "loss/reg": 6.84761835145764e-05, "step": 2792 }, { "epoch": 0.349125, "grad_norm": 2.692842721939087, "grad_norm_var": 0.30658838376912717, "learning_rate": 0.0001, "loss": 1.4812, "loss/crossentropy": 2.5305016040802, "loss/hidden": 1.25, "loss/logits": 0.2305147647857666, "loss/reg": 6.844254676252604e-05, "step": 2793 }, { "epoch": 0.34925, "grad_norm": 2.596148729324341, "grad_norm_var": 0.3123390852643522, "learning_rate": 0.0001, "loss": 1.4747, "loss/crossentropy": 2.5908849239349365, "loss/hidden": 1.25, "loss/logits": 0.2240356057882309, "loss/reg": 6.840889545856044e-05, "step": 2794 }, { "epoch": 0.349375, "grad_norm": 2.2404229640960693, "grad_norm_var": 0.30880676155588255, "learning_rate": 0.0001, "loss": 1.4304, "loss/crossentropy": 2.502960681915283, "loss/hidden": 1.21875, "loss/logits": 0.21100296080112457, "loss/reg": 6.837843829998747e-05, "step": 2795 }, { "epoch": 0.3495, "grad_norm": 2.275763750076294, "grad_norm_var": 0.32634882446684493, "learning_rate": 0.0001, "loss": 1.4204, "loss/crossentropy": 2.5115342140197754, "loss/hidden": 1.21875, "loss/logits": 0.2009662240743637, "loss/reg": 6.834463420091197e-05, "step": 2796 }, { "epoch": 0.349625, "grad_norm": 2.820992946624756, "grad_norm_var": 0.24784536074689262, "learning_rate": 0.0001, "loss": 1.3582, "loss/crossentropy": 2.279062509536743, "loss/hidden": 1.171875, "loss/logits": 0.18562422692775726, "loss/reg": 6.830701750004664e-05, "step": 2797 }, { "epoch": 0.34975, "grad_norm": 2.8533802032470703, "grad_norm_var": 0.24742688205031602, "learning_rate": 0.0001, "loss": 1.5734, "loss/crossentropy": 2.296684980392456, "loss/hidden": 1.3671875, "loss/logits": 0.20556217432022095, "loss/reg": 6.827009929111227e-05, "step": 2798 }, { "epoch": 0.349875, "grad_norm": 3.064408302307129, "grad_norm_var": 0.21070383282622723, "learning_rate": 0.0001, "loss": 1.5041, "loss/crossentropy": 2.618752956390381, "loss/hidden": 1.25, "loss/logits": 0.25342682003974915, "loss/reg": 6.823569856351241e-05, "step": 2799 }, { "epoch": 0.35, "grad_norm": 2.3550548553466797, "grad_norm_var": 0.22836729551056745, "learning_rate": 0.0001, "loss": 1.3286, "loss/crossentropy": 2.303563117980957, "loss/hidden": 1.15625, "loss/logits": 0.17167334258556366, "loss/reg": 6.819918053224683e-05, "step": 2800 }, { "epoch": 0.350125, "grad_norm": 4.140681743621826, "grad_norm_var": 0.32179975177640635, "learning_rate": 0.0001, "loss": 1.873, "loss/crossentropy": 2.8083484172821045, "loss/hidden": 1.5703125, "loss/logits": 0.30196449160575867, "loss/reg": 6.816044333390892e-05, "step": 2801 }, { "epoch": 0.35025, "grad_norm": 2.517822742462158, "grad_norm_var": 0.3135520583506443, "learning_rate": 0.0001, "loss": 1.331, "loss/crossentropy": 2.6682581901550293, "loss/hidden": 1.1484375, "loss/logits": 0.18184085190296173, "loss/reg": 6.813100480940193e-05, "step": 2802 }, { "epoch": 0.350375, "grad_norm": 1.9914594888687134, "grad_norm_var": 0.3657131121624961, "learning_rate": 0.0001, "loss": 1.2701, "loss/crossentropy": 2.637448310852051, "loss/hidden": 1.109375, "loss/logits": 0.16006684303283691, "loss/reg": 6.809920887462795e-05, "step": 2803 }, { "epoch": 0.3505, "grad_norm": 2.5701520442962646, "grad_norm_var": 0.3424332456650209, "learning_rate": 0.0001, "loss": 1.3494, "loss/crossentropy": 2.4441747665405273, "loss/hidden": 1.1484375, "loss/logits": 0.2003229558467865, "loss/reg": 6.806760211475194e-05, "step": 2804 }, { "epoch": 0.350625, "grad_norm": 2.6488845348358154, "grad_norm_var": 0.24284569732051053, "learning_rate": 0.0001, "loss": 1.3645, "loss/crossentropy": 2.4233691692352295, "loss/hidden": 1.15625, "loss/logits": 0.2075205147266388, "loss/reg": 6.803763244533911e-05, "step": 2805 }, { "epoch": 0.35075, "grad_norm": 2.6337294578552246, "grad_norm_var": 0.23963731870934069, "learning_rate": 0.0001, "loss": 1.3539, "loss/crossentropy": 2.580638885498047, "loss/hidden": 1.140625, "loss/logits": 0.21262648701667786, "loss/reg": 6.800799019401893e-05, "step": 2806 }, { "epoch": 0.350875, "grad_norm": 2.5253376960754395, "grad_norm_var": 0.22983045987413575, "learning_rate": 0.0001, "loss": 1.4643, "loss/crossentropy": 2.3763837814331055, "loss/hidden": 1.265625, "loss/logits": 0.19799299538135529, "loss/reg": 6.797980313422158e-05, "step": 2807 }, { "epoch": 0.351, "grad_norm": 2.721991777420044, "grad_norm_var": 0.22257579043794012, "learning_rate": 0.0001, "loss": 1.2064, "loss/crossentropy": 2.733295440673828, "loss/hidden": 1.0546875, "loss/logits": 0.15100139379501343, "loss/reg": 6.795458466513082e-05, "step": 2808 }, { "epoch": 0.351125, "grad_norm": 1.9947876930236816, "grad_norm_var": 0.2504921926433937, "learning_rate": 0.0001, "loss": 1.2922, "loss/crossentropy": 2.3609142303466797, "loss/hidden": 1.1328125, "loss/logits": 0.15870727598667145, "loss/reg": 6.7932101956103e-05, "step": 2809 }, { "epoch": 0.35125, "grad_norm": 2.0725464820861816, "grad_norm_var": 0.2694276400920102, "learning_rate": 0.0001, "loss": 1.2667, "loss/crossentropy": 2.704880952835083, "loss/hidden": 1.078125, "loss/logits": 0.1879434883594513, "loss/reg": 6.790794577682391e-05, "step": 2810 }, { "epoch": 0.351375, "grad_norm": 3.0752944946289062, "grad_norm_var": 0.274164751403623, "learning_rate": 0.0001, "loss": 1.3974, "loss/crossentropy": 2.4474947452545166, "loss/hidden": 1.21875, "loss/logits": 0.17795707285404205, "loss/reg": 6.787958409404382e-05, "step": 2811 }, { "epoch": 0.3515, "grad_norm": 3.467475175857544, "grad_norm_var": 0.3048291496388022, "learning_rate": 0.0001, "loss": 1.3974, "loss/crossentropy": 2.9465856552124023, "loss/hidden": 1.171875, "loss/logits": 0.22482453286647797, "loss/reg": 6.786370067857206e-05, "step": 2812 }, { "epoch": 0.351625, "grad_norm": 3.2134616374969482, "grad_norm_var": 0.319956864122975, "learning_rate": 0.0001, "loss": 1.4233, "loss/crossentropy": 2.5486841201782227, "loss/hidden": 1.21875, "loss/logits": 0.20389090478420258, "loss/reg": 6.784807919757441e-05, "step": 2813 }, { "epoch": 0.35175, "grad_norm": 3.026484966278076, "grad_norm_var": 0.3244372490829325, "learning_rate": 0.0001, "loss": 1.6637, "loss/crossentropy": 2.2621026039123535, "loss/hidden": 1.4375, "loss/logits": 0.2255287915468216, "loss/reg": 6.78321230225265e-05, "step": 2814 }, { "epoch": 0.351875, "grad_norm": 4.798422813415527, "grad_norm_var": 0.5847716186410242, "learning_rate": 0.0001, "loss": 1.9453, "loss/crossentropy": 2.8846938610076904, "loss/hidden": 1.59375, "loss/logits": 0.35091060400009155, "loss/reg": 6.781575211789459e-05, "step": 2815 }, { "epoch": 0.352, "grad_norm": 2.297264575958252, "grad_norm_var": 0.5888680522620465, "learning_rate": 0.0001, "loss": 1.4168, "loss/crossentropy": 2.379504919052124, "loss/hidden": 1.2109375, "loss/logits": 0.20521070063114166, "loss/reg": 6.780129479011521e-05, "step": 2816 }, { "epoch": 0.352125, "grad_norm": 3.1725497245788574, "grad_norm_var": 0.48161418847694265, "learning_rate": 0.0001, "loss": 1.3314, "loss/crossentropy": 2.7281925678253174, "loss/hidden": 1.140625, "loss/logits": 0.19005241990089417, "loss/reg": 6.776546069886535e-05, "step": 2817 }, { "epoch": 0.35225, "grad_norm": 3.820321559906006, "grad_norm_var": 0.5394260294544149, "learning_rate": 0.0001, "loss": 1.5679, "loss/crossentropy": 2.409947156906128, "loss/hidden": 1.34375, "loss/logits": 0.22350502014160156, "loss/reg": 6.774192297598347e-05, "step": 2818 }, { "epoch": 0.352375, "grad_norm": 3.221635103225708, "grad_norm_var": 0.48877872354372964, "learning_rate": 0.0001, "loss": 1.5593, "loss/crossentropy": 2.552091360092163, "loss/hidden": 1.3203125, "loss/logits": 0.23830285668373108, "loss/reg": 6.771621701773256e-05, "step": 2819 }, { "epoch": 0.3525, "grad_norm": 2.619992256164551, "grad_norm_var": 0.4863846882410615, "learning_rate": 0.0001, "loss": 1.4336, "loss/crossentropy": 2.540283679962158, "loss/hidden": 1.2421875, "loss/logits": 0.19072511792182922, "loss/reg": 6.769593892386183e-05, "step": 2820 }, { "epoch": 0.352625, "grad_norm": 2.3992021083831787, "grad_norm_var": 0.5005347023436681, "learning_rate": 0.0001, "loss": 1.2324, "loss/crossentropy": 3.032970666885376, "loss/hidden": 1.046875, "loss/logits": 0.1848786473274231, "loss/reg": 6.76728377584368e-05, "step": 2821 }, { "epoch": 0.35275, "grad_norm": 2.466495990753174, "grad_norm_var": 0.509140365425069, "learning_rate": 0.0001, "loss": 1.5163, "loss/crossentropy": 2.2834033966064453, "loss/hidden": 1.28125, "loss/logits": 0.23440220952033997, "loss/reg": 6.764130375813693e-05, "step": 2822 }, { "epoch": 0.352875, "grad_norm": 2.198427677154541, "grad_norm_var": 0.5334943065834501, "learning_rate": 0.0001, "loss": 1.3101, "loss/crossentropy": 2.5580222606658936, "loss/hidden": 1.109375, "loss/logits": 0.20000645518302917, "loss/reg": 6.761877739336342e-05, "step": 2823 }, { "epoch": 0.353, "grad_norm": 2.4496827125549316, "grad_norm_var": 0.5449694187100543, "learning_rate": 0.0001, "loss": 1.2909, "loss/crossentropy": 2.4816415309906006, "loss/hidden": 1.1015625, "loss/logits": 0.1886357069015503, "loss/reg": 6.758720701327547e-05, "step": 2824 }, { "epoch": 0.353125, "grad_norm": 2.394752264022827, "grad_norm_var": 0.5070470858815116, "learning_rate": 0.0001, "loss": 1.259, "loss/crossentropy": 2.624701976776123, "loss/hidden": 1.0703125, "loss/logits": 0.18798145651817322, "loss/reg": 6.756201764801517e-05, "step": 2825 }, { "epoch": 0.35325, "grad_norm": 2.681774377822876, "grad_norm_var": 0.4615374746613336, "learning_rate": 0.0001, "loss": 1.4411, "loss/crossentropy": 2.4292092323303223, "loss/hidden": 1.234375, "loss/logits": 0.20602205395698547, "loss/reg": 6.753990601282567e-05, "step": 2826 }, { "epoch": 0.353375, "grad_norm": 3.7352190017700195, "grad_norm_var": 0.4992131602427046, "learning_rate": 0.0001, "loss": 2.1947, "loss/crossentropy": 2.6210153102874756, "loss/hidden": 1.734375, "loss/logits": 0.45969486236572266, "loss/reg": 6.752082117600366e-05, "step": 2827 }, { "epoch": 0.3535, "grad_norm": 4.135535717010498, "grad_norm_var": 0.5689525286086581, "learning_rate": 0.0001, "loss": 1.4325, "loss/crossentropy": 2.720987319946289, "loss/hidden": 1.203125, "loss/logits": 0.22866541147232056, "loss/reg": 6.749387830495834e-05, "step": 2828 }, { "epoch": 0.353625, "grad_norm": 2.7346181869506836, "grad_norm_var": 0.5721733979132106, "learning_rate": 0.0001, "loss": 1.5019, "loss/crossentropy": 2.691770553588867, "loss/hidden": 1.2734375, "loss/logits": 0.2277478277683258, "loss/reg": 6.747142469976097e-05, "step": 2829 }, { "epoch": 0.35375, "grad_norm": 2.614046096801758, "grad_norm_var": 0.5818722797668205, "learning_rate": 0.0001, "loss": 1.3415, "loss/crossentropy": 2.576838731765747, "loss/hidden": 1.140625, "loss/logits": 0.20017409324645996, "loss/reg": 6.745552673237398e-05, "step": 2830 }, { "epoch": 0.353875, "grad_norm": 2.420051336288452, "grad_norm_var": 0.3599496327818805, "learning_rate": 0.0001, "loss": 1.4122, "loss/crossentropy": 2.4475927352905273, "loss/hidden": 1.203125, "loss/logits": 0.20838941633701324, "loss/reg": 6.742506957380101e-05, "step": 2831 }, { "epoch": 0.354, "grad_norm": 2.8087050914764404, "grad_norm_var": 0.3396218685803348, "learning_rate": 0.0001, "loss": 1.77, "loss/crossentropy": 2.3641583919525146, "loss/hidden": 1.453125, "loss/logits": 0.31621527671813965, "loss/reg": 6.739589298376814e-05, "step": 2832 }, { "epoch": 0.354125, "grad_norm": 2.3022303581237793, "grad_norm_var": 0.35151339644936097, "learning_rate": 0.0001, "loss": 1.2551, "loss/crossentropy": 2.7439277172088623, "loss/hidden": 1.078125, "loss/logits": 0.17633652687072754, "loss/reg": 6.736774230375886e-05, "step": 2833 }, { "epoch": 0.35425, "grad_norm": 2.22556471824646, "grad_norm_var": 0.29620485289138, "learning_rate": 0.0001, "loss": 1.3853, "loss/crossentropy": 2.702989339828491, "loss/hidden": 1.1875, "loss/logits": 0.19713810086250305, "loss/reg": 6.733623740728945e-05, "step": 2834 }, { "epoch": 0.354375, "grad_norm": 2.3994693756103516, "grad_norm_var": 0.2826940274244246, "learning_rate": 0.0001, "loss": 1.3565, "loss/crossentropy": 2.445793867111206, "loss/hidden": 1.15625, "loss/logits": 0.1995435655117035, "loss/reg": 6.730860332027078e-05, "step": 2835 }, { "epoch": 0.3545, "grad_norm": 2.3337342739105225, "grad_norm_var": 0.28940397664025547, "learning_rate": 0.0001, "loss": 1.4005, "loss/crossentropy": 2.4074504375457764, "loss/hidden": 1.1796875, "loss/logits": 0.22011837363243103, "loss/reg": 6.728374864906073e-05, "step": 2836 }, { "epoch": 0.354625, "grad_norm": 2.2426347732543945, "grad_norm_var": 0.2960405144386177, "learning_rate": 0.0001, "loss": 1.3883, "loss/crossentropy": 2.5592281818389893, "loss/hidden": 1.1953125, "loss/logits": 0.19235700368881226, "loss/reg": 6.725241837557405e-05, "step": 2837 }, { "epoch": 0.35475, "grad_norm": 3.9499289989471436, "grad_norm_var": 0.40045864023900296, "learning_rate": 0.0001, "loss": 1.512, "loss/crossentropy": 2.7666711807250977, "loss/hidden": 1.265625, "loss/logits": 0.24572348594665527, "loss/reg": 6.722537364112213e-05, "step": 2838 }, { "epoch": 0.354875, "grad_norm": 2.274170398712158, "grad_norm_var": 0.39548268319318797, "learning_rate": 0.0001, "loss": 1.3913, "loss/crossentropy": 2.4172439575195312, "loss/hidden": 1.203125, "loss/logits": 0.18749752640724182, "loss/reg": 6.719759403495118e-05, "step": 2839 }, { "epoch": 0.355, "grad_norm": 2.9068026542663574, "grad_norm_var": 0.3913731950389033, "learning_rate": 0.0001, "loss": 1.3192, "loss/crossentropy": 2.1830313205718994, "loss/hidden": 1.125, "loss/logits": 0.193521186709404, "loss/reg": 6.716432108078152e-05, "step": 2840 }, { "epoch": 0.355125, "grad_norm": 2.4124507904052734, "grad_norm_var": 0.3905309719526978, "learning_rate": 0.0001, "loss": 1.2498, "loss/crossentropy": 2.42842698097229, "loss/hidden": 1.078125, "loss/logits": 0.17097532749176025, "loss/reg": 6.713035691063851e-05, "step": 2841 }, { "epoch": 0.35525, "grad_norm": 2.933353900909424, "grad_norm_var": 0.39182723611359216, "learning_rate": 0.0001, "loss": 1.5475, "loss/crossentropy": 2.3967390060424805, "loss/hidden": 1.328125, "loss/logits": 0.21867001056671143, "loss/reg": 6.710235174978152e-05, "step": 2842 }, { "epoch": 0.355375, "grad_norm": 2.2656667232513428, "grad_norm_var": 0.33900483749411403, "learning_rate": 0.0001, "loss": 1.3755, "loss/crossentropy": 2.669989824295044, "loss/hidden": 1.171875, "loss/logits": 0.2029278576374054, "loss/reg": 6.708097498631105e-05, "step": 2843 }, { "epoch": 0.3555, "grad_norm": 2.1119883060455322, "grad_norm_var": 0.203545159139266, "learning_rate": 0.0001, "loss": 1.4129, "loss/crossentropy": 2.411691188812256, "loss/hidden": 1.1875, "loss/logits": 0.22474730014801025, "loss/reg": 6.706031854264438e-05, "step": 2844 }, { "epoch": 0.355625, "grad_norm": 3.1419448852539062, "grad_norm_var": 0.2234818500800389, "learning_rate": 0.0001, "loss": 1.5321, "loss/crossentropy": 3.023282051086426, "loss/hidden": 1.296875, "loss/logits": 0.23452182114124298, "loss/reg": 6.703209510305896e-05, "step": 2845 }, { "epoch": 0.35575, "grad_norm": 2.1441967487335205, "grad_norm_var": 0.2353920425793414, "learning_rate": 0.0001, "loss": 1.5231, "loss/crossentropy": 2.518749237060547, "loss/hidden": 1.28125, "loss/logits": 0.24114753305912018, "loss/reg": 6.700139056192711e-05, "step": 2846 }, { "epoch": 0.355875, "grad_norm": 6.0156779289245605, "grad_norm_var": 0.9789414784936277, "learning_rate": 0.0001, "loss": 1.6078, "loss/crossentropy": 2.8790016174316406, "loss/hidden": 1.3671875, "loss/logits": 0.23992542922496796, "loss/reg": 6.69754299451597e-05, "step": 2847 }, { "epoch": 0.356, "grad_norm": 3.1996521949768066, "grad_norm_var": 0.9900276463482963, "learning_rate": 0.0001, "loss": 1.6249, "loss/crossentropy": 2.361599922180176, "loss/hidden": 1.375, "loss/logits": 0.24925637245178223, "loss/reg": 6.695299816783518e-05, "step": 2848 }, { "epoch": 0.356125, "grad_norm": 4.900071620941162, "grad_norm_var": 1.2381220968239168, "learning_rate": 0.0001, "loss": 2.0593, "loss/crossentropy": 2.5535991191864014, "loss/hidden": 1.5078125, "loss/logits": 0.5508327484130859, "loss/reg": 6.692892930004746e-05, "step": 2849 }, { "epoch": 0.35625, "grad_norm": 2.6361498832702637, "grad_norm_var": 1.208118982574035, "learning_rate": 0.0001, "loss": 1.4051, "loss/crossentropy": 2.6826581954956055, "loss/hidden": 1.203125, "loss/logits": 0.2012781798839569, "loss/reg": 6.689882138743997e-05, "step": 2850 }, { "epoch": 0.356375, "grad_norm": 2.6103179454803467, "grad_norm_var": 1.1942468700585815, "learning_rate": 0.0001, "loss": 1.3794, "loss/crossentropy": 2.6092865467071533, "loss/hidden": 1.1875, "loss/logits": 0.1912710964679718, "loss/reg": 6.687327550025657e-05, "step": 2851 }, { "epoch": 0.3565, "grad_norm": 2.9672787189483643, "grad_norm_var": 1.1626361155575968, "learning_rate": 0.0001, "loss": 1.3675, "loss/crossentropy": 2.6412172317504883, "loss/hidden": 1.1328125, "loss/logits": 0.2340364158153534, "loss/reg": 6.684495019726455e-05, "step": 2852 }, { "epoch": 0.356625, "grad_norm": 2.467639446258545, "grad_norm_var": 1.1417433159221408, "learning_rate": 0.0001, "loss": 1.3742, "loss/crossentropy": 2.668386936187744, "loss/hidden": 1.1796875, "loss/logits": 0.19385293126106262, "loss/reg": 6.68190696160309e-05, "step": 2853 }, { "epoch": 0.35675, "grad_norm": 2.4985833168029785, "grad_norm_var": 1.1009063159998105, "learning_rate": 0.0001, "loss": 1.339, "loss/crossentropy": 2.666644334793091, "loss/hidden": 1.1484375, "loss/logits": 0.18993687629699707, "loss/reg": 6.679257785435766e-05, "step": 2854 }, { "epoch": 0.356875, "grad_norm": 3.185540199279785, "grad_norm_var": 1.0685227223960905, "learning_rate": 0.0001, "loss": 1.3322, "loss/crossentropy": 2.6477112770080566, "loss/hidden": 1.140625, "loss/logits": 0.190871924161911, "loss/reg": 6.676786870229989e-05, "step": 2855 }, { "epoch": 0.357, "grad_norm": 2.697371244430542, "grad_norm_var": 1.0745599464434983, "learning_rate": 0.0001, "loss": 1.6309, "loss/crossentropy": 2.0749671459198, "loss/hidden": 1.375, "loss/logits": 0.255240797996521, "loss/reg": 6.673672032775357e-05, "step": 2856 }, { "epoch": 0.357125, "grad_norm": 3.795292377471924, "grad_norm_var": 1.083578847289895, "learning_rate": 0.0001, "loss": 1.4791, "loss/crossentropy": 2.552164316177368, "loss/hidden": 1.25, "loss/logits": 0.22844061255455017, "loss/reg": 6.6710876126308e-05, "step": 2857 }, { "epoch": 0.35725, "grad_norm": 2.5413174629211426, "grad_norm_var": 1.1017998394645663, "learning_rate": 0.0001, "loss": 1.2029, "loss/crossentropy": 2.4185352325439453, "loss/hidden": 1.03125, "loss/logits": 0.17101925611495972, "loss/reg": 6.668044079560786e-05, "step": 2858 }, { "epoch": 0.357375, "grad_norm": 2.955279588699341, "grad_norm_var": 1.0572282926133303, "learning_rate": 0.0001, "loss": 1.4082, "loss/crossentropy": 2.6675634384155273, "loss/hidden": 1.1875, "loss/logits": 0.22004657983779907, "loss/reg": 6.665255932603031e-05, "step": 2859 }, { "epoch": 0.3575, "grad_norm": 2.7467193603515625, "grad_norm_var": 0.9973731221150006, "learning_rate": 0.0001, "loss": 1.3126, "loss/crossentropy": 2.774935483932495, "loss/hidden": 1.1484375, "loss/logits": 0.1634751856327057, "loss/reg": 6.662240775767714e-05, "step": 2860 }, { "epoch": 0.357625, "grad_norm": 2.411285877227783, "grad_norm_var": 1.032151622136653, "learning_rate": 0.0001, "loss": 1.3945, "loss/crossentropy": 2.739293336868286, "loss/hidden": 1.1875, "loss/logits": 0.2062903195619583, "loss/reg": 6.65895058773458e-05, "step": 2861 }, { "epoch": 0.35775, "grad_norm": 2.3558337688446045, "grad_norm_var": 1.007675891969044, "learning_rate": 0.0001, "loss": 1.4595, "loss/crossentropy": 2.3067879676818848, "loss/hidden": 1.2421875, "loss/logits": 0.21668913960456848, "loss/reg": 6.655535253230482e-05, "step": 2862 }, { "epoch": 0.357875, "grad_norm": 2.870522975921631, "grad_norm_var": 0.4132894703805768, "learning_rate": 0.0001, "loss": 1.5445, "loss/crossentropy": 2.849651575088501, "loss/hidden": 1.328125, "loss/logits": 0.2156791090965271, "loss/reg": 6.652622687397525e-05, "step": 2863 }, { "epoch": 0.358, "grad_norm": 2.40203595161438, "grad_norm_var": 0.4241007776139554, "learning_rate": 0.0001, "loss": 1.3977, "loss/crossentropy": 2.4904584884643555, "loss/hidden": 1.203125, "loss/logits": 0.19392900168895721, "loss/reg": 6.649015995208174e-05, "step": 2864 }, { "epoch": 0.358125, "grad_norm": 2.70290207862854, "grad_norm_var": 0.13332123340767693, "learning_rate": 0.0001, "loss": 1.3279, "loss/crossentropy": 2.409106731414795, "loss/hidden": 1.15625, "loss/logits": 0.17098590731620789, "loss/reg": 6.645932444371283e-05, "step": 2865 }, { "epoch": 0.35825, "grad_norm": 2.7962725162506104, "grad_norm_var": 0.13270108870269628, "learning_rate": 0.0001, "loss": 1.5272, "loss/crossentropy": 2.6904842853546143, "loss/hidden": 1.2890625, "loss/logits": 0.23746003210544586, "loss/reg": 6.642317748628557e-05, "step": 2866 }, { "epoch": 0.358375, "grad_norm": 2.4272494316101074, "grad_norm_var": 0.13821163336054737, "learning_rate": 0.0001, "loss": 1.3192, "loss/crossentropy": 2.415907144546509, "loss/hidden": 1.140625, "loss/logits": 0.17792995274066925, "loss/reg": 6.638477498199791e-05, "step": 2867 }, { "epoch": 0.3585, "grad_norm": 2.6064398288726807, "grad_norm_var": 0.13535786829698182, "learning_rate": 0.0001, "loss": 1.2002, "loss/crossentropy": 2.2071571350097656, "loss/hidden": 1.0625, "loss/logits": 0.1370571255683899, "loss/reg": 6.634975579800084e-05, "step": 2868 }, { "epoch": 0.358625, "grad_norm": 3.9557816982269287, "grad_norm_var": 0.22443573957611135, "learning_rate": 0.0001, "loss": 1.4593, "loss/crossentropy": 2.2344391345977783, "loss/hidden": 1.28125, "loss/logits": 0.17740419507026672, "loss/reg": 6.632066651945934e-05, "step": 2869 }, { "epoch": 0.35875, "grad_norm": 2.6339030265808105, "grad_norm_var": 0.21997447790305671, "learning_rate": 0.0001, "loss": 1.4127, "loss/crossentropy": 2.3311352729797363, "loss/hidden": 1.21875, "loss/logits": 0.19323912262916565, "loss/reg": 6.628712435485795e-05, "step": 2870 }, { "epoch": 0.358875, "grad_norm": 2.9609336853027344, "grad_norm_var": 0.21211260052642256, "learning_rate": 0.0001, "loss": 1.3852, "loss/crossentropy": 2.8042683601379395, "loss/hidden": 1.15625, "loss/logits": 0.22828146815299988, "loss/reg": 6.625511741731316e-05, "step": 2871 }, { "epoch": 0.359, "grad_norm": 3.2218947410583496, "grad_norm_var": 0.22187190770784065, "learning_rate": 0.0001, "loss": 1.461, "loss/crossentropy": 2.6548359394073486, "loss/hidden": 1.265625, "loss/logits": 0.19473612308502197, "loss/reg": 6.622418732149526e-05, "step": 2872 }, { "epoch": 0.359125, "grad_norm": 2.2799055576324463, "grad_norm_var": 0.17166698424640572, "learning_rate": 0.0001, "loss": 1.4035, "loss/crossentropy": 2.395106077194214, "loss/hidden": 1.1953125, "loss/logits": 0.2074800282716751, "loss/reg": 6.619467603741214e-05, "step": 2873 }, { "epoch": 0.35925, "grad_norm": 2.712918281555176, "grad_norm_var": 0.16892109658230367, "learning_rate": 0.0001, "loss": 1.6398, "loss/crossentropy": 2.600799322128296, "loss/hidden": 1.359375, "loss/logits": 0.27974215149879456, "loss/reg": 6.61626472719945e-05, "step": 2874 }, { "epoch": 0.359375, "grad_norm": 2.9160635471343994, "grad_norm_var": 0.1679568804156048, "learning_rate": 0.0001, "loss": 1.3283, "loss/crossentropy": 2.5367517471313477, "loss/hidden": 1.140625, "loss/logits": 0.18704748153686523, "loss/reg": 6.613548612222075e-05, "step": 2875 }, { "epoch": 0.3595, "grad_norm": 2.0632197856903076, "grad_norm_var": 0.19745785764210358, "learning_rate": 0.0001, "loss": 1.3098, "loss/crossentropy": 2.2827794551849365, "loss/hidden": 1.125, "loss/logits": 0.184128999710083, "loss/reg": 6.610977288801223e-05, "step": 2876 }, { "epoch": 0.359625, "grad_norm": 2.400362968444824, "grad_norm_var": 0.19789645890056112, "learning_rate": 0.0001, "loss": 1.2123, "loss/crossentropy": 2.437070846557617, "loss/hidden": 1.0390625, "loss/logits": 0.17252948880195618, "loss/reg": 6.608012336073443e-05, "step": 2877 }, { "epoch": 0.35975, "grad_norm": 2.434685230255127, "grad_norm_var": 0.19459684486409118, "learning_rate": 0.0001, "loss": 1.4182, "loss/crossentropy": 2.5659799575805664, "loss/hidden": 1.2109375, "loss/logits": 0.20656916499137878, "loss/reg": 6.605240923818201e-05, "step": 2878 }, { "epoch": 0.359875, "grad_norm": 3.237940549850464, "grad_norm_var": 0.21082111122973188, "learning_rate": 0.0001, "loss": 1.7456, "loss/crossentropy": 2.2938594818115234, "loss/hidden": 1.46875, "loss/logits": 0.27616679668426514, "loss/reg": 6.602724170079455e-05, "step": 2879 }, { "epoch": 0.36, "grad_norm": 2.369903326034546, "grad_norm_var": 0.2123101714662101, "learning_rate": 0.0001, "loss": 1.2874, "loss/crossentropy": 2.4591121673583984, "loss/hidden": 1.1015625, "loss/logits": 0.18521052598953247, "loss/reg": 6.600555207114667e-05, "step": 2880 }, { "epoch": 0.360125, "grad_norm": 2.6738107204437256, "grad_norm_var": 0.21247796270651495, "learning_rate": 0.0001, "loss": 1.3395, "loss/crossentropy": 2.5000176429748535, "loss/hidden": 1.140625, "loss/logits": 0.1981954425573349, "loss/reg": 6.59818688291125e-05, "step": 2881 }, { "epoch": 0.36025, "grad_norm": 4.03118896484375, "grad_norm_var": 0.31858763092883746, "learning_rate": 0.0001, "loss": 1.6813, "loss/crossentropy": 2.400888204574585, "loss/hidden": 1.421875, "loss/logits": 0.2587233781814575, "loss/reg": 6.596469029318541e-05, "step": 2882 }, { "epoch": 0.360375, "grad_norm": 4.290897846221924, "grad_norm_var": 0.44107829403406623, "learning_rate": 0.0001, "loss": 1.5092, "loss/crossentropy": 3.0093789100646973, "loss/hidden": 1.3046875, "loss/logits": 0.20380906760692596, "loss/reg": 6.594695150852203e-05, "step": 2883 }, { "epoch": 0.3605, "grad_norm": 2.3901498317718506, "grad_norm_var": 0.4531706847123701, "learning_rate": 0.0001, "loss": 1.2522, "loss/crossentropy": 2.292250871658325, "loss/hidden": 1.078125, "loss/logits": 0.17342528700828552, "loss/reg": 6.591899000341073e-05, "step": 2884 }, { "epoch": 0.360625, "grad_norm": 3.3279929161071777, "grad_norm_var": 0.39033670995362946, "learning_rate": 0.0001, "loss": 1.8751, "loss/crossentropy": 2.091630458831787, "loss/hidden": 1.515625, "loss/logits": 0.3588463068008423, "loss/reg": 6.590135308215395e-05, "step": 2885 }, { "epoch": 0.36075, "grad_norm": 2.7473952770233154, "grad_norm_var": 0.3875446770041132, "learning_rate": 0.0001, "loss": 1.3277, "loss/crossentropy": 2.3529860973358154, "loss/hidden": 1.1171875, "loss/logits": 0.20985886454582214, "loss/reg": 6.588315591216087e-05, "step": 2886 }, { "epoch": 0.360875, "grad_norm": 2.36098051071167, "grad_norm_var": 0.40346329995023306, "learning_rate": 0.0001, "loss": 1.2752, "loss/crossentropy": 2.1176373958587646, "loss/hidden": 1.109375, "loss/logits": 0.16521263122558594, "loss/reg": 6.585413939319551e-05, "step": 2887 }, { "epoch": 0.361, "grad_norm": 3.114490509033203, "grad_norm_var": 0.398732614262614, "learning_rate": 0.0001, "loss": 1.3778, "loss/crossentropy": 2.851780891418457, "loss/hidden": 1.1484375, "loss/logits": 0.22873148322105408, "loss/reg": 6.58245844533667e-05, "step": 2888 }, { "epoch": 0.361125, "grad_norm": 2.5575528144836426, "grad_norm_var": 0.3830199487036071, "learning_rate": 0.0001, "loss": 1.221, "loss/crossentropy": 2.7010254859924316, "loss/hidden": 1.0390625, "loss/logits": 0.18131256103515625, "loss/reg": 6.580140325240791e-05, "step": 2889 }, { "epoch": 0.36125, "grad_norm": 2.2799127101898193, "grad_norm_var": 0.40275923786588996, "learning_rate": 0.0001, "loss": 1.2769, "loss/crossentropy": 2.6765801906585693, "loss/hidden": 1.09375, "loss/logits": 0.18249261379241943, "loss/reg": 6.577758176717907e-05, "step": 2890 }, { "epoch": 0.361375, "grad_norm": 2.441188335418701, "grad_norm_var": 0.4110738866817994, "learning_rate": 0.0001, "loss": 1.3849, "loss/crossentropy": 2.724580764770508, "loss/hidden": 1.1875, "loss/logits": 0.19671741127967834, "loss/reg": 6.575572479050606e-05, "step": 2891 }, { "epoch": 0.3615, "grad_norm": 2.4259145259857178, "grad_norm_var": 0.38390217143274274, "learning_rate": 0.0001, "loss": 1.405, "loss/crossentropy": 2.5444352626800537, "loss/hidden": 1.203125, "loss/logits": 0.20123980939388275, "loss/reg": 6.573774589924142e-05, "step": 2892 }, { "epoch": 0.361625, "grad_norm": 3.236527681350708, "grad_norm_var": 0.38106392044528453, "learning_rate": 0.0001, "loss": 1.2563, "loss/crossentropy": 2.5958704948425293, "loss/hidden": 1.046875, "loss/logits": 0.20878221094608307, "loss/reg": 6.570994446519762e-05, "step": 2893 }, { "epoch": 0.36175, "grad_norm": 2.31217622756958, "grad_norm_var": 0.3891131555694746, "learning_rate": 0.0001, "loss": 1.3846, "loss/crossentropy": 2.3817813396453857, "loss/hidden": 1.1875, "loss/logits": 0.19646961987018585, "loss/reg": 6.569163815584034e-05, "step": 2894 }, { "epoch": 0.361875, "grad_norm": 2.7066457271575928, "grad_norm_var": 0.38015058877413044, "learning_rate": 0.0001, "loss": 1.5049, "loss/crossentropy": 2.347402811050415, "loss/hidden": 1.2734375, "loss/logits": 0.23083017766475677, "loss/reg": 6.567584205185995e-05, "step": 2895 }, { "epoch": 0.362, "grad_norm": 2.378221273422241, "grad_norm_var": 0.37964555835292896, "learning_rate": 0.0001, "loss": 1.3066, "loss/crossentropy": 2.552408456802368, "loss/hidden": 1.125, "loss/logits": 0.18097081780433655, "loss/reg": 6.564728391822428e-05, "step": 2896 }, { "epoch": 0.362125, "grad_norm": 2.5330119132995605, "grad_norm_var": 0.38381093313086745, "learning_rate": 0.0001, "loss": 1.3451, "loss/crossentropy": 2.72941517829895, "loss/hidden": 1.1484375, "loss/logits": 0.1960519254207611, "loss/reg": 6.561660848092288e-05, "step": 2897 }, { "epoch": 0.36225, "grad_norm": 2.0341978073120117, "grad_norm_var": 0.3107985617989487, "learning_rate": 0.0001, "loss": 1.3703, "loss/crossentropy": 2.2636125087738037, "loss/hidden": 1.171875, "loss/logits": 0.19781389832496643, "loss/reg": 6.559077883139253e-05, "step": 2898 }, { "epoch": 0.362375, "grad_norm": 2.68984055519104, "grad_norm_var": 0.1305571363455139, "learning_rate": 0.0001, "loss": 1.5038, "loss/crossentropy": 2.206495523452759, "loss/hidden": 1.28125, "loss/logits": 0.2219083160161972, "loss/reg": 6.55630137771368e-05, "step": 2899 }, { "epoch": 0.3625, "grad_norm": 3.193049907684326, "grad_norm_var": 0.14880939192281797, "learning_rate": 0.0001, "loss": 1.462, "loss/crossentropy": 2.7352616786956787, "loss/hidden": 1.2578125, "loss/logits": 0.20353274047374725, "loss/reg": 6.553861749125645e-05, "step": 2900 }, { "epoch": 0.362625, "grad_norm": 2.1279335021972656, "grad_norm_var": 0.1297250234576398, "learning_rate": 0.0001, "loss": 1.4292, "loss/crossentropy": 2.5871870517730713, "loss/hidden": 1.21875, "loss/logits": 0.20983844995498657, "loss/reg": 6.550995021825656e-05, "step": 2901 }, { "epoch": 0.36275, "grad_norm": 2.939866542816162, "grad_norm_var": 0.13656227590311118, "learning_rate": 0.0001, "loss": 1.6103, "loss/crossentropy": 2.2399134635925293, "loss/hidden": 1.3984375, "loss/logits": 0.21121850609779358, "loss/reg": 6.547536759171635e-05, "step": 2902 }, { "epoch": 0.362875, "grad_norm": 2.2871530055999756, "grad_norm_var": 0.13909057797152755, "learning_rate": 0.0001, "loss": 1.4771, "loss/crossentropy": 2.5272586345672607, "loss/hidden": 1.265625, "loss/logits": 0.21080952882766724, "loss/reg": 6.544730422319844e-05, "step": 2903 }, { "epoch": 0.363, "grad_norm": 2.607384443283081, "grad_norm_var": 0.1189294369756946, "learning_rate": 0.0001, "loss": 1.3151, "loss/crossentropy": 2.6112048625946045, "loss/hidden": 1.1328125, "loss/logits": 0.18160119652748108, "loss/reg": 6.542013579746708e-05, "step": 2904 }, { "epoch": 0.363125, "grad_norm": 2.2937662601470947, "grad_norm_var": 0.12290410924508871, "learning_rate": 0.0001, "loss": 1.2877, "loss/crossentropy": 2.72279953956604, "loss/hidden": 1.109375, "loss/logits": 0.17763081192970276, "loss/reg": 6.538940942846239e-05, "step": 2905 }, { "epoch": 0.36325, "grad_norm": 2.1892101764678955, "grad_norm_var": 0.12644789960967248, "learning_rate": 0.0001, "loss": 1.2121, "loss/crossentropy": 2.7363266944885254, "loss/hidden": 1.0390625, "loss/logits": 0.172386035323143, "loss/reg": 6.536050932481885e-05, "step": 2906 }, { "epoch": 0.363375, "grad_norm": 2.1407053470611572, "grad_norm_var": 0.1354390941436615, "learning_rate": 0.0001, "loss": 1.3088, "loss/crossentropy": 2.508946418762207, "loss/hidden": 1.125, "loss/logits": 0.183100625872612, "loss/reg": 6.53297029202804e-05, "step": 2907 }, { "epoch": 0.3635, "grad_norm": 4.542610168457031, "grad_norm_var": 0.3928688812540616, "learning_rate": 0.0001, "loss": 1.5488, "loss/crossentropy": 3.6265997886657715, "loss/hidden": 1.34375, "loss/logits": 0.20438866317272186, "loss/reg": 6.53074894216843e-05, "step": 2908 }, { "epoch": 0.363625, "grad_norm": 3.09771466255188, "grad_norm_var": 0.3830003806597818, "learning_rate": 0.0001, "loss": 1.8165, "loss/crossentropy": 2.948300361633301, "loss/hidden": 1.4921875, "loss/logits": 0.32364073395729065, "loss/reg": 6.529095844598487e-05, "step": 2909 }, { "epoch": 0.36375, "grad_norm": 2.482038736343384, "grad_norm_var": 0.3776147495929346, "learning_rate": 0.0001, "loss": 1.3566, "loss/crossentropy": 2.509207010269165, "loss/hidden": 1.171875, "loss/logits": 0.18408863246440887, "loss/reg": 6.526459037559107e-05, "step": 2910 }, { "epoch": 0.363875, "grad_norm": 2.4392056465148926, "grad_norm_var": 0.3797159795029489, "learning_rate": 0.0001, "loss": 1.2225, "loss/crossentropy": 2.644902229309082, "loss/hidden": 1.0546875, "loss/logits": 0.16720272600650787, "loss/reg": 6.523758929688483e-05, "step": 2911 }, { "epoch": 0.364, "grad_norm": 2.3244552612304688, "grad_norm_var": 0.38165496769329366, "learning_rate": 0.0001, "loss": 1.2943, "loss/crossentropy": 2.61859393119812, "loss/hidden": 1.1171875, "loss/logits": 0.17647767066955566, "loss/reg": 6.520942406496033e-05, "step": 2912 }, { "epoch": 0.364125, "grad_norm": 3.469456434249878, "grad_norm_var": 0.4255849893933693, "learning_rate": 0.0001, "loss": 1.3447, "loss/crossentropy": 2.7970690727233887, "loss/hidden": 1.1328125, "loss/logits": 0.21126726269721985, "loss/reg": 6.518140435218811e-05, "step": 2913 }, { "epoch": 0.36425, "grad_norm": 2.6655640602111816, "grad_norm_var": 0.39624657478645775, "learning_rate": 0.0001, "loss": 1.3357, "loss/crossentropy": 2.5724189281463623, "loss/hidden": 1.140625, "loss/logits": 0.19445669651031494, "loss/reg": 6.515940185636282e-05, "step": 2914 }, { "epoch": 0.364375, "grad_norm": 5.499610900878906, "grad_norm_var": 0.8790768498905124, "learning_rate": 0.0001, "loss": 1.8064, "loss/crossentropy": 2.5568346977233887, "loss/hidden": 1.4921875, "loss/logits": 0.31354475021362305, "loss/reg": 6.51329246466048e-05, "step": 2915 }, { "epoch": 0.3645, "grad_norm": 3.1492106914520264, "grad_norm_var": 0.8774473903014759, "learning_rate": 0.0001, "loss": 1.5538, "loss/crossentropy": 3.2795183658599854, "loss/hidden": 1.265625, "loss/logits": 0.28754186630249023, "loss/reg": 6.511082756333053e-05, "step": 2916 }, { "epoch": 0.364625, "grad_norm": 2.8067615032196045, "grad_norm_var": 0.837183047985232, "learning_rate": 0.0001, "loss": 1.3712, "loss/crossentropy": 2.4544730186462402, "loss/hidden": 1.203125, "loss/logits": 0.16740016639232635, "loss/reg": 6.508747901534662e-05, "step": 2917 }, { "epoch": 0.36475, "grad_norm": 2.479581117630005, "grad_norm_var": 0.8500288081055485, "learning_rate": 0.0001, "loss": 1.417, "loss/crossentropy": 2.6068055629730225, "loss/hidden": 1.2109375, "loss/logits": 0.20538827776908875, "loss/reg": 6.506066711153835e-05, "step": 2918 }, { "epoch": 0.364875, "grad_norm": 2.488429307937622, "grad_norm_var": 0.8359891015805069, "learning_rate": 0.0001, "loss": 1.3726, "loss/crossentropy": 2.5931124687194824, "loss/hidden": 1.1796875, "loss/logits": 0.19230414927005768, "loss/reg": 6.502954784082249e-05, "step": 2919 }, { "epoch": 0.365, "grad_norm": 2.8037757873535156, "grad_norm_var": 0.8302861939176523, "learning_rate": 0.0001, "loss": 1.4463, "loss/crossentropy": 2.3422367572784424, "loss/hidden": 1.2109375, "loss/logits": 0.23468342423439026, "loss/reg": 6.499757728306577e-05, "step": 2920 }, { "epoch": 0.365125, "grad_norm": 2.9610519409179688, "grad_norm_var": 0.8015529097961895, "learning_rate": 0.0001, "loss": 1.6706, "loss/crossentropy": 2.106898546218872, "loss/hidden": 1.421875, "loss/logits": 0.24808484315872192, "loss/reg": 6.496626883745193e-05, "step": 2921 }, { "epoch": 0.36525, "grad_norm": 3.5240538120269775, "grad_norm_var": 0.7737359736581423, "learning_rate": 0.0001, "loss": 1.4912, "loss/crossentropy": 2.611156940460205, "loss/hidden": 1.2578125, "loss/logits": 0.2327847182750702, "loss/reg": 6.493443652288988e-05, "step": 2922 }, { "epoch": 0.365375, "grad_norm": 2.619961977005005, "grad_norm_var": 0.729690232155305, "learning_rate": 0.0001, "loss": 1.3054, "loss/crossentropy": 2.597060203552246, "loss/hidden": 1.125, "loss/logits": 0.1797330379486084, "loss/reg": 6.49024368613027e-05, "step": 2923 }, { "epoch": 0.3655, "grad_norm": 2.331735849380493, "grad_norm_var": 0.6053889215787156, "learning_rate": 0.0001, "loss": 1.4076, "loss/crossentropy": 2.6112592220306396, "loss/hidden": 1.1875, "loss/logits": 0.21940529346466064, "loss/reg": 6.486666825367138e-05, "step": 2924 }, { "epoch": 0.365625, "grad_norm": 2.42877197265625, "grad_norm_var": 0.6198617378412182, "learning_rate": 0.0001, "loss": 1.2516, "loss/crossentropy": 2.728579521179199, "loss/hidden": 1.078125, "loss/logits": 0.17278045415878296, "loss/reg": 6.483263132395223e-05, "step": 2925 }, { "epoch": 0.36575, "grad_norm": 2.681575059890747, "grad_norm_var": 0.6111078751501412, "learning_rate": 0.0001, "loss": 1.3428, "loss/crossentropy": 2.717459201812744, "loss/hidden": 1.1328125, "loss/logits": 0.20929189026355743, "loss/reg": 6.479666626546532e-05, "step": 2926 }, { "epoch": 0.365875, "grad_norm": 2.213071346282959, "grad_norm_var": 0.6287122763316231, "learning_rate": 0.0001, "loss": 1.242, "loss/crossentropy": 2.692227840423584, "loss/hidden": 1.0625, "loss/logits": 0.17887921631336212, "loss/reg": 6.477061833720654e-05, "step": 2927 }, { "epoch": 0.366, "grad_norm": 3.146198034286499, "grad_norm_var": 0.6075338282325513, "learning_rate": 0.0001, "loss": 1.8779, "loss/crossentropy": 2.3151705265045166, "loss/hidden": 1.546875, "loss/logits": 0.33034849166870117, "loss/reg": 6.474341353168711e-05, "step": 2928 }, { "epoch": 0.366125, "grad_norm": 2.878556489944458, "grad_norm_var": 0.588769093536977, "learning_rate": 0.0001, "loss": 1.6249, "loss/crossentropy": 2.5788540840148926, "loss/hidden": 1.359375, "loss/logits": 0.26489296555519104, "loss/reg": 6.471773667726666e-05, "step": 2929 }, { "epoch": 0.36625, "grad_norm": 2.301500082015991, "grad_norm_var": 0.6092761036332242, "learning_rate": 0.0001, "loss": 1.2054, "loss/crossentropy": 2.3017895221710205, "loss/hidden": 1.046875, "loss/logits": 0.1578664481639862, "loss/reg": 6.468418723670766e-05, "step": 2930 }, { "epoch": 0.366375, "grad_norm": 2.709411859512329, "grad_norm_var": 0.12672429962059653, "learning_rate": 0.0001, "loss": 1.4291, "loss/crossentropy": 2.6337268352508545, "loss/hidden": 1.21875, "loss/logits": 0.20970849692821503, "loss/reg": 6.465777551056817e-05, "step": 2931 }, { "epoch": 0.3665, "grad_norm": 2.51567006111145, "grad_norm_var": 0.11557308962846946, "learning_rate": 0.0001, "loss": 1.7329, "loss/crossentropy": 2.104097604751587, "loss/hidden": 1.4375, "loss/logits": 0.2947903871536255, "loss/reg": 6.46313710603863e-05, "step": 2932 }, { "epoch": 0.366625, "grad_norm": 2.1178691387176514, "grad_norm_var": 0.13364856256136287, "learning_rate": 0.0001, "loss": 1.2434, "loss/crossentropy": 2.5307583808898926, "loss/hidden": 1.0546875, "loss/logits": 0.1880609691143036, "loss/reg": 6.460479198722169e-05, "step": 2933 }, { "epoch": 0.36675, "grad_norm": 2.8522732257843018, "grad_norm_var": 0.134478656142619, "learning_rate": 0.0001, "loss": 1.4075, "loss/crossentropy": 2.4590976238250732, "loss/hidden": 1.2109375, "loss/logits": 0.19594907760620117, "loss/reg": 6.457702693296596e-05, "step": 2934 }, { "epoch": 0.366875, "grad_norm": 3.4035093784332275, "grad_norm_var": 0.1657748788665193, "learning_rate": 0.0001, "loss": 1.3277, "loss/crossentropy": 2.9511990547180176, "loss/hidden": 1.1484375, "loss/logits": 0.17863625288009644, "loss/reg": 6.455090624513105e-05, "step": 2935 }, { "epoch": 0.367, "grad_norm": 2.4783926010131836, "grad_norm_var": 0.1686733578953105, "learning_rate": 0.0001, "loss": 1.3804, "loss/crossentropy": 2.5576980113983154, "loss/hidden": 1.15625, "loss/logits": 0.22354255616664886, "loss/reg": 6.452522939071059e-05, "step": 2936 }, { "epoch": 0.367125, "grad_norm": 2.5526530742645264, "grad_norm_var": 0.1647587297686203, "learning_rate": 0.0001, "loss": 1.454, "loss/crossentropy": 2.3957858085632324, "loss/hidden": 1.234375, "loss/logits": 0.21893680095672607, "loss/reg": 6.449998909374699e-05, "step": 2937 }, { "epoch": 0.36725, "grad_norm": 2.4520318508148193, "grad_norm_var": 0.11482490876733564, "learning_rate": 0.0001, "loss": 1.4263, "loss/crossentropy": 2.534863233566284, "loss/hidden": 1.21875, "loss/logits": 0.20690473914146423, "loss/reg": 6.44735264359042e-05, "step": 2938 }, { "epoch": 0.367375, "grad_norm": 2.361769199371338, "grad_norm_var": 0.11848314799603088, "learning_rate": 0.0001, "loss": 1.4358, "loss/crossentropy": 2.2010152339935303, "loss/hidden": 1.2109375, "loss/logits": 0.2241779863834381, "loss/reg": 6.444072641897947e-05, "step": 2939 }, { "epoch": 0.3675, "grad_norm": 2.2792482376098633, "grad_norm_var": 0.12045618913071295, "learning_rate": 0.0001, "loss": 1.2021, "loss/crossentropy": 2.3537559509277344, "loss/hidden": 1.03125, "loss/logits": 0.17024698853492737, "loss/reg": 6.441138248192146e-05, "step": 2940 }, { "epoch": 0.367625, "grad_norm": 2.4321234226226807, "grad_norm_var": 0.12038672993428369, "learning_rate": 0.0001, "loss": 1.2241, "loss/crossentropy": 2.907867670059204, "loss/hidden": 1.0546875, "loss/logits": 0.16881850361824036, "loss/reg": 6.438203854486346e-05, "step": 2941 }, { "epoch": 0.36775, "grad_norm": 2.1593282222747803, "grad_norm_var": 0.13077728070843098, "learning_rate": 0.0001, "loss": 1.2503, "loss/crossentropy": 2.5794570446014404, "loss/hidden": 1.0625, "loss/logits": 0.18717467784881592, "loss/reg": 6.43569728708826e-05, "step": 2942 }, { "epoch": 0.367875, "grad_norm": 2.642245054244995, "grad_norm_var": 0.12281731771629793, "learning_rate": 0.0001, "loss": 1.5198, "loss/crossentropy": 2.153944969177246, "loss/hidden": 1.28125, "loss/logits": 0.237893208861351, "loss/reg": 6.43261446384713e-05, "step": 2943 }, { "epoch": 0.368, "grad_norm": 2.512718915939331, "grad_norm_var": 0.1000896958340924, "learning_rate": 0.0001, "loss": 1.4968, "loss/crossentropy": 2.4160914421081543, "loss/hidden": 1.265625, "loss/logits": 0.23052245378494263, "loss/reg": 6.429522909456864e-05, "step": 2944 }, { "epoch": 0.368125, "grad_norm": 2.720484733581543, "grad_norm_var": 0.09452811911120307, "learning_rate": 0.0001, "loss": 1.256, "loss/crossentropy": 2.7665116786956787, "loss/hidden": 1.0859375, "loss/logits": 0.16939538717269897, "loss/reg": 6.426584150176495e-05, "step": 2945 }, { "epoch": 0.36825, "grad_norm": 3.0608813762664795, "grad_norm_var": 0.10736249806186607, "learning_rate": 0.0001, "loss": 1.4468, "loss/crossentropy": 2.585686206817627, "loss/hidden": 1.1875, "loss/logits": 0.25865137577056885, "loss/reg": 6.42323648207821e-05, "step": 2946 }, { "epoch": 0.368375, "grad_norm": 2.5621252059936523, "grad_norm_var": 0.10614084180276465, "learning_rate": 0.0001, "loss": 1.2997, "loss/crossentropy": 2.606428861618042, "loss/hidden": 1.109375, "loss/logits": 0.1896384358406067, "loss/reg": 6.41996884951368e-05, "step": 2947 }, { "epoch": 0.3685, "grad_norm": 3.0044288635253906, "grad_norm_var": 0.11759852236558857, "learning_rate": 0.0001, "loss": 1.3202, "loss/crossentropy": 2.6592135429382324, "loss/hidden": 1.15625, "loss/logits": 0.16327540576457977, "loss/reg": 6.417390977730975e-05, "step": 2948 }, { "epoch": 0.368625, "grad_norm": 3.2617268562316895, "grad_norm_var": 0.12591776713938785, "learning_rate": 0.0001, "loss": 1.6209, "loss/crossentropy": 2.3959450721740723, "loss/hidden": 1.4140625, "loss/logits": 0.20618778467178345, "loss/reg": 6.413725350284949e-05, "step": 2949 }, { "epoch": 0.36875, "grad_norm": 3.1133804321289062, "grad_norm_var": 0.1364898574533072, "learning_rate": 0.0001, "loss": 1.8499, "loss/crossentropy": 2.5552399158477783, "loss/hidden": 1.546875, "loss/logits": 0.30234551429748535, "loss/reg": 6.410996138583869e-05, "step": 2950 }, { "epoch": 0.368875, "grad_norm": 2.5129234790802, "grad_norm_var": 0.10101701669400048, "learning_rate": 0.0001, "loss": 1.5391, "loss/crossentropy": 2.3352909088134766, "loss/hidden": 1.28125, "loss/logits": 0.25725799798965454, "loss/reg": 6.408064655261114e-05, "step": 2951 }, { "epoch": 0.369, "grad_norm": 2.4306375980377197, "grad_norm_var": 0.10213541595364835, "learning_rate": 0.0001, "loss": 1.4159, "loss/crossentropy": 2.300166606903076, "loss/hidden": 1.1953125, "loss/logits": 0.2199755162000656, "loss/reg": 6.405232124961913e-05, "step": 2952 }, { "epoch": 0.369125, "grad_norm": 2.2368743419647217, "grad_norm_var": 0.11156824726448325, "learning_rate": 0.0001, "loss": 1.1942, "loss/crossentropy": 2.515896797180176, "loss/hidden": 1.03125, "loss/logits": 0.16235141456127167, "loss/reg": 6.401749124052003e-05, "step": 2953 }, { "epoch": 0.36925, "grad_norm": 2.396397352218628, "grad_norm_var": 0.11292557924181755, "learning_rate": 0.0001, "loss": 1.4261, "loss/crossentropy": 2.4774622917175293, "loss/hidden": 1.2265625, "loss/logits": 0.19886139035224915, "loss/reg": 6.39851568848826e-05, "step": 2954 }, { "epoch": 0.369375, "grad_norm": 2.4500210285186768, "grad_norm_var": 0.11054491453233671, "learning_rate": 0.0001, "loss": 1.2786, "loss/crossentropy": 2.3753795623779297, "loss/hidden": 1.109375, "loss/logits": 0.16861461102962494, "loss/reg": 6.395346281351522e-05, "step": 2955 }, { "epoch": 0.3695, "grad_norm": 3.323993682861328, "grad_norm_var": 0.1325543711196777, "learning_rate": 0.0001, "loss": 1.6486, "loss/crossentropy": 2.4095823764801025, "loss/hidden": 1.3828125, "loss/logits": 0.2651233673095703, "loss/reg": 6.391818169504404e-05, "step": 2956 }, { "epoch": 0.369625, "grad_norm": 2.2081778049468994, "grad_norm_var": 0.14297886781158695, "learning_rate": 0.0001, "loss": 1.2044, "loss/crossentropy": 2.687476396560669, "loss/hidden": 1.0390625, "loss/logits": 0.16471447050571442, "loss/reg": 6.388223846442997e-05, "step": 2957 }, { "epoch": 0.36975, "grad_norm": 4.037971496582031, "grad_norm_var": 0.23758000333857368, "learning_rate": 0.0001, "loss": 1.8637, "loss/crossentropy": 2.4336042404174805, "loss/hidden": 1.546875, "loss/logits": 0.31620413064956665, "loss/reg": 6.384445441653952e-05, "step": 2958 }, { "epoch": 0.369875, "grad_norm": 2.3745579719543457, "grad_norm_var": 0.24696404274687894, "learning_rate": 0.0001, "loss": 1.3079, "loss/crossentropy": 2.601874589920044, "loss/hidden": 1.1171875, "loss/logits": 0.19002635776996613, "loss/reg": 6.381676939781755e-05, "step": 2959 }, { "epoch": 0.37, "grad_norm": 2.18459415435791, "grad_norm_var": 0.2646410374716062, "learning_rate": 0.0001, "loss": 1.3879, "loss/crossentropy": 2.633523941040039, "loss/hidden": 1.1953125, "loss/logits": 0.1919637769460678, "loss/reg": 6.378323450917378e-05, "step": 2960 }, { "epoch": 0.370125, "grad_norm": 1.9295908212661743, "grad_norm_var": 0.30605174830174814, "learning_rate": 0.0001, "loss": 1.2114, "loss/crossentropy": 2.4141247272491455, "loss/hidden": 1.046875, "loss/logits": 0.16386404633522034, "loss/reg": 6.375854718498886e-05, "step": 2961 }, { "epoch": 0.37025, "grad_norm": 3.6826703548431396, "grad_norm_var": 0.3607134085756503, "learning_rate": 0.0001, "loss": 1.3078, "loss/crossentropy": 2.778897523880005, "loss/hidden": 1.140625, "loss/logits": 0.16655871272087097, "loss/reg": 6.373201904352754e-05, "step": 2962 }, { "epoch": 0.370375, "grad_norm": 3.1197850704193115, "grad_norm_var": 0.3675279230540501, "learning_rate": 0.0001, "loss": 1.4544, "loss/crossentropy": 2.3683950901031494, "loss/hidden": 1.2421875, "loss/logits": 0.21159294247627258, "loss/reg": 6.37023476883769e-05, "step": 2963 }, { "epoch": 0.3705, "grad_norm": 2.2838263511657715, "grad_norm_var": 0.3771442935342889, "learning_rate": 0.0001, "loss": 1.3183, "loss/crossentropy": 2.6524925231933594, "loss/hidden": 1.1328125, "loss/logits": 0.18488985300064087, "loss/reg": 6.367031164700165e-05, "step": 2964 }, { "epoch": 0.370625, "grad_norm": 1.984851360321045, "grad_norm_var": 0.38710461740785523, "learning_rate": 0.0001, "loss": 1.3471, "loss/crossentropy": 2.3246843814849854, "loss/hidden": 1.140625, "loss/logits": 0.20586876571178436, "loss/reg": 6.363928696373478e-05, "step": 2965 }, { "epoch": 0.37075, "grad_norm": 3.0741093158721924, "grad_norm_var": 0.3847322164698335, "learning_rate": 0.0001, "loss": 1.369, "loss/crossentropy": 2.6442673206329346, "loss/hidden": 1.1640625, "loss/logits": 0.20431852340698242, "loss/reg": 6.36146214674227e-05, "step": 2966 }, { "epoch": 0.370875, "grad_norm": 1.9716213941574097, "grad_norm_var": 0.4121761065755076, "learning_rate": 0.0001, "loss": 1.2097, "loss/crossentropy": 2.603672504425049, "loss/hidden": 1.0390625, "loss/logits": 0.16999706625938416, "loss/reg": 6.357901293085888e-05, "step": 2967 }, { "epoch": 0.371, "grad_norm": 3.9049766063690186, "grad_norm_var": 0.5136359893678986, "learning_rate": 0.0001, "loss": 1.8595, "loss/crossentropy": 2.4243624210357666, "loss/hidden": 1.59375, "loss/logits": 0.26515963673591614, "loss/reg": 6.354250217555091e-05, "step": 2968 }, { "epoch": 0.371125, "grad_norm": 2.5098819732666016, "grad_norm_var": 0.5015179259215583, "learning_rate": 0.0001, "loss": 1.4665, "loss/crossentropy": 2.565091133117676, "loss/hidden": 1.2421875, "loss/logits": 0.2236735224723816, "loss/reg": 6.350933108478785e-05, "step": 2969 }, { "epoch": 0.37125, "grad_norm": 2.3258323669433594, "grad_norm_var": 0.5048250165789019, "learning_rate": 0.0001, "loss": 1.3085, "loss/crossentropy": 2.5719261169433594, "loss/hidden": 1.125, "loss/logits": 0.1828286200761795, "loss/reg": 6.347247108351439e-05, "step": 2970 }, { "epoch": 0.371375, "grad_norm": 3.0875070095062256, "grad_norm_var": 0.5080922361409355, "learning_rate": 0.0001, "loss": 1.235, "loss/crossentropy": 2.4915106296539307, "loss/hidden": 1.0625, "loss/logits": 0.17190435528755188, "loss/reg": 6.344519351841882e-05, "step": 2971 }, { "epoch": 0.3715, "grad_norm": 2.6113147735595703, "grad_norm_var": 0.4853170411910857, "learning_rate": 0.0001, "loss": 1.2257, "loss/crossentropy": 2.8890388011932373, "loss/hidden": 1.0546875, "loss/logits": 0.1704152524471283, "loss/reg": 6.341288826661184e-05, "step": 2972 }, { "epoch": 0.371625, "grad_norm": 2.454434871673584, "grad_norm_var": 0.47277127803199975, "learning_rate": 0.0001, "loss": 1.2156, "loss/crossentropy": 2.5898449420928955, "loss/hidden": 1.0625, "loss/logits": 0.15244591236114502, "loss/reg": 6.338267849059775e-05, "step": 2973 }, { "epoch": 0.37175, "grad_norm": 2.9413554668426514, "grad_norm_var": 0.35538403495893456, "learning_rate": 0.0001, "loss": 1.4277, "loss/crossentropy": 2.5474841594696045, "loss/hidden": 1.234375, "loss/logits": 0.19271042943000793, "loss/reg": 6.335430953186005e-05, "step": 2974 }, { "epoch": 0.371875, "grad_norm": 4.01010799407959, "grad_norm_var": 0.46194888074208884, "learning_rate": 0.0001, "loss": 1.8542, "loss/crossentropy": 2.3551039695739746, "loss/hidden": 1.6328125, "loss/logits": 0.22074884176254272, "loss/reg": 6.332369957817718e-05, "step": 2975 }, { "epoch": 0.372, "grad_norm": 2.8702478408813477, "grad_norm_var": 0.43920488651629247, "learning_rate": 0.0001, "loss": 1.6717, "loss/crossentropy": 2.0603673458099365, "loss/hidden": 1.421875, "loss/logits": 0.24919116497039795, "loss/reg": 6.329182360786945e-05, "step": 2976 }, { "epoch": 0.372125, "grad_norm": 2.2678921222686768, "grad_norm_var": 0.4072032730495564, "learning_rate": 0.0001, "loss": 1.3887, "loss/crossentropy": 2.4731359481811523, "loss/hidden": 1.1953125, "loss/logits": 0.19277814030647278, "loss/reg": 6.325924914563075e-05, "step": 2977 }, { "epoch": 0.37225, "grad_norm": 2.5826265811920166, "grad_norm_var": 0.35612473422247976, "learning_rate": 0.0001, "loss": 1.3969, "loss/crossentropy": 2.521217107772827, "loss/hidden": 1.1953125, "loss/logits": 0.20098784565925598, "loss/reg": 6.322353146970272e-05, "step": 2978 }, { "epoch": 0.372375, "grad_norm": 5.689695358276367, "grad_norm_var": 0.8956028115192789, "learning_rate": 0.0001, "loss": 1.4563, "loss/crossentropy": 2.0184884071350098, "loss/hidden": 1.296875, "loss/logits": 0.1588059663772583, "loss/reg": 6.318974919850007e-05, "step": 2979 }, { "epoch": 0.3725, "grad_norm": 2.6561596393585205, "grad_norm_var": 0.8731493763561559, "learning_rate": 0.0001, "loss": 1.4846, "loss/crossentropy": 2.3659873008728027, "loss/hidden": 1.28125, "loss/logits": 0.20269934833049774, "loss/reg": 6.3152881921269e-05, "step": 2980 }, { "epoch": 0.372625, "grad_norm": 2.4568676948547363, "grad_norm_var": 0.8273446343350703, "learning_rate": 0.0001, "loss": 1.3066, "loss/crossentropy": 2.3906993865966797, "loss/hidden": 1.1171875, "loss/logits": 0.18873481452465057, "loss/reg": 6.311244942480698e-05, "step": 2981 }, { "epoch": 0.37275, "grad_norm": 2.5507161617279053, "grad_norm_var": 0.8367409834490762, "learning_rate": 0.0001, "loss": 1.5806, "loss/crossentropy": 2.4396488666534424, "loss/hidden": 1.3046875, "loss/logits": 0.27529165148735046, "loss/reg": 6.307926378212869e-05, "step": 2982 }, { "epoch": 0.372875, "grad_norm": 3.0622200965881348, "grad_norm_var": 0.7716158339451868, "learning_rate": 0.0001, "loss": 1.6269, "loss/crossentropy": 2.34615159034729, "loss/hidden": 1.3671875, "loss/logits": 0.25909924507141113, "loss/reg": 6.304262933554128e-05, "step": 2983 }, { "epoch": 0.373, "grad_norm": 2.0640971660614014, "grad_norm_var": 0.7610124705039385, "learning_rate": 0.0001, "loss": 1.1928, "loss/crossentropy": 2.589111804962158, "loss/hidden": 1.0390625, "loss/logits": 0.15308743715286255, "loss/reg": 6.300045788520947e-05, "step": 2984 }, { "epoch": 0.373125, "grad_norm": 2.7912564277648926, "grad_norm_var": 0.7519321953023694, "learning_rate": 0.0001, "loss": 1.5325, "loss/crossentropy": 3.286241292953491, "loss/hidden": 1.296875, "loss/logits": 0.23503965139389038, "loss/reg": 6.295337516348809e-05, "step": 2985 }, { "epoch": 0.37325, "grad_norm": 2.864032506942749, "grad_norm_var": 0.7287334700303508, "learning_rate": 0.0001, "loss": 1.265, "loss/crossentropy": 2.9111034870147705, "loss/hidden": 1.0859375, "loss/logits": 0.17841294407844543, "loss/reg": 6.291209137998521e-05, "step": 2986 }, { "epoch": 0.373375, "grad_norm": 4.3819193840026855, "grad_norm_var": 0.8597676248233671, "learning_rate": 0.0001, "loss": 1.9805, "loss/crossentropy": 2.9310858249664307, "loss/hidden": 1.6015625, "loss/logits": 0.3783401846885681, "loss/reg": 6.288255826802924e-05, "step": 2987 }, { "epoch": 0.3735, "grad_norm": 3.335242509841919, "grad_norm_var": 0.8534667406769197, "learning_rate": 0.0001, "loss": 1.4359, "loss/crossentropy": 2.506587505340576, "loss/hidden": 1.234375, "loss/logits": 0.20085568726062775, "loss/reg": 6.285084964474663e-05, "step": 2988 }, { "epoch": 0.373625, "grad_norm": 2.04323148727417, "grad_norm_var": 0.8973008133115693, "learning_rate": 0.0001, "loss": 1.1698, "loss/crossentropy": 2.6092071533203125, "loss/hidden": 1.015625, "loss/logits": 0.1535772979259491, "loss/reg": 6.282192771323025e-05, "step": 2989 }, { "epoch": 0.37375, "grad_norm": 2.7088623046875, "grad_norm_var": 0.9035968825271783, "learning_rate": 0.0001, "loss": 1.3185, "loss/crossentropy": 2.68137526512146, "loss/hidden": 1.1484375, "loss/logits": 0.1694534420967102, "loss/reg": 6.279157969402149e-05, "step": 2990 }, { "epoch": 0.373875, "grad_norm": 3.004310369491577, "grad_norm_var": 0.8341711110443678, "learning_rate": 0.0001, "loss": 1.4816, "loss/crossentropy": 2.8618662357330322, "loss/hidden": 1.2578125, "loss/logits": 0.22318598628044128, "loss/reg": 6.275823398027569e-05, "step": 2991 }, { "epoch": 0.374, "grad_norm": 2.966421604156494, "grad_norm_var": 0.8336228332578647, "learning_rate": 0.0001, "loss": 1.4496, "loss/crossentropy": 2.4050283432006836, "loss/hidden": 1.265625, "loss/logits": 0.183384507894516, "loss/reg": 6.272875907598063e-05, "step": 2992 }, { "epoch": 0.374125, "grad_norm": 2.1810312271118164, "grad_norm_var": 0.8421574476305729, "learning_rate": 0.0001, "loss": 1.3373, "loss/crossentropy": 2.4977712631225586, "loss/hidden": 1.140625, "loss/logits": 0.1960592269897461, "loss/reg": 6.270047015277669e-05, "step": 2993 }, { "epoch": 0.37425, "grad_norm": 4.047881603240967, "grad_norm_var": 0.9028769740249345, "learning_rate": 0.0001, "loss": 1.7562, "loss/crossentropy": 2.2375099658966064, "loss/hidden": 1.515625, "loss/logits": 0.23994037508964539, "loss/reg": 6.266956916078925e-05, "step": 2994 }, { "epoch": 0.374375, "grad_norm": 2.715088367462158, "grad_norm_var": 0.4090518639991302, "learning_rate": 0.0001, "loss": 1.5205, "loss/crossentropy": 2.34190034866333, "loss/hidden": 1.2890625, "loss/logits": 0.2308216094970703, "loss/reg": 6.263991963351145e-05, "step": 2995 }, { "epoch": 0.3745, "grad_norm": 2.4802074432373047, "grad_norm_var": 0.4158706359237591, "learning_rate": 0.0001, "loss": 1.172, "loss/crossentropy": 2.6636102199554443, "loss/hidden": 1.0234375, "loss/logits": 0.1479628086090088, "loss/reg": 6.261339149205014e-05, "step": 2996 }, { "epoch": 0.374625, "grad_norm": 2.091684579849243, "grad_norm_var": 0.4435100574086732, "learning_rate": 0.0001, "loss": 1.3242, "loss/crossentropy": 2.3042514324188232, "loss/hidden": 1.140625, "loss/logits": 0.18293757736682892, "loss/reg": 6.25854154350236e-05, "step": 2997 }, { "epoch": 0.37475, "grad_norm": 2.131197214126587, "grad_norm_var": 0.4701604766626275, "learning_rate": 0.0001, "loss": 1.2949, "loss/crossentropy": 2.532275915145874, "loss/hidden": 1.125, "loss/logits": 0.1692412942647934, "loss/reg": 6.256090273382142e-05, "step": 2998 }, { "epoch": 0.374875, "grad_norm": 2.26434063911438, "grad_norm_var": 0.4825093812124462, "learning_rate": 0.0001, "loss": 1.2331, "loss/crossentropy": 2.5872960090637207, "loss/hidden": 1.0625, "loss/logits": 0.16993172466754913, "loss/reg": 6.25383690930903e-05, "step": 2999 }, { "epoch": 0.375, "grad_norm": 2.891429901123047, "grad_norm_var": 0.449138538463193, "learning_rate": 0.0001, "loss": 1.2241, "loss/crossentropy": 2.4563827514648438, "loss/hidden": 1.0625, "loss/logits": 0.16098375618457794, "loss/reg": 6.250730803003535e-05, "step": 3000 } ], "logging_steps": 1, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.93217584693248e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }