{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0625, "eval_steps": 250, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000125, "grad_norm": 4.097814559936523, "learning_rate": 1.0000000000000002e-06, "loss": 1.1655, "loss/crossentropy": 2.343535900115967, "loss/hidden": 0.9296875, "loss/logits": 0.17379230260849, "loss/reg": 0.006198255345225334, "step": 1 }, { "epoch": 0.00025, "grad_norm": 3.662576913833618, "learning_rate": 2.0000000000000003e-06, "loss": 1.4973, "loss/crossentropy": 2.318769931793213, "loss/hidden": 1.1875, "loss/logits": 0.24786217510700226, "loss/reg": 0.006198255345225334, "step": 2 }, { "epoch": 0.000375, "grad_norm": 2.8296749591827393, "learning_rate": 3e-06, "loss": 1.2258, "loss/crossentropy": 2.4907937049865723, "loss/hidden": 0.97265625, "loss/logits": 0.19112952053546906, "loss/reg": 0.006198245566338301, "step": 3 }, { "epoch": 0.0005, "grad_norm": 3.057624578475952, "learning_rate": 4.000000000000001e-06, "loss": 1.1136, "loss/crossentropy": 2.744520902633667, "loss/hidden": 0.890625, "loss/logits": 0.16101403534412384, "loss/reg": 0.006198232993483543, "step": 4 }, { "epoch": 0.000625, "grad_norm": 2.7055587768554688, "learning_rate": 5e-06, "loss": 1.1943, "loss/crossentropy": 2.5722062587738037, "loss/hidden": 0.94921875, "loss/logits": 0.18310005962848663, "loss/reg": 0.0061982134357094765, "step": 5 }, { "epoch": 0.00075, "grad_norm": 3.789276361465454, "learning_rate": 6e-06, "loss": 1.247, "loss/crossentropy": 2.613312005996704, "loss/hidden": 1.0078125, "loss/logits": 0.17725251615047455, "loss/reg": 0.006198191549628973, "step": 6 }, { "epoch": 0.000875, "grad_norm": 3.997910499572754, "learning_rate": 7.000000000000001e-06, "loss": 1.4206, "loss/crossentropy": 2.4207534790039062, "loss/hidden": 1.125, "loss/logits": 0.2336406409740448, "loss/reg": 0.006198164541274309, "step": 7 }, { "epoch": 0.001, "grad_norm": 2.5986244678497314, "learning_rate": 8.000000000000001e-06, "loss": 1.0878, "loss/crossentropy": 2.536424160003662, "loss/hidden": 0.8671875, "loss/logits": 0.1585812270641327, "loss/reg": 0.006198132876306772, "step": 8 }, { "epoch": 0.001125, "grad_norm": 2.2757976055145264, "learning_rate": 9e-06, "loss": 1.1175, "loss/crossentropy": 2.745281219482422, "loss/hidden": 0.89453125, "loss/logits": 0.16094230115413666, "loss/reg": 0.006198094692081213, "step": 9 }, { "epoch": 0.00125, "grad_norm": 2.261094808578491, "learning_rate": 1e-05, "loss": 1.0803, "loss/crossentropy": 2.3173577785491943, "loss/hidden": 0.8671875, "loss/logits": 0.15108685195446014, "loss/reg": 0.0061980499885976315, "step": 10 }, { "epoch": 0.001375, "grad_norm": 21.777265548706055, "learning_rate": 1.1000000000000001e-05, "loss": 2.0501, "loss/crossentropy": 3.2122714519500732, "loss/hidden": 1.7109375, "loss/logits": 0.27713608741760254, "loss/reg": 0.006198008079081774, "step": 11 }, { "epoch": 0.0015, "grad_norm": 2.5655505657196045, "learning_rate": 1.2e-05, "loss": 1.151, "loss/crossentropy": 2.706430196762085, "loss/hidden": 0.8984375, "loss/logits": 0.19056561589241028, "loss/reg": 0.0061979577876627445, "step": 12 }, { "epoch": 0.001625, "grad_norm": 2.403053045272827, "learning_rate": 1.3000000000000001e-05, "loss": 1.0719, "loss/crossentropy": 2.0466296672821045, "loss/hidden": 0.88671875, "loss/logits": 0.12316589802503586, "loss/reg": 0.0061978911980986595, "step": 13 }, { "epoch": 0.00175, "grad_norm": 3.840881586074829, "learning_rate": 1.4000000000000001e-05, "loss": 1.5441, "loss/crossentropy": 2.3191423416137695, "loss/hidden": 1.234375, "loss/logits": 0.24779079854488373, "loss/reg": 0.00619781669229269, "step": 14 }, { "epoch": 0.001875, "grad_norm": 2.557331085205078, "learning_rate": 1.5e-05, "loss": 0.9444, "loss/crossentropy": 2.6370084285736084, "loss/hidden": 0.76953125, "loss/logits": 0.11287336051464081, "loss/reg": 0.006197733338922262, "step": 15 }, { "epoch": 0.002, "grad_norm": 3.1850404739379883, "grad_norm_var": 22.31061335402559, "learning_rate": 1.6000000000000003e-05, "loss": 1.3213, "loss/crossentropy": 2.676577091217041, "loss/hidden": 1.0546875, "loss/logits": 0.2046227753162384, "loss/reg": 0.006197639741003513, "step": 16 }, { "epoch": 0.002125, "grad_norm": 2.2587289810180664, "grad_norm_var": 22.553268201402446, "learning_rate": 1.7000000000000003e-05, "loss": 1.0312, "loss/crossentropy": 2.4961040019989014, "loss/hidden": 0.8203125, "loss/logits": 0.148894801735878, "loss/reg": 0.006197560112923384, "step": 17 }, { "epoch": 0.00225, "grad_norm": 3.3259811401367188, "grad_norm_var": 22.58044614452358, "learning_rate": 1.8e-05, "loss": 1.3626, "loss/crossentropy": 2.5914387702941895, "loss/hidden": 1.046875, "loss/logits": 0.25370728969573975, "loss/reg": 0.006197475362569094, "step": 18 }, { "epoch": 0.002375, "grad_norm": 2.468914747238159, "grad_norm_var": 22.649171856957494, "learning_rate": 1.9e-05, "loss": 1.1683, "loss/crossentropy": 2.6096584796905518, "loss/hidden": 0.921875, "loss/logits": 0.18447336554527283, "loss/reg": 0.00619738781824708, "step": 19 }, { "epoch": 0.0025, "grad_norm": 2.3097646236419678, "grad_norm_var": 22.784756315801523, "learning_rate": 2e-05, "loss": 1.1605, "loss/crossentropy": 2.299048662185669, "loss/hidden": 0.9375, "loss/logits": 0.16106057167053223, "loss/reg": 0.006197274662554264, "step": 20 }, { "epoch": 0.002625, "grad_norm": 2.1111207008361816, "grad_norm_var": 22.911025462198744, "learning_rate": 2.1e-05, "loss": 0.939, "loss/crossentropy": 2.547258138656616, "loss/hidden": 0.75, "loss/logits": 0.12698382139205933, "loss/reg": 0.006197154987603426, "step": 21 }, { "epoch": 0.00275, "grad_norm": 2.4918222427368164, "grad_norm_var": 23.049732177187614, "learning_rate": 2.2000000000000003e-05, "loss": 1.2047, "loss/crossentropy": 2.2802374362945557, "loss/hidden": 0.953125, "loss/logits": 0.18965375423431396, "loss/reg": 0.006197045091539621, "step": 22 }, { "epoch": 0.002875, "grad_norm": 3.3273494243621826, "grad_norm_var": 23.069242834486193, "learning_rate": 2.3000000000000003e-05, "loss": 1.2554, "loss/crossentropy": 2.3062734603881836, "loss/hidden": 1.0078125, "loss/logits": 0.18566077947616577, "loss/reg": 0.006196921691298485, "step": 23 }, { "epoch": 0.003, "grad_norm": 2.5644068717956543, "grad_norm_var": 23.075070365271714, "learning_rate": 2.4e-05, "loss": 1.2266, "loss/crossentropy": 2.460878372192383, "loss/hidden": 0.98046875, "loss/logits": 0.18418912589550018, "loss/reg": 0.006196786183863878, "step": 24 }, { "epoch": 0.003125, "grad_norm": 2.3506264686584473, "grad_norm_var": 23.059636834121356, "learning_rate": 2.5e-05, "loss": 1.0205, "loss/crossentropy": 2.4281811714172363, "loss/hidden": 0.82421875, "loss/logits": 0.13434948027133942, "loss/reg": 0.0061966474168002605, "step": 25 }, { "epoch": 0.00325, "grad_norm": 2.25004506111145, "grad_norm_var": 23.062003716592635, "learning_rate": 2.6000000000000002e-05, "loss": 1.1133, "loss/crossentropy": 2.326843500137329, "loss/hidden": 0.9140625, "loss/logits": 0.13725802302360535, "loss/reg": 0.006196498870849609, "step": 26 }, { "epoch": 0.003375, "grad_norm": 2.283770799636841, "grad_norm_var": 0.2469546323472817, "learning_rate": 2.7000000000000002e-05, "loss": 1.1459, "loss/crossentropy": 2.3002493381500244, "loss/hidden": 0.9140625, "loss/logits": 0.16987068951129913, "loss/reg": 0.006196335889399052, "step": 27 }, { "epoch": 0.0035, "grad_norm": 2.805088758468628, "grad_norm_var": 0.24805442740468303, "learning_rate": 2.8000000000000003e-05, "loss": 1.0272, "loss/crossentropy": 2.510472536087036, "loss/hidden": 0.8359375, "loss/logits": 0.12927240133285522, "loss/reg": 0.006196176633238792, "step": 28 }, { "epoch": 0.003625, "grad_norm": 2.0331132411956787, "grad_norm_var": 0.2692014993258605, "learning_rate": 2.9e-05, "loss": 1.0913, "loss/crossentropy": 2.51584529876709, "loss/hidden": 0.87109375, "loss/logits": 0.15820594131946564, "loss/reg": 0.006195997819304466, "step": 29 }, { "epoch": 0.00375, "grad_norm": 2.1523566246032715, "grad_norm_var": 0.17596421900176604, "learning_rate": 3e-05, "loss": 1.0026, "loss/crossentropy": 2.704220771789551, "loss/hidden": 0.796875, "loss/logits": 0.14372289180755615, "loss/reg": 0.0061958180740475655, "step": 30 }, { "epoch": 0.003875, "grad_norm": 2.6658694744110107, "grad_norm_var": 0.1771001402109505, "learning_rate": 3.1e-05, "loss": 1.122, "loss/crossentropy": 2.4840426445007324, "loss/hidden": 0.89453125, "loss/logits": 0.1655040979385376, "loss/reg": 0.006195634603500366, "step": 31 }, { "epoch": 0.004, "grad_norm": 2.813079595565796, "grad_norm_var": 0.153583095436327, "learning_rate": 3.2000000000000005e-05, "loss": 1.0653, "loss/crossentropy": 2.442962646484375, "loss/hidden": 0.859375, "loss/logits": 0.14400474727153778, "loss/reg": 0.00619542459025979, "step": 32 }, { "epoch": 0.004125, "grad_norm": 2.4273953437805176, "grad_norm_var": 0.1496371777315666, "learning_rate": 3.3e-05, "loss": 1.1025, "loss/crossentropy": 2.515721559524536, "loss/hidden": 0.89453125, "loss/logits": 0.1460331827402115, "loss/reg": 0.006195210851728916, "step": 33 }, { "epoch": 0.00425, "grad_norm": 2.0594100952148438, "grad_norm_var": 0.11442956053255457, "learning_rate": 3.4000000000000007e-05, "loss": 1.118, "loss/crossentropy": 2.5347506999969482, "loss/hidden": 0.8984375, "loss/logits": 0.15760375559329987, "loss/reg": 0.006195001769810915, "step": 34 }, { "epoch": 0.004375, "grad_norm": 2.497893810272217, "grad_norm_var": 0.11457586733464495, "learning_rate": 3.5e-05, "loss": 1.2359, "loss/crossentropy": 1.7681002616882324, "loss/hidden": 1.0390625, "loss/logits": 0.13490143418312073, "loss/reg": 0.006194803398102522, "step": 35 }, { "epoch": 0.0045, "grad_norm": 3.3231709003448486, "grad_norm_var": 0.16029457606237638, "learning_rate": 3.6e-05, "loss": 1.3588, "loss/crossentropy": 2.729518175125122, "loss/hidden": 1.09375, "loss/logits": 0.20313453674316406, "loss/reg": 0.00619460316374898, "step": 36 }, { "epoch": 0.004625, "grad_norm": 2.5542962551116943, "grad_norm_var": 0.14901290879942408, "learning_rate": 3.7e-05, "loss": 1.1671, "loss/crossentropy": 2.3359429836273193, "loss/hidden": 0.9296875, "loss/logits": 0.17546769976615906, "loss/reg": 0.006194361485540867, "step": 37 }, { "epoch": 0.00475, "grad_norm": 3.5138309001922607, "grad_norm_var": 0.2080724542279834, "learning_rate": 3.8e-05, "loss": 1.2044, "loss/crossentropy": 2.447890520095825, "loss/hidden": 0.96484375, "loss/logits": 0.17756858468055725, "loss/reg": 0.0061941081658005714, "step": 38 }, { "epoch": 0.004875, "grad_norm": 3.813410758972168, "grad_norm_var": 0.2698887106917669, "learning_rate": 3.9000000000000006e-05, "loss": 1.0819, "loss/crossentropy": 2.766765832901001, "loss/hidden": 0.88671875, "loss/logits": 0.13325469195842743, "loss/reg": 0.006193886045366526, "step": 39 }, { "epoch": 0.005, "grad_norm": 3.1502718925476074, "grad_norm_var": 0.2860816910243668, "learning_rate": 4e-05, "loss": 1.3622, "loss/crossentropy": 2.3325388431549072, "loss/hidden": 1.109375, "loss/logits": 0.19087004661560059, "loss/reg": 0.006193609442561865, "step": 40 }, { "epoch": 0.005125, "grad_norm": 2.422366142272949, "grad_norm_var": 0.28336421674108553, "learning_rate": 4.1e-05, "loss": 1.2212, "loss/crossentropy": 2.3002498149871826, "loss/hidden": 0.96875, "loss/logits": 0.19054222106933594, "loss/reg": 0.00619333703070879, "step": 41 }, { "epoch": 0.00525, "grad_norm": 2.7353622913360596, "grad_norm_var": 0.2707266796228128, "learning_rate": 4.2e-05, "loss": 1.0549, "loss/crossentropy": 2.0319221019744873, "loss/hidden": 0.87890625, "loss/logits": 0.1140664741396904, "loss/reg": 0.006193041335791349, "step": 42 }, { "epoch": 0.005375, "grad_norm": 1.9425387382507324, "grad_norm_var": 0.2970857034274398, "learning_rate": 4.3e-05, "loss": 1.0366, "loss/crossentropy": 2.431666374206543, "loss/hidden": 0.83203125, "loss/logits": 0.1426728069782257, "loss/reg": 0.006192733999341726, "step": 43 }, { "epoch": 0.0055, "grad_norm": 2.7009642124176025, "grad_norm_var": 0.2960522402202514, "learning_rate": 4.4000000000000006e-05, "loss": 0.9824, "loss/crossentropy": 2.391608476638794, "loss/hidden": 0.78515625, "loss/logits": 0.13533324003219604, "loss/reg": 0.006192411296069622, "step": 44 }, { "epoch": 0.005625, "grad_norm": 2.6632983684539795, "grad_norm_var": 0.2669107471214488, "learning_rate": 4.5e-05, "loss": 1.1067, "loss/crossentropy": 2.7733116149902344, "loss/hidden": 0.87109375, "loss/logits": 0.1736893653869629, "loss/reg": 0.006192059256136417, "step": 45 }, { "epoch": 0.00575, "grad_norm": 2.1037468910217285, "grad_norm_var": 0.2707032714108967, "learning_rate": 4.600000000000001e-05, "loss": 0.9831, "loss/crossentropy": 2.4606895446777344, "loss/hidden": 0.7890625, "loss/logits": 0.13213258981704712, "loss/reg": 0.006191718857735395, "step": 46 }, { "epoch": 0.005875, "grad_norm": 2.1911983489990234, "grad_norm_var": 0.28768473978113296, "learning_rate": 4.7e-05, "loss": 0.9509, "loss/crossentropy": 2.6825270652770996, "loss/hidden": 0.76953125, "loss/logits": 0.11942489445209503, "loss/reg": 0.006191306747496128, "step": 47 }, { "epoch": 0.006, "grad_norm": 3.2640700340270996, "grad_norm_var": 0.30827796768009724, "learning_rate": 4.8e-05, "loss": 1.0346, "loss/crossentropy": 2.3665199279785156, "loss/hidden": 0.83203125, "loss/logits": 0.14068934321403503, "loss/reg": 0.0061909533105790615, "step": 48 }, { "epoch": 0.006125, "grad_norm": 2.259894847869873, "grad_norm_var": 0.3163475179157634, "learning_rate": 4.9e-05, "loss": 0.9647, "loss/crossentropy": 2.4414587020874023, "loss/hidden": 0.79296875, "loss/logits": 0.10987477004528046, "loss/reg": 0.0061905342154204845, "step": 49 }, { "epoch": 0.00625, "grad_norm": 2.7616565227508545, "grad_norm_var": 0.28721415330329, "learning_rate": 5e-05, "loss": 1.019, "loss/crossentropy": 2.0829460620880127, "loss/hidden": 0.83984375, "loss/logits": 0.11724002659320831, "loss/reg": 0.0061900559812784195, "step": 50 }, { "epoch": 0.006375, "grad_norm": 2.7897861003875732, "grad_norm_var": 0.28297568806904866, "learning_rate": 5.1000000000000006e-05, "loss": 0.853, "loss/crossentropy": 2.5636909008026123, "loss/hidden": 0.6953125, "loss/logits": 0.09577471762895584, "loss/reg": 0.00618965458124876, "step": 51 }, { "epoch": 0.0065, "grad_norm": 2.3134403228759766, "grad_norm_var": 0.2711290924819705, "learning_rate": 5.2000000000000004e-05, "loss": 1.0497, "loss/crossentropy": 2.440258026123047, "loss/hidden": 0.83984375, "loss/logits": 0.14791719615459442, "loss/reg": 0.006189141888171434, "step": 52 }, { "epoch": 0.006625, "grad_norm": 2.2032997608184814, "grad_norm_var": 0.2855897568404882, "learning_rate": 5.300000000000001e-05, "loss": 0.9934, "loss/crossentropy": 2.4747955799102783, "loss/hidden": 0.796875, "loss/logits": 0.13461169600486755, "loss/reg": 0.006188610102981329, "step": 53 }, { "epoch": 0.00675, "grad_norm": 2.267400026321411, "grad_norm_var": 0.24358579758792467, "learning_rate": 5.4000000000000005e-05, "loss": 1.1149, "loss/crossentropy": 2.705127477645874, "loss/hidden": 0.89453125, "loss/logits": 0.1585235595703125, "loss/reg": 0.0061880191788077354, "step": 54 }, { "epoch": 0.006875, "grad_norm": 2.281036853790283, "grad_norm_var": 0.14220569464836952, "learning_rate": 5.500000000000001e-05, "loss": 0.9642, "loss/crossentropy": 2.545010805130005, "loss/hidden": 0.78515625, "loss/logits": 0.11717304587364197, "loss/reg": 0.006187579594552517, "step": 55 }, { "epoch": 0.007, "grad_norm": 4.942420959472656, "grad_norm_var": 0.4975759650139497, "learning_rate": 5.6000000000000006e-05, "loss": 1.1237, "loss/crossentropy": 2.7698795795440674, "loss/hidden": 0.91796875, "loss/logits": 0.14385326206684113, "loss/reg": 0.006187067367136478, "step": 56 }, { "epoch": 0.007125, "grad_norm": 2.4213955402374268, "grad_norm_var": 0.4976009733976563, "learning_rate": 5.6999999999999996e-05, "loss": 1.0386, "loss/crossentropy": 2.572023868560791, "loss/hidden": 0.84765625, "loss/logits": 0.12909512221813202, "loss/reg": 0.006186594720929861, "step": 57 }, { "epoch": 0.00725, "grad_norm": 2.15891695022583, "grad_norm_var": 0.5091253321428854, "learning_rate": 5.8e-05, "loss": 0.961, "loss/crossentropy": 2.283557415008545, "loss/hidden": 0.7734375, "loss/logits": 0.12568500638008118, "loss/reg": 0.006185955833643675, "step": 58 }, { "epoch": 0.007375, "grad_norm": 2.36811900138855, "grad_norm_var": 0.48432608682591366, "learning_rate": 5.9e-05, "loss": 0.8386, "loss/crossentropy": 2.453810453414917, "loss/hidden": 0.6796875, "loss/logits": 0.09709502756595612, "loss/reg": 0.0061853062361478806, "step": 59 }, { "epoch": 0.0075, "grad_norm": 2.591327667236328, "grad_norm_var": 0.4836842483889178, "learning_rate": 6e-05, "loss": 1.033, "loss/crossentropy": 2.8110511302948, "loss/hidden": 0.81640625, "loss/logits": 0.1547423005104065, "loss/reg": 0.006184632424265146, "step": 60 }, { "epoch": 0.007625, "grad_norm": 2.0103816986083984, "grad_norm_var": 0.5047142009615214, "learning_rate": 6.1e-05, "loss": 0.9296, "loss/crossentropy": 2.15134334564209, "loss/hidden": 0.7578125, "loss/logits": 0.1099701076745987, "loss/reg": 0.0061841062270104885, "step": 61 }, { "epoch": 0.00775, "grad_norm": 1.80124831199646, "grad_norm_var": 0.5287549745746596, "learning_rate": 6.2e-05, "loss": 0.9266, "loss/crossentropy": 2.7054479122161865, "loss/hidden": 0.7421875, "loss/logits": 0.12253857403993607, "loss/reg": 0.0061835781671106815, "step": 62 }, { "epoch": 0.007875, "grad_norm": 2.277440309524536, "grad_norm_var": 0.5252193383179133, "learning_rate": 6.3e-05, "loss": 0.914, "loss/crossentropy": 2.6631381511688232, "loss/hidden": 0.734375, "loss/logits": 0.1177992895245552, "loss/reg": 0.0061830319464206696, "step": 63 }, { "epoch": 0.008, "grad_norm": 3.3314151763916016, "grad_norm_var": 0.531964164332922, "learning_rate": 6.400000000000001e-05, "loss": 1.29, "loss/crossentropy": 2.1269633769989014, "loss/hidden": 1.0625, "loss/logits": 0.16565865278244019, "loss/reg": 0.006182366982102394, "step": 64 }, { "epoch": 0.008125, "grad_norm": 4.333358287811279, "grad_norm_var": 0.7208240839518936, "learning_rate": 6.500000000000001e-05, "loss": 1.1615, "loss/crossentropy": 2.714442491531372, "loss/hidden": 0.94140625, "loss/logits": 0.15825161337852478, "loss/reg": 0.006181675940752029, "step": 65 }, { "epoch": 0.00825, "grad_norm": 2.853740930557251, "grad_norm_var": 0.7223776199927481, "learning_rate": 6.6e-05, "loss": 1.062, "loss/crossentropy": 2.2147135734558105, "loss/hidden": 0.8515625, "loss/logits": 0.14859826862812042, "loss/reg": 0.006180979777127504, "step": 66 }, { "epoch": 0.008375, "grad_norm": 2.8853657245635986, "grad_norm_var": 0.7242961395218184, "learning_rate": 6.7e-05, "loss": 0.9533, "loss/crossentropy": 2.619598388671875, "loss/hidden": 0.7734375, "loss/logits": 0.11804014444351196, "loss/reg": 0.006180332973599434, "step": 67 }, { "epoch": 0.0085, "grad_norm": 2.725229501724243, "grad_norm_var": 0.7142181363616674, "learning_rate": 6.800000000000001e-05, "loss": 1.1308, "loss/crossentropy": 2.4091367721557617, "loss/hidden": 0.90234375, "loss/logits": 0.16662752628326416, "loss/reg": 0.006179714575409889, "step": 68 }, { "epoch": 0.008625, "grad_norm": 2.93643856048584, "grad_norm_var": 0.6977178730278022, "learning_rate": 6.9e-05, "loss": 1.1414, "loss/crossentropy": 2.509793281555176, "loss/hidden": 0.90234375, "loss/logits": 0.17730477452278137, "loss/reg": 0.0061789220198988914, "step": 69 }, { "epoch": 0.00875, "grad_norm": 2.4086973667144775, "grad_norm_var": 0.6896555586144653, "learning_rate": 7e-05, "loss": 0.9852, "loss/crossentropy": 2.7080371379852295, "loss/hidden": 0.7890625, "loss/logits": 0.1343374401330948, "loss/reg": 0.0061781019903719425, "step": 70 }, { "epoch": 0.008875, "grad_norm": 1.9355547428131104, "grad_norm_var": 0.7196579708330165, "learning_rate": 7.1e-05, "loss": 0.9176, "loss/crossentropy": 2.451488494873047, "loss/hidden": 0.7421875, "loss/logits": 0.11365102231502533, "loss/reg": 0.006177456583827734, "step": 71 }, { "epoch": 0.009, "grad_norm": 2.273902654647827, "grad_norm_var": 0.38422972669649574, "learning_rate": 7.2e-05, "loss": 1.0112, "loss/crossentropy": 2.4479947090148926, "loss/hidden": 0.8125, "loss/logits": 0.13690924644470215, "loss/reg": 0.006176764145493507, "step": 72 }, { "epoch": 0.009125, "grad_norm": 3.385849952697754, "grad_norm_var": 0.4217084598233742, "learning_rate": 7.3e-05, "loss": 1.3992, "loss/crossentropy": 2.3916804790496826, "loss/hidden": 1.1484375, "loss/logits": 0.18896484375, "loss/reg": 0.006176079623401165, "step": 73 }, { "epoch": 0.00925, "grad_norm": 1.893932580947876, "grad_norm_var": 0.44317594415441114, "learning_rate": 7.4e-05, "loss": 0.9357, "loss/crossentropy": 2.3809518814086914, "loss/hidden": 0.74609375, "loss/logits": 0.12787015736103058, "loss/reg": 0.00617539556697011, "step": 74 }, { "epoch": 0.009375, "grad_norm": 2.431032657623291, "grad_norm_var": 0.4412621914582907, "learning_rate": 7.500000000000001e-05, "loss": 1.0796, "loss/crossentropy": 2.5346295833587646, "loss/hidden": 0.86328125, "loss/logits": 0.1545613557100296, "loss/reg": 0.006174764130264521, "step": 75 }, { "epoch": 0.0095, "grad_norm": 2.2421321868896484, "grad_norm_var": 0.45066905079875685, "learning_rate": 7.6e-05, "loss": 0.9869, "loss/crossentropy": 2.756843090057373, "loss/hidden": 0.796875, "loss/logits": 0.1282375454902649, "loss/reg": 0.006174163427203894, "step": 76 }, { "epoch": 0.009625, "grad_norm": 2.7022979259490967, "grad_norm_var": 0.4254703741989109, "learning_rate": 7.7e-05, "loss": 1.2503, "loss/crossentropy": 2.0696699619293213, "loss/hidden": 1.015625, "loss/logits": 0.1729813814163208, "loss/reg": 0.006173421163111925, "step": 77 }, { "epoch": 0.00975, "grad_norm": 2.501106023788452, "grad_norm_var": 0.37677934250983375, "learning_rate": 7.800000000000001e-05, "loss": 1.0516, "loss/crossentropy": 2.629380941390991, "loss/hidden": 0.83984375, "loss/logits": 0.15003597736358643, "loss/reg": 0.006172672379761934, "step": 78 }, { "epoch": 0.009875, "grad_norm": 2.137601137161255, "grad_norm_var": 0.3857841035513881, "learning_rate": 7.900000000000001e-05, "loss": 0.9388, "loss/crossentropy": 2.6841280460357666, "loss/hidden": 0.75, "loss/logits": 0.12706515192985535, "loss/reg": 0.006171974819153547, "step": 79 }, { "epoch": 0.01, "grad_norm": 4.655951976776123, "grad_norm_var": 0.6093991769416703, "learning_rate": 8e-05, "loss": 1.2659, "loss/crossentropy": 2.4634439945220947, "loss/hidden": 1.0390625, "loss/logits": 0.16511483490467072, "loss/reg": 0.006171175744384527, "step": 80 }, { "epoch": 0.010125, "grad_norm": 2.2418179512023926, "grad_norm_var": 0.44652068466097317, "learning_rate": 8.1e-05, "loss": 1.0773, "loss/crossentropy": 2.479743480682373, "loss/hidden": 0.87890625, "loss/logits": 0.1366729438304901, "loss/reg": 0.006170437205582857, "step": 81 }, { "epoch": 0.01025, "grad_norm": 2.0470192432403564, "grad_norm_var": 0.4640077865797357, "learning_rate": 8.2e-05, "loss": 0.8599, "loss/crossentropy": 2.440803050994873, "loss/hidden": 0.68359375, "loss/logits": 0.11458206921815872, "loss/reg": 0.0061697582714259624, "step": 82 }, { "epoch": 0.010375, "grad_norm": 2.0131125450134277, "grad_norm_var": 0.47694604476552793, "learning_rate": 8.3e-05, "loss": 0.8585, "loss/crossentropy": 2.480877637863159, "loss/hidden": 0.6875, "loss/logits": 0.10927767306566238, "loss/reg": 0.006169027183204889, "step": 83 }, { "epoch": 0.0105, "grad_norm": 2.2644267082214355, "grad_norm_var": 0.47842071328175656, "learning_rate": 8.4e-05, "loss": 0.8351, "loss/crossentropy": 2.693246841430664, "loss/hidden": 0.67578125, "loss/logits": 0.09764716029167175, "loss/reg": 0.006168315652757883, "step": 84 }, { "epoch": 0.010625, "grad_norm": 3.1729207038879395, "grad_norm_var": 0.4955376038232837, "learning_rate": 8.5e-05, "loss": 1.2314, "loss/crossentropy": 2.3339309692382812, "loss/hidden": 1.015625, "loss/logits": 0.15408015251159668, "loss/reg": 0.006167604587972164, "step": 85 }, { "epoch": 0.01075, "grad_norm": 2.281872510910034, "grad_norm_var": 0.4984116504809473, "learning_rate": 8.6e-05, "loss": 1.1113, "loss/crossentropy": 2.410794258117676, "loss/hidden": 0.8828125, "loss/logits": 0.16686803102493286, "loss/reg": 0.0061669000424444675, "step": 86 }, { "epoch": 0.010875, "grad_norm": 2.701244354248047, "grad_norm_var": 0.4762769450482454, "learning_rate": 8.7e-05, "loss": 0.9115, "loss/crossentropy": 2.5270962715148926, "loss/hidden": 0.73046875, "loss/logits": 0.11935658752918243, "loss/reg": 0.0061660343781113625, "step": 87 }, { "epoch": 0.011, "grad_norm": 2.0738677978515625, "grad_norm_var": 0.4863854399313406, "learning_rate": 8.800000000000001e-05, "loss": 0.9634, "loss/crossentropy": 2.625903844833374, "loss/hidden": 0.7734375, "loss/logits": 0.12826378643512726, "loss/reg": 0.006165289785712957, "step": 88 }, { "epoch": 0.011125, "grad_norm": 2.827744245529175, "grad_norm_var": 0.44340376520124375, "learning_rate": 8.900000000000001e-05, "loss": 1.0134, "loss/crossentropy": 2.2436654567718506, "loss/hidden": 0.80078125, "loss/logits": 0.15097512304782867, "loss/reg": 0.006164397578686476, "step": 89 }, { "epoch": 0.01125, "grad_norm": 2.412203788757324, "grad_norm_var": 0.4174983019540292, "learning_rate": 9e-05, "loss": 0.9541, "loss/crossentropy": 2.4847052097320557, "loss/hidden": 0.78515625, "loss/logits": 0.10735376924276352, "loss/reg": 0.006163434591144323, "step": 90 }, { "epoch": 0.011375, "grad_norm": 2.385309934616089, "grad_norm_var": 0.41831854842319344, "learning_rate": 9.1e-05, "loss": 1.0455, "loss/crossentropy": 2.1011688709259033, "loss/hidden": 0.828125, "loss/logits": 0.15577414631843567, "loss/reg": 0.0061626131646335125, "step": 91 }, { "epoch": 0.0115, "grad_norm": 2.779266595840454, "grad_norm_var": 0.4149256226543306, "learning_rate": 9.200000000000001e-05, "loss": 0.9782, "loss/crossentropy": 2.770954132080078, "loss/hidden": 0.78125, "loss/logits": 0.13530117273330688, "loss/reg": 0.006161784287542105, "step": 92 }, { "epoch": 0.011625, "grad_norm": 2.816206216812134, "grad_norm_var": 0.41767206123470924, "learning_rate": 9.300000000000001e-05, "loss": 1.2584, "loss/crossentropy": 2.4919488430023193, "loss/hidden": 1.0234375, "loss/logits": 0.17335021495819092, "loss/reg": 0.006160792429000139, "step": 93 }, { "epoch": 0.01175, "grad_norm": 2.1000349521636963, "grad_norm_var": 0.4320504871954351, "learning_rate": 9.4e-05, "loss": 0.9293, "loss/crossentropy": 2.6951355934143066, "loss/hidden": 0.7421875, "loss/logits": 0.12551091611385345, "loss/reg": 0.006159830838441849, "step": 94 }, { "epoch": 0.011875, "grad_norm": 2.6696228981018066, "grad_norm_var": 0.4199965621062515, "learning_rate": 9.5e-05, "loss": 1.0491, "loss/crossentropy": 2.6532485485076904, "loss/hidden": 0.83984375, "loss/logits": 0.14771661162376404, "loss/reg": 0.006158801261335611, "step": 95 }, { "epoch": 0.012, "grad_norm": 2.308758020401001, "grad_norm_var": 0.11782165750081125, "learning_rate": 9.6e-05, "loss": 1.1178, "loss/crossentropy": 2.38185977935791, "loss/hidden": 0.90625, "loss/logits": 0.1499352604150772, "loss/reg": 0.006157839670777321, "step": 96 }, { "epoch": 0.012125, "grad_norm": 2.4204304218292236, "grad_norm_var": 0.11501335190634426, "learning_rate": 9.7e-05, "loss": 1.092, "loss/crossentropy": 2.4358534812927246, "loss/hidden": 0.86328125, "loss/logits": 0.16712763905525208, "loss/reg": 0.006156752817332745, "step": 97 }, { "epoch": 0.01225, "grad_norm": 3.7184524536132812, "grad_norm_var": 0.198780236272727, "learning_rate": 9.8e-05, "loss": 1.4311, "loss/crossentropy": 2.1283679008483887, "loss/hidden": 1.171875, "loss/logits": 0.1976230889558792, "loss/reg": 0.006155804730951786, "step": 98 }, { "epoch": 0.012375, "grad_norm": 3.2656571865081787, "grad_norm_var": 0.20565265002658914, "learning_rate": 9.900000000000001e-05, "loss": 1.017, "loss/crossentropy": 2.6715664863586426, "loss/hidden": 0.80078125, "loss/logits": 0.15465494990348816, "loss/reg": 0.006154791917651892, "step": 99 }, { "epoch": 0.0125, "grad_norm": 2.915663719177246, "grad_norm_var": 0.19977570339779593, "learning_rate": 0.0001, "loss": 0.98, "loss/crossentropy": 2.5455305576324463, "loss/hidden": 0.77734375, "loss/logits": 0.1410846710205078, "loss/reg": 0.0061536673456430435, "step": 100 }, { "epoch": 0.012625, "grad_norm": 3.3153059482574463, "grad_norm_var": 0.2104372314148539, "learning_rate": 0.0001, "loss": 1.1039, "loss/crossentropy": 2.455479621887207, "loss/hidden": 0.90625, "loss/logits": 0.13615351915359497, "loss/reg": 0.0061526307836174965, "step": 101 }, { "epoch": 0.01275, "grad_norm": 2.40315318107605, "grad_norm_var": 0.20480568897691, "learning_rate": 0.0001, "loss": 0.9588, "loss/crossentropy": 2.6359853744506836, "loss/hidden": 0.76953125, "loss/logits": 0.1277719885110855, "loss/reg": 0.006151493173092604, "step": 102 }, { "epoch": 0.012875, "grad_norm": 3.625624895095825, "grad_norm_var": 0.25903479701245613, "learning_rate": 0.0001, "loss": 1.2481, "loss/crossentropy": 2.0148656368255615, "loss/hidden": 1.046875, "loss/logits": 0.13969773054122925, "loss/reg": 0.006150420755147934, "step": 103 }, { "epoch": 0.013, "grad_norm": 2.497906446456909, "grad_norm_var": 0.23191354079432358, "learning_rate": 0.0001, "loss": 1.0603, "loss/crossentropy": 2.3493525981903076, "loss/hidden": 0.86328125, "loss/logits": 0.13548779487609863, "loss/reg": 0.006149281747639179, "step": 104 }, { "epoch": 0.013125, "grad_norm": 3.258059501647949, "grad_norm_var": 0.24629299643454275, "learning_rate": 0.0001, "loss": 0.9497, "loss/crossentropy": 2.6988418102264404, "loss/hidden": 0.7734375, "loss/logits": 0.11473990976810455, "loss/reg": 0.006148339249193668, "step": 105 }, { "epoch": 0.01325, "grad_norm": 3.1279666423797607, "grad_norm_var": 0.24075672502018505, "learning_rate": 0.0001, "loss": 1.1195, "loss/crossentropy": 2.578716278076172, "loss/hidden": 0.875, "loss/logits": 0.18304204940795898, "loss/reg": 0.006147205363959074, "step": 106 }, { "epoch": 0.013375, "grad_norm": 2.760901927947998, "grad_norm_var": 0.22627915570051277, "learning_rate": 0.0001, "loss": 0.9369, "loss/crossentropy": 2.5835328102111816, "loss/hidden": 0.75, "loss/logits": 0.12544697523117065, "loss/reg": 0.006146106868982315, "step": 107 }, { "epoch": 0.0135, "grad_norm": 3.2917559146881104, "grad_norm_var": 0.23622539643692994, "learning_rate": 0.0001, "loss": 1.1437, "loss/crossentropy": 2.6001460552215576, "loss/hidden": 0.91796875, "loss/logits": 0.16428819298744202, "loss/reg": 0.006144997663795948, "step": 108 }, { "epoch": 0.013625, "grad_norm": 3.3908517360687256, "grad_norm_var": 0.2499864352593607, "learning_rate": 0.0001, "loss": 1.0747, "loss/crossentropy": 2.6003377437591553, "loss/hidden": 0.87109375, "loss/logits": 0.14213082194328308, "loss/reg": 0.00614393362775445, "step": 109 }, { "epoch": 0.01375, "grad_norm": 2.7455620765686035, "grad_norm_var": 0.2035723185991922, "learning_rate": 0.0001, "loss": 1.1844, "loss/crossentropy": 2.446432113647461, "loss/hidden": 0.94921875, "loss/logits": 0.17372827231884003, "loss/reg": 0.00614282488822937, "step": 110 }, { "epoch": 0.013875, "grad_norm": 2.899392604827881, "grad_norm_var": 0.1972949454934593, "learning_rate": 0.0001, "loss": 1.0314, "loss/crossentropy": 2.4233920574188232, "loss/hidden": 0.83984375, "loss/logits": 0.13018067181110382, "loss/reg": 0.00614172825589776, "step": 111 }, { "epoch": 0.014, "grad_norm": 2.204866647720337, "grad_norm_var": 0.20749751086427656, "learning_rate": 0.0001, "loss": 0.9867, "loss/crossentropy": 2.4006736278533936, "loss/hidden": 0.79296875, "loss/logits": 0.13233302533626556, "loss/reg": 0.006140332669019699, "step": 112 }, { "epoch": 0.014125, "grad_norm": 2.5094263553619385, "grad_norm_var": 0.20123279411857975, "learning_rate": 0.0001, "loss": 1.2429, "loss/crossentropy": 2.2730560302734375, "loss/hidden": 1.0078125, "loss/logits": 0.1737476885318756, "loss/reg": 0.006138913799077272, "step": 113 }, { "epoch": 0.01425, "grad_norm": 2.590543031692505, "grad_norm_var": 0.17204464736018749, "learning_rate": 0.0001, "loss": 1.0086, "loss/crossentropy": 2.5709896087646484, "loss/hidden": 0.79296875, "loss/logits": 0.1542350947856903, "loss/reg": 0.0061377594247460365, "step": 114 }, { "epoch": 0.014375, "grad_norm": 2.5024876594543457, "grad_norm_var": 0.17379926494707643, "learning_rate": 0.0001, "loss": 1.0309, "loss/crossentropy": 2.539165496826172, "loss/hidden": 0.828125, "loss/logits": 0.14142319560050964, "loss/reg": 0.006136584095656872, "step": 115 }, { "epoch": 0.0145, "grad_norm": 3.2216732501983643, "grad_norm_var": 0.18121036366206128, "learning_rate": 0.0001, "loss": 0.9404, "loss/crossentropy": 2.7685325145721436, "loss/hidden": 0.765625, "loss/logits": 0.1133967787027359, "loss/reg": 0.006135319825261831, "step": 116 }, { "epoch": 0.014625, "grad_norm": 2.3834009170532227, "grad_norm_var": 0.18346146088524526, "learning_rate": 0.0001, "loss": 1.1432, "loss/crossentropy": 2.4507999420166016, "loss/hidden": 0.92578125, "loss/logits": 0.1561031937599182, "loss/reg": 0.006133983377367258, "step": 117 }, { "epoch": 0.01475, "grad_norm": 2.4703636169433594, "grad_norm_var": 0.17984383474256424, "learning_rate": 0.0001, "loss": 1.0541, "loss/crossentropy": 2.3506076335906982, "loss/hidden": 0.84765625, "loss/logits": 0.14511807262897491, "loss/reg": 0.006132753100246191, "step": 118 }, { "epoch": 0.014875, "grad_norm": 2.5960817337036133, "grad_norm_var": 0.13859654880591943, "learning_rate": 0.0001, "loss": 1.2156, "loss/crossentropy": 2.427006244659424, "loss/hidden": 0.96875, "loss/logits": 0.1855170726776123, "loss/reg": 0.006131566129624844, "step": 119 }, { "epoch": 0.015, "grad_norm": 2.908734083175659, "grad_norm_var": 0.13379147574996655, "learning_rate": 0.0001, "loss": 1.0136, "loss/crossentropy": 2.4075210094451904, "loss/hidden": 0.81640625, "loss/logits": 0.13592825829982758, "loss/reg": 0.006130332592874765, "step": 120 }, { "epoch": 0.015125, "grad_norm": 3.450002670288086, "grad_norm_var": 0.147717685364636, "learning_rate": 0.0001, "loss": 1.1584, "loss/crossentropy": 2.446925640106201, "loss/hidden": 0.92578125, "loss/logits": 0.17129938304424286, "loss/reg": 0.0061291721649467945, "step": 121 }, { "epoch": 0.01525, "grad_norm": 2.941195011138916, "grad_norm_var": 0.14212594790061886, "learning_rate": 0.0001, "loss": 1.0996, "loss/crossentropy": 2.5499086380004883, "loss/hidden": 0.87109375, "loss/logits": 0.1672220528125763, "loss/reg": 0.006127914879471064, "step": 122 }, { "epoch": 0.015375, "grad_norm": 2.951799154281616, "grad_norm_var": 0.14330143067309015, "learning_rate": 0.0001, "loss": 1.0862, "loss/crossentropy": 2.654383420944214, "loss/hidden": 0.87109375, "loss/logits": 0.15379250049591064, "loss/reg": 0.006126696243882179, "step": 123 }, { "epoch": 0.0155, "grad_norm": 2.5093131065368652, "grad_norm_var": 0.13194533540905293, "learning_rate": 0.0001, "loss": 1.0905, "loss/crossentropy": 2.4646618366241455, "loss/hidden": 0.87890625, "loss/logits": 0.15029752254486084, "loss/reg": 0.006125394720584154, "step": 124 }, { "epoch": 0.015625, "grad_norm": 2.357142448425293, "grad_norm_var": 0.11277765633995311, "learning_rate": 0.0001, "loss": 1.0794, "loss/crossentropy": 2.4590322971343994, "loss/hidden": 0.87109375, "loss/logits": 0.1471107453107834, "loss/reg": 0.0061240773648023605, "step": 125 }, { "epoch": 0.01575, "grad_norm": 2.0443954467773438, "grad_norm_var": 0.13949059079901172, "learning_rate": 0.0001, "loss": 1.0064, "loss/crossentropy": 2.6105568408966064, "loss/hidden": 0.80859375, "loss/logits": 0.13658249378204346, "loss/reg": 0.006122750695794821, "step": 126 }, { "epoch": 0.015875, "grad_norm": 2.334003448486328, "grad_norm_var": 0.1413326038540049, "learning_rate": 0.0001, "loss": 1.128, "loss/crossentropy": 2.3226428031921387, "loss/hidden": 0.8984375, "loss/logits": 0.16836631298065186, "loss/reg": 0.006121381651610136, "step": 127 }, { "epoch": 0.016, "grad_norm": 2.6693766117095947, "grad_norm_var": 0.12889249481462456, "learning_rate": 0.0001, "loss": 1.0478, "loss/crossentropy": 2.5844597816467285, "loss/hidden": 0.84765625, "loss/logits": 0.1388963758945465, "loss/reg": 0.006120136007666588, "step": 128 }, { "epoch": 0.016125, "grad_norm": 3.935439348220825, "grad_norm_var": 0.22878447427120438, "learning_rate": 0.0001, "loss": 1.1726, "loss/crossentropy": 2.7213780879974365, "loss/hidden": 0.9375, "loss/logits": 0.1738772690296173, "loss/reg": 0.006118897348642349, "step": 129 }, { "epoch": 0.01625, "grad_norm": 3.463432788848877, "grad_norm_var": 0.25882213944617144, "learning_rate": 0.0001, "loss": 1.0898, "loss/crossentropy": 2.3635873794555664, "loss/hidden": 0.8828125, "loss/logits": 0.1457763910293579, "loss/reg": 0.006117486394941807, "step": 130 }, { "epoch": 0.016375, "grad_norm": 3.779526948928833, "grad_norm_var": 0.31074183113488135, "learning_rate": 0.0001, "loss": 1.2078, "loss/crossentropy": 2.316762924194336, "loss/hidden": 0.98046875, "loss/logits": 0.16614478826522827, "loss/reg": 0.006116243079304695, "step": 131 }, { "epoch": 0.0165, "grad_norm": 2.7554008960723877, "grad_norm_var": 0.3028391023812749, "learning_rate": 0.0001, "loss": 0.9769, "loss/crossentropy": 2.458954095840454, "loss/hidden": 0.7890625, "loss/logits": 0.12667913734912872, "loss/reg": 0.006114880088716745, "step": 132 }, { "epoch": 0.016625, "grad_norm": 2.342526435852051, "grad_norm_var": 0.30546929082944035, "learning_rate": 0.0001, "loss": 1.1137, "loss/crossentropy": 2.6329517364501953, "loss/hidden": 0.890625, "loss/logits": 0.161947563290596, "loss/reg": 0.0061136274598538876, "step": 133 }, { "epoch": 0.01675, "grad_norm": 2.2754058837890625, "grad_norm_var": 0.31756495416411024, "learning_rate": 0.0001, "loss": 1.1703, "loss/crossentropy": 2.2747550010681152, "loss/hidden": 0.94921875, "loss/logits": 0.15994513034820557, "loss/reg": 0.006112351547926664, "step": 134 }, { "epoch": 0.016875, "grad_norm": 3.1313912868499756, "grad_norm_var": 0.3186282278045513, "learning_rate": 0.0001, "loss": 1.2333, "loss/crossentropy": 2.4932894706726074, "loss/hidden": 0.99609375, "loss/logits": 0.17612434923648834, "loss/reg": 0.006111042574048042, "step": 135 }, { "epoch": 0.017, "grad_norm": 3.960482358932495, "grad_norm_var": 0.39381746513703864, "learning_rate": 0.0001, "loss": 1.3101, "loss/crossentropy": 2.581660747528076, "loss/hidden": 1.0625, "loss/logits": 0.18646802008152008, "loss/reg": 0.006109676789492369, "step": 136 }, { "epoch": 0.017125, "grad_norm": 2.7605810165405273, "grad_norm_var": 0.37584340109069647, "learning_rate": 0.0001, "loss": 0.8792, "loss/crossentropy": 2.6490936279296875, "loss/hidden": 0.703125, "loss/logits": 0.1150316372513771, "loss/reg": 0.006108277477324009, "step": 137 }, { "epoch": 0.01725, "grad_norm": 2.6196203231811523, "grad_norm_var": 0.38003486499210315, "learning_rate": 0.0001, "loss": 0.955, "loss/crossentropy": 2.633441209793091, "loss/hidden": 0.76953125, "loss/logits": 0.1244344562292099, "loss/reg": 0.006106934975832701, "step": 138 }, { "epoch": 0.017375, "grad_norm": 4.534512519836426, "grad_norm_var": 0.554255985026353, "learning_rate": 0.0001, "loss": 1.4104, "loss/crossentropy": 2.2204151153564453, "loss/hidden": 1.1796875, "loss/logits": 0.1696874350309372, "loss/reg": 0.0061056241393089294, "step": 139 }, { "epoch": 0.0175, "grad_norm": 2.192370653152466, "grad_norm_var": 0.5798771099829023, "learning_rate": 0.0001, "loss": 1.1299, "loss/crossentropy": 2.375506639480591, "loss/hidden": 0.921875, "loss/logits": 0.14694982767105103, "loss/reg": 0.0061043244786560535, "step": 140 }, { "epoch": 0.017625, "grad_norm": 4.368403911590576, "grad_norm_var": 0.6744588881998081, "learning_rate": 0.0001, "loss": 1.278, "loss/crossentropy": 2.3692545890808105, "loss/hidden": 1.03125, "loss/logits": 0.18568292260169983, "loss/reg": 0.006102937273681164, "step": 141 }, { "epoch": 0.01775, "grad_norm": 2.2753779888153076, "grad_norm_var": 0.6461169960118004, "learning_rate": 0.0001, "loss": 1.0276, "loss/crossentropy": 2.470676898956299, "loss/hidden": 0.82421875, "loss/logits": 0.14231771230697632, "loss/reg": 0.006101653911173344, "step": 142 }, { "epoch": 0.017875, "grad_norm": 2.6550562381744385, "grad_norm_var": 0.6203099666067883, "learning_rate": 0.0001, "loss": 0.8712, "loss/crossentropy": 2.8198063373565674, "loss/hidden": 0.69921875, "loss/logits": 0.11099085956811905, "loss/reg": 0.006100376136600971, "step": 143 }, { "epoch": 0.018, "grad_norm": 2.8701858520507812, "grad_norm_var": 0.6111015072729884, "learning_rate": 0.0001, "loss": 1.1794, "loss/crossentropy": 2.413463830947876, "loss/hidden": 0.96484375, "loss/logits": 0.15351834893226624, "loss/reg": 0.006099053658545017, "step": 144 }, { "epoch": 0.018125, "grad_norm": 2.2347958087921143, "grad_norm_var": 0.6069563505613275, "learning_rate": 0.0001, "loss": 1.0832, "loss/crossentropy": 2.446056604385376, "loss/hidden": 0.8671875, "loss/logits": 0.1550455242395401, "loss/reg": 0.006097796373069286, "step": 145 }, { "epoch": 0.01825, "grad_norm": 2.60143780708313, "grad_norm_var": 0.6017061449507364, "learning_rate": 0.0001, "loss": 1.1216, "loss/crossentropy": 2.2890260219573975, "loss/hidden": 0.8984375, "loss/logits": 0.16223573684692383, "loss/reg": 0.006096460856497288, "step": 146 }, { "epoch": 0.018375, "grad_norm": 3.656100273132324, "grad_norm_var": 0.5891684064627459, "learning_rate": 0.0001, "loss": 1.2759, "loss/crossentropy": 2.2077646255493164, "loss/hidden": 1.0546875, "loss/logits": 0.16024138033390045, "loss/reg": 0.006095105782151222, "step": 147 }, { "epoch": 0.0185, "grad_norm": 2.8190999031066895, "grad_norm_var": 0.5877513730221795, "learning_rate": 0.0001, "loss": 1.1416, "loss/crossentropy": 2.4892842769622803, "loss/hidden": 0.9140625, "loss/logits": 0.1665700376033783, "loss/reg": 0.0060938019305467606, "step": 148 }, { "epoch": 0.018625, "grad_norm": 2.6578848361968994, "grad_norm_var": 0.568168306773175, "learning_rate": 0.0001, "loss": 1.1443, "loss/crossentropy": 2.3138527870178223, "loss/hidden": 0.93359375, "loss/logits": 0.14977282285690308, "loss/reg": 0.006092346739023924, "step": 149 }, { "epoch": 0.01875, "grad_norm": 2.656559944152832, "grad_norm_var": 0.5416540961853636, "learning_rate": 0.0001, "loss": 0.9868, "loss/crossentropy": 2.7701377868652344, "loss/hidden": 0.796875, "loss/logits": 0.12901648879051208, "loss/reg": 0.006090943701565266, "step": 150 }, { "epoch": 0.018875, "grad_norm": 1.9359983205795288, "grad_norm_var": 0.6099613145708634, "learning_rate": 0.0001, "loss": 0.9127, "loss/crossentropy": 2.55560040473938, "loss/hidden": 0.73828125, "loss/logits": 0.11351295560598373, "loss/reg": 0.00608965614810586, "step": 151 }, { "epoch": 0.019, "grad_norm": 3.7978732585906982, "grad_norm_var": 0.5891613317586338, "learning_rate": 0.0001, "loss": 1.2275, "loss/crossentropy": 2.4227731227874756, "loss/hidden": 0.98828125, "loss/logits": 0.17836451530456543, "loss/reg": 0.006088252179324627, "step": 152 }, { "epoch": 0.019125, "grad_norm": 2.8193647861480713, "grad_norm_var": 0.588169020521083, "learning_rate": 0.0001, "loss": 0.9739, "loss/crossentropy": 2.474368095397949, "loss/hidden": 0.80078125, "loss/logits": 0.11225409805774689, "loss/reg": 0.006086937617510557, "step": 153 }, { "epoch": 0.01925, "grad_norm": 2.2882325649261475, "grad_norm_var": 0.6082348956957436, "learning_rate": 0.0001, "loss": 1.0395, "loss/crossentropy": 2.3776350021362305, "loss/hidden": 0.82421875, "loss/logits": 0.15443992614746094, "loss/reg": 0.0060854703187942505, "step": 154 }, { "epoch": 0.019375, "grad_norm": 2.006150245666504, "grad_norm_var": 0.4559805309993303, "learning_rate": 0.0001, "loss": 0.9762, "loss/crossentropy": 2.7556076049804688, "loss/hidden": 0.78515625, "loss/logits": 0.13019207119941711, "loss/reg": 0.006084186024963856, "step": 155 }, { "epoch": 0.0195, "grad_norm": 2.8143231868743896, "grad_norm_var": 0.43477030174237014, "learning_rate": 0.0001, "loss": 1.1927, "loss/crossentropy": 2.652045249938965, "loss/hidden": 0.94140625, "loss/logits": 0.19042611122131348, "loss/reg": 0.00608274107798934, "step": 156 }, { "epoch": 0.019625, "grad_norm": 2.957540988922119, "grad_norm_var": 0.2601037584282233, "learning_rate": 0.0001, "loss": 1.0641, "loss/crossentropy": 2.546213150024414, "loss/hidden": 0.86328125, "loss/logits": 0.14000022411346436, "loss/reg": 0.006081291940063238, "step": 157 }, { "epoch": 0.01975, "grad_norm": 2.625493288040161, "grad_norm_var": 0.24839219907499052, "learning_rate": 0.0001, "loss": 1.012, "loss/crossentropy": 2.5120432376861572, "loss/hidden": 0.81640625, "loss/logits": 0.13474689424037933, "loss/reg": 0.006079958751797676, "step": 158 }, { "epoch": 0.019875, "grad_norm": 2.6614878177642822, "grad_norm_var": 0.2483457330217589, "learning_rate": 0.0001, "loss": 0.9873, "loss/crossentropy": 2.312061071395874, "loss/hidden": 0.80859375, "loss/logits": 0.11790065467357635, "loss/reg": 0.006078665144741535, "step": 159 }, { "epoch": 0.02, "grad_norm": 2.6204919815063477, "grad_norm_var": 0.24699792562249925, "learning_rate": 0.0001, "loss": 1.0488, "loss/crossentropy": 2.505072593688965, "loss/hidden": 0.84375, "loss/logits": 0.14428117871284485, "loss/reg": 0.006077310536056757, "step": 160 }, { "epoch": 0.020125, "grad_norm": 3.107072591781616, "grad_norm_var": 0.24079003208151678, "learning_rate": 0.0001, "loss": 1.1736, "loss/crossentropy": 2.6514599323272705, "loss/hidden": 0.96484375, "loss/logits": 0.1480400413274765, "loss/reg": 0.006076075602322817, "step": 161 }, { "epoch": 0.02025, "grad_norm": 2.669001817703247, "grad_norm_var": 0.23972287159530806, "learning_rate": 0.0001, "loss": 1.1966, "loss/crossentropy": 2.4616479873657227, "loss/hidden": 0.9765625, "loss/logits": 0.15933012962341309, "loss/reg": 0.006074720993638039, "step": 162 }, { "epoch": 0.020375, "grad_norm": 2.5872421264648438, "grad_norm_var": 0.1828196031273113, "learning_rate": 0.0001, "loss": 1.0551, "loss/crossentropy": 2.5483999252319336, "loss/hidden": 0.83984375, "loss/logits": 0.1544739305973053, "loss/reg": 0.006073469761759043, "step": 163 }, { "epoch": 0.0205, "grad_norm": 2.3342509269714355, "grad_norm_var": 0.1891007671877621, "learning_rate": 0.0001, "loss": 1.1418, "loss/crossentropy": 2.610344171524048, "loss/hidden": 0.90234375, "loss/logits": 0.17876723408699036, "loss/reg": 0.006072178483009338, "step": 164 }, { "epoch": 0.020625, "grad_norm": 2.548274278640747, "grad_norm_var": 0.18986337395058156, "learning_rate": 0.0001, "loss": 0.9512, "loss/crossentropy": 2.747725009918213, "loss/hidden": 0.7734375, "loss/logits": 0.11706214398145676, "loss/reg": 0.00607073912397027, "step": 165 }, { "epoch": 0.02075, "grad_norm": 2.666066884994507, "grad_norm_var": 0.18987501227134793, "learning_rate": 0.0001, "loss": 1.0557, "loss/crossentropy": 2.3086578845977783, "loss/hidden": 0.83984375, "loss/logits": 0.1551416665315628, "loss/reg": 0.006069260183721781, "step": 166 }, { "epoch": 0.020875, "grad_norm": 3.363084554672241, "grad_norm_var": 0.18083982986582872, "learning_rate": 0.0001, "loss": 0.9886, "loss/crossentropy": 2.7422661781311035, "loss/hidden": 0.79296875, "loss/logits": 0.13497118651866913, "loss/reg": 0.006067754700779915, "step": 167 }, { "epoch": 0.021, "grad_norm": 2.717400550842285, "grad_norm_var": 0.10163689874761227, "learning_rate": 0.0001, "loss": 1.2413, "loss/crossentropy": 2.341296672821045, "loss/hidden": 1.0078125, "loss/logits": 0.17277640104293823, "loss/reg": 0.006066245958209038, "step": 168 }, { "epoch": 0.021125, "grad_norm": 2.2773897647857666, "grad_norm_var": 0.10949759007257095, "learning_rate": 0.0001, "loss": 0.9531, "loss/crossentropy": 2.492532968521118, "loss/hidden": 0.76953125, "loss/logits": 0.12295819818973541, "loss/reg": 0.006064848508685827, "step": 169 }, { "epoch": 0.02125, "grad_norm": 2.7625067234039307, "grad_norm_var": 0.1012976809853086, "learning_rate": 0.0001, "loss": 1.0102, "loss/crossentropy": 2.3799381256103516, "loss/hidden": 0.80859375, "loss/logits": 0.140989288687706, "loss/reg": 0.0060633583925664425, "step": 170 }, { "epoch": 0.021375, "grad_norm": 3.713162899017334, "grad_norm_var": 0.1323542313667114, "learning_rate": 0.0001, "loss": 1.0173, "loss/crossentropy": 2.7296385765075684, "loss/hidden": 0.80078125, "loss/logits": 0.1559314727783203, "loss/reg": 0.006062004715204239, "step": 171 }, { "epoch": 0.0215, "grad_norm": 2.8448026180267334, "grad_norm_var": 0.13256580340874963, "learning_rate": 0.0001, "loss": 1.0945, "loss/crossentropy": 2.211848497390747, "loss/hidden": 0.87890625, "loss/logits": 0.15503031015396118, "loss/reg": 0.006060663145035505, "step": 172 }, { "epoch": 0.021625, "grad_norm": 2.951566696166992, "grad_norm_var": 0.13242537871232402, "learning_rate": 0.0001, "loss": 1.243, "loss/crossentropy": 2.6379833221435547, "loss/hidden": 0.96484375, "loss/logits": 0.21754613518714905, "loss/reg": 0.00605935649946332, "step": 173 }, { "epoch": 0.02175, "grad_norm": 2.6862404346466064, "grad_norm_var": 0.13142011502921586, "learning_rate": 0.0001, "loss": 1.0053, "loss/crossentropy": 2.3807766437530518, "loss/hidden": 0.80078125, "loss/logits": 0.14393460750579834, "loss/reg": 0.006058130878955126, "step": 174 }, { "epoch": 0.021875, "grad_norm": 2.5145609378814697, "grad_norm_var": 0.13512780159794507, "learning_rate": 0.0001, "loss": 1.0609, "loss/crossentropy": 2.4608380794525146, "loss/hidden": 0.85546875, "loss/logits": 0.14485566318035126, "loss/reg": 0.006056922487914562, "step": 175 }, { "epoch": 0.022, "grad_norm": 3.23178768157959, "grad_norm_var": 0.14607750168249728, "learning_rate": 0.0001, "loss": 1.1294, "loss/crossentropy": 2.9791719913482666, "loss/hidden": 0.91796875, "loss/logits": 0.1508345603942871, "loss/reg": 0.006055623292922974, "step": 176 }, { "epoch": 0.022125, "grad_norm": 2.7397234439849854, "grad_norm_var": 0.14000512423072375, "learning_rate": 0.0001, "loss": 1.0578, "loss/crossentropy": 2.4559919834136963, "loss/hidden": 0.86328125, "loss/logits": 0.1340080350637436, "loss/reg": 0.0060544307343661785, "step": 177 }, { "epoch": 0.02225, "grad_norm": 2.6637048721313477, "grad_norm_var": 0.14009088002925954, "learning_rate": 0.0001, "loss": 1.076, "loss/crossentropy": 2.3794586658477783, "loss/hidden": 0.86328125, "loss/logits": 0.15214313566684723, "loss/reg": 0.0060530174523591995, "step": 178 }, { "epoch": 0.022375, "grad_norm": 2.0105221271514893, "grad_norm_var": 0.17628626628935157, "learning_rate": 0.0001, "loss": 0.9703, "loss/crossentropy": 2.3926336765289307, "loss/hidden": 0.77734375, "loss/logits": 0.13244566321372986, "loss/reg": 0.0060517978854477406, "step": 179 }, { "epoch": 0.0225, "grad_norm": 2.571902275085449, "grad_norm_var": 0.16659277386996318, "learning_rate": 0.0001, "loss": 1.0739, "loss/crossentropy": 2.7502923011779785, "loss/hidden": 0.8515625, "loss/logits": 0.16181406378746033, "loss/reg": 0.006050686351954937, "step": 180 }, { "epoch": 0.022625, "grad_norm": 2.700366973876953, "grad_norm_var": 0.1636147823311904, "learning_rate": 0.0001, "loss": 1.0113, "loss/crossentropy": 2.502389669418335, "loss/hidden": 0.8125, "loss/logits": 0.138347327709198, "loss/reg": 0.006049246061593294, "step": 181 }, { "epoch": 0.02275, "grad_norm": 2.7259435653686523, "grad_norm_var": 0.1629618050893432, "learning_rate": 0.0001, "loss": 1.0192, "loss/crossentropy": 2.2493560314178467, "loss/hidden": 0.82421875, "loss/logits": 0.1344609260559082, "loss/reg": 0.006048021838068962, "step": 182 }, { "epoch": 0.022875, "grad_norm": 4.930091857910156, "grad_norm_var": 0.43832731745023895, "learning_rate": 0.0001, "loss": 1.1874, "loss/crossentropy": 2.649231433868408, "loss/hidden": 0.94140625, "loss/logits": 0.1855432242155075, "loss/reg": 0.006046844646334648, "step": 183 }, { "epoch": 0.023, "grad_norm": 2.288604259490967, "grad_norm_var": 0.4589782783160859, "learning_rate": 0.0001, "loss": 1.0354, "loss/crossentropy": 3.0482568740844727, "loss/hidden": 0.8203125, "loss/logits": 0.15461647510528564, "loss/reg": 0.006045445334166288, "step": 184 }, { "epoch": 0.023125, "grad_norm": 2.7902991771698, "grad_norm_var": 0.4362058684835667, "learning_rate": 0.0001, "loss": 1.0744, "loss/crossentropy": 2.726069211959839, "loss/hidden": 0.8359375, "loss/logits": 0.17799492180347443, "loss/reg": 0.006044231820851564, "step": 185 }, { "epoch": 0.02325, "grad_norm": 3.597017526626587, "grad_norm_var": 0.46633972017124825, "learning_rate": 0.0001, "loss": 1.0985, "loss/crossentropy": 2.200692892074585, "loss/hidden": 0.8984375, "loss/logits": 0.13961729407310486, "loss/reg": 0.006042772904038429, "step": 186 }, { "epoch": 0.023375, "grad_norm": 2.969062566757202, "grad_norm_var": 0.42374272593361867, "learning_rate": 0.0001, "loss": 1.2314, "loss/crossentropy": 2.3744540214538574, "loss/hidden": 0.96875, "loss/logits": 0.20225511491298676, "loss/reg": 0.006041594315320253, "step": 187 }, { "epoch": 0.0235, "grad_norm": 3.2257020473480225, "grad_norm_var": 0.4305906329857976, "learning_rate": 0.0001, "loss": 1.0982, "loss/crossentropy": 2.442505121231079, "loss/hidden": 0.875, "loss/logits": 0.16284233331680298, "loss/reg": 0.006040407810360193, "step": 188 }, { "epoch": 0.023625, "grad_norm": 3.670443058013916, "grad_norm_var": 0.4666515285365591, "learning_rate": 0.0001, "loss": 1.2391, "loss/crossentropy": 2.533158540725708, "loss/hidden": 0.98046875, "loss/logits": 0.19827201962471008, "loss/reg": 0.0060392809100449085, "step": 189 }, { "epoch": 0.02375, "grad_norm": 7.53206729888916, "grad_norm_var": 1.7591779439754056, "learning_rate": 0.0001, "loss": 1.1689, "loss/crossentropy": 2.3104734420776367, "loss/hidden": 0.96875, "loss/logits": 0.13976144790649414, "loss/reg": 0.006038178689777851, "step": 190 }, { "epoch": 0.023875, "grad_norm": 4.658889293670654, "grad_norm_var": 1.833400975261701, "learning_rate": 0.0001, "loss": 1.3266, "loss/crossentropy": 2.286229133605957, "loss/hidden": 1.1015625, "loss/logits": 0.16465552151203156, "loss/reg": 0.006036726757884026, "step": 191 }, { "epoch": 0.024, "grad_norm": 3.2109904289245605, "grad_norm_var": 1.8338781863373583, "learning_rate": 0.0001, "loss": 1.278, "loss/crossentropy": 2.5849151611328125, "loss/hidden": 1.0078125, "loss/logits": 0.20983844995498657, "loss/reg": 0.006035543512552977, "step": 192 }, { "epoch": 0.024125, "grad_norm": 2.556408643722534, "grad_norm_var": 1.8519417466969637, "learning_rate": 0.0001, "loss": 1.0335, "loss/crossentropy": 2.635669231414795, "loss/hidden": 0.8359375, "loss/logits": 0.13721294701099396, "loss/reg": 0.006034051068127155, "step": 193 }, { "epoch": 0.02425, "grad_norm": 3.4185855388641357, "grad_norm_var": 1.8153229069184569, "learning_rate": 0.0001, "loss": 1.0115, "loss/crossentropy": 2.3127341270446777, "loss/hidden": 0.828125, "loss/logits": 0.12303752452135086, "loss/reg": 0.00603274954482913, "step": 194 }, { "epoch": 0.024375, "grad_norm": 3.639681816101074, "grad_norm_var": 1.6731808292397734, "learning_rate": 0.0001, "loss": 1.2374, "loss/crossentropy": 2.4363749027252197, "loss/hidden": 0.98046875, "loss/logits": 0.19659578800201416, "loss/reg": 0.006031363736838102, "step": 195 }, { "epoch": 0.0245, "grad_norm": 3.266385078430176, "grad_norm_var": 1.614572274352353, "learning_rate": 0.0001, "loss": 1.19, "loss/crossentropy": 2.2824337482452393, "loss/hidden": 0.9609375, "loss/logits": 0.16878634691238403, "loss/reg": 0.006029782351106405, "step": 196 }, { "epoch": 0.024625, "grad_norm": 3.0692105293273926, "grad_norm_var": 1.5801212385016838, "learning_rate": 0.0001, "loss": 1.1495, "loss/crossentropy": 2.518056631088257, "loss/hidden": 0.921875, "loss/logits": 0.16731634736061096, "loss/reg": 0.006028252653777599, "step": 197 }, { "epoch": 0.02475, "grad_norm": 3.390202283859253, "grad_norm_var": 1.530565626963321, "learning_rate": 0.0001, "loss": 1.1783, "loss/crossentropy": 2.3565316200256348, "loss/hidden": 0.9375, "loss/logits": 0.18055224418640137, "loss/reg": 0.006026738323271275, "step": 198 }, { "epoch": 0.024875, "grad_norm": 2.524461030960083, "grad_norm_var": 1.4779304822181976, "learning_rate": 0.0001, "loss": 1.095, "loss/crossentropy": 2.3489255905151367, "loss/hidden": 0.88671875, "loss/logits": 0.1480264812707901, "loss/reg": 0.006025230046361685, "step": 199 }, { "epoch": 0.025, "grad_norm": 2.8753433227539062, "grad_norm_var": 1.4056158732497617, "learning_rate": 0.0001, "loss": 1.1396, "loss/crossentropy": 2.379971504211426, "loss/hidden": 0.90625, "loss/logits": 0.17312359809875488, "loss/reg": 0.0060236188583076, "step": 200 }, { "epoch": 0.025125, "grad_norm": 2.2297983169555664, "grad_norm_var": 1.4801331513155804, "learning_rate": 0.0001, "loss": 1.1642, "loss/crossentropy": 2.401499032974243, "loss/hidden": 0.9296875, "loss/logits": 0.1743072271347046, "loss/reg": 0.006021994166076183, "step": 201 }, { "epoch": 0.02525, "grad_norm": 2.7430193424224854, "grad_norm_var": 1.5134885749372204, "learning_rate": 0.0001, "loss": 1.3503, "loss/crossentropy": 2.3397345542907715, "loss/hidden": 1.09375, "loss/logits": 0.1963859498500824, "loss/reg": 0.006020485423505306, "step": 202 }, { "epoch": 0.025375, "grad_norm": 3.3862688541412354, "grad_norm_var": 1.4983780502999742, "learning_rate": 0.0001, "loss": 1.3154, "loss/crossentropy": 2.3259048461914062, "loss/hidden": 1.09375, "loss/logits": 0.1614416241645813, "loss/reg": 0.0060190120711922646, "step": 203 }, { "epoch": 0.0255, "grad_norm": 2.554938316345215, "grad_norm_var": 1.547662147741073, "learning_rate": 0.0001, "loss": 1.1147, "loss/crossentropy": 2.559544801712036, "loss/hidden": 0.890625, "loss/logits": 0.16388913989067078, "loss/reg": 0.006017730105668306, "step": 204 }, { "epoch": 0.025625, "grad_norm": 2.6290361881256104, "grad_norm_var": 1.5807281675134672, "learning_rate": 0.0001, "loss": 1.049, "loss/crossentropy": 2.7080090045928955, "loss/hidden": 0.828125, "loss/logits": 0.16068041324615479, "loss/reg": 0.006016433704644442, "step": 205 }, { "epoch": 0.02575, "grad_norm": 2.234259605407715, "grad_norm_var": 0.38456120947827777, "learning_rate": 0.0001, "loss": 1.0392, "loss/crossentropy": 2.3816347122192383, "loss/hidden": 0.8359375, "loss/logits": 0.14315146207809448, "loss/reg": 0.0060149249620735645, "step": 206 }, { "epoch": 0.025875, "grad_norm": 2.810352325439453, "grad_norm_var": 0.19522907990381644, "learning_rate": 0.0001, "loss": 1.1385, "loss/crossentropy": 2.6245384216308594, "loss/hidden": 0.90625, "loss/logits": 0.17206540703773499, "loss/reg": 0.006013684440404177, "step": 207 }, { "epoch": 0.026, "grad_norm": 2.198707342147827, "grad_norm_var": 0.21847125065788287, "learning_rate": 0.0001, "loss": 0.9762, "loss/crossentropy": 2.3812787532806396, "loss/hidden": 0.796875, "loss/logits": 0.119233138859272, "loss/reg": 0.006012204568833113, "step": 208 }, { "epoch": 0.026125, "grad_norm": 2.5001378059387207, "grad_norm_var": 0.22083751043745087, "learning_rate": 0.0001, "loss": 1.2526, "loss/crossentropy": 2.5999109745025635, "loss/hidden": 0.984375, "loss/logits": 0.20815744996070862, "loss/reg": 0.006010920740664005, "step": 209 }, { "epoch": 0.02625, "grad_norm": 3.175185203552246, "grad_norm_var": 0.20582482438127556, "learning_rate": 0.0001, "loss": 1.239, "loss/crossentropy": 2.3893682956695557, "loss/hidden": 1.0234375, "loss/logits": 0.15550163388252258, "loss/reg": 0.006009369157254696, "step": 210 }, { "epoch": 0.026375, "grad_norm": 3.482342481613159, "grad_norm_var": 0.19031657232839597, "learning_rate": 0.0001, "loss": 1.1572, "loss/crossentropy": 2.382542848587036, "loss/hidden": 0.94921875, "loss/logits": 0.14788678288459778, "loss/reg": 0.006007815711200237, "step": 211 }, { "epoch": 0.0265, "grad_norm": 2.285135507583618, "grad_norm_var": 0.19168098803167197, "learning_rate": 0.0001, "loss": 0.9667, "loss/crossentropy": 2.552724838256836, "loss/hidden": 0.78125, "loss/logits": 0.1254206746816635, "loss/reg": 0.006006232462823391, "step": 212 }, { "epoch": 0.026625, "grad_norm": 2.991971969604492, "grad_norm_var": 0.1888233667670041, "learning_rate": 0.0001, "loss": 1.1472, "loss/crossentropy": 2.472437620162964, "loss/hidden": 0.9296875, "loss/logits": 0.15750399231910706, "loss/reg": 0.0060045006684958935, "step": 213 }, { "epoch": 0.02675, "grad_norm": 2.3775179386138916, "grad_norm_var": 0.1665701003974154, "learning_rate": 0.0001, "loss": 1.1938, "loss/crossentropy": 2.294337749481201, "loss/hidden": 0.95703125, "loss/logits": 0.17671090364456177, "loss/reg": 0.006002978887408972, "step": 214 }, { "epoch": 0.026875, "grad_norm": 2.2992701530456543, "grad_norm_var": 0.17463199132661936, "learning_rate": 0.0001, "loss": 1.2097, "loss/crossentropy": 2.3843300342559814, "loss/hidden": 0.9609375, "loss/logits": 0.18876615166664124, "loss/reg": 0.006001432426273823, "step": 215 }, { "epoch": 0.027, "grad_norm": 2.4926228523254395, "grad_norm_var": 0.17347807328228151, "learning_rate": 0.0001, "loss": 1.3156, "loss/crossentropy": 2.326836585998535, "loss/hidden": 1.0625, "loss/logits": 0.19308596849441528, "loss/reg": 0.005999880842864513, "step": 216 }, { "epoch": 0.027125, "grad_norm": 2.552459478378296, "grad_norm_var": 0.16193263198218044, "learning_rate": 0.0001, "loss": 1.1424, "loss/crossentropy": 2.6629388332366943, "loss/hidden": 0.91015625, "loss/logits": 0.1722826063632965, "loss/reg": 0.005998372100293636, "step": 217 }, { "epoch": 0.02725, "grad_norm": 2.866387128829956, "grad_norm_var": 0.16409192036900605, "learning_rate": 0.0001, "loss": 1.0142, "loss/crossentropy": 2.8154890537261963, "loss/hidden": 0.80078125, "loss/logits": 0.15349115431308746, "loss/reg": 0.005996840540319681, "step": 218 }, { "epoch": 0.027375, "grad_norm": 2.77524471282959, "grad_norm_var": 0.12966566207502767, "learning_rate": 0.0001, "loss": 1.4111, "loss/crossentropy": 2.4509928226470947, "loss/hidden": 1.1015625, "loss/logits": 0.249616801738739, "loss/reg": 0.005995343904942274, "step": 219 }, { "epoch": 0.0275, "grad_norm": 2.887923240661621, "grad_norm_var": 0.13285907347625023, "learning_rate": 0.0001, "loss": 1.2886, "loss/crossentropy": 2.4280507564544678, "loss/hidden": 1.0234375, "loss/logits": 0.20519307255744934, "loss/reg": 0.005993579979985952, "step": 220 }, { "epoch": 0.027625, "grad_norm": 2.5383920669555664, "grad_norm_var": 0.1337457284607846, "learning_rate": 0.0001, "loss": 1.3292, "loss/crossentropy": 2.0803585052490234, "loss/hidden": 1.09375, "loss/logits": 0.17551109194755554, "loss/reg": 0.005991705227643251, "step": 221 }, { "epoch": 0.02775, "grad_norm": 2.639490842819214, "grad_norm_var": 0.12131687494494538, "learning_rate": 0.0001, "loss": 1.0593, "loss/crossentropy": 2.293325901031494, "loss/hidden": 0.8515625, "loss/logits": 0.14782238006591797, "loss/reg": 0.005989882629364729, "step": 222 }, { "epoch": 0.027875, "grad_norm": 2.4396984577178955, "grad_norm_var": 0.12344012810124999, "learning_rate": 0.0001, "loss": 1.0587, "loss/crossentropy": 2.7268667221069336, "loss/hidden": 0.84765625, "loss/logits": 0.15114662051200867, "loss/reg": 0.0059883627109229565, "step": 223 }, { "epoch": 0.028, "grad_norm": 2.227886438369751, "grad_norm_var": 0.12171264621671582, "learning_rate": 0.0001, "loss": 1.0087, "loss/crossentropy": 2.4431943893432617, "loss/hidden": 0.81640625, "loss/logits": 0.13243696093559265, "loss/reg": 0.005986812058836222, "step": 224 }, { "epoch": 0.028125, "grad_norm": 3.690627098083496, "grad_norm_var": 0.18519755428341872, "learning_rate": 0.0001, "loss": 1.0732, "loss/crossentropy": 2.4630942344665527, "loss/hidden": 0.875, "loss/logits": 0.13830721378326416, "loss/reg": 0.005985158029943705, "step": 225 }, { "epoch": 0.02825, "grad_norm": 3.377890110015869, "grad_norm_var": 0.19972658805784155, "learning_rate": 0.0001, "loss": 1.1848, "loss/crossentropy": 2.2899203300476074, "loss/hidden": 0.9609375, "loss/logits": 0.16401749849319458, "loss/reg": 0.005983633920550346, "step": 226 }, { "epoch": 0.028375, "grad_norm": 2.7600386142730713, "grad_norm_var": 0.16135214723361363, "learning_rate": 0.0001, "loss": 1.0223, "loss/crossentropy": 2.8077659606933594, "loss/hidden": 0.8203125, "loss/logits": 0.14218226075172424, "loss/reg": 0.005982026923447847, "step": 227 }, { "epoch": 0.0285, "grad_norm": 2.3397345542907715, "grad_norm_var": 0.15851713921701366, "learning_rate": 0.0001, "loss": 1.077, "loss/crossentropy": 2.438030958175659, "loss/hidden": 0.875, "loss/logits": 0.14217695593833923, "loss/reg": 0.005980519577860832, "step": 228 }, { "epoch": 0.028625, "grad_norm": 2.744401216506958, "grad_norm_var": 0.15282793193407448, "learning_rate": 0.0001, "loss": 1.1967, "loss/crossentropy": 2.557457447052002, "loss/hidden": 0.97265625, "loss/logits": 0.16425767540931702, "loss/reg": 0.005979116074740887, "step": 229 }, { "epoch": 0.02875, "grad_norm": 2.4241418838500977, "grad_norm_var": 0.15103305834679168, "learning_rate": 0.0001, "loss": 1.0402, "loss/crossentropy": 2.743885040283203, "loss/hidden": 0.828125, "loss/logits": 0.15231972932815552, "loss/reg": 0.005977709777653217, "step": 230 }, { "epoch": 0.028875, "grad_norm": 2.0828442573547363, "grad_norm_var": 0.16526500993595217, "learning_rate": 0.0001, "loss": 0.9747, "loss/crossentropy": 2.719327688217163, "loss/hidden": 0.78125, "loss/logits": 0.133681058883667, "loss/reg": 0.005976095795631409, "step": 231 }, { "epoch": 0.029, "grad_norm": 2.127495527267456, "grad_norm_var": 0.18259721536013085, "learning_rate": 0.0001, "loss": 1.0588, "loss/crossentropy": 2.8147058486938477, "loss/hidden": 0.85546875, "loss/logits": 0.14354225993156433, "loss/reg": 0.005974431522190571, "step": 232 }, { "epoch": 0.029125, "grad_norm": 4.263195991516113, "grad_norm_var": 0.34219781045772657, "learning_rate": 0.0001, "loss": 1.1724, "loss/crossentropy": 2.5414481163024902, "loss/hidden": 0.96484375, "loss/logits": 0.1478062868118286, "loss/reg": 0.005972826853394508, "step": 233 }, { "epoch": 0.02925, "grad_norm": 2.9974324703216553, "grad_norm_var": 0.34510225788824467, "learning_rate": 0.0001, "loss": 1.3152, "loss/crossentropy": 2.697648763656616, "loss/hidden": 1.0546875, "loss/logits": 0.20080995559692383, "loss/reg": 0.005971227772533894, "step": 234 }, { "epoch": 0.029375, "grad_norm": 3.4798855781555176, "grad_norm_var": 0.37664835069757197, "learning_rate": 0.0001, "loss": 1.2096, "loss/crossentropy": 2.3990559577941895, "loss/hidden": 0.95703125, "loss/logits": 0.19287389516830444, "loss/reg": 0.005969603545963764, "step": 235 }, { "epoch": 0.0295, "grad_norm": 2.43911075592041, "grad_norm_var": 0.3848032740432508, "learning_rate": 0.0001, "loss": 1.0658, "loss/crossentropy": 1.966374158859253, "loss/hidden": 0.875, "loss/logits": 0.13115233182907104, "loss/reg": 0.005967943929135799, "step": 236 }, { "epoch": 0.029625, "grad_norm": 3.7423646450042725, "grad_norm_var": 0.4356891905379257, "learning_rate": 0.0001, "loss": 1.2397, "loss/crossentropy": 2.718675374984741, "loss/hidden": 0.9921875, "loss/logits": 0.18789833784103394, "loss/reg": 0.00596608454361558, "step": 237 }, { "epoch": 0.02975, "grad_norm": 3.328033924102783, "grad_norm_var": 0.4449827328026664, "learning_rate": 0.0001, "loss": 1.5581, "loss/crossentropy": 2.272303819656372, "loss/hidden": 1.2421875, "loss/logits": 0.2562662661075592, "loss/reg": 0.005964066833257675, "step": 238 }, { "epoch": 0.029875, "grad_norm": 2.8761045932769775, "grad_norm_var": 0.42986649641521024, "learning_rate": 0.0001, "loss": 1.1392, "loss/crossentropy": 2.6973013877868652, "loss/hidden": 0.91796875, "loss/logits": 0.16159963607788086, "loss/reg": 0.005962541792541742, "step": 239 }, { "epoch": 0.03, "grad_norm": 2.4458563327789307, "grad_norm_var": 0.4123921579785623, "learning_rate": 0.0001, "loss": 1.178, "loss/crossentropy": 2.5731561183929443, "loss/hidden": 0.9375, "loss/logits": 0.18093177676200867, "loss/reg": 0.005961006972938776, "step": 240 }, { "epoch": 0.030125, "grad_norm": 2.4645614624023438, "grad_norm_var": 0.3844441578530656, "learning_rate": 0.0001, "loss": 1.0932, "loss/crossentropy": 2.648738145828247, "loss/hidden": 0.890625, "loss/logits": 0.14302745461463928, "loss/reg": 0.005959144793450832, "step": 241 }, { "epoch": 0.03025, "grad_norm": 3.0715034008026123, "grad_norm_var": 0.3694944025754277, "learning_rate": 0.0001, "loss": 1.1916, "loss/crossentropy": 2.4820139408111572, "loss/hidden": 0.94921875, "loss/logits": 0.18278783559799194, "loss/reg": 0.005957332905381918, "step": 242 }, { "epoch": 0.030375, "grad_norm": 2.479677677154541, "grad_norm_var": 0.37773887013444374, "learning_rate": 0.0001, "loss": 1.0787, "loss/crossentropy": 2.614309549331665, "loss/hidden": 0.87109375, "loss/logits": 0.14808647334575653, "loss/reg": 0.005955492611974478, "step": 243 }, { "epoch": 0.0305, "grad_norm": 3.0970399379730225, "grad_norm_var": 0.36391299171458796, "learning_rate": 0.0001, "loss": 1.1987, "loss/crossentropy": 2.2731809616088867, "loss/hidden": 0.95703125, "loss/logits": 0.18210504949092865, "loss/reg": 0.00595364673063159, "step": 244 }, { "epoch": 0.030625, "grad_norm": 2.388214588165283, "grad_norm_var": 0.37823356386532864, "learning_rate": 0.0001, "loss": 1.1283, "loss/crossentropy": 2.532259225845337, "loss/hidden": 0.91015625, "loss/logits": 0.15858401358127594, "loss/reg": 0.005952049978077412, "step": 245 }, { "epoch": 0.03075, "grad_norm": 2.97310733795166, "grad_norm_var": 0.36540629077152076, "learning_rate": 0.0001, "loss": 1.1177, "loss/crossentropy": 2.5206258296966553, "loss/hidden": 0.89453125, "loss/logits": 0.16365137696266174, "loss/reg": 0.005950110498815775, "step": 246 }, { "epoch": 0.030875, "grad_norm": 2.15498423576355, "grad_norm_var": 0.3579579158371985, "learning_rate": 0.0001, "loss": 1.1046, "loss/crossentropy": 2.478773832321167, "loss/hidden": 0.8828125, "loss/logits": 0.162343829870224, "loss/reg": 0.005948282778263092, "step": 247 }, { "epoch": 0.031, "grad_norm": 2.3404128551483154, "grad_norm_var": 0.338987407645584, "learning_rate": 0.0001, "loss": 1.1555, "loss/crossentropy": 2.1949751377105713, "loss/hidden": 0.93359375, "loss/logits": 0.1624409407377243, "loss/reg": 0.005946675315499306, "step": 248 }, { "epoch": 0.031125, "grad_norm": 2.8813085556030273, "grad_norm_var": 0.20879640313171802, "learning_rate": 0.0001, "loss": 1.1599, "loss/crossentropy": 2.556128978729248, "loss/hidden": 0.9296875, "loss/logits": 0.1707805097103119, "loss/reg": 0.005944731179624796, "step": 249 }, { "epoch": 0.03125, "grad_norm": 3.309937000274658, "grad_norm_var": 0.22219010027481143, "learning_rate": 0.0001, "loss": 1.0939, "loss/crossentropy": 2.4590022563934326, "loss/hidden": 0.88671875, "loss/logits": 0.14774294197559357, "loss/reg": 0.005942681338638067, "step": 250 }, { "epoch": 0.031375, "grad_norm": 3.1676676273345947, "grad_norm_var": 0.201728293925846, "learning_rate": 0.0001, "loss": 1.3162, "loss/crossentropy": 2.419811487197876, "loss/hidden": 1.015625, "loss/logits": 0.24120670557022095, "loss/reg": 0.005940672475844622, "step": 251 }, { "epoch": 0.0315, "grad_norm": 2.6006832122802734, "grad_norm_var": 0.1951007002723287, "learning_rate": 0.0001, "loss": 1.3903, "loss/crossentropy": 2.170666456222534, "loss/hidden": 1.140625, "loss/logits": 0.19024603068828583, "loss/reg": 0.005938523914664984, "step": 252 }, { "epoch": 0.031625, "grad_norm": 2.4954755306243896, "grad_norm_var": 0.14101991304577552, "learning_rate": 0.0001, "loss": 1.1465, "loss/crossentropy": 2.262831449508667, "loss/hidden": 0.93359375, "loss/logits": 0.1535283327102661, "loss/reg": 0.00593681400641799, "step": 253 }, { "epoch": 0.03175, "grad_norm": 2.339406728744507, "grad_norm_var": 0.12652605714113535, "learning_rate": 0.0001, "loss": 0.984, "loss/crossentropy": 2.2793617248535156, "loss/hidden": 0.796875, "loss/logits": 0.12778240442276, "loss/reg": 0.005935273133218288, "step": 254 }, { "epoch": 0.031875, "grad_norm": 2.3391647338867188, "grad_norm_var": 0.131427049667937, "learning_rate": 0.0001, "loss": 1.0622, "loss/crossentropy": 2.4579379558563232, "loss/hidden": 0.83984375, "loss/logits": 0.16299216449260712, "loss/reg": 0.0059331608936190605, "step": 255 }, { "epoch": 0.032, "grad_norm": 2.3896231651306152, "grad_norm_var": 0.13322512800125588, "learning_rate": 0.0001, "loss": 1.057, "loss/crossentropy": 2.8022475242614746, "loss/hidden": 0.85546875, "loss/logits": 0.14219465851783752, "loss/reg": 0.005931555759161711, "step": 256 }, { "epoch": 0.032125, "grad_norm": 2.125249147415161, "grad_norm_var": 0.14907278605534582, "learning_rate": 0.0001, "loss": 1.0611, "loss/crossentropy": 2.33644700050354, "loss/hidden": 0.8515625, "loss/logits": 0.15020999312400818, "loss/reg": 0.005930029321461916, "step": 257 }, { "epoch": 0.03225, "grad_norm": 2.521933078765869, "grad_norm_var": 0.13593429417580463, "learning_rate": 0.0001, "loss": 1.0436, "loss/crossentropy": 2.512619733810425, "loss/hidden": 0.8203125, "loss/logits": 0.16396166384220123, "loss/reg": 0.00592817785218358, "step": 258 }, { "epoch": 0.032375, "grad_norm": 2.5966317653656006, "grad_norm_var": 0.13490910688263208, "learning_rate": 0.0001, "loss": 1.1331, "loss/crossentropy": 2.248013734817505, "loss/hidden": 0.91015625, "loss/logits": 0.16364812850952148, "loss/reg": 0.00592625979334116, "step": 259 }, { "epoch": 0.0325, "grad_norm": 2.2045137882232666, "grad_norm_var": 0.12644607438415487, "learning_rate": 0.0001, "loss": 1.0015, "loss/crossentropy": 2.3253698348999023, "loss/hidden": 0.796875, "loss/logits": 0.14540287852287292, "loss/reg": 0.005924653727561235, "step": 260 }, { "epoch": 0.032625, "grad_norm": 2.4450156688690186, "grad_norm_var": 0.1254090419850094, "learning_rate": 0.0001, "loss": 0.9932, "loss/crossentropy": 2.2374210357666016, "loss/hidden": 0.80078125, "loss/logits": 0.13316848874092102, "loss/reg": 0.005922792013734579, "step": 261 }, { "epoch": 0.03275, "grad_norm": 7.747511863708496, "grad_norm_var": 1.8160510254643325, "learning_rate": 0.0001, "loss": 1.2542, "loss/crossentropy": 2.8747429847717285, "loss/hidden": 1.0234375, "loss/logits": 0.17151576280593872, "loss/reg": 0.005921173375099897, "step": 262 }, { "epoch": 0.032875, "grad_norm": 2.1854233741760254, "grad_norm_var": 1.8132730792650582, "learning_rate": 0.0001, "loss": 1.0069, "loss/crossentropy": 2.4989960193634033, "loss/hidden": 0.8125, "loss/logits": 0.13518914580345154, "loss/reg": 0.005919379647821188, "step": 263 }, { "epoch": 0.033, "grad_norm": 3.5132219791412354, "grad_norm_var": 1.8186749991604263, "learning_rate": 0.0001, "loss": 1.054, "loss/crossentropy": 2.497178316116333, "loss/hidden": 0.84765625, "loss/logits": 0.1471494734287262, "loss/reg": 0.005917761009186506, "step": 264 }, { "epoch": 0.033125, "grad_norm": 4.302145481109619, "grad_norm_var": 1.9358282916849012, "learning_rate": 0.0001, "loss": 1.3123, "loss/crossentropy": 2.1725542545318604, "loss/hidden": 1.0859375, "loss/logits": 0.16722658276557922, "loss/reg": 0.0059160212986171246, "step": 265 }, { "epoch": 0.03325, "grad_norm": 2.3225510120391846, "grad_norm_var": 1.9582913809461102, "learning_rate": 0.0001, "loss": 1.0153, "loss/crossentropy": 2.6670029163360596, "loss/hidden": 0.80859375, "loss/logits": 0.1475904881954193, "loss/reg": 0.0059142098762094975, "step": 266 }, { "epoch": 0.033375, "grad_norm": 5.196990013122559, "grad_norm_var": 2.27294427304937, "learning_rate": 0.0001, "loss": 1.1665, "loss/crossentropy": 2.6792731285095215, "loss/hidden": 0.94140625, "loss/logits": 0.1659836769104004, "loss/reg": 0.00591221172362566, "step": 267 }, { "epoch": 0.0335, "grad_norm": 3.5144336223602295, "grad_norm_var": 2.26638445070385, "learning_rate": 0.0001, "loss": 1.2502, "loss/crossentropy": 2.2949023246765137, "loss/hidden": 1.0234375, "loss/logits": 0.1677004098892212, "loss/reg": 0.005910532083362341, "step": 268 }, { "epoch": 0.033625, "grad_norm": 2.861222267150879, "grad_norm_var": 2.2433162495019436, "learning_rate": 0.0001, "loss": 1.3308, "loss/crossentropy": 2.5955142974853516, "loss/hidden": 1.0703125, "loss/logits": 0.2013990730047226, "loss/reg": 0.005908492021262646, "step": 269 }, { "epoch": 0.03375, "grad_norm": 2.964390754699707, "grad_norm_var": 2.1991134738974947, "learning_rate": 0.0001, "loss": 1.0975, "loss/crossentropy": 2.483924150466919, "loss/hidden": 0.8828125, "loss/logits": 0.15562227368354797, "loss/reg": 0.005906403064727783, "step": 270 }, { "epoch": 0.033875, "grad_norm": 2.75604510307312, "grad_norm_var": 2.1620222961988325, "learning_rate": 0.0001, "loss": 1.2196, "loss/crossentropy": 2.39125394821167, "loss/hidden": 0.9765625, "loss/logits": 0.18403753638267517, "loss/reg": 0.00590470340102911, "step": 271 }, { "epoch": 0.034, "grad_norm": 2.360309362411499, "grad_norm_var": 2.165352535939727, "learning_rate": 0.0001, "loss": 1.0194, "loss/crossentropy": 2.530670404434204, "loss/hidden": 0.8046875, "loss/logits": 0.15565866231918335, "loss/reg": 0.005902664735913277, "step": 272 }, { "epoch": 0.034125, "grad_norm": 2.496027946472168, "grad_norm_var": 2.1195219252368287, "learning_rate": 0.0001, "loss": 1.2228, "loss/crossentropy": 2.7535252571105957, "loss/hidden": 0.9609375, "loss/logits": 0.20284873247146606, "loss/reg": 0.005900639574974775, "step": 273 }, { "epoch": 0.03425, "grad_norm": 2.854250431060791, "grad_norm_var": 2.0941964139517344, "learning_rate": 0.0001, "loss": 1.1387, "loss/crossentropy": 2.134964942932129, "loss/hidden": 0.9296875, "loss/logits": 0.15002194046974182, "loss/reg": 0.005898929201066494, "step": 274 }, { "epoch": 0.034375, "grad_norm": 4.497798442840576, "grad_norm_var": 2.149396374832277, "learning_rate": 0.0001, "loss": 1.2312, "loss/crossentropy": 2.3270835876464844, "loss/hidden": 0.99609375, "loss/logits": 0.17617599666118622, "loss/reg": 0.0058972095139324665, "step": 275 }, { "epoch": 0.0345, "grad_norm": 2.321152448654175, "grad_norm_var": 2.1318278315927155, "learning_rate": 0.0001, "loss": 1.1523, "loss/crossentropy": 1.858445644378662, "loss/hidden": 0.94921875, "loss/logits": 0.14408603310585022, "loss/reg": 0.005895303096622229, "step": 276 }, { "epoch": 0.034625, "grad_norm": 2.4426257610321045, "grad_norm_var": 2.1321312734782243, "learning_rate": 0.0001, "loss": 1.0267, "loss/crossentropy": 2.4483628273010254, "loss/hidden": 0.82421875, "loss/logits": 0.1435263752937317, "loss/reg": 0.005893299821764231, "step": 277 }, { "epoch": 0.03475, "grad_norm": 2.144637107849121, "grad_norm_var": 0.843351985629086, "learning_rate": 0.0001, "loss": 1.0517, "loss/crossentropy": 2.237915277481079, "loss/hidden": 0.8515625, "loss/logits": 0.14119011163711548, "loss/reg": 0.005891298409551382, "step": 278 }, { "epoch": 0.034875, "grad_norm": 2.32000732421875, "grad_norm_var": 0.8290445100225684, "learning_rate": 0.0001, "loss": 1.0462, "loss/crossentropy": 2.6588850021362305, "loss/hidden": 0.83203125, "loss/logits": 0.1552983820438385, "loss/reg": 0.0058892290107905865, "step": 279 }, { "epoch": 0.035, "grad_norm": 3.3390939235687256, "grad_norm_var": 0.820283282746707, "learning_rate": 0.0001, "loss": 1.1937, "loss/crossentropy": 2.5243186950683594, "loss/hidden": 0.953125, "loss/logits": 0.1817275732755661, "loss/reg": 0.00588742271065712, "step": 280 }, { "epoch": 0.035125, "grad_norm": 3.1800894737243652, "grad_norm_var": 0.7106469411621028, "learning_rate": 0.0001, "loss": 1.1937, "loss/crossentropy": 2.556126832962036, "loss/hidden": 0.953125, "loss/logits": 0.18167603015899658, "loss/reg": 0.005885709077119827, "step": 281 }, { "epoch": 0.03525, "grad_norm": 4.466390132904053, "grad_norm_var": 0.8119073339313209, "learning_rate": 0.0001, "loss": 1.27, "loss/crossentropy": 2.5671539306640625, "loss/hidden": 0.984375, "loss/logits": 0.2267427146434784, "loss/reg": 0.0058837407268583775, "step": 282 }, { "epoch": 0.035375, "grad_norm": 3.2809953689575195, "grad_norm_var": 0.5074810718943117, "learning_rate": 0.0001, "loss": 1.1245, "loss/crossentropy": 2.1554338932037354, "loss/hidden": 0.9140625, "loss/logits": 0.1516391634941101, "loss/reg": 0.005881770513951778, "step": 283 }, { "epoch": 0.0355, "grad_norm": 2.9982316493988037, "grad_norm_var": 0.48786559613454966, "learning_rate": 0.0001, "loss": 1.1286, "loss/crossentropy": 2.6773006916046143, "loss/hidden": 0.90625, "loss/logits": 0.1635606288909912, "loss/reg": 0.005880062934011221, "step": 284 }, { "epoch": 0.035625, "grad_norm": 2.387657880783081, "grad_norm_var": 0.5078162485774572, "learning_rate": 0.0001, "loss": 1.1214, "loss/crossentropy": 2.4741320610046387, "loss/hidden": 0.8984375, "loss/logits": 0.1641697734594345, "loss/reg": 0.0058782072737813, "step": 285 }, { "epoch": 0.03575, "grad_norm": 271.6628112792969, "grad_norm_var": 4514.324895160767, "learning_rate": 0.0001, "loss": 1.6171, "loss/crossentropy": 2.5766143798828125, "loss/hidden": 1.375, "loss/logits": 0.1833469420671463, "loss/reg": 0.005876271054148674, "step": 286 }, { "epoch": 0.035875, "grad_norm": 3.545677900314331, "grad_norm_var": 4512.577903953303, "learning_rate": 0.0001, "loss": 1.1466, "loss/crossentropy": 2.5389881134033203, "loss/hidden": 0.88671875, "loss/logits": 0.20117658376693726, "loss/reg": 0.005874336697161198, "step": 287 }, { "epoch": 0.036, "grad_norm": 2.9219233989715576, "grad_norm_var": 4511.294050983276, "learning_rate": 0.0001, "loss": 1.1121, "loss/crossentropy": 2.3270509243011475, "loss/hidden": 0.8828125, "loss/logits": 0.17058232426643372, "loss/reg": 0.005872361361980438, "step": 288 }, { "epoch": 0.036125, "grad_norm": 2.831878423690796, "grad_norm_var": 4510.526061571783, "learning_rate": 0.0001, "loss": 1.148, "loss/crossentropy": 2.4853744506835938, "loss/hidden": 0.91796875, "loss/logits": 0.17128118872642517, "loss/reg": 0.005870639346539974, "step": 289 }, { "epoch": 0.03625, "grad_norm": 2.284134864807129, "grad_norm_var": 4511.83639181831, "learning_rate": 0.0001, "loss": 1.0599, "loss/crossentropy": 2.3107759952545166, "loss/hidden": 0.8515625, "loss/logits": 0.14969472587108612, "loss/reg": 0.005868903826922178, "step": 290 }, { "epoch": 0.036375, "grad_norm": 2.2008161544799805, "grad_norm_var": 4516.84932017332, "learning_rate": 0.0001, "loss": 1.0902, "loss/crossentropy": 2.4265358448028564, "loss/hidden": 0.86328125, "loss/logits": 0.1682073473930359, "loss/reg": 0.0058671231381595135, "step": 291 }, { "epoch": 0.0365, "grad_norm": 2.6285743713378906, "grad_norm_var": 4516.145108725088, "learning_rate": 0.0001, "loss": 1.2494, "loss/crossentropy": 2.372230291366577, "loss/hidden": 0.98046875, "loss/logits": 0.2102714478969574, "loss/reg": 0.005865375977009535, "step": 292 }, { "epoch": 0.036625, "grad_norm": 2.6784040927886963, "grad_norm_var": 4515.607170253259, "learning_rate": 0.0001, "loss": 1.0752, "loss/crossentropy": 2.6276440620422363, "loss/hidden": 0.875, "loss/logits": 0.14159329235553741, "loss/reg": 0.005863656289875507, "step": 293 }, { "epoch": 0.03675, "grad_norm": 2.6373047828674316, "grad_norm_var": 4514.470495103465, "learning_rate": 0.0001, "loss": 1.1694, "loss/crossentropy": 2.70892333984375, "loss/hidden": 0.9453125, "loss/logits": 0.16546514630317688, "loss/reg": 0.005862091202288866, "step": 294 }, { "epoch": 0.036875, "grad_norm": 2.384430170059204, "grad_norm_var": 4514.321377312488, "learning_rate": 0.0001, "loss": 1.2472, "loss/crossentropy": 2.1273090839385986, "loss/hidden": 1.0, "loss/logits": 0.18860690295696259, "loss/reg": 0.005860424134880304, "step": 295 }, { "epoch": 0.037, "grad_norm": 2.5959692001342773, "grad_norm_var": 4515.978398966678, "learning_rate": 0.0001, "loss": 1.0376, "loss/crossentropy": 2.7293522357940674, "loss/hidden": 0.8203125, "loss/logits": 0.1587076485157013, "loss/reg": 0.0058588446117937565, "step": 296 }, { "epoch": 0.037125, "grad_norm": 2.2753238677978516, "grad_norm_var": 4518.0185669920775, "learning_rate": 0.0001, "loss": 1.0063, "loss/crossentropy": 2.4602949619293213, "loss/hidden": 0.8125, "loss/logits": 0.13525693118572235, "loss/reg": 0.005857320036739111, "step": 297 }, { "epoch": 0.03725, "grad_norm": 3.009300708770752, "grad_norm_var": 4521.093589717446, "learning_rate": 0.0001, "loss": 1.2573, "loss/crossentropy": 2.8883349895477295, "loss/hidden": 0.9921875, "loss/logits": 0.20657645165920258, "loss/reg": 0.005855792202055454, "step": 298 }, { "epoch": 0.037375, "grad_norm": 2.700221538543701, "grad_norm_var": 4522.372179334166, "learning_rate": 0.0001, "loss": 1.1557, "loss/crossentropy": 2.5446314811706543, "loss/hidden": 0.90234375, "loss/logits": 0.19479964673519135, "loss/reg": 0.005854278337210417, "step": 299 }, { "epoch": 0.0375, "grad_norm": 2.3786559104919434, "grad_norm_var": 4523.758055495688, "learning_rate": 0.0001, "loss": 1.1224, "loss/crossentropy": 2.469960927963257, "loss/hidden": 0.90234375, "loss/logits": 0.16156738996505737, "loss/reg": 0.00585273839533329, "step": 300 }, { "epoch": 0.037625, "grad_norm": 2.7032158374786377, "grad_norm_var": 4523.046593599144, "learning_rate": 0.0001, "loss": 1.1947, "loss/crossentropy": 2.7451162338256836, "loss/hidden": 0.94140625, "loss/logits": 0.19476984441280365, "loss/reg": 0.0058509958907961845, "step": 301 }, { "epoch": 0.03775, "grad_norm": 2.507664442062378, "grad_norm_var": 0.11250867537391755, "learning_rate": 0.0001, "loss": 0.9899, "loss/crossentropy": 2.53341007232666, "loss/hidden": 0.796875, "loss/logits": 0.1345081329345703, "loss/reg": 0.005849248263984919, "step": 302 }, { "epoch": 0.037875, "grad_norm": 3.027892589569092, "grad_norm_var": 0.06692647718721882, "learning_rate": 0.0001, "loss": 1.0973, "loss/crossentropy": 2.7899296283721924, "loss/hidden": 0.890625, "loss/logits": 0.1482122391462326, "loss/reg": 0.005847662687301636, "step": 303 }, { "epoch": 0.038, "grad_norm": 2.1617183685302734, "grad_norm_var": 0.07146536810277529, "learning_rate": 0.0001, "loss": 0.969, "loss/crossentropy": 2.4700305461883545, "loss/hidden": 0.78125, "loss/logits": 0.12925508618354797, "loss/reg": 0.005846073850989342, "step": 304 }, { "epoch": 0.038125, "grad_norm": 2.3791332244873047, "grad_norm_var": 0.06803597239225306, "learning_rate": 0.0001, "loss": 1.1912, "loss/crossentropy": 2.4171202182769775, "loss/hidden": 0.9453125, "loss/logits": 0.18739524483680725, "loss/reg": 0.005844476167112589, "step": 305 }, { "epoch": 0.03825, "grad_norm": 2.7622976303100586, "grad_norm_var": 0.06636088237049004, "learning_rate": 0.0001, "loss": 1.0808, "loss/crossentropy": 2.5030367374420166, "loss/hidden": 0.8359375, "loss/logits": 0.18643516302108765, "loss/reg": 0.005842759273946285, "step": 306 }, { "epoch": 0.038375, "grad_norm": 2.4079246520996094, "grad_norm_var": 0.059000676657357566, "learning_rate": 0.0001, "loss": 1.0359, "loss/crossentropy": 2.381542682647705, "loss/hidden": 0.828125, "loss/logits": 0.1493588387966156, "loss/reg": 0.0058412267826497555, "step": 307 }, { "epoch": 0.0385, "grad_norm": 2.5356478691101074, "grad_norm_var": 0.058906038923372726, "learning_rate": 0.0001, "loss": 1.087, "loss/crossentropy": 2.4928808212280273, "loss/hidden": 0.875, "loss/logits": 0.15363982319831848, "loss/reg": 0.0058394852094352245, "step": 308 }, { "epoch": 0.038625, "grad_norm": 2.4036688804626465, "grad_norm_var": 0.0597099908353601, "learning_rate": 0.0001, "loss": 0.986, "loss/crossentropy": 2.5816946029663086, "loss/hidden": 0.7890625, "loss/logits": 0.13851355016231537, "loss/reg": 0.005837727338075638, "step": 309 }, { "epoch": 0.03875, "grad_norm": 2.630572557449341, "grad_norm_var": 0.05963840398777146, "learning_rate": 0.0001, "loss": 1.0333, "loss/crossentropy": 2.140015125274658, "loss/hidden": 0.828125, "loss/logits": 0.14680367708206177, "loss/reg": 0.005835913587361574, "step": 310 }, { "epoch": 0.038875, "grad_norm": 2.3641905784606934, "grad_norm_var": 0.06012154861927167, "learning_rate": 0.0001, "loss": 1.0947, "loss/crossentropy": 2.3300833702087402, "loss/hidden": 0.8828125, "loss/logits": 0.15358075499534607, "loss/reg": 0.005834224168211222, "step": 311 }, { "epoch": 0.039, "grad_norm": 2.215728759765625, "grad_norm_var": 0.06696490679455162, "learning_rate": 0.0001, "loss": 1.1411, "loss/crossentropy": 2.4583277702331543, "loss/hidden": 0.9140625, "loss/logits": 0.1687404215335846, "loss/reg": 0.005832599475979805, "step": 312 }, { "epoch": 0.039125, "grad_norm": 2.8934550285339355, "grad_norm_var": 0.06994228066174794, "learning_rate": 0.0001, "loss": 1.2763, "loss/crossentropy": 2.409702777862549, "loss/hidden": 1.0390625, "loss/logits": 0.17889352142810822, "loss/reg": 0.005831001792103052, "step": 313 }, { "epoch": 0.03925, "grad_norm": 8.741681098937988, "grad_norm_var": 2.4613182467650705, "learning_rate": 0.0001, "loss": 1.1972, "loss/crossentropy": 2.3858492374420166, "loss/hidden": 0.96875, "loss/logits": 0.1701970100402832, "loss/reg": 0.005829236935824156, "step": 314 }, { "epoch": 0.039375, "grad_norm": 7.412417411804199, "grad_norm_var": 3.707354176329111, "learning_rate": 0.0001, "loss": 1.3096, "loss/crossentropy": 2.3804125785827637, "loss/hidden": 1.1015625, "loss/logits": 0.149795800447464, "loss/reg": 0.005827469285577536, "step": 315 }, { "epoch": 0.0395, "grad_norm": 3.1443870067596436, "grad_norm_var": 3.6580641482995806, "learning_rate": 0.0001, "loss": 1.1365, "loss/crossentropy": 2.481820583343506, "loss/hidden": 0.90234375, "loss/logits": 0.1759084165096283, "loss/reg": 0.005825776606798172, "step": 316 }, { "epoch": 0.039625, "grad_norm": 2.8567562103271484, "grad_norm_var": 3.6479706732170993, "learning_rate": 0.0001, "loss": 1.0023, "loss/crossentropy": 2.5141823291778564, "loss/hidden": 0.80078125, "loss/logits": 0.14331723749637604, "loss/reg": 0.005824015475809574, "step": 317 }, { "epoch": 0.03975, "grad_norm": 2.2817444801330566, "grad_norm_var": 3.674359828489624, "learning_rate": 0.0001, "loss": 1.0893, "loss/crossentropy": 2.184128999710083, "loss/hidden": 0.875, "loss/logits": 0.15605026483535767, "loss/reg": 0.00582248717546463, "step": 318 }, { "epoch": 0.039875, "grad_norm": 2.249969005584717, "grad_norm_var": 3.736641439481692, "learning_rate": 0.0001, "loss": 1.008, "loss/crossentropy": 2.768484354019165, "loss/hidden": 0.80078125, "loss/logits": 0.14897163212299347, "loss/reg": 0.00582079216837883, "step": 319 }, { "epoch": 0.04, "grad_norm": 2.6358306407928467, "grad_norm_var": 3.684102068428194, "learning_rate": 0.0001, "loss": 1.3237, "loss/crossentropy": 2.301954507827759, "loss/hidden": 1.015625, "loss/logits": 0.24987459182739258, "loss/reg": 0.005819002632051706, "step": 320 }, { "epoch": 0.040125, "grad_norm": 2.353457450866699, "grad_norm_var": 3.6871065280104496, "learning_rate": 0.0001, "loss": 1.1095, "loss/crossentropy": 2.379765272140503, "loss/hidden": 0.89453125, "loss/logits": 0.15680107474327087, "loss/reg": 0.005817302968353033, "step": 321 }, { "epoch": 0.04025, "grad_norm": 2.4568967819213867, "grad_norm_var": 3.712514538750317, "learning_rate": 0.0001, "loss": 0.9706, "loss/crossentropy": 2.380795955657959, "loss/hidden": 0.77734375, "loss/logits": 0.13508911430835724, "loss/reg": 0.005815597716718912, "step": 322 }, { "epoch": 0.040375, "grad_norm": 3.207794189453125, "grad_norm_var": 3.6654654630236734, "learning_rate": 0.0001, "loss": 1.3668, "loss/crossentropy": 1.949703574180603, "loss/hidden": 1.1171875, "loss/logits": 0.19150257110595703, "loss/reg": 0.005813860800117254, "step": 323 }, { "epoch": 0.0405, "grad_norm": 3.156318187713623, "grad_norm_var": 3.6284383166396252, "learning_rate": 0.0001, "loss": 1.2742, "loss/crossentropy": 2.1970410346984863, "loss/hidden": 1.0, "loss/logits": 0.21606677770614624, "loss/reg": 0.005812041461467743, "step": 324 }, { "epoch": 0.040625, "grad_norm": 2.556889533996582, "grad_norm_var": 3.611332493108523, "learning_rate": 0.0001, "loss": 0.9529, "loss/crossentropy": 2.7647974491119385, "loss/hidden": 0.7578125, "loss/logits": 0.1369488537311554, "loss/reg": 0.00581031059846282, "step": 325 }, { "epoch": 0.04075, "grad_norm": 2.2634167671203613, "grad_norm_var": 3.653624545749698, "learning_rate": 0.0001, "loss": 1.0757, "loss/crossentropy": 2.334134340286255, "loss/hidden": 0.859375, "loss/logits": 0.1581987738609314, "loss/reg": 0.005808570422232151, "step": 326 }, { "epoch": 0.040875, "grad_norm": 2.3521125316619873, "grad_norm_var": 3.6551397839485555, "learning_rate": 0.0001, "loss": 0.9965, "loss/crossentropy": 2.78828763961792, "loss/hidden": 0.79296875, "loss/logits": 0.1454332172870636, "loss/reg": 0.005806888919323683, "step": 327 }, { "epoch": 0.041, "grad_norm": 3.0836093425750732, "grad_norm_var": 3.5768996944618254, "learning_rate": 0.0001, "loss": 1.1938, "loss/crossentropy": 2.2781612873077393, "loss/hidden": 0.9609375, "loss/logits": 0.1747758537530899, "loss/reg": 0.005805303808301687, "step": 328 }, { "epoch": 0.041125, "grad_norm": 3.6110970973968506, "grad_norm_var": 3.5651235487558246, "learning_rate": 0.0001, "loss": 1.1693, "loss/crossentropy": 2.812913417816162, "loss/hidden": 0.9375, "loss/logits": 0.17377659678459167, "loss/reg": 0.005803780164569616, "step": 329 }, { "epoch": 0.04125, "grad_norm": 2.5020155906677246, "grad_norm_var": 1.552569952590708, "learning_rate": 0.0001, "loss": 1.0862, "loss/crossentropy": 2.6585140228271484, "loss/hidden": 0.86328125, "loss/logits": 0.16489718854427338, "loss/reg": 0.005802258383482695, "step": 330 }, { "epoch": 0.041375, "grad_norm": 2.383924961090088, "grad_norm_var": 0.17978007457456116, "learning_rate": 0.0001, "loss": 1.1592, "loss/crossentropy": 2.4862210750579834, "loss/hidden": 0.94921875, "loss/logits": 0.15199331939220428, "loss/reg": 0.005800731014460325, "step": 331 }, { "epoch": 0.0415, "grad_norm": 2.187321424484253, "grad_norm_var": 0.17949311071790794, "learning_rate": 0.0001, "loss": 1.0507, "loss/crossentropy": 2.6380603313446045, "loss/hidden": 0.84765625, "loss/logits": 0.14507073163986206, "loss/reg": 0.005798923317342997, "step": 332 }, { "epoch": 0.041625, "grad_norm": 2.21768856048584, "grad_norm_var": 0.18601193201957902, "learning_rate": 0.0001, "loss": 1.1027, "loss/crossentropy": 2.3925793170928955, "loss/hidden": 0.875, "loss/logits": 0.16972869634628296, "loss/reg": 0.00579707371070981, "step": 333 }, { "epoch": 0.04175, "grad_norm": 2.682497262954712, "grad_norm_var": 0.17937770683656615, "learning_rate": 0.0001, "loss": 1.3272, "loss/crossentropy": 2.3586106300354004, "loss/hidden": 1.078125, "loss/logits": 0.1911502480506897, "loss/reg": 0.005795224104076624, "step": 334 }, { "epoch": 0.041875, "grad_norm": 3.0983307361602783, "grad_norm_var": 0.1826395003188658, "learning_rate": 0.0001, "loss": 1.1675, "loss/crossentropy": 2.436326265335083, "loss/hidden": 0.91796875, "loss/logits": 0.1915540099143982, "loss/reg": 0.005793258547782898, "step": 335 }, { "epoch": 0.042, "grad_norm": 6.251674652099609, "grad_norm_var": 0.982431631272856, "learning_rate": 0.0001, "loss": 1.6879, "loss/crossentropy": 2.3841142654418945, "loss/hidden": 1.265625, "loss/logits": 0.3643344044685364, "loss/reg": 0.0057912725023925304, "step": 336 }, { "epoch": 0.042125, "grad_norm": 3.0111782550811768, "grad_norm_var": 0.9617308564996427, "learning_rate": 0.0001, "loss": 1.3497, "loss/crossentropy": 2.430532217025757, "loss/hidden": 1.0703125, "loss/logits": 0.2214677333831787, "loss/reg": 0.00578899122774601, "step": 337 }, { "epoch": 0.04225, "grad_norm": 2.4221205711364746, "grad_norm_var": 0.9640415151512265, "learning_rate": 0.0001, "loss": 1.0955, "loss/crossentropy": 2.4376015663146973, "loss/hidden": 0.890625, "loss/logits": 0.1470467746257782, "loss/reg": 0.005786662455648184, "step": 338 }, { "epoch": 0.042375, "grad_norm": 2.615758180618286, "grad_norm_var": 0.9645524062068328, "learning_rate": 0.0001, "loss": 1.0887, "loss/crossentropy": 2.5318005084991455, "loss/hidden": 0.875, "loss/logits": 0.15580901503562927, "loss/reg": 0.0057848175056278706, "step": 339 }, { "epoch": 0.0425, "grad_norm": 2.857177972793579, "grad_norm_var": 0.9599117798964886, "learning_rate": 0.0001, "loss": 1.1153, "loss/crossentropy": 2.4260058403015137, "loss/hidden": 0.89453125, "loss/logits": 0.16291844844818115, "loss/reg": 0.005782809574157, "step": 340 }, { "epoch": 0.042625, "grad_norm": 2.4030630588531494, "grad_norm_var": 0.9680393035693963, "learning_rate": 0.0001, "loss": 1.2054, "loss/crossentropy": 2.3009443283081055, "loss/hidden": 0.953125, "loss/logits": 0.194431871175766, "loss/reg": 0.005780525505542755, "step": 341 }, { "epoch": 0.04275, "grad_norm": 2.264251470565796, "grad_norm_var": 0.9679716782722624, "learning_rate": 0.0001, "loss": 1.0227, "loss/crossentropy": 2.597288131713867, "loss/hidden": 0.8203125, "loss/logits": 0.14457917213439941, "loss/reg": 0.005778233055025339, "step": 342 }, { "epoch": 0.042875, "grad_norm": 2.2368180751800537, "grad_norm_var": 0.9767866404468121, "learning_rate": 0.0001, "loss": 0.943, "loss/crossentropy": 2.4534237384796143, "loss/hidden": 0.7578125, "loss/logits": 0.12742644548416138, "loss/reg": 0.005776000674813986, "step": 343 }, { "epoch": 0.043, "grad_norm": 2.469120979309082, "grad_norm_var": 0.9824165851632264, "learning_rate": 0.0001, "loss": 1.0531, "loss/crossentropy": 2.793834686279297, "loss/hidden": 0.83984375, "loss/logits": 0.15554235875606537, "loss/reg": 0.005774145945906639, "step": 344 }, { "epoch": 0.043125, "grad_norm": 2.8334686756134033, "grad_norm_var": 0.9387961568478952, "learning_rate": 0.0001, "loss": 0.9467, "loss/crossentropy": 2.678666830062866, "loss/hidden": 0.7578125, "loss/logits": 0.13116785883903503, "loss/reg": 0.005771928001195192, "step": 345 }, { "epoch": 0.04325, "grad_norm": 7.863356590270996, "grad_norm_var": 2.5385263322105893, "learning_rate": 0.0001, "loss": 1.4695, "loss/crossentropy": 2.613318920135498, "loss/hidden": 1.2734375, "loss/logits": 0.13832132518291473, "loss/reg": 0.005770097486674786, "step": 346 }, { "epoch": 0.043375, "grad_norm": 2.763582468032837, "grad_norm_var": 2.510660987467067, "learning_rate": 0.0001, "loss": 1.1302, "loss/crossentropy": 2.846453905105591, "loss/hidden": 0.90625, "loss/logits": 0.166295126080513, "loss/reg": 0.0057678911834955215, "step": 347 }, { "epoch": 0.0435, "grad_norm": 3.600456714630127, "grad_norm_var": 2.4567056984087676, "learning_rate": 0.0001, "loss": 1.2108, "loss/crossentropy": 2.515092372894287, "loss/hidden": 0.96875, "loss/logits": 0.18436874449253082, "loss/reg": 0.005765695124864578, "step": 348 }, { "epoch": 0.043625, "grad_norm": 4.2698073387146, "grad_norm_var": 2.4444505062987636, "learning_rate": 0.0001, "loss": 1.1224, "loss/crossentropy": 2.3673834800720215, "loss/hidden": 0.8984375, "loss/logits": 0.16628439724445343, "loss/reg": 0.005763507913798094, "step": 349 }, { "epoch": 0.04375, "grad_norm": 2.962045192718506, "grad_norm_var": 2.42435544256402, "learning_rate": 0.0001, "loss": 1.079, "loss/crossentropy": 2.9470205307006836, "loss/hidden": 0.83203125, "loss/logits": 0.1893935650587082, "loss/reg": 0.005761242005974054, "step": 350 }, { "epoch": 0.043875, "grad_norm": 3.0306880474090576, "grad_norm_var": 2.427092851603572, "learning_rate": 0.0001, "loss": 1.0201, "loss/crossentropy": 2.3637542724609375, "loss/hidden": 0.83203125, "loss/logits": 0.13047108054161072, "loss/reg": 0.0057592191733419895, "step": 351 }, { "epoch": 0.044, "grad_norm": 2.599585771560669, "grad_norm_var": 1.855493477227511, "learning_rate": 0.0001, "loss": 0.9429, "loss/crossentropy": 2.9222559928894043, "loss/hidden": 0.7578125, "loss/logits": 0.12747693061828613, "loss/reg": 0.005757040809839964, "step": 352 }, { "epoch": 0.044125, "grad_norm": 2.4723081588745117, "grad_norm_var": 1.882729557078295, "learning_rate": 0.0001, "loss": 1.2276, "loss/crossentropy": 2.5835001468658447, "loss/hidden": 0.94921875, "loss/logits": 0.220790833234787, "loss/reg": 0.005754764657467604, "step": 353 }, { "epoch": 0.04425, "grad_norm": 2.5266165733337402, "grad_norm_var": 1.873911870827686, "learning_rate": 0.0001, "loss": 1.1879, "loss/crossentropy": 2.4273722171783447, "loss/hidden": 0.97265625, "loss/logits": 0.15772980451583862, "loss/reg": 0.005752884317189455, "step": 354 }, { "epoch": 0.044375, "grad_norm": 2.8139867782592773, "grad_norm_var": 1.8632913443851133, "learning_rate": 0.0001, "loss": 1.2803, "loss/crossentropy": 2.591078996658325, "loss/hidden": 1.0234375, "loss/logits": 0.19931599497795105, "loss/reg": 0.0057507967576384544, "step": 355 }, { "epoch": 0.0445, "grad_norm": 2.0173490047454834, "grad_norm_var": 1.9371277324683585, "learning_rate": 0.0001, "loss": 1.0066, "loss/crossentropy": 2.415416955947876, "loss/hidden": 0.80859375, "loss/logits": 0.14050991833209991, "loss/reg": 0.005748571362346411, "step": 356 }, { "epoch": 0.044625, "grad_norm": 3.5304269790649414, "grad_norm_var": 1.916250206343263, "learning_rate": 0.0001, "loss": 1.2665, "loss/crossentropy": 2.7149741649627686, "loss/hidden": 1.0390625, "loss/logits": 0.16997796297073364, "loss/reg": 0.005746254697442055, "step": 357 }, { "epoch": 0.04475, "grad_norm": 47.96537399291992, "grad_norm_var": 127.11164707702224, "learning_rate": 0.0001, "loss": 1.4579, "loss/crossentropy": 2.7637100219726562, "loss/hidden": 1.2265625, "loss/logits": 0.17390823364257812, "loss/reg": 0.005744417663663626, "step": 358 }, { "epoch": 0.044875, "grad_norm": 2.253833055496216, "grad_norm_var": 127.10313415769795, "learning_rate": 0.0001, "loss": 1.1382, "loss/crossentropy": 2.3016419410705566, "loss/hidden": 0.9140625, "loss/logits": 0.16676074266433716, "loss/reg": 0.005742207169532776, "step": 359 }, { "epoch": 0.045, "grad_norm": 3.2059576511383057, "grad_norm_var": 126.79034824550331, "learning_rate": 0.0001, "loss": 1.2389, "loss/crossentropy": 2.624589204788208, "loss/hidden": 1.0, "loss/logits": 0.18154433369636536, "loss/reg": 0.005740353371948004, "step": 360 }, { "epoch": 0.045125, "grad_norm": 2.456129789352417, "grad_norm_var": 126.9607902891753, "learning_rate": 0.0001, "loss": 1.0342, "loss/crossentropy": 2.500290870666504, "loss/hidden": 0.83203125, "loss/logits": 0.14475134015083313, "loss/reg": 0.005738324951380491, "step": 361 }, { "epoch": 0.04525, "grad_norm": 3.081372022628784, "grad_norm_var": 127.21513938268541, "learning_rate": 0.0001, "loss": 1.1093, "loss/crossentropy": 2.3305118083953857, "loss/hidden": 0.8984375, "loss/logits": 0.15346962213516235, "loss/reg": 0.0057361493818461895, "step": 362 }, { "epoch": 0.045375, "grad_norm": 2.2634801864624023, "grad_norm_var": 127.4280286195785, "learning_rate": 0.0001, "loss": 1.0956, "loss/crossentropy": 2.4553990364074707, "loss/hidden": 0.875, "loss/logits": 0.16324618458747864, "loss/reg": 0.005734298378229141, "step": 363 }, { "epoch": 0.0455, "grad_norm": 3.9597907066345215, "grad_norm_var": 127.3359579534097, "learning_rate": 0.0001, "loss": 1.3557, "loss/crossentropy": 2.6449685096740723, "loss/hidden": 1.078125, "loss/logits": 0.2202637791633606, "loss/reg": 0.005732398014515638, "step": 364 }, { "epoch": 0.045625, "grad_norm": 2.7794013023376465, "grad_norm_var": 127.76159157574789, "learning_rate": 0.0001, "loss": 1.0787, "loss/crossentropy": 2.3118059635162354, "loss/hidden": 0.86328125, "loss/logits": 0.1581302285194397, "loss/reg": 0.005730301141738892, "step": 365 }, { "epoch": 0.04575, "grad_norm": 4.7589192390441895, "grad_norm_var": 127.32661229099328, "learning_rate": 0.0001, "loss": 1.3244, "loss/crossentropy": 2.5914306640625, "loss/hidden": 1.078125, "loss/logits": 0.18898184597492218, "loss/reg": 0.005728167947381735, "step": 366 }, { "epoch": 0.045875, "grad_norm": 4.024761199951172, "grad_norm_var": 127.03030673720949, "learning_rate": 0.0001, "loss": 1.421, "loss/crossentropy": 2.083667755126953, "loss/hidden": 1.1640625, "loss/logits": 0.1997053027153015, "loss/reg": 0.005726283416152, "step": 367 }, { "epoch": 0.046, "grad_norm": 2.9291043281555176, "grad_norm_var": 126.89672944049376, "learning_rate": 0.0001, "loss": 1.1321, "loss/crossentropy": 2.7017500400543213, "loss/hidden": 0.90625, "loss/logits": 0.1686232089996338, "loss/reg": 0.005724436603486538, "step": 368 }, { "epoch": 0.046125, "grad_norm": 2.289379119873047, "grad_norm_var": 126.98034912166224, "learning_rate": 0.0001, "loss": 1.0433, "loss/crossentropy": 2.404045581817627, "loss/hidden": 0.8359375, "loss/logits": 0.1501048356294632, "loss/reg": 0.005722455680370331, "step": 369 }, { "epoch": 0.04625, "grad_norm": 2.5955307483673096, "grad_norm_var": 126.95053618311779, "learning_rate": 0.0001, "loss": 1.1052, "loss/crossentropy": 2.555497407913208, "loss/hidden": 0.87890625, "loss/logits": 0.16912290453910828, "loss/reg": 0.0057206167839467525, "step": 370 }, { "epoch": 0.046375, "grad_norm": 2.5631515979766846, "grad_norm_var": 127.05459572518181, "learning_rate": 0.0001, "loss": 1.0105, "loss/crossentropy": 2.3253824710845947, "loss/hidden": 0.80859375, "loss/logits": 0.14470672607421875, "loss/reg": 0.005718756001442671, "step": 371 }, { "epoch": 0.0465, "grad_norm": 2.8995003700256348, "grad_norm_var": 126.65924311218065, "learning_rate": 0.0001, "loss": 1.0727, "loss/crossentropy": 2.5171523094177246, "loss/hidden": 0.859375, "loss/logits": 0.15616215765476227, "loss/reg": 0.005716769490391016, "step": 372 }, { "epoch": 0.046625, "grad_norm": 2.4674322605133057, "grad_norm_var": 127.0582358856119, "learning_rate": 0.0001, "loss": 0.9544, "loss/crossentropy": 2.426679849624634, "loss/hidden": 0.765625, "loss/logits": 0.13166998326778412, "loss/reg": 0.005714884493499994, "step": 373 }, { "epoch": 0.04675, "grad_norm": 2.1486146450042725, "grad_norm_var": 0.5554253140062239, "learning_rate": 0.0001, "loss": 1.0123, "loss/crossentropy": 2.3567564487457275, "loss/hidden": 0.8203125, "loss/logits": 0.1348218023777008, "loss/reg": 0.0057129692286252975, "step": 374 }, { "epoch": 0.046875, "grad_norm": 2.4249770641326904, "grad_norm_var": 0.5421168003854054, "learning_rate": 0.0001, "loss": 1.0005, "loss/crossentropy": 2.575383424758911, "loss/hidden": 0.80078125, "loss/logits": 0.1425924003124237, "loss/reg": 0.005710979457944632, "step": 375 }, { "epoch": 0.047, "grad_norm": 3.9449760913848877, "grad_norm_var": 0.6036429091311817, "learning_rate": 0.0001, "loss": 1.1428, "loss/crossentropy": 2.5839173793792725, "loss/hidden": 0.94921875, "loss/logits": 0.13653349876403809, "loss/reg": 0.0057089440524578094, "step": 376 }, { "epoch": 0.047125, "grad_norm": 2.3119592666625977, "grad_norm_var": 0.6148998912723904, "learning_rate": 0.0001, "loss": 1.088, "loss/crossentropy": 2.492663860321045, "loss/hidden": 0.859375, "loss/logits": 0.1715661883354187, "loss/reg": 0.005707095842808485, "step": 377 }, { "epoch": 0.04725, "grad_norm": 3.586817979812622, "grad_norm_var": 0.6386998540868449, "learning_rate": 0.0001, "loss": 1.0907, "loss/crossentropy": 2.8210177421569824, "loss/hidden": 0.87890625, "loss/logits": 0.15476316213607788, "loss/reg": 0.005705154500901699, "step": 378 }, { "epoch": 0.047375, "grad_norm": 2.805647850036621, "grad_norm_var": 0.6040650287121667, "learning_rate": 0.0001, "loss": 1.0792, "loss/crossentropy": 2.54019832611084, "loss/hidden": 0.859375, "loss/logits": 0.16280022263526917, "loss/reg": 0.005703243892639875, "step": 379 }, { "epoch": 0.0475, "grad_norm": 2.7932748794555664, "grad_norm_var": 0.5445939245804574, "learning_rate": 0.0001, "loss": 1.4621, "loss/crossentropy": 2.2343437671661377, "loss/hidden": 1.1953125, "loss/logits": 0.20978981256484985, "loss/reg": 0.005701290909200907, "step": 380 }, { "epoch": 0.047625, "grad_norm": 2.661917209625244, "grad_norm_var": 0.5482497924242672, "learning_rate": 0.0001, "loss": 0.9746, "loss/crossentropy": 2.782052516937256, "loss/hidden": 0.78125, "loss/logits": 0.13640211522579193, "loss/reg": 0.0056994096376001835, "step": 381 }, { "epoch": 0.04775, "grad_norm": 2.4914302825927734, "grad_norm_var": 0.3228126995822395, "learning_rate": 0.0001, "loss": 1.126, "loss/crossentropy": 2.166295051574707, "loss/hidden": 0.91015625, "loss/logits": 0.1589164137840271, "loss/reg": 0.005697426851838827, "step": 382 }, { "epoch": 0.047875, "grad_norm": 2.961653709411621, "grad_norm_var": 0.22106978564282992, "learning_rate": 0.0001, "loss": 1.1071, "loss/crossentropy": 2.5477302074432373, "loss/hidden": 0.8828125, "loss/logits": 0.16730068624019623, "loss/reg": 0.005695413798093796, "step": 383 }, { "epoch": 0.048, "grad_norm": 2.9396286010742188, "grad_norm_var": 0.22133896443579198, "learning_rate": 0.0001, "loss": 1.0254, "loss/crossentropy": 2.555258274078369, "loss/hidden": 0.828125, "loss/logits": 0.1403425633907318, "loss/reg": 0.005693417973816395, "step": 384 }, { "epoch": 0.048125, "grad_norm": 2.8298912048339844, "grad_norm_var": 0.20691636732209961, "learning_rate": 0.0001, "loss": 1.195, "loss/crossentropy": 2.472844362258911, "loss/hidden": 0.984375, "loss/logits": 0.15367946028709412, "loss/reg": 0.005691539496183395, "step": 385 }, { "epoch": 0.04825, "grad_norm": 15.47062873840332, "grad_norm_var": 10.256501481265339, "learning_rate": 0.0001, "loss": 1.4448, "loss/crossentropy": 2.521524667739868, "loss/hidden": 1.203125, "loss/logits": 0.1847420334815979, "loss/reg": 0.005689616315066814, "step": 386 }, { "epoch": 0.048375, "grad_norm": 2.455294370651245, "grad_norm_var": 10.271871141002237, "learning_rate": 0.0001, "loss": 1.1018, "loss/crossentropy": 2.309390068054199, "loss/hidden": 0.89453125, "loss/logits": 0.15039557218551636, "loss/reg": 0.005687698721885681, "step": 387 }, { "epoch": 0.0485, "grad_norm": 3.23420786857605, "grad_norm_var": 10.248744715041969, "learning_rate": 0.0001, "loss": 1.2879, "loss/crossentropy": 2.4902544021606445, "loss/hidden": 1.015625, "loss/logits": 0.2154603898525238, "loss/reg": 0.005685731768608093, "step": 388 }, { "epoch": 0.048625, "grad_norm": 2.660858631134033, "grad_norm_var": 10.221989434520331, "learning_rate": 0.0001, "loss": 1.025, "loss/crossentropy": 2.31535267829895, "loss/hidden": 0.8359375, "loss/logits": 0.13224059343338013, "loss/reg": 0.005683773662894964, "step": 389 }, { "epoch": 0.04875, "grad_norm": 2.4209847450256348, "grad_norm_var": 10.173641089965429, "learning_rate": 0.0001, "loss": 0.9974, "loss/crossentropy": 2.1761093139648438, "loss/hidden": 0.8125, "loss/logits": 0.12805956602096558, "loss/reg": 0.005681932438164949, "step": 390 }, { "epoch": 0.048875, "grad_norm": 3.108008623123169, "grad_norm_var": 10.09354551501582, "learning_rate": 0.0001, "loss": 0.979, "loss/crossentropy": 2.721165657043457, "loss/hidden": 0.78125, "loss/logits": 0.14099523425102234, "loss/reg": 0.005679869093000889, "step": 391 }, { "epoch": 0.049, "grad_norm": 2.6531527042388916, "grad_norm_var": 10.150022289467502, "learning_rate": 0.0001, "loss": 1.1723, "loss/crossentropy": 2.518146514892578, "loss/hidden": 0.9375, "loss/logits": 0.17805764079093933, "loss/reg": 0.005677856504917145, "step": 392 }, { "epoch": 0.049125, "grad_norm": 2.2534499168395996, "grad_norm_var": 10.160179916565673, "learning_rate": 0.0001, "loss": 1.1292, "loss/crossentropy": 2.633385181427002, "loss/hidden": 0.91015625, "loss/logits": 0.16230204701423645, "loss/reg": 0.005675735417753458, "step": 393 }, { "epoch": 0.04925, "grad_norm": 2.9424333572387695, "grad_norm_var": 10.185797665159741, "learning_rate": 0.0001, "loss": 1.4214, "loss/crossentropy": 2.62923002243042, "loss/hidden": 1.15625, "loss/logits": 0.20838308334350586, "loss/reg": 0.00567356962710619, "step": 394 }, { "epoch": 0.049375, "grad_norm": 2.622178792953491, "grad_norm_var": 10.20593051221178, "learning_rate": 0.0001, "loss": 0.9697, "loss/crossentropy": 2.5544826984405518, "loss/hidden": 0.78125, "loss/logits": 0.13172510266304016, "loss/reg": 0.005671407096087933, "step": 395 }, { "epoch": 0.0495, "grad_norm": 2.635505199432373, "grad_norm_var": 10.223008906743342, "learning_rate": 0.0001, "loss": 0.933, "loss/crossentropy": 2.5959105491638184, "loss/hidden": 0.75390625, "loss/logits": 0.12239634245634079, "loss/reg": 0.0056692929938435555, "step": 396 }, { "epoch": 0.049625, "grad_norm": 2.6063406467437744, "grad_norm_var": 10.229570355797922, "learning_rate": 0.0001, "loss": 1.0478, "loss/crossentropy": 2.719916343688965, "loss/hidden": 0.83984375, "loss/logits": 0.15127256512641907, "loss/reg": 0.0056673381477594376, "step": 397 }, { "epoch": 0.04975, "grad_norm": 2.589893102645874, "grad_norm_var": 10.216701025853546, "learning_rate": 0.0001, "loss": 1.1265, "loss/crossentropy": 2.3730130195617676, "loss/hidden": 0.90234375, "loss/logits": 0.16749918460845947, "loss/reg": 0.0056652189232409, "step": 398 }, { "epoch": 0.049875, "grad_norm": 2.1503751277923584, "grad_norm_var": 10.318666846324161, "learning_rate": 0.0001, "loss": 1.1685, "loss/crossentropy": 2.2147741317749023, "loss/hidden": 0.92578125, "loss/logits": 0.1860472559928894, "loss/reg": 0.005663097370415926, "step": 399 }, { "epoch": 0.05, "grad_norm": 3.6945109367370605, "grad_norm_var": 10.300567557859127, "learning_rate": 0.0001, "loss": 1.1272, "loss/crossentropy": 2.4212143421173096, "loss/hidden": 0.921875, "loss/logits": 0.1487593650817871, "loss/reg": 0.005661314353346825, "step": 400 }, { "epoch": 0.050125, "grad_norm": 3.7444777488708496, "grad_norm_var": 10.268632820538057, "learning_rate": 0.0001, "loss": 1.1221, "loss/crossentropy": 2.5369904041290283, "loss/hidden": 0.90625, "loss/logits": 0.15929211676120758, "loss/reg": 0.005659462418407202, "step": 401 }, { "epoch": 0.05025, "grad_norm": 5.121776580810547, "grad_norm_var": 0.5518050614602837, "learning_rate": 0.0001, "loss": 1.4671, "loss/crossentropy": 2.2371129989624023, "loss/hidden": 1.2109375, "loss/logits": 0.19960111379623413, "loss/reg": 0.005657529458403587, "step": 402 }, { "epoch": 0.050375, "grad_norm": 28.607572555541992, "grad_norm_var": 41.63994308721723, "learning_rate": 0.0001, "loss": 1.1515, "loss/crossentropy": 2.84385347366333, "loss/hidden": 0.90234375, "loss/logits": 0.19263674318790436, "loss/reg": 0.005655454937368631, "step": 403 }, { "epoch": 0.0505, "grad_norm": 2.38948655128479, "grad_norm_var": 41.834466994087045, "learning_rate": 0.0001, "loss": 1.0929, "loss/crossentropy": 2.2518088817596436, "loss/hidden": 0.8984375, "loss/logits": 0.13791221380233765, "loss/reg": 0.005653408356010914, "step": 404 }, { "epoch": 0.050625, "grad_norm": 6.887917518615723, "grad_norm_var": 41.907583648135414, "learning_rate": 0.0001, "loss": 1.2522, "loss/crossentropy": 2.8729405403137207, "loss/hidden": 1.046875, "loss/logits": 0.14880970120429993, "loss/reg": 0.005651514511555433, "step": 405 }, { "epoch": 0.05075, "grad_norm": 3.2420449256896973, "grad_norm_var": 41.69182027548524, "learning_rate": 0.0001, "loss": 1.2031, "loss/crossentropy": 2.598705530166626, "loss/hidden": 0.98046875, "loss/logits": 0.16617505252361298, "loss/reg": 0.005649634636938572, "step": 406 }, { "epoch": 0.050875, "grad_norm": 2.3294692039489746, "grad_norm_var": 41.9082544413822, "learning_rate": 0.0001, "loss": 1.0316, "loss/crossentropy": 2.7743589878082275, "loss/hidden": 0.84375, "loss/logits": 0.13134868443012238, "loss/reg": 0.005647764541208744, "step": 407 }, { "epoch": 0.051, "grad_norm": 2.3849406242370605, "grad_norm_var": 41.988788990047645, "learning_rate": 0.0001, "loss": 1.1579, "loss/crossentropy": 2.2934722900390625, "loss/hidden": 0.9375, "loss/logits": 0.16397064924240112, "loss/reg": 0.00564591446891427, "step": 408 }, { "epoch": 0.051125, "grad_norm": 2.616523504257202, "grad_norm_var": 41.875558070811756, "learning_rate": 0.0001, "loss": 0.9281, "loss/crossentropy": 2.617312431335449, "loss/hidden": 0.7734375, "loss/logits": 0.09819567203521729, "loss/reg": 0.005644225515425205, "step": 409 }, { "epoch": 0.05125, "grad_norm": 2.302281141281128, "grad_norm_var": 42.058469053043055, "learning_rate": 0.0001, "loss": 1.0583, "loss/crossentropy": 2.8029561042785645, "loss/hidden": 0.859375, "loss/logits": 0.14253735542297363, "loss/reg": 0.005642317235469818, "step": 410 }, { "epoch": 0.051375, "grad_norm": 2.1521739959716797, "grad_norm_var": 42.20532780726832, "learning_rate": 0.0001, "loss": 0.996, "loss/crossentropy": 2.5798304080963135, "loss/hidden": 0.80078125, "loss/logits": 0.13881272077560425, "loss/reg": 0.005640234332531691, "step": 411 }, { "epoch": 0.0515, "grad_norm": 4.3292155265808105, "grad_norm_var": 41.914794683811124, "learning_rate": 0.0001, "loss": 1.3517, "loss/crossentropy": 2.4219868183135986, "loss/hidden": 1.0390625, "loss/logits": 0.2562292516231537, "loss/reg": 0.005638125352561474, "step": 412 }, { "epoch": 0.051625, "grad_norm": 19.01975440979004, "grad_norm_var": 53.903843358167165, "learning_rate": 0.0001, "loss": 1.3283, "loss/crossentropy": 2.2926077842712402, "loss/hidden": 1.078125, "loss/logits": 0.19380658864974976, "loss/reg": 0.005636140704154968, "step": 413 }, { "epoch": 0.05175, "grad_norm": 2.859027862548828, "grad_norm_var": 53.791467006877085, "learning_rate": 0.0001, "loss": 1.1115, "loss/crossentropy": 2.429117441177368, "loss/hidden": 0.90234375, "loss/logits": 0.1528070569038391, "loss/reg": 0.005634027067571878, "step": 414 }, { "epoch": 0.051875, "grad_norm": 2.385204792022705, "grad_norm_var": 53.67862289213027, "learning_rate": 0.0001, "loss": 1.0186, "loss/crossentropy": 2.710325002670288, "loss/hidden": 0.81640625, "loss/logits": 0.1458669900894165, "loss/reg": 0.005631967913359404, "step": 415 }, { "epoch": 0.052, "grad_norm": 2.3011677265167236, "grad_norm_var": 54.20582073402194, "learning_rate": 0.0001, "loss": 1.0843, "loss/crossentropy": 2.485734701156616, "loss/hidden": 0.87109375, "loss/logits": 0.1569264829158783, "loss/reg": 0.0056300037540495396, "step": 416 }, { "epoch": 0.052125, "grad_norm": 2.7714357376098633, "grad_norm_var": 54.53064815195892, "learning_rate": 0.0001, "loss": 1.0741, "loss/crossentropy": 2.6249403953552246, "loss/hidden": 0.85546875, "loss/logits": 0.1623522937297821, "loss/reg": 0.0056281075812876225, "step": 417 }, { "epoch": 0.05225, "grad_norm": 2.376473903656006, "grad_norm_var": 55.22478277620113, "learning_rate": 0.0001, "loss": 1.2116, "loss/crossentropy": 2.5150105953216553, "loss/hidden": 0.95703125, "loss/logits": 0.19830524921417236, "loss/reg": 0.005626222584396601, "step": 418 }, { "epoch": 0.052375, "grad_norm": 2.6247470378875732, "grad_norm_var": 17.572360223815615, "learning_rate": 0.0001, "loss": 1.172, "loss/crossentropy": 2.7201685905456543, "loss/hidden": 0.9453125, "loss/logits": 0.17042091488838196, "loss/reg": 0.005624283570796251, "step": 419 }, { "epoch": 0.0525, "grad_norm": 49.02815628051758, "grad_norm_var": 143.90483482694842, "learning_rate": 0.0001, "loss": 5.3824, "loss/crossentropy": 2.692047357559204, "loss/hidden": 4.84375, "loss/logits": 0.48245739936828613, "loss/reg": 0.005622203927487135, "step": 420 }, { "epoch": 0.052625, "grad_norm": 2.6867082118988037, "grad_norm_var": 144.9870986829453, "learning_rate": 0.0001, "loss": 1.2507, "loss/crossentropy": 2.404517412185669, "loss/hidden": 1.0, "loss/logits": 0.19445687532424927, "loss/reg": 0.005620268173515797, "step": 421 }, { "epoch": 0.05275, "grad_norm": 4.397704124450684, "grad_norm_var": 144.55498651709914, "learning_rate": 0.0001, "loss": 1.4596, "loss/crossentropy": 2.1510226726531982, "loss/hidden": 1.2109375, "loss/logits": 0.19246640801429749, "loss/reg": 0.005618296563625336, "step": 422 }, { "epoch": 0.052875, "grad_norm": 4.239573955535889, "grad_norm_var": 143.68003611616095, "learning_rate": 0.0001, "loss": 1.3275, "loss/crossentropy": 2.686849355697632, "loss/hidden": 1.09375, "loss/logits": 0.17758557200431824, "loss/reg": 0.005616751033812761, "step": 423 }, { "epoch": 0.053, "grad_norm": 2.749202251434326, "grad_norm_var": 143.4748837350726, "learning_rate": 0.0001, "loss": 1.0827, "loss/crossentropy": 2.8104846477508545, "loss/hidden": 0.8828125, "loss/logits": 0.1437493860721588, "loss/reg": 0.005615332629531622, "step": 424 }, { "epoch": 0.053125, "grad_norm": 2.459291458129883, "grad_norm_var": 143.5641839570371, "learning_rate": 0.0001, "loss": 1.0548, "loss/crossentropy": 2.5806379318237305, "loss/hidden": 0.8515625, "loss/logits": 0.14714661240577698, "loss/reg": 0.005613364279270172, "step": 425 }, { "epoch": 0.05325, "grad_norm": 2.294171094894409, "grad_norm_var": 143.56904366210821, "learning_rate": 0.0001, "loss": 1.1486, "loss/crossentropy": 2.6366002559661865, "loss/hidden": 0.90234375, "loss/logits": 0.19014191627502441, "loss/reg": 0.005611394997686148, "step": 426 }, { "epoch": 0.053375, "grad_norm": 2.2255382537841797, "grad_norm_var": 143.52399251007708, "learning_rate": 0.0001, "loss": 1.0752, "loss/crossentropy": 2.542306661605835, "loss/hidden": 0.875, "loss/logits": 0.14408408105373383, "loss/reg": 0.005609368905425072, "step": 427 }, { "epoch": 0.0535, "grad_norm": 3.5708723068237305, "grad_norm_var": 143.80942972780392, "learning_rate": 0.0001, "loss": 1.0863, "loss/crossentropy": 2.2636356353759766, "loss/hidden": 0.8828125, "loss/logits": 0.14744916558265686, "loss/reg": 0.005607361439615488, "step": 428 }, { "epoch": 0.053625, "grad_norm": 2.9189610481262207, "grad_norm_var": 133.66980873374825, "learning_rate": 0.0001, "loss": 0.9895, "loss/crossentropy": 2.7651426792144775, "loss/hidden": 0.78515625, "loss/logits": 0.1482805609703064, "loss/reg": 0.005605428479611874, "step": 429 }, { "epoch": 0.05375, "grad_norm": 3.2735564708709717, "grad_norm_var": 133.5211490137515, "learning_rate": 0.0001, "loss": 1.2363, "loss/crossentropy": 2.248082399368286, "loss/hidden": 0.98046875, "loss/logits": 0.19977417588233948, "loss/reg": 0.0056034415028989315, "step": 430 }, { "epoch": 0.053875, "grad_norm": 3.5670769214630127, "grad_norm_var": 133.0752341056661, "learning_rate": 0.0001, "loss": 1.2766, "loss/crossentropy": 2.500338554382324, "loss/hidden": 1.0234375, "loss/logits": 0.19719059765338898, "loss/reg": 0.005601502023637295, "step": 431 }, { "epoch": 0.054, "grad_norm": 2.2697787284851074, "grad_norm_var": 133.0901180807591, "learning_rate": 0.0001, "loss": 0.9931, "loss/crossentropy": 2.6418793201446533, "loss/hidden": 0.7890625, "loss/logits": 0.14799568057060242, "loss/reg": 0.005599519703537226, "step": 432 }, { "epoch": 0.054125, "grad_norm": 3.220383405685425, "grad_norm_var": 132.91898234062202, "learning_rate": 0.0001, "loss": 1.2515, "loss/crossentropy": 2.5073025226593018, "loss/hidden": 1.0390625, "loss/logits": 0.15643876791000366, "loss/reg": 0.005597477313131094, "step": 433 }, { "epoch": 0.05425, "grad_norm": 3.2845206260681152, "grad_norm_var": 132.5476800488924, "learning_rate": 0.0001, "loss": 1.1441, "loss/crossentropy": 2.509037971496582, "loss/hidden": 0.9296875, "loss/logits": 0.15849418938159943, "loss/reg": 0.005595567170530558, "step": 434 }, { "epoch": 0.054375, "grad_norm": 2.254239320755005, "grad_norm_var": 132.71932731242507, "learning_rate": 0.0001, "loss": 0.9815, "loss/crossentropy": 2.567584991455078, "loss/hidden": 0.78125, "loss/logits": 0.14433184266090393, "loss/reg": 0.005593593697994947, "step": 435 }, { "epoch": 0.0545, "grad_norm": 3.2273480892181396, "grad_norm_var": 0.4676980414191933, "learning_rate": 0.0001, "loss": 1.1645, "loss/crossentropy": 2.3639349937438965, "loss/hidden": 0.94921875, "loss/logits": 0.15934088826179504, "loss/reg": 0.0055916691198945045, "step": 436 }, { "epoch": 0.054625, "grad_norm": 2.6044058799743652, "grad_norm_var": 0.47199755801328347, "learning_rate": 0.0001, "loss": 1.1033, "loss/crossentropy": 2.539247989654541, "loss/hidden": 0.8984375, "loss/logits": 0.14898554980754852, "loss/reg": 0.005589775741100311, "step": 437 }, { "epoch": 0.05475, "grad_norm": 2.9674391746520996, "grad_norm_var": 0.3399405404704983, "learning_rate": 0.0001, "loss": 1.252, "loss/crossentropy": 2.5642499923706055, "loss/hidden": 0.9921875, "loss/logits": 0.20391228795051575, "loss/reg": 0.005587900057435036, "step": 438 }, { "epoch": 0.054875, "grad_norm": 2.4164047241210938, "grad_norm_var": 0.23308679379454797, "learning_rate": 0.0001, "loss": 1.1939, "loss/crossentropy": 2.3462696075439453, "loss/hidden": 0.93359375, "loss/logits": 0.2044137418270111, "loss/reg": 0.005585688166320324, "step": 439 }, { "epoch": 0.055, "grad_norm": 2.7590599060058594, "grad_norm_var": 0.2329847653181711, "learning_rate": 0.0001, "loss": 1.0377, "loss/crossentropy": 2.775485038757324, "loss/hidden": 0.84375, "loss/logits": 0.13808496296405792, "loss/reg": 0.0055835009552538395, "step": 440 }, { "epoch": 0.055125, "grad_norm": 2.7251267433166504, "grad_norm_var": 0.224188675724659, "learning_rate": 0.0001, "loss": 1.0001, "loss/crossentropy": 2.4934420585632324, "loss/hidden": 0.80859375, "loss/logits": 0.1357189267873764, "loss/reg": 0.005581483710557222, "step": 441 }, { "epoch": 0.05525, "grad_norm": 2.4774584770202637, "grad_norm_var": 0.21273704839308963, "learning_rate": 0.0001, "loss": 1.2166, "loss/crossentropy": 2.426271438598633, "loss/hidden": 0.95703125, "loss/logits": 0.20375394821166992, "loss/reg": 0.0055792308412492275, "step": 442 }, { "epoch": 0.055375, "grad_norm": 3.2236833572387695, "grad_norm_var": 0.1905493662305197, "learning_rate": 0.0001, "loss": 1.1724, "loss/crossentropy": 2.9799797534942627, "loss/hidden": 0.92578125, "loss/logits": 0.19083930552005768, "loss/reg": 0.005577271804213524, "step": 443 }, { "epoch": 0.0555, "grad_norm": 2.5997183322906494, "grad_norm_var": 0.16554225723918894, "learning_rate": 0.0001, "loss": 1.126, "loss/crossentropy": 2.2098257541656494, "loss/hidden": 0.92578125, "loss/logits": 0.14447355270385742, "loss/reg": 0.005575183313339949, "step": 444 }, { "epoch": 0.055625, "grad_norm": 2.5179152488708496, "grad_norm_var": 0.1725392629592297, "learning_rate": 0.0001, "loss": 1.2018, "loss/crossentropy": 2.0029213428497314, "loss/hidden": 0.98046875, "loss/logits": 0.1655960977077484, "loss/reg": 0.005572900176048279, "step": 445 }, { "epoch": 0.05575, "grad_norm": 2.5075204372406006, "grad_norm_var": 0.16460110044899826, "learning_rate": 0.0001, "loss": 1.0614, "loss/crossentropy": 2.3672924041748047, "loss/hidden": 0.85546875, "loss/logits": 0.15021467208862305, "loss/reg": 0.005570439621806145, "step": 446 }, { "epoch": 0.055875, "grad_norm": 2.441183567047119, "grad_norm_var": 0.12700610259855102, "learning_rate": 0.0001, "loss": 0.9323, "loss/crossentropy": 2.311056137084961, "loss/hidden": 0.7578125, "loss/logits": 0.11881721019744873, "loss/reg": 0.00556844100356102, "step": 447 }, { "epoch": 0.056, "grad_norm": 2.6724319458007812, "grad_norm_var": 0.11304803744365562, "learning_rate": 0.0001, "loss": 1.0937, "loss/crossentropy": 2.562101364135742, "loss/hidden": 0.8671875, "loss/logits": 0.1708334982395172, "loss/reg": 0.005566492676734924, "step": 448 }, { "epoch": 0.056125, "grad_norm": 2.196300506591797, "grad_norm_var": 0.11350312697665288, "learning_rate": 0.0001, "loss": 0.9882, "loss/crossentropy": 2.4227116107940674, "loss/hidden": 0.80078125, "loss/logits": 0.13182450830936432, "loss/reg": 0.00556437112390995, "step": 449 }, { "epoch": 0.05625, "grad_norm": 2.912667989730835, "grad_norm_var": 0.0921566818687341, "learning_rate": 0.0001, "loss": 1.3721, "loss/crossentropy": 1.9439491033554077, "loss/hidden": 1.109375, "loss/logits": 0.2070913016796112, "loss/reg": 0.0055623650550842285, "step": 450 }, { "epoch": 0.056375, "grad_norm": 2.011991500854492, "grad_norm_var": 0.10881512213368959, "learning_rate": 0.0001, "loss": 1.0172, "loss/crossentropy": 2.498812675476074, "loss/hidden": 0.81640625, "loss/logits": 0.14521706104278564, "loss/reg": 0.005560221150517464, "step": 451 }, { "epoch": 0.0565, "grad_norm": 2.2709267139434814, "grad_norm_var": 0.0912508163184422, "learning_rate": 0.0001, "loss": 1.1384, "loss/crossentropy": 2.320579767227173, "loss/hidden": 0.9140625, "loss/logits": 0.16879746317863464, "loss/reg": 0.005558326840400696, "step": 452 }, { "epoch": 0.056625, "grad_norm": 2.954127788543701, "grad_norm_var": 0.09996231296479816, "learning_rate": 0.0001, "loss": 1.2415, "loss/crossentropy": 2.483376979827881, "loss/hidden": 0.99609375, "loss/logits": 0.18988527357578278, "loss/reg": 0.005556488875299692, "step": 453 }, { "epoch": 0.05675, "grad_norm": 2.442729949951172, "grad_norm_var": 0.0916992305907788, "learning_rate": 0.0001, "loss": 1.0533, "loss/crossentropy": 2.414472818374634, "loss/hidden": 0.84765625, "loss/logits": 0.1501239389181137, "loss/reg": 0.005554646719247103, "step": 454 }, { "epoch": 0.056875, "grad_norm": 2.598292589187622, "grad_norm_var": 0.09002796513685567, "learning_rate": 0.0001, "loss": 0.9797, "loss/crossentropy": 2.8175811767578125, "loss/hidden": 0.78515625, "loss/logits": 0.13899990916252136, "loss/reg": 0.005552831571549177, "step": 455 }, { "epoch": 0.057, "grad_norm": 2.284618616104126, "grad_norm_var": 0.09289234998963139, "learning_rate": 0.0001, "loss": 1.1767, "loss/crossentropy": 2.5178730487823486, "loss/hidden": 0.953125, "loss/logits": 0.1680239588022232, "loss/reg": 0.005550856236368418, "step": 456 }, { "epoch": 0.057125, "grad_norm": 2.9749691486358643, "grad_norm_var": 0.10255115779464533, "learning_rate": 0.0001, "loss": 1.146, "loss/crossentropy": 2.6965036392211914, "loss/hidden": 0.89453125, "loss/logits": 0.19602364301681519, "loss/reg": 0.005548745859414339, "step": 457 }, { "epoch": 0.05725, "grad_norm": 2.4419991970062256, "grad_norm_var": 0.10305738190390912, "learning_rate": 0.0001, "loss": 1.0782, "loss/crossentropy": 2.507200241088867, "loss/hidden": 0.87890625, "loss/logits": 0.14385411143302917, "loss/reg": 0.005546758882701397, "step": 458 }, { "epoch": 0.057375, "grad_norm": 2.41898250579834, "grad_norm_var": 0.07293072023693033, "learning_rate": 0.0001, "loss": 1.0665, "loss/crossentropy": 2.4068796634674072, "loss/hidden": 0.87109375, "loss/logits": 0.13996180891990662, "loss/reg": 0.005544655025005341, "step": 459 }, { "epoch": 0.0575, "grad_norm": 3.584895372390747, "grad_norm_var": 0.1446675774892469, "learning_rate": 0.0001, "loss": 1.419, "loss/crossentropy": 2.4029970169067383, "loss/hidden": 1.15625, "loss/logits": 0.20734865963459015, "loss/reg": 0.005542535334825516, "step": 460 }, { "epoch": 0.057625, "grad_norm": 2.5190699100494385, "grad_norm_var": 0.14465856873481447, "learning_rate": 0.0001, "loss": 1.0687, "loss/crossentropy": 2.632817268371582, "loss/hidden": 0.84375, "loss/logits": 0.16959112882614136, "loss/reg": 0.005540382582694292, "step": 461 }, { "epoch": 0.05775, "grad_norm": 3.293412446975708, "grad_norm_var": 0.1759751166057581, "learning_rate": 0.0001, "loss": 1.2079, "loss/crossentropy": 1.8526346683502197, "loss/hidden": 0.984375, "loss/logits": 0.16817334294319153, "loss/reg": 0.005538390018045902, "step": 462 }, { "epoch": 0.057875, "grad_norm": 2.090097665786743, "grad_norm_var": 0.1923380804679141, "learning_rate": 0.0001, "loss": 1.0403, "loss/crossentropy": 2.7256767749786377, "loss/hidden": 0.83984375, "loss/logits": 0.14509689807891846, "loss/reg": 0.005536381620913744, "step": 463 }, { "epoch": 0.058, "grad_norm": 2.367372751235962, "grad_norm_var": 0.19537989350592183, "learning_rate": 0.0001, "loss": 0.967, "loss/crossentropy": 2.440683603286743, "loss/hidden": 0.78125, "loss/logits": 0.13041679561138153, "loss/reg": 0.005534291733056307, "step": 464 }, { "epoch": 0.058125, "grad_norm": 2.5434730052948, "grad_norm_var": 0.18491306851457617, "learning_rate": 0.0001, "loss": 1.1396, "loss/crossentropy": 2.811406373977661, "loss/hidden": 0.91015625, "loss/logits": 0.1740744560956955, "loss/reg": 0.005532294511795044, "step": 465 }, { "epoch": 0.05825, "grad_norm": 2.613758087158203, "grad_norm_var": 0.17830906169392974, "learning_rate": 0.0001, "loss": 1.0313, "loss/crossentropy": 2.5138356685638428, "loss/hidden": 0.828125, "loss/logits": 0.1479034125804901, "loss/reg": 0.005530340131372213, "step": 466 }, { "epoch": 0.058375, "grad_norm": 3.6053991317749023, "grad_norm_var": 0.21458171164135606, "learning_rate": 0.0001, "loss": 1.2109, "loss/crossentropy": 1.9949983358383179, "loss/hidden": 1.0, "loss/logits": 0.155661940574646, "loss/reg": 0.0055284383706748486, "step": 467 }, { "epoch": 0.0585, "grad_norm": 2.2574644088745117, "grad_norm_var": 0.21534123971961966, "learning_rate": 0.0001, "loss": 1.08, "loss/crossentropy": 2.514662981033325, "loss/hidden": 0.859375, "loss/logits": 0.16538314521312714, "loss/reg": 0.005526562221348286, "step": 468 }, { "epoch": 0.058625, "grad_norm": 2.2614095211029053, "grad_norm_var": 0.2206521084247221, "learning_rate": 0.0001, "loss": 1.2297, "loss/crossentropy": 2.4910507202148438, "loss/hidden": 0.98046875, "loss/logits": 0.19400066137313843, "loss/reg": 0.005524714011698961, "step": 469 }, { "epoch": 0.05875, "grad_norm": 3.083524465560913, "grad_norm_var": 0.22915168035201153, "learning_rate": 0.0001, "loss": 1.1725, "loss/crossentropy": 2.5548853874206543, "loss/hidden": 0.92578125, "loss/logits": 0.19151920080184937, "loss/reg": 0.005522689316421747, "step": 470 }, { "epoch": 0.058875, "grad_norm": 2.6530709266662598, "grad_norm_var": 0.2287156357176549, "learning_rate": 0.0001, "loss": 0.9819, "loss/crossentropy": 2.5769848823547363, "loss/hidden": 0.79296875, "loss/logits": 0.1337730437517166, "loss/reg": 0.00552078802138567, "step": 471 }, { "epoch": 0.059, "grad_norm": 2.857489585876465, "grad_norm_var": 0.21848469951039154, "learning_rate": 0.0001, "loss": 1.2335, "loss/crossentropy": 2.6933629512786865, "loss/hidden": 0.98828125, "loss/logits": 0.19003306329250336, "loss/reg": 0.005518974736332893, "step": 472 }, { "epoch": 0.059125, "grad_norm": 1.960106372833252, "grad_norm_var": 0.24874750636482734, "learning_rate": 0.0001, "loss": 0.9776, "loss/crossentropy": 2.534855365753174, "loss/hidden": 0.7890625, "loss/logits": 0.13338381052017212, "loss/reg": 0.005517229437828064, "step": 473 }, { "epoch": 0.05925, "grad_norm": 2.787822961807251, "grad_norm_var": 0.24619457779295406, "learning_rate": 0.0001, "loss": 1.0858, "loss/crossentropy": 2.396390438079834, "loss/hidden": 0.88671875, "loss/logits": 0.14397624135017395, "loss/reg": 0.005515479948371649, "step": 474 }, { "epoch": 0.059375, "grad_norm": 2.3396122455596924, "grad_norm_var": 0.24936205040752385, "learning_rate": 0.0001, "loss": 1.0392, "loss/crossentropy": 2.6306259632110596, "loss/hidden": 0.83984375, "loss/logits": 0.14426180720329285, "loss/reg": 0.005513759795576334, "step": 475 }, { "epoch": 0.0595, "grad_norm": 2.367551803588867, "grad_norm_var": 0.19447740210993794, "learning_rate": 0.0001, "loss": 1.1071, "loss/crossentropy": 2.342672348022461, "loss/hidden": 0.890625, "loss/logits": 0.16136375069618225, "loss/reg": 0.0055120959877967834, "step": 476 }, { "epoch": 0.059625, "grad_norm": 2.3029873371124268, "grad_norm_var": 0.19972845357339655, "learning_rate": 0.0001, "loss": 0.9785, "loss/crossentropy": 2.725276231765747, "loss/hidden": 0.796875, "loss/logits": 0.12647491693496704, "loss/reg": 0.0055101178586483, "step": 477 }, { "epoch": 0.05975, "grad_norm": 2.3109138011932373, "grad_norm_var": 0.1674590503375268, "learning_rate": 0.0001, "loss": 1.012, "loss/crossentropy": 2.6665799617767334, "loss/hidden": 0.81640625, "loss/logits": 0.14054208993911743, "loss/reg": 0.005508116912096739, "step": 478 }, { "epoch": 0.059875, "grad_norm": 2.8778023719787598, "grad_norm_var": 0.1605488706137739, "learning_rate": 0.0001, "loss": 1.0028, "loss/crossentropy": 2.599010705947876, "loss/hidden": 0.80078125, "loss/logits": 0.14697444438934326, "loss/reg": 0.0055063748732209206, "step": 479 }, { "epoch": 0.06, "grad_norm": 2.7762978076934814, "grad_norm_var": 0.15971446982347573, "learning_rate": 0.0001, "loss": 1.1492, "loss/crossentropy": 2.6345436573028564, "loss/hidden": 0.9296875, "loss/logits": 0.1645045280456543, "loss/reg": 0.005504653323441744, "step": 480 }, { "epoch": 0.060125, "grad_norm": 3.0745112895965576, "grad_norm_var": 0.1733429982183973, "learning_rate": 0.0001, "loss": 1.2914, "loss/crossentropy": 2.1021008491516113, "loss/hidden": 1.0546875, "loss/logits": 0.18168240785598755, "loss/reg": 0.005502650048583746, "step": 481 }, { "epoch": 0.06025, "grad_norm": 2.5635828971862793, "grad_norm_var": 0.17362979402171655, "learning_rate": 0.0001, "loss": 1.1746, "loss/crossentropy": 2.599754810333252, "loss/hidden": 0.9453125, "loss/logits": 0.1743006557226181, "loss/reg": 0.005500909872353077, "step": 482 }, { "epoch": 0.060375, "grad_norm": 2.982170343399048, "grad_norm_var": 0.11685041441696337, "learning_rate": 0.0001, "loss": 1.084, "loss/crossentropy": 2.780411958694458, "loss/hidden": 0.875, "loss/logits": 0.15399503707885742, "loss/reg": 0.005499421618878841, "step": 483 }, { "epoch": 0.0605, "grad_norm": 6.475743770599365, "grad_norm_var": 1.0413639393420129, "learning_rate": 0.0001, "loss": 2.1473, "loss/crossentropy": 2.3867931365966797, "loss/hidden": 1.703125, "loss/logits": 0.38922837376594543, "loss/reg": 0.005497433710843325, "step": 484 }, { "epoch": 0.060625, "grad_norm": 2.522434711456299, "grad_norm_var": 1.024975132918582, "learning_rate": 0.0001, "loss": 1.0915, "loss/crossentropy": 2.741684675216675, "loss/hidden": 0.88671875, "loss/logits": 0.14987404644489288, "loss/reg": 0.0054954588413238525, "step": 485 }, { "epoch": 0.06075, "grad_norm": 2.6852359771728516, "grad_norm_var": 1.0236023483547378, "learning_rate": 0.0001, "loss": 1.0905, "loss/crossentropy": 2.2552525997161865, "loss/hidden": 0.8984375, "loss/logits": 0.13711076974868774, "loss/reg": 0.005493887234479189, "step": 486 }, { "epoch": 0.060875, "grad_norm": 6.048346996307373, "grad_norm_var": 1.65671866064532, "learning_rate": 0.0001, "loss": 1.4058, "loss/crossentropy": 3.1526873111724854, "loss/hidden": 1.0625, "loss/logits": 0.2884060740470886, "loss/reg": 0.005492268595844507, "step": 487 }, { "epoch": 0.061, "grad_norm": 5.24729061126709, "grad_norm_var": 1.9496829900519608, "learning_rate": 0.0001, "loss": 1.5487, "loss/crossentropy": 2.391798496246338, "loss/hidden": 1.234375, "loss/logits": 0.2594112157821655, "loss/reg": 0.0054903156124055386, "step": 488 }, { "epoch": 0.061125, "grad_norm": 3.4879932403564453, "grad_norm_var": 1.8414378354073275, "learning_rate": 0.0001, "loss": 1.2408, "loss/crossentropy": 2.3853161334991455, "loss/hidden": 1.015625, "loss/logits": 0.1702655553817749, "loss/reg": 0.005488729570060968, "step": 489 }, { "epoch": 0.06125, "grad_norm": 2.416243076324463, "grad_norm_var": 1.875598350696971, "learning_rate": 0.0001, "loss": 1.0646, "loss/crossentropy": 2.310605049133301, "loss/hidden": 0.86328125, "loss/logits": 0.146418958902359, "loss/reg": 0.005487216170877218, "step": 490 }, { "epoch": 0.061375, "grad_norm": 2.9619152545928955, "grad_norm_var": 1.8217813283025472, "learning_rate": 0.0001, "loss": 1.2577, "loss/crossentropy": 2.3735132217407227, "loss/hidden": 1.015625, "loss/logits": 0.18721503019332886, "loss/reg": 0.005485245026648045, "step": 491 }, { "epoch": 0.0615, "grad_norm": 2.9602112770080566, "grad_norm_var": 1.7685642295810833, "learning_rate": 0.0001, "loss": 1.1274, "loss/crossentropy": 2.6420083045959473, "loss/hidden": 0.90234375, "loss/logits": 0.17025524377822876, "loss/reg": 0.005483296699821949, "step": 492 }, { "epoch": 0.061625, "grad_norm": 2.5772223472595215, "grad_norm_var": 1.7347667738241757, "learning_rate": 0.0001, "loss": 1.1004, "loss/crossentropy": 2.4166319370269775, "loss/hidden": 0.890625, "loss/logits": 0.15491390228271484, "loss/reg": 0.005481342785060406, "step": 493 }, { "epoch": 0.06175, "grad_norm": 2.6494603157043457, "grad_norm_var": 1.693988292922673, "learning_rate": 0.0001, "loss": 1.0762, "loss/crossentropy": 2.7021005153656006, "loss/hidden": 0.8671875, "loss/logits": 0.1542307734489441, "loss/reg": 0.005479689687490463, "step": 494 }, { "epoch": 0.061875, "grad_norm": 2.065351963043213, "grad_norm_var": 1.7911776893626628, "learning_rate": 0.0001, "loss": 1.015, "loss/crossentropy": 2.4842755794525146, "loss/hidden": 0.8203125, "loss/logits": 0.13995476067066193, "loss/reg": 0.005478002596646547, "step": 495 }, { "epoch": 0.062, "grad_norm": 2.650660753250122, "grad_norm_var": 1.8016636980513454, "learning_rate": 0.0001, "loss": 1.1699, "loss/crossentropy": 2.3899097442626953, "loss/hidden": 0.94921875, "loss/logits": 0.16591498255729675, "loss/reg": 0.005476430524140596, "step": 496 }, { "epoch": 0.062125, "grad_norm": 3.412050724029541, "grad_norm_var": 1.7970375838694677, "learning_rate": 0.0001, "loss": 1.1983, "loss/crossentropy": 2.4459383487701416, "loss/hidden": 0.94140625, "loss/logits": 0.20212361216545105, "loss/reg": 0.005474465899169445, "step": 497 }, { "epoch": 0.06225, "grad_norm": 2.7389674186706543, "grad_norm_var": 1.7804152177025587, "learning_rate": 0.0001, "loss": 1.1076, "loss/crossentropy": 2.6794888973236084, "loss/hidden": 0.90625, "loss/logits": 0.1465749740600586, "loss/reg": 0.005472847726196051, "step": 498 }, { "epoch": 0.062375, "grad_norm": 20.56003761291504, "grad_norm_var": 20.18846043733062, "learning_rate": 0.0001, "loss": 1.0568, "loss/crossentropy": 2.527268409729004, "loss/hidden": 0.859375, "loss/logits": 0.14275437593460083, "loss/reg": 0.005471326876431704, "step": 499 }, { "epoch": 0.0625, "grad_norm": 2.9909119606018066, "grad_norm_var": 20.013739807194945, "learning_rate": 0.0001, "loss": 1.0002, "loss/crossentropy": 2.311053991317749, "loss/hidden": 0.80859375, "loss/logits": 0.13688521087169647, "loss/reg": 0.005469587165862322, "step": 500 } ], "logging_steps": 1, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.2202930782208e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }