| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.0625, |
| "eval_steps": 250, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.000125, |
| "grad_norm": 4.097814559936523, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 1.1655, |
| "loss/crossentropy": 2.343535900115967, |
| "loss/hidden": 0.9296875, |
| "loss/logits": 0.17379230260849, |
| "loss/reg": 0.006198255345225334, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.00025, |
| "grad_norm": 3.662576913833618, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 1.4973, |
| "loss/crossentropy": 2.318769931793213, |
| "loss/hidden": 1.1875, |
| "loss/logits": 0.24786217510700226, |
| "loss/reg": 0.006198255345225334, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.000375, |
| "grad_norm": 2.8296749591827393, |
| "learning_rate": 3e-06, |
| "loss": 1.2258, |
| "loss/crossentropy": 2.4907937049865723, |
| "loss/hidden": 0.97265625, |
| "loss/logits": 0.19112952053546906, |
| "loss/reg": 0.006198245566338301, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.0005, |
| "grad_norm": 3.057624578475952, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 1.1136, |
| "loss/crossentropy": 2.744520902633667, |
| "loss/hidden": 0.890625, |
| "loss/logits": 0.16101403534412384, |
| "loss/reg": 0.006198232993483543, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.000625, |
| "grad_norm": 2.7055587768554688, |
| "learning_rate": 5e-06, |
| "loss": 1.1943, |
| "loss/crossentropy": 2.5722062587738037, |
| "loss/hidden": 0.94921875, |
| "loss/logits": 0.18310005962848663, |
| "loss/reg": 0.0061982134357094765, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.00075, |
| "grad_norm": 3.789276361465454, |
| "learning_rate": 6e-06, |
| "loss": 1.247, |
| "loss/crossentropy": 2.613312005996704, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.17725251615047455, |
| "loss/reg": 0.006198191549628973, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.000875, |
| "grad_norm": 3.997910499572754, |
| "learning_rate": 7.000000000000001e-06, |
| "loss": 1.4206, |
| "loss/crossentropy": 2.4207534790039062, |
| "loss/hidden": 1.125, |
| "loss/logits": 0.2336406409740448, |
| "loss/reg": 0.006198164541274309, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.001, |
| "grad_norm": 2.5986244678497314, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 1.0878, |
| "loss/crossentropy": 2.536424160003662, |
| "loss/hidden": 0.8671875, |
| "loss/logits": 0.1585812270641327, |
| "loss/reg": 0.006198132876306772, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.001125, |
| "grad_norm": 2.2757976055145264, |
| "learning_rate": 9e-06, |
| "loss": 1.1175, |
| "loss/crossentropy": 2.745281219482422, |
| "loss/hidden": 0.89453125, |
| "loss/logits": 0.16094230115413666, |
| "loss/reg": 0.006198094692081213, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.00125, |
| "grad_norm": 2.261094808578491, |
| "learning_rate": 1e-05, |
| "loss": 1.0803, |
| "loss/crossentropy": 2.3173577785491943, |
| "loss/hidden": 0.8671875, |
| "loss/logits": 0.15108685195446014, |
| "loss/reg": 0.0061980499885976315, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.001375, |
| "grad_norm": 21.777265548706055, |
| "learning_rate": 1.1000000000000001e-05, |
| "loss": 2.0501, |
| "loss/crossentropy": 3.2122714519500732, |
| "loss/hidden": 1.7109375, |
| "loss/logits": 0.27713608741760254, |
| "loss/reg": 0.006198008079081774, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.0015, |
| "grad_norm": 2.5655505657196045, |
| "learning_rate": 1.2e-05, |
| "loss": 1.151, |
| "loss/crossentropy": 2.706430196762085, |
| "loss/hidden": 0.8984375, |
| "loss/logits": 0.19056561589241028, |
| "loss/reg": 0.0061979577876627445, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.001625, |
| "grad_norm": 2.403053045272827, |
| "learning_rate": 1.3000000000000001e-05, |
| "loss": 1.0719, |
| "loss/crossentropy": 2.0466296672821045, |
| "loss/hidden": 0.88671875, |
| "loss/logits": 0.12316589802503586, |
| "loss/reg": 0.0061978911980986595, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.00175, |
| "grad_norm": 3.840881586074829, |
| "learning_rate": 1.4000000000000001e-05, |
| "loss": 1.5441, |
| "loss/crossentropy": 2.3191423416137695, |
| "loss/hidden": 1.234375, |
| "loss/logits": 0.24779079854488373, |
| "loss/reg": 0.00619781669229269, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.001875, |
| "grad_norm": 2.557331085205078, |
| "learning_rate": 1.5e-05, |
| "loss": 0.9444, |
| "loss/crossentropy": 2.6370084285736084, |
| "loss/hidden": 0.76953125, |
| "loss/logits": 0.11287336051464081, |
| "loss/reg": 0.006197733338922262, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.002, |
| "grad_norm": 3.1850404739379883, |
| "grad_norm_var": 22.31061335402559, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 1.3213, |
| "loss/crossentropy": 2.676577091217041, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.2046227753162384, |
| "loss/reg": 0.006197639741003513, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.002125, |
| "grad_norm": 2.2587289810180664, |
| "grad_norm_var": 22.553268201402446, |
| "learning_rate": 1.7000000000000003e-05, |
| "loss": 1.0312, |
| "loss/crossentropy": 2.4961040019989014, |
| "loss/hidden": 0.8203125, |
| "loss/logits": 0.148894801735878, |
| "loss/reg": 0.006197560112923384, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.00225, |
| "grad_norm": 3.3259811401367188, |
| "grad_norm_var": 22.58044614452358, |
| "learning_rate": 1.8e-05, |
| "loss": 1.3626, |
| "loss/crossentropy": 2.5914387702941895, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.25370728969573975, |
| "loss/reg": 0.006197475362569094, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.002375, |
| "grad_norm": 2.468914747238159, |
| "grad_norm_var": 22.649171856957494, |
| "learning_rate": 1.9e-05, |
| "loss": 1.1683, |
| "loss/crossentropy": 2.6096584796905518, |
| "loss/hidden": 0.921875, |
| "loss/logits": 0.18447336554527283, |
| "loss/reg": 0.00619738781824708, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.0025, |
| "grad_norm": 2.3097646236419678, |
| "grad_norm_var": 22.784756315801523, |
| "learning_rate": 2e-05, |
| "loss": 1.1605, |
| "loss/crossentropy": 2.299048662185669, |
| "loss/hidden": 0.9375, |
| "loss/logits": 0.16106057167053223, |
| "loss/reg": 0.006197274662554264, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.002625, |
| "grad_norm": 2.1111207008361816, |
| "grad_norm_var": 22.911025462198744, |
| "learning_rate": 2.1e-05, |
| "loss": 0.939, |
| "loss/crossentropy": 2.547258138656616, |
| "loss/hidden": 0.75, |
| "loss/logits": 0.12698382139205933, |
| "loss/reg": 0.006197154987603426, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.00275, |
| "grad_norm": 2.4918222427368164, |
| "grad_norm_var": 23.049732177187614, |
| "learning_rate": 2.2000000000000003e-05, |
| "loss": 1.2047, |
| "loss/crossentropy": 2.2802374362945557, |
| "loss/hidden": 0.953125, |
| "loss/logits": 0.18965375423431396, |
| "loss/reg": 0.006197045091539621, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.002875, |
| "grad_norm": 3.3273494243621826, |
| "grad_norm_var": 23.069242834486193, |
| "learning_rate": 2.3000000000000003e-05, |
| "loss": 1.2554, |
| "loss/crossentropy": 2.3062734603881836, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.18566077947616577, |
| "loss/reg": 0.006196921691298485, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.003, |
| "grad_norm": 2.5644068717956543, |
| "grad_norm_var": 23.075070365271714, |
| "learning_rate": 2.4e-05, |
| "loss": 1.2266, |
| "loss/crossentropy": 2.460878372192383, |
| "loss/hidden": 0.98046875, |
| "loss/logits": 0.18418912589550018, |
| "loss/reg": 0.006196786183863878, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.003125, |
| "grad_norm": 2.3506264686584473, |
| "grad_norm_var": 23.059636834121356, |
| "learning_rate": 2.5e-05, |
| "loss": 1.0205, |
| "loss/crossentropy": 2.4281811714172363, |
| "loss/hidden": 0.82421875, |
| "loss/logits": 0.13434948027133942, |
| "loss/reg": 0.0061966474168002605, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.00325, |
| "grad_norm": 2.25004506111145, |
| "grad_norm_var": 23.062003716592635, |
| "learning_rate": 2.6000000000000002e-05, |
| "loss": 1.1133, |
| "loss/crossentropy": 2.326843500137329, |
| "loss/hidden": 0.9140625, |
| "loss/logits": 0.13725802302360535, |
| "loss/reg": 0.006196498870849609, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.003375, |
| "grad_norm": 2.283770799636841, |
| "grad_norm_var": 0.2469546323472817, |
| "learning_rate": 2.7000000000000002e-05, |
| "loss": 1.1459, |
| "loss/crossentropy": 2.3002493381500244, |
| "loss/hidden": 0.9140625, |
| "loss/logits": 0.16987068951129913, |
| "loss/reg": 0.006196335889399052, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.0035, |
| "grad_norm": 2.805088758468628, |
| "grad_norm_var": 0.24805442740468303, |
| "learning_rate": 2.8000000000000003e-05, |
| "loss": 1.0272, |
| "loss/crossentropy": 2.510472536087036, |
| "loss/hidden": 0.8359375, |
| "loss/logits": 0.12927240133285522, |
| "loss/reg": 0.006196176633238792, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.003625, |
| "grad_norm": 2.0331132411956787, |
| "grad_norm_var": 0.2692014993258605, |
| "learning_rate": 2.9e-05, |
| "loss": 1.0913, |
| "loss/crossentropy": 2.51584529876709, |
| "loss/hidden": 0.87109375, |
| "loss/logits": 0.15820594131946564, |
| "loss/reg": 0.006195997819304466, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.00375, |
| "grad_norm": 2.1523566246032715, |
| "grad_norm_var": 0.17596421900176604, |
| "learning_rate": 3e-05, |
| "loss": 1.0026, |
| "loss/crossentropy": 2.704220771789551, |
| "loss/hidden": 0.796875, |
| "loss/logits": 0.14372289180755615, |
| "loss/reg": 0.0061958180740475655, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.003875, |
| "grad_norm": 2.6658694744110107, |
| "grad_norm_var": 0.1771001402109505, |
| "learning_rate": 3.1e-05, |
| "loss": 1.122, |
| "loss/crossentropy": 2.4840426445007324, |
| "loss/hidden": 0.89453125, |
| "loss/logits": 0.1655040979385376, |
| "loss/reg": 0.006195634603500366, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.004, |
| "grad_norm": 2.813079595565796, |
| "grad_norm_var": 0.153583095436327, |
| "learning_rate": 3.2000000000000005e-05, |
| "loss": 1.0653, |
| "loss/crossentropy": 2.442962646484375, |
| "loss/hidden": 0.859375, |
| "loss/logits": 0.14400474727153778, |
| "loss/reg": 0.00619542459025979, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.004125, |
| "grad_norm": 2.4273953437805176, |
| "grad_norm_var": 0.1496371777315666, |
| "learning_rate": 3.3e-05, |
| "loss": 1.1025, |
| "loss/crossentropy": 2.515721559524536, |
| "loss/hidden": 0.89453125, |
| "loss/logits": 0.1460331827402115, |
| "loss/reg": 0.006195210851728916, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.00425, |
| "grad_norm": 2.0594100952148438, |
| "grad_norm_var": 0.11442956053255457, |
| "learning_rate": 3.4000000000000007e-05, |
| "loss": 1.118, |
| "loss/crossentropy": 2.5347506999969482, |
| "loss/hidden": 0.8984375, |
| "loss/logits": 0.15760375559329987, |
| "loss/reg": 0.006195001769810915, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.004375, |
| "grad_norm": 2.497893810272217, |
| "grad_norm_var": 0.11457586733464495, |
| "learning_rate": 3.5e-05, |
| "loss": 1.2359, |
| "loss/crossentropy": 1.7681002616882324, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.13490143418312073, |
| "loss/reg": 0.006194803398102522, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.0045, |
| "grad_norm": 3.3231709003448486, |
| "grad_norm_var": 0.16029457606237638, |
| "learning_rate": 3.6e-05, |
| "loss": 1.3588, |
| "loss/crossentropy": 2.729518175125122, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.20313453674316406, |
| "loss/reg": 0.00619460316374898, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.004625, |
| "grad_norm": 2.5542962551116943, |
| "grad_norm_var": 0.14901290879942408, |
| "learning_rate": 3.7e-05, |
| "loss": 1.1671, |
| "loss/crossentropy": 2.3359429836273193, |
| "loss/hidden": 0.9296875, |
| "loss/logits": 0.17546769976615906, |
| "loss/reg": 0.006194361485540867, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.00475, |
| "grad_norm": 3.5138309001922607, |
| "grad_norm_var": 0.2080724542279834, |
| "learning_rate": 3.8e-05, |
| "loss": 1.2044, |
| "loss/crossentropy": 2.447890520095825, |
| "loss/hidden": 0.96484375, |
| "loss/logits": 0.17756858468055725, |
| "loss/reg": 0.0061941081658005714, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.004875, |
| "grad_norm": 3.813410758972168, |
| "grad_norm_var": 0.2698887106917669, |
| "learning_rate": 3.9000000000000006e-05, |
| "loss": 1.0819, |
| "loss/crossentropy": 2.766765832901001, |
| "loss/hidden": 0.88671875, |
| "loss/logits": 0.13325469195842743, |
| "loss/reg": 0.006193886045366526, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.005, |
| "grad_norm": 3.1502718925476074, |
| "grad_norm_var": 0.2860816910243668, |
| "learning_rate": 4e-05, |
| "loss": 1.3622, |
| "loss/crossentropy": 2.3325388431549072, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.19087004661560059, |
| "loss/reg": 0.006193609442561865, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.005125, |
| "grad_norm": 2.422366142272949, |
| "grad_norm_var": 0.28336421674108553, |
| "learning_rate": 4.1e-05, |
| "loss": 1.2212, |
| "loss/crossentropy": 2.3002498149871826, |
| "loss/hidden": 0.96875, |
| "loss/logits": 0.19054222106933594, |
| "loss/reg": 0.00619333703070879, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.00525, |
| "grad_norm": 2.7353622913360596, |
| "grad_norm_var": 0.2707266796228128, |
| "learning_rate": 4.2e-05, |
| "loss": 1.0549, |
| "loss/crossentropy": 2.0319221019744873, |
| "loss/hidden": 0.87890625, |
| "loss/logits": 0.1140664741396904, |
| "loss/reg": 0.006193041335791349, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.005375, |
| "grad_norm": 1.9425387382507324, |
| "grad_norm_var": 0.2970857034274398, |
| "learning_rate": 4.3e-05, |
| "loss": 1.0366, |
| "loss/crossentropy": 2.431666374206543, |
| "loss/hidden": 0.83203125, |
| "loss/logits": 0.1426728069782257, |
| "loss/reg": 0.006192733999341726, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.0055, |
| "grad_norm": 2.7009642124176025, |
| "grad_norm_var": 0.2960522402202514, |
| "learning_rate": 4.4000000000000006e-05, |
| "loss": 0.9824, |
| "loss/crossentropy": 2.391608476638794, |
| "loss/hidden": 0.78515625, |
| "loss/logits": 0.13533324003219604, |
| "loss/reg": 0.006192411296069622, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.005625, |
| "grad_norm": 2.6632983684539795, |
| "grad_norm_var": 0.2669107471214488, |
| "learning_rate": 4.5e-05, |
| "loss": 1.1067, |
| "loss/crossentropy": 2.7733116149902344, |
| "loss/hidden": 0.87109375, |
| "loss/logits": 0.1736893653869629, |
| "loss/reg": 0.006192059256136417, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.00575, |
| "grad_norm": 2.1037468910217285, |
| "grad_norm_var": 0.2707032714108967, |
| "learning_rate": 4.600000000000001e-05, |
| "loss": 0.9831, |
| "loss/crossentropy": 2.4606895446777344, |
| "loss/hidden": 0.7890625, |
| "loss/logits": 0.13213258981704712, |
| "loss/reg": 0.006191718857735395, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.005875, |
| "grad_norm": 2.1911983489990234, |
| "grad_norm_var": 0.28768473978113296, |
| "learning_rate": 4.7e-05, |
| "loss": 0.9509, |
| "loss/crossentropy": 2.6825270652770996, |
| "loss/hidden": 0.76953125, |
| "loss/logits": 0.11942489445209503, |
| "loss/reg": 0.006191306747496128, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.006, |
| "grad_norm": 3.2640700340270996, |
| "grad_norm_var": 0.30827796768009724, |
| "learning_rate": 4.8e-05, |
| "loss": 1.0346, |
| "loss/crossentropy": 2.3665199279785156, |
| "loss/hidden": 0.83203125, |
| "loss/logits": 0.14068934321403503, |
| "loss/reg": 0.0061909533105790615, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.006125, |
| "grad_norm": 2.259894847869873, |
| "grad_norm_var": 0.3163475179157634, |
| "learning_rate": 4.9e-05, |
| "loss": 0.9647, |
| "loss/crossentropy": 2.4414587020874023, |
| "loss/hidden": 0.79296875, |
| "loss/logits": 0.10987477004528046, |
| "loss/reg": 0.0061905342154204845, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.00625, |
| "grad_norm": 2.7616565227508545, |
| "grad_norm_var": 0.28721415330329, |
| "learning_rate": 5e-05, |
| "loss": 1.019, |
| "loss/crossentropy": 2.0829460620880127, |
| "loss/hidden": 0.83984375, |
| "loss/logits": 0.11724002659320831, |
| "loss/reg": 0.0061900559812784195, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.006375, |
| "grad_norm": 2.7897861003875732, |
| "grad_norm_var": 0.28297568806904866, |
| "learning_rate": 5.1000000000000006e-05, |
| "loss": 0.853, |
| "loss/crossentropy": 2.5636909008026123, |
| "loss/hidden": 0.6953125, |
| "loss/logits": 0.09577471762895584, |
| "loss/reg": 0.00618965458124876, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.0065, |
| "grad_norm": 2.3134403228759766, |
| "grad_norm_var": 0.2711290924819705, |
| "learning_rate": 5.2000000000000004e-05, |
| "loss": 1.0497, |
| "loss/crossentropy": 2.440258026123047, |
| "loss/hidden": 0.83984375, |
| "loss/logits": 0.14791719615459442, |
| "loss/reg": 0.006189141888171434, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.006625, |
| "grad_norm": 2.2032997608184814, |
| "grad_norm_var": 0.2855897568404882, |
| "learning_rate": 5.300000000000001e-05, |
| "loss": 0.9934, |
| "loss/crossentropy": 2.4747955799102783, |
| "loss/hidden": 0.796875, |
| "loss/logits": 0.13461169600486755, |
| "loss/reg": 0.006188610102981329, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.00675, |
| "grad_norm": 2.267400026321411, |
| "grad_norm_var": 0.24358579758792467, |
| "learning_rate": 5.4000000000000005e-05, |
| "loss": 1.1149, |
| "loss/crossentropy": 2.705127477645874, |
| "loss/hidden": 0.89453125, |
| "loss/logits": 0.1585235595703125, |
| "loss/reg": 0.0061880191788077354, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.006875, |
| "grad_norm": 2.281036853790283, |
| "grad_norm_var": 0.14220569464836952, |
| "learning_rate": 5.500000000000001e-05, |
| "loss": 0.9642, |
| "loss/crossentropy": 2.545010805130005, |
| "loss/hidden": 0.78515625, |
| "loss/logits": 0.11717304587364197, |
| "loss/reg": 0.006187579594552517, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.007, |
| "grad_norm": 4.942420959472656, |
| "grad_norm_var": 0.4975759650139497, |
| "learning_rate": 5.6000000000000006e-05, |
| "loss": 1.1237, |
| "loss/crossentropy": 2.7698795795440674, |
| "loss/hidden": 0.91796875, |
| "loss/logits": 0.14385326206684113, |
| "loss/reg": 0.006187067367136478, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.007125, |
| "grad_norm": 2.4213955402374268, |
| "grad_norm_var": 0.4976009733976563, |
| "learning_rate": 5.6999999999999996e-05, |
| "loss": 1.0386, |
| "loss/crossentropy": 2.572023868560791, |
| "loss/hidden": 0.84765625, |
| "loss/logits": 0.12909512221813202, |
| "loss/reg": 0.006186594720929861, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.00725, |
| "grad_norm": 2.15891695022583, |
| "grad_norm_var": 0.5091253321428854, |
| "learning_rate": 5.8e-05, |
| "loss": 0.961, |
| "loss/crossentropy": 2.283557415008545, |
| "loss/hidden": 0.7734375, |
| "loss/logits": 0.12568500638008118, |
| "loss/reg": 0.006185955833643675, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.007375, |
| "grad_norm": 2.36811900138855, |
| "grad_norm_var": 0.48432608682591366, |
| "learning_rate": 5.9e-05, |
| "loss": 0.8386, |
| "loss/crossentropy": 2.453810453414917, |
| "loss/hidden": 0.6796875, |
| "loss/logits": 0.09709502756595612, |
| "loss/reg": 0.0061853062361478806, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.0075, |
| "grad_norm": 2.591327667236328, |
| "grad_norm_var": 0.4836842483889178, |
| "learning_rate": 6e-05, |
| "loss": 1.033, |
| "loss/crossentropy": 2.8110511302948, |
| "loss/hidden": 0.81640625, |
| "loss/logits": 0.1547423005104065, |
| "loss/reg": 0.006184632424265146, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.007625, |
| "grad_norm": 2.0103816986083984, |
| "grad_norm_var": 0.5047142009615214, |
| "learning_rate": 6.1e-05, |
| "loss": 0.9296, |
| "loss/crossentropy": 2.15134334564209, |
| "loss/hidden": 0.7578125, |
| "loss/logits": 0.1099701076745987, |
| "loss/reg": 0.0061841062270104885, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.00775, |
| "grad_norm": 1.80124831199646, |
| "grad_norm_var": 0.5287549745746596, |
| "learning_rate": 6.2e-05, |
| "loss": 0.9266, |
| "loss/crossentropy": 2.7054479122161865, |
| "loss/hidden": 0.7421875, |
| "loss/logits": 0.12253857403993607, |
| "loss/reg": 0.0061835781671106815, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.007875, |
| "grad_norm": 2.277440309524536, |
| "grad_norm_var": 0.5252193383179133, |
| "learning_rate": 6.3e-05, |
| "loss": 0.914, |
| "loss/crossentropy": 2.6631381511688232, |
| "loss/hidden": 0.734375, |
| "loss/logits": 0.1177992895245552, |
| "loss/reg": 0.0061830319464206696, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 3.3314151763916016, |
| "grad_norm_var": 0.531964164332922, |
| "learning_rate": 6.400000000000001e-05, |
| "loss": 1.29, |
| "loss/crossentropy": 2.1269633769989014, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.16565865278244019, |
| "loss/reg": 0.006182366982102394, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.008125, |
| "grad_norm": 4.333358287811279, |
| "grad_norm_var": 0.7208240839518936, |
| "learning_rate": 6.500000000000001e-05, |
| "loss": 1.1615, |
| "loss/crossentropy": 2.714442491531372, |
| "loss/hidden": 0.94140625, |
| "loss/logits": 0.15825161337852478, |
| "loss/reg": 0.006181675940752029, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.00825, |
| "grad_norm": 2.853740930557251, |
| "grad_norm_var": 0.7223776199927481, |
| "learning_rate": 6.6e-05, |
| "loss": 1.062, |
| "loss/crossentropy": 2.2147135734558105, |
| "loss/hidden": 0.8515625, |
| "loss/logits": 0.14859826862812042, |
| "loss/reg": 0.006180979777127504, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.008375, |
| "grad_norm": 2.8853657245635986, |
| "grad_norm_var": 0.7242961395218184, |
| "learning_rate": 6.7e-05, |
| "loss": 0.9533, |
| "loss/crossentropy": 2.619598388671875, |
| "loss/hidden": 0.7734375, |
| "loss/logits": 0.11804014444351196, |
| "loss/reg": 0.006180332973599434, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.0085, |
| "grad_norm": 2.725229501724243, |
| "grad_norm_var": 0.7142181363616674, |
| "learning_rate": 6.800000000000001e-05, |
| "loss": 1.1308, |
| "loss/crossentropy": 2.4091367721557617, |
| "loss/hidden": 0.90234375, |
| "loss/logits": 0.16662752628326416, |
| "loss/reg": 0.006179714575409889, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.008625, |
| "grad_norm": 2.93643856048584, |
| "grad_norm_var": 0.6977178730278022, |
| "learning_rate": 6.9e-05, |
| "loss": 1.1414, |
| "loss/crossentropy": 2.509793281555176, |
| "loss/hidden": 0.90234375, |
| "loss/logits": 0.17730477452278137, |
| "loss/reg": 0.0061789220198988914, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.00875, |
| "grad_norm": 2.4086973667144775, |
| "grad_norm_var": 0.6896555586144653, |
| "learning_rate": 7e-05, |
| "loss": 0.9852, |
| "loss/crossentropy": 2.7080371379852295, |
| "loss/hidden": 0.7890625, |
| "loss/logits": 0.1343374401330948, |
| "loss/reg": 0.0061781019903719425, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.008875, |
| "grad_norm": 1.9355547428131104, |
| "grad_norm_var": 0.7196579708330165, |
| "learning_rate": 7.1e-05, |
| "loss": 0.9176, |
| "loss/crossentropy": 2.451488494873047, |
| "loss/hidden": 0.7421875, |
| "loss/logits": 0.11365102231502533, |
| "loss/reg": 0.006177456583827734, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.009, |
| "grad_norm": 2.273902654647827, |
| "grad_norm_var": 0.38422972669649574, |
| "learning_rate": 7.2e-05, |
| "loss": 1.0112, |
| "loss/crossentropy": 2.4479947090148926, |
| "loss/hidden": 0.8125, |
| "loss/logits": 0.13690924644470215, |
| "loss/reg": 0.006176764145493507, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.009125, |
| "grad_norm": 3.385849952697754, |
| "grad_norm_var": 0.4217084598233742, |
| "learning_rate": 7.3e-05, |
| "loss": 1.3992, |
| "loss/crossentropy": 2.3916804790496826, |
| "loss/hidden": 1.1484375, |
| "loss/logits": 0.18896484375, |
| "loss/reg": 0.006176079623401165, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.00925, |
| "grad_norm": 1.893932580947876, |
| "grad_norm_var": 0.44317594415441114, |
| "learning_rate": 7.4e-05, |
| "loss": 0.9357, |
| "loss/crossentropy": 2.3809518814086914, |
| "loss/hidden": 0.74609375, |
| "loss/logits": 0.12787015736103058, |
| "loss/reg": 0.00617539556697011, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.009375, |
| "grad_norm": 2.431032657623291, |
| "grad_norm_var": 0.4412621914582907, |
| "learning_rate": 7.500000000000001e-05, |
| "loss": 1.0796, |
| "loss/crossentropy": 2.5346295833587646, |
| "loss/hidden": 0.86328125, |
| "loss/logits": 0.1545613557100296, |
| "loss/reg": 0.006174764130264521, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.0095, |
| "grad_norm": 2.2421321868896484, |
| "grad_norm_var": 0.45066905079875685, |
| "learning_rate": 7.6e-05, |
| "loss": 0.9869, |
| "loss/crossentropy": 2.756843090057373, |
| "loss/hidden": 0.796875, |
| "loss/logits": 0.1282375454902649, |
| "loss/reg": 0.006174163427203894, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.009625, |
| "grad_norm": 2.7022979259490967, |
| "grad_norm_var": 0.4254703741989109, |
| "learning_rate": 7.7e-05, |
| "loss": 1.2503, |
| "loss/crossentropy": 2.0696699619293213, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.1729813814163208, |
| "loss/reg": 0.006173421163111925, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.00975, |
| "grad_norm": 2.501106023788452, |
| "grad_norm_var": 0.37677934250983375, |
| "learning_rate": 7.800000000000001e-05, |
| "loss": 1.0516, |
| "loss/crossentropy": 2.629380941390991, |
| "loss/hidden": 0.83984375, |
| "loss/logits": 0.15003597736358643, |
| "loss/reg": 0.006172672379761934, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.009875, |
| "grad_norm": 2.137601137161255, |
| "grad_norm_var": 0.3857841035513881, |
| "learning_rate": 7.900000000000001e-05, |
| "loss": 0.9388, |
| "loss/crossentropy": 2.6841280460357666, |
| "loss/hidden": 0.75, |
| "loss/logits": 0.12706515192985535, |
| "loss/reg": 0.006171974819153547, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 4.655951976776123, |
| "grad_norm_var": 0.6093991769416703, |
| "learning_rate": 8e-05, |
| "loss": 1.2659, |
| "loss/crossentropy": 2.4634439945220947, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.16511483490467072, |
| "loss/reg": 0.006171175744384527, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.010125, |
| "grad_norm": 2.2418179512023926, |
| "grad_norm_var": 0.44652068466097317, |
| "learning_rate": 8.1e-05, |
| "loss": 1.0773, |
| "loss/crossentropy": 2.479743480682373, |
| "loss/hidden": 0.87890625, |
| "loss/logits": 0.1366729438304901, |
| "loss/reg": 0.006170437205582857, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.01025, |
| "grad_norm": 2.0470192432403564, |
| "grad_norm_var": 0.4640077865797357, |
| "learning_rate": 8.2e-05, |
| "loss": 0.8599, |
| "loss/crossentropy": 2.440803050994873, |
| "loss/hidden": 0.68359375, |
| "loss/logits": 0.11458206921815872, |
| "loss/reg": 0.0061697582714259624, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.010375, |
| "grad_norm": 2.0131125450134277, |
| "grad_norm_var": 0.47694604476552793, |
| "learning_rate": 8.3e-05, |
| "loss": 0.8585, |
| "loss/crossentropy": 2.480877637863159, |
| "loss/hidden": 0.6875, |
| "loss/logits": 0.10927767306566238, |
| "loss/reg": 0.006169027183204889, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.0105, |
| "grad_norm": 2.2644267082214355, |
| "grad_norm_var": 0.47842071328175656, |
| "learning_rate": 8.4e-05, |
| "loss": 0.8351, |
| "loss/crossentropy": 2.693246841430664, |
| "loss/hidden": 0.67578125, |
| "loss/logits": 0.09764716029167175, |
| "loss/reg": 0.006168315652757883, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.010625, |
| "grad_norm": 3.1729207038879395, |
| "grad_norm_var": 0.4955376038232837, |
| "learning_rate": 8.5e-05, |
| "loss": 1.2314, |
| "loss/crossentropy": 2.3339309692382812, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.15408015251159668, |
| "loss/reg": 0.006167604587972164, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.01075, |
| "grad_norm": 2.281872510910034, |
| "grad_norm_var": 0.4984116504809473, |
| "learning_rate": 8.6e-05, |
| "loss": 1.1113, |
| "loss/crossentropy": 2.410794258117676, |
| "loss/hidden": 0.8828125, |
| "loss/logits": 0.16686803102493286, |
| "loss/reg": 0.0061669000424444675, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.010875, |
| "grad_norm": 2.701244354248047, |
| "grad_norm_var": 0.4762769450482454, |
| "learning_rate": 8.7e-05, |
| "loss": 0.9115, |
| "loss/crossentropy": 2.5270962715148926, |
| "loss/hidden": 0.73046875, |
| "loss/logits": 0.11935658752918243, |
| "loss/reg": 0.0061660343781113625, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.011, |
| "grad_norm": 2.0738677978515625, |
| "grad_norm_var": 0.4863854399313406, |
| "learning_rate": 8.800000000000001e-05, |
| "loss": 0.9634, |
| "loss/crossentropy": 2.625903844833374, |
| "loss/hidden": 0.7734375, |
| "loss/logits": 0.12826378643512726, |
| "loss/reg": 0.006165289785712957, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.011125, |
| "grad_norm": 2.827744245529175, |
| "grad_norm_var": 0.44340376520124375, |
| "learning_rate": 8.900000000000001e-05, |
| "loss": 1.0134, |
| "loss/crossentropy": 2.2436654567718506, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.15097512304782867, |
| "loss/reg": 0.006164397578686476, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.01125, |
| "grad_norm": 2.412203788757324, |
| "grad_norm_var": 0.4174983019540292, |
| "learning_rate": 9e-05, |
| "loss": 0.9541, |
| "loss/crossentropy": 2.4847052097320557, |
| "loss/hidden": 0.78515625, |
| "loss/logits": 0.10735376924276352, |
| "loss/reg": 0.006163434591144323, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.011375, |
| "grad_norm": 2.385309934616089, |
| "grad_norm_var": 0.41831854842319344, |
| "learning_rate": 9.1e-05, |
| "loss": 1.0455, |
| "loss/crossentropy": 2.1011688709259033, |
| "loss/hidden": 0.828125, |
| "loss/logits": 0.15577414631843567, |
| "loss/reg": 0.0061626131646335125, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.0115, |
| "grad_norm": 2.779266595840454, |
| "grad_norm_var": 0.4149256226543306, |
| "learning_rate": 9.200000000000001e-05, |
| "loss": 0.9782, |
| "loss/crossentropy": 2.770954132080078, |
| "loss/hidden": 0.78125, |
| "loss/logits": 0.13530117273330688, |
| "loss/reg": 0.006161784287542105, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.011625, |
| "grad_norm": 2.816206216812134, |
| "grad_norm_var": 0.41767206123470924, |
| "learning_rate": 9.300000000000001e-05, |
| "loss": 1.2584, |
| "loss/crossentropy": 2.4919488430023193, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.17335021495819092, |
| "loss/reg": 0.006160792429000139, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.01175, |
| "grad_norm": 2.1000349521636963, |
| "grad_norm_var": 0.4320504871954351, |
| "learning_rate": 9.4e-05, |
| "loss": 0.9293, |
| "loss/crossentropy": 2.6951355934143066, |
| "loss/hidden": 0.7421875, |
| "loss/logits": 0.12551091611385345, |
| "loss/reg": 0.006159830838441849, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.011875, |
| "grad_norm": 2.6696228981018066, |
| "grad_norm_var": 0.4199965621062515, |
| "learning_rate": 9.5e-05, |
| "loss": 1.0491, |
| "loss/crossentropy": 2.6532485485076904, |
| "loss/hidden": 0.83984375, |
| "loss/logits": 0.14771661162376404, |
| "loss/reg": 0.006158801261335611, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 2.308758020401001, |
| "grad_norm_var": 0.11782165750081125, |
| "learning_rate": 9.6e-05, |
| "loss": 1.1178, |
| "loss/crossentropy": 2.38185977935791, |
| "loss/hidden": 0.90625, |
| "loss/logits": 0.1499352604150772, |
| "loss/reg": 0.006157839670777321, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.012125, |
| "grad_norm": 2.4204304218292236, |
| "grad_norm_var": 0.11501335190634426, |
| "learning_rate": 9.7e-05, |
| "loss": 1.092, |
| "loss/crossentropy": 2.4358534812927246, |
| "loss/hidden": 0.86328125, |
| "loss/logits": 0.16712763905525208, |
| "loss/reg": 0.006156752817332745, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.01225, |
| "grad_norm": 3.7184524536132812, |
| "grad_norm_var": 0.198780236272727, |
| "learning_rate": 9.8e-05, |
| "loss": 1.4311, |
| "loss/crossentropy": 2.1283679008483887, |
| "loss/hidden": 1.171875, |
| "loss/logits": 0.1976230889558792, |
| "loss/reg": 0.006155804730951786, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.012375, |
| "grad_norm": 3.2656571865081787, |
| "grad_norm_var": 0.20565265002658914, |
| "learning_rate": 9.900000000000001e-05, |
| "loss": 1.017, |
| "loss/crossentropy": 2.6715664863586426, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.15465494990348816, |
| "loss/reg": 0.006154791917651892, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.0125, |
| "grad_norm": 2.915663719177246, |
| "grad_norm_var": 0.19977570339779593, |
| "learning_rate": 0.0001, |
| "loss": 0.98, |
| "loss/crossentropy": 2.5455305576324463, |
| "loss/hidden": 0.77734375, |
| "loss/logits": 0.1410846710205078, |
| "loss/reg": 0.0061536673456430435, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.012625, |
| "grad_norm": 3.3153059482574463, |
| "grad_norm_var": 0.2104372314148539, |
| "learning_rate": 0.0001, |
| "loss": 1.1039, |
| "loss/crossentropy": 2.455479621887207, |
| "loss/hidden": 0.90625, |
| "loss/logits": 0.13615351915359497, |
| "loss/reg": 0.0061526307836174965, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.01275, |
| "grad_norm": 2.40315318107605, |
| "grad_norm_var": 0.20480568897691, |
| "learning_rate": 0.0001, |
| "loss": 0.9588, |
| "loss/crossentropy": 2.6359853744506836, |
| "loss/hidden": 0.76953125, |
| "loss/logits": 0.1277719885110855, |
| "loss/reg": 0.006151493173092604, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.012875, |
| "grad_norm": 3.625624895095825, |
| "grad_norm_var": 0.25903479701245613, |
| "learning_rate": 0.0001, |
| "loss": 1.2481, |
| "loss/crossentropy": 2.0148656368255615, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.13969773054122925, |
| "loss/reg": 0.006150420755147934, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.013, |
| "grad_norm": 2.497906446456909, |
| "grad_norm_var": 0.23191354079432358, |
| "learning_rate": 0.0001, |
| "loss": 1.0603, |
| "loss/crossentropy": 2.3493525981903076, |
| "loss/hidden": 0.86328125, |
| "loss/logits": 0.13548779487609863, |
| "loss/reg": 0.006149281747639179, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.013125, |
| "grad_norm": 3.258059501647949, |
| "grad_norm_var": 0.24629299643454275, |
| "learning_rate": 0.0001, |
| "loss": 0.9497, |
| "loss/crossentropy": 2.6988418102264404, |
| "loss/hidden": 0.7734375, |
| "loss/logits": 0.11473990976810455, |
| "loss/reg": 0.006148339249193668, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.01325, |
| "grad_norm": 3.1279666423797607, |
| "grad_norm_var": 0.24075672502018505, |
| "learning_rate": 0.0001, |
| "loss": 1.1195, |
| "loss/crossentropy": 2.578716278076172, |
| "loss/hidden": 0.875, |
| "loss/logits": 0.18304204940795898, |
| "loss/reg": 0.006147205363959074, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.013375, |
| "grad_norm": 2.760901927947998, |
| "grad_norm_var": 0.22627915570051277, |
| "learning_rate": 0.0001, |
| "loss": 0.9369, |
| "loss/crossentropy": 2.5835328102111816, |
| "loss/hidden": 0.75, |
| "loss/logits": 0.12544697523117065, |
| "loss/reg": 0.006146106868982315, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.0135, |
| "grad_norm": 3.2917559146881104, |
| "grad_norm_var": 0.23622539643692994, |
| "learning_rate": 0.0001, |
| "loss": 1.1437, |
| "loss/crossentropy": 2.6001460552215576, |
| "loss/hidden": 0.91796875, |
| "loss/logits": 0.16428819298744202, |
| "loss/reg": 0.006144997663795948, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.013625, |
| "grad_norm": 3.3908517360687256, |
| "grad_norm_var": 0.2499864352593607, |
| "learning_rate": 0.0001, |
| "loss": 1.0747, |
| "loss/crossentropy": 2.6003377437591553, |
| "loss/hidden": 0.87109375, |
| "loss/logits": 0.14213082194328308, |
| "loss/reg": 0.00614393362775445, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.01375, |
| "grad_norm": 2.7455620765686035, |
| "grad_norm_var": 0.2035723185991922, |
| "learning_rate": 0.0001, |
| "loss": 1.1844, |
| "loss/crossentropy": 2.446432113647461, |
| "loss/hidden": 0.94921875, |
| "loss/logits": 0.17372827231884003, |
| "loss/reg": 0.00614282488822937, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.013875, |
| "grad_norm": 2.899392604827881, |
| "grad_norm_var": 0.1972949454934593, |
| "learning_rate": 0.0001, |
| "loss": 1.0314, |
| "loss/crossentropy": 2.4233920574188232, |
| "loss/hidden": 0.83984375, |
| "loss/logits": 0.13018067181110382, |
| "loss/reg": 0.00614172825589776, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.014, |
| "grad_norm": 2.204866647720337, |
| "grad_norm_var": 0.20749751086427656, |
| "learning_rate": 0.0001, |
| "loss": 0.9867, |
| "loss/crossentropy": 2.4006736278533936, |
| "loss/hidden": 0.79296875, |
| "loss/logits": 0.13233302533626556, |
| "loss/reg": 0.006140332669019699, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.014125, |
| "grad_norm": 2.5094263553619385, |
| "grad_norm_var": 0.20123279411857975, |
| "learning_rate": 0.0001, |
| "loss": 1.2429, |
| "loss/crossentropy": 2.2730560302734375, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.1737476885318756, |
| "loss/reg": 0.006138913799077272, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.01425, |
| "grad_norm": 2.590543031692505, |
| "grad_norm_var": 0.17204464736018749, |
| "learning_rate": 0.0001, |
| "loss": 1.0086, |
| "loss/crossentropy": 2.5709896087646484, |
| "loss/hidden": 0.79296875, |
| "loss/logits": 0.1542350947856903, |
| "loss/reg": 0.0061377594247460365, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.014375, |
| "grad_norm": 2.5024876594543457, |
| "grad_norm_var": 0.17379926494707643, |
| "learning_rate": 0.0001, |
| "loss": 1.0309, |
| "loss/crossentropy": 2.539165496826172, |
| "loss/hidden": 0.828125, |
| "loss/logits": 0.14142319560050964, |
| "loss/reg": 0.006136584095656872, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.0145, |
| "grad_norm": 3.2216732501983643, |
| "grad_norm_var": 0.18121036366206128, |
| "learning_rate": 0.0001, |
| "loss": 0.9404, |
| "loss/crossentropy": 2.7685325145721436, |
| "loss/hidden": 0.765625, |
| "loss/logits": 0.1133967787027359, |
| "loss/reg": 0.006135319825261831, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.014625, |
| "grad_norm": 2.3834009170532227, |
| "grad_norm_var": 0.18346146088524526, |
| "learning_rate": 0.0001, |
| "loss": 1.1432, |
| "loss/crossentropy": 2.4507999420166016, |
| "loss/hidden": 0.92578125, |
| "loss/logits": 0.1561031937599182, |
| "loss/reg": 0.006133983377367258, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.01475, |
| "grad_norm": 2.4703636169433594, |
| "grad_norm_var": 0.17984383474256424, |
| "learning_rate": 0.0001, |
| "loss": 1.0541, |
| "loss/crossentropy": 2.3506076335906982, |
| "loss/hidden": 0.84765625, |
| "loss/logits": 0.14511807262897491, |
| "loss/reg": 0.006132753100246191, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.014875, |
| "grad_norm": 2.5960817337036133, |
| "grad_norm_var": 0.13859654880591943, |
| "learning_rate": 0.0001, |
| "loss": 1.2156, |
| "loss/crossentropy": 2.427006244659424, |
| "loss/hidden": 0.96875, |
| "loss/logits": 0.1855170726776123, |
| "loss/reg": 0.006131566129624844, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 2.908734083175659, |
| "grad_norm_var": 0.13379147574996655, |
| "learning_rate": 0.0001, |
| "loss": 1.0136, |
| "loss/crossentropy": 2.4075210094451904, |
| "loss/hidden": 0.81640625, |
| "loss/logits": 0.13592825829982758, |
| "loss/reg": 0.006130332592874765, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.015125, |
| "grad_norm": 3.450002670288086, |
| "grad_norm_var": 0.147717685364636, |
| "learning_rate": 0.0001, |
| "loss": 1.1584, |
| "loss/crossentropy": 2.446925640106201, |
| "loss/hidden": 0.92578125, |
| "loss/logits": 0.17129938304424286, |
| "loss/reg": 0.0061291721649467945, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.01525, |
| "grad_norm": 2.941195011138916, |
| "grad_norm_var": 0.14212594790061886, |
| "learning_rate": 0.0001, |
| "loss": 1.0996, |
| "loss/crossentropy": 2.5499086380004883, |
| "loss/hidden": 0.87109375, |
| "loss/logits": 0.1672220528125763, |
| "loss/reg": 0.006127914879471064, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.015375, |
| "grad_norm": 2.951799154281616, |
| "grad_norm_var": 0.14330143067309015, |
| "learning_rate": 0.0001, |
| "loss": 1.0862, |
| "loss/crossentropy": 2.654383420944214, |
| "loss/hidden": 0.87109375, |
| "loss/logits": 0.15379250049591064, |
| "loss/reg": 0.006126696243882179, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.0155, |
| "grad_norm": 2.5093131065368652, |
| "grad_norm_var": 0.13194533540905293, |
| "learning_rate": 0.0001, |
| "loss": 1.0905, |
| "loss/crossentropy": 2.4646618366241455, |
| "loss/hidden": 0.87890625, |
| "loss/logits": 0.15029752254486084, |
| "loss/reg": 0.006125394720584154, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.015625, |
| "grad_norm": 2.357142448425293, |
| "grad_norm_var": 0.11277765633995311, |
| "learning_rate": 0.0001, |
| "loss": 1.0794, |
| "loss/crossentropy": 2.4590322971343994, |
| "loss/hidden": 0.87109375, |
| "loss/logits": 0.1471107453107834, |
| "loss/reg": 0.0061240773648023605, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.01575, |
| "grad_norm": 2.0443954467773438, |
| "grad_norm_var": 0.13949059079901172, |
| "learning_rate": 0.0001, |
| "loss": 1.0064, |
| "loss/crossentropy": 2.6105568408966064, |
| "loss/hidden": 0.80859375, |
| "loss/logits": 0.13658249378204346, |
| "loss/reg": 0.006122750695794821, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.015875, |
| "grad_norm": 2.334003448486328, |
| "grad_norm_var": 0.1413326038540049, |
| "learning_rate": 0.0001, |
| "loss": 1.128, |
| "loss/crossentropy": 2.3226428031921387, |
| "loss/hidden": 0.8984375, |
| "loss/logits": 0.16836631298065186, |
| "loss/reg": 0.006121381651610136, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 2.6693766117095947, |
| "grad_norm_var": 0.12889249481462456, |
| "learning_rate": 0.0001, |
| "loss": 1.0478, |
| "loss/crossentropy": 2.5844597816467285, |
| "loss/hidden": 0.84765625, |
| "loss/logits": 0.1388963758945465, |
| "loss/reg": 0.006120136007666588, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.016125, |
| "grad_norm": 3.935439348220825, |
| "grad_norm_var": 0.22878447427120438, |
| "learning_rate": 0.0001, |
| "loss": 1.1726, |
| "loss/crossentropy": 2.7213780879974365, |
| "loss/hidden": 0.9375, |
| "loss/logits": 0.1738772690296173, |
| "loss/reg": 0.006118897348642349, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.01625, |
| "grad_norm": 3.463432788848877, |
| "grad_norm_var": 0.25882213944617144, |
| "learning_rate": 0.0001, |
| "loss": 1.0898, |
| "loss/crossentropy": 2.3635873794555664, |
| "loss/hidden": 0.8828125, |
| "loss/logits": 0.1457763910293579, |
| "loss/reg": 0.006117486394941807, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.016375, |
| "grad_norm": 3.779526948928833, |
| "grad_norm_var": 0.31074183113488135, |
| "learning_rate": 0.0001, |
| "loss": 1.2078, |
| "loss/crossentropy": 2.316762924194336, |
| "loss/hidden": 0.98046875, |
| "loss/logits": 0.16614478826522827, |
| "loss/reg": 0.006116243079304695, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.0165, |
| "grad_norm": 2.7554008960723877, |
| "grad_norm_var": 0.3028391023812749, |
| "learning_rate": 0.0001, |
| "loss": 0.9769, |
| "loss/crossentropy": 2.458954095840454, |
| "loss/hidden": 0.7890625, |
| "loss/logits": 0.12667913734912872, |
| "loss/reg": 0.006114880088716745, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.016625, |
| "grad_norm": 2.342526435852051, |
| "grad_norm_var": 0.30546929082944035, |
| "learning_rate": 0.0001, |
| "loss": 1.1137, |
| "loss/crossentropy": 2.6329517364501953, |
| "loss/hidden": 0.890625, |
| "loss/logits": 0.161947563290596, |
| "loss/reg": 0.0061136274598538876, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.01675, |
| "grad_norm": 2.2754058837890625, |
| "grad_norm_var": 0.31756495416411024, |
| "learning_rate": 0.0001, |
| "loss": 1.1703, |
| "loss/crossentropy": 2.2747550010681152, |
| "loss/hidden": 0.94921875, |
| "loss/logits": 0.15994513034820557, |
| "loss/reg": 0.006112351547926664, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.016875, |
| "grad_norm": 3.1313912868499756, |
| "grad_norm_var": 0.3186282278045513, |
| "learning_rate": 0.0001, |
| "loss": 1.2333, |
| "loss/crossentropy": 2.4932894706726074, |
| "loss/hidden": 0.99609375, |
| "loss/logits": 0.17612434923648834, |
| "loss/reg": 0.006111042574048042, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.017, |
| "grad_norm": 3.960482358932495, |
| "grad_norm_var": 0.39381746513703864, |
| "learning_rate": 0.0001, |
| "loss": 1.3101, |
| "loss/crossentropy": 2.581660747528076, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.18646802008152008, |
| "loss/reg": 0.006109676789492369, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.017125, |
| "grad_norm": 2.7605810165405273, |
| "grad_norm_var": 0.37584340109069647, |
| "learning_rate": 0.0001, |
| "loss": 0.8792, |
| "loss/crossentropy": 2.6490936279296875, |
| "loss/hidden": 0.703125, |
| "loss/logits": 0.1150316372513771, |
| "loss/reg": 0.006108277477324009, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.01725, |
| "grad_norm": 2.6196203231811523, |
| "grad_norm_var": 0.38003486499210315, |
| "learning_rate": 0.0001, |
| "loss": 0.955, |
| "loss/crossentropy": 2.633441209793091, |
| "loss/hidden": 0.76953125, |
| "loss/logits": 0.1244344562292099, |
| "loss/reg": 0.006106934975832701, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.017375, |
| "grad_norm": 4.534512519836426, |
| "grad_norm_var": 0.554255985026353, |
| "learning_rate": 0.0001, |
| "loss": 1.4104, |
| "loss/crossentropy": 2.2204151153564453, |
| "loss/hidden": 1.1796875, |
| "loss/logits": 0.1696874350309372, |
| "loss/reg": 0.0061056241393089294, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.0175, |
| "grad_norm": 2.192370653152466, |
| "grad_norm_var": 0.5798771099829023, |
| "learning_rate": 0.0001, |
| "loss": 1.1299, |
| "loss/crossentropy": 2.375506639480591, |
| "loss/hidden": 0.921875, |
| "loss/logits": 0.14694982767105103, |
| "loss/reg": 0.0061043244786560535, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.017625, |
| "grad_norm": 4.368403911590576, |
| "grad_norm_var": 0.6744588881998081, |
| "learning_rate": 0.0001, |
| "loss": 1.278, |
| "loss/crossentropy": 2.3692545890808105, |
| "loss/hidden": 1.03125, |
| "loss/logits": 0.18568292260169983, |
| "loss/reg": 0.006102937273681164, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.01775, |
| "grad_norm": 2.2753779888153076, |
| "grad_norm_var": 0.6461169960118004, |
| "learning_rate": 0.0001, |
| "loss": 1.0276, |
| "loss/crossentropy": 2.470676898956299, |
| "loss/hidden": 0.82421875, |
| "loss/logits": 0.14231771230697632, |
| "loss/reg": 0.006101653911173344, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.017875, |
| "grad_norm": 2.6550562381744385, |
| "grad_norm_var": 0.6203099666067883, |
| "learning_rate": 0.0001, |
| "loss": 0.8712, |
| "loss/crossentropy": 2.8198063373565674, |
| "loss/hidden": 0.69921875, |
| "loss/logits": 0.11099085956811905, |
| "loss/reg": 0.006100376136600971, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.018, |
| "grad_norm": 2.8701858520507812, |
| "grad_norm_var": 0.6111015072729884, |
| "learning_rate": 0.0001, |
| "loss": 1.1794, |
| "loss/crossentropy": 2.413463830947876, |
| "loss/hidden": 0.96484375, |
| "loss/logits": 0.15351834893226624, |
| "loss/reg": 0.006099053658545017, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.018125, |
| "grad_norm": 2.2347958087921143, |
| "grad_norm_var": 0.6069563505613275, |
| "learning_rate": 0.0001, |
| "loss": 1.0832, |
| "loss/crossentropy": 2.446056604385376, |
| "loss/hidden": 0.8671875, |
| "loss/logits": 0.1550455242395401, |
| "loss/reg": 0.006097796373069286, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.01825, |
| "grad_norm": 2.60143780708313, |
| "grad_norm_var": 0.6017061449507364, |
| "learning_rate": 0.0001, |
| "loss": 1.1216, |
| "loss/crossentropy": 2.2890260219573975, |
| "loss/hidden": 0.8984375, |
| "loss/logits": 0.16223573684692383, |
| "loss/reg": 0.006096460856497288, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.018375, |
| "grad_norm": 3.656100273132324, |
| "grad_norm_var": 0.5891684064627459, |
| "learning_rate": 0.0001, |
| "loss": 1.2759, |
| "loss/crossentropy": 2.2077646255493164, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.16024138033390045, |
| "loss/reg": 0.006095105782151222, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.0185, |
| "grad_norm": 2.8190999031066895, |
| "grad_norm_var": 0.5877513730221795, |
| "learning_rate": 0.0001, |
| "loss": 1.1416, |
| "loss/crossentropy": 2.4892842769622803, |
| "loss/hidden": 0.9140625, |
| "loss/logits": 0.1665700376033783, |
| "loss/reg": 0.0060938019305467606, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.018625, |
| "grad_norm": 2.6578848361968994, |
| "grad_norm_var": 0.568168306773175, |
| "learning_rate": 0.0001, |
| "loss": 1.1443, |
| "loss/crossentropy": 2.3138527870178223, |
| "loss/hidden": 0.93359375, |
| "loss/logits": 0.14977282285690308, |
| "loss/reg": 0.006092346739023924, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.01875, |
| "grad_norm": 2.656559944152832, |
| "grad_norm_var": 0.5416540961853636, |
| "learning_rate": 0.0001, |
| "loss": 0.9868, |
| "loss/crossentropy": 2.7701377868652344, |
| "loss/hidden": 0.796875, |
| "loss/logits": 0.12901648879051208, |
| "loss/reg": 0.006090943701565266, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.018875, |
| "grad_norm": 1.9359983205795288, |
| "grad_norm_var": 0.6099613145708634, |
| "learning_rate": 0.0001, |
| "loss": 0.9127, |
| "loss/crossentropy": 2.55560040473938, |
| "loss/hidden": 0.73828125, |
| "loss/logits": 0.11351295560598373, |
| "loss/reg": 0.00608965614810586, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.019, |
| "grad_norm": 3.7978732585906982, |
| "grad_norm_var": 0.5891613317586338, |
| "learning_rate": 0.0001, |
| "loss": 1.2275, |
| "loss/crossentropy": 2.4227731227874756, |
| "loss/hidden": 0.98828125, |
| "loss/logits": 0.17836451530456543, |
| "loss/reg": 0.006088252179324627, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.019125, |
| "grad_norm": 2.8193647861480713, |
| "grad_norm_var": 0.588169020521083, |
| "learning_rate": 0.0001, |
| "loss": 0.9739, |
| "loss/crossentropy": 2.474368095397949, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.11225409805774689, |
| "loss/reg": 0.006086937617510557, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.01925, |
| "grad_norm": 2.2882325649261475, |
| "grad_norm_var": 0.6082348956957436, |
| "learning_rate": 0.0001, |
| "loss": 1.0395, |
| "loss/crossentropy": 2.3776350021362305, |
| "loss/hidden": 0.82421875, |
| "loss/logits": 0.15443992614746094, |
| "loss/reg": 0.0060854703187942505, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.019375, |
| "grad_norm": 2.006150245666504, |
| "grad_norm_var": 0.4559805309993303, |
| "learning_rate": 0.0001, |
| "loss": 0.9762, |
| "loss/crossentropy": 2.7556076049804688, |
| "loss/hidden": 0.78515625, |
| "loss/logits": 0.13019207119941711, |
| "loss/reg": 0.006084186024963856, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.0195, |
| "grad_norm": 2.8143231868743896, |
| "grad_norm_var": 0.43477030174237014, |
| "learning_rate": 0.0001, |
| "loss": 1.1927, |
| "loss/crossentropy": 2.652045249938965, |
| "loss/hidden": 0.94140625, |
| "loss/logits": 0.19042611122131348, |
| "loss/reg": 0.00608274107798934, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.019625, |
| "grad_norm": 2.957540988922119, |
| "grad_norm_var": 0.2601037584282233, |
| "learning_rate": 0.0001, |
| "loss": 1.0641, |
| "loss/crossentropy": 2.546213150024414, |
| "loss/hidden": 0.86328125, |
| "loss/logits": 0.14000022411346436, |
| "loss/reg": 0.006081291940063238, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.01975, |
| "grad_norm": 2.625493288040161, |
| "grad_norm_var": 0.24839219907499052, |
| "learning_rate": 0.0001, |
| "loss": 1.012, |
| "loss/crossentropy": 2.5120432376861572, |
| "loss/hidden": 0.81640625, |
| "loss/logits": 0.13474689424037933, |
| "loss/reg": 0.006079958751797676, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.019875, |
| "grad_norm": 2.6614878177642822, |
| "grad_norm_var": 0.2483457330217589, |
| "learning_rate": 0.0001, |
| "loss": 0.9873, |
| "loss/crossentropy": 2.312061071395874, |
| "loss/hidden": 0.80859375, |
| "loss/logits": 0.11790065467357635, |
| "loss/reg": 0.006078665144741535, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 2.6204919815063477, |
| "grad_norm_var": 0.24699792562249925, |
| "learning_rate": 0.0001, |
| "loss": 1.0488, |
| "loss/crossentropy": 2.505072593688965, |
| "loss/hidden": 0.84375, |
| "loss/logits": 0.14428117871284485, |
| "loss/reg": 0.006077310536056757, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.020125, |
| "grad_norm": 3.107072591781616, |
| "grad_norm_var": 0.24079003208151678, |
| "learning_rate": 0.0001, |
| "loss": 1.1736, |
| "loss/crossentropy": 2.6514599323272705, |
| "loss/hidden": 0.96484375, |
| "loss/logits": 0.1480400413274765, |
| "loss/reg": 0.006076075602322817, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.02025, |
| "grad_norm": 2.669001817703247, |
| "grad_norm_var": 0.23972287159530806, |
| "learning_rate": 0.0001, |
| "loss": 1.1966, |
| "loss/crossentropy": 2.4616479873657227, |
| "loss/hidden": 0.9765625, |
| "loss/logits": 0.15933012962341309, |
| "loss/reg": 0.006074720993638039, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.020375, |
| "grad_norm": 2.5872421264648438, |
| "grad_norm_var": 0.1828196031273113, |
| "learning_rate": 0.0001, |
| "loss": 1.0551, |
| "loss/crossentropy": 2.5483999252319336, |
| "loss/hidden": 0.83984375, |
| "loss/logits": 0.1544739305973053, |
| "loss/reg": 0.006073469761759043, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.0205, |
| "grad_norm": 2.3342509269714355, |
| "grad_norm_var": 0.1891007671877621, |
| "learning_rate": 0.0001, |
| "loss": 1.1418, |
| "loss/crossentropy": 2.610344171524048, |
| "loss/hidden": 0.90234375, |
| "loss/logits": 0.17876723408699036, |
| "loss/reg": 0.006072178483009338, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.020625, |
| "grad_norm": 2.548274278640747, |
| "grad_norm_var": 0.18986337395058156, |
| "learning_rate": 0.0001, |
| "loss": 0.9512, |
| "loss/crossentropy": 2.747725009918213, |
| "loss/hidden": 0.7734375, |
| "loss/logits": 0.11706214398145676, |
| "loss/reg": 0.00607073912397027, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.02075, |
| "grad_norm": 2.666066884994507, |
| "grad_norm_var": 0.18987501227134793, |
| "learning_rate": 0.0001, |
| "loss": 1.0557, |
| "loss/crossentropy": 2.3086578845977783, |
| "loss/hidden": 0.83984375, |
| "loss/logits": 0.1551416665315628, |
| "loss/reg": 0.006069260183721781, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.020875, |
| "grad_norm": 3.363084554672241, |
| "grad_norm_var": 0.18083982986582872, |
| "learning_rate": 0.0001, |
| "loss": 0.9886, |
| "loss/crossentropy": 2.7422661781311035, |
| "loss/hidden": 0.79296875, |
| "loss/logits": 0.13497118651866913, |
| "loss/reg": 0.006067754700779915, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.021, |
| "grad_norm": 2.717400550842285, |
| "grad_norm_var": 0.10163689874761227, |
| "learning_rate": 0.0001, |
| "loss": 1.2413, |
| "loss/crossentropy": 2.341296672821045, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.17277640104293823, |
| "loss/reg": 0.006066245958209038, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.021125, |
| "grad_norm": 2.2773897647857666, |
| "grad_norm_var": 0.10949759007257095, |
| "learning_rate": 0.0001, |
| "loss": 0.9531, |
| "loss/crossentropy": 2.492532968521118, |
| "loss/hidden": 0.76953125, |
| "loss/logits": 0.12295819818973541, |
| "loss/reg": 0.006064848508685827, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.02125, |
| "grad_norm": 2.7625067234039307, |
| "grad_norm_var": 0.1012976809853086, |
| "learning_rate": 0.0001, |
| "loss": 1.0102, |
| "loss/crossentropy": 2.3799381256103516, |
| "loss/hidden": 0.80859375, |
| "loss/logits": 0.140989288687706, |
| "loss/reg": 0.0060633583925664425, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.021375, |
| "grad_norm": 3.713162899017334, |
| "grad_norm_var": 0.1323542313667114, |
| "learning_rate": 0.0001, |
| "loss": 1.0173, |
| "loss/crossentropy": 2.7296385765075684, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.1559314727783203, |
| "loss/reg": 0.006062004715204239, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.0215, |
| "grad_norm": 2.8448026180267334, |
| "grad_norm_var": 0.13256580340874963, |
| "learning_rate": 0.0001, |
| "loss": 1.0945, |
| "loss/crossentropy": 2.211848497390747, |
| "loss/hidden": 0.87890625, |
| "loss/logits": 0.15503031015396118, |
| "loss/reg": 0.006060663145035505, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.021625, |
| "grad_norm": 2.951566696166992, |
| "grad_norm_var": 0.13242537871232402, |
| "learning_rate": 0.0001, |
| "loss": 1.243, |
| "loss/crossentropy": 2.6379833221435547, |
| "loss/hidden": 0.96484375, |
| "loss/logits": 0.21754613518714905, |
| "loss/reg": 0.00605935649946332, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.02175, |
| "grad_norm": 2.6862404346466064, |
| "grad_norm_var": 0.13142011502921586, |
| "learning_rate": 0.0001, |
| "loss": 1.0053, |
| "loss/crossentropy": 2.3807766437530518, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.14393460750579834, |
| "loss/reg": 0.006058130878955126, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.021875, |
| "grad_norm": 2.5145609378814697, |
| "grad_norm_var": 0.13512780159794507, |
| "learning_rate": 0.0001, |
| "loss": 1.0609, |
| "loss/crossentropy": 2.4608380794525146, |
| "loss/hidden": 0.85546875, |
| "loss/logits": 0.14485566318035126, |
| "loss/reg": 0.006056922487914562, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.022, |
| "grad_norm": 3.23178768157959, |
| "grad_norm_var": 0.14607750168249728, |
| "learning_rate": 0.0001, |
| "loss": 1.1294, |
| "loss/crossentropy": 2.9791719913482666, |
| "loss/hidden": 0.91796875, |
| "loss/logits": 0.1508345603942871, |
| "loss/reg": 0.006055623292922974, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.022125, |
| "grad_norm": 2.7397234439849854, |
| "grad_norm_var": 0.14000512423072375, |
| "learning_rate": 0.0001, |
| "loss": 1.0578, |
| "loss/crossentropy": 2.4559919834136963, |
| "loss/hidden": 0.86328125, |
| "loss/logits": 0.1340080350637436, |
| "loss/reg": 0.0060544307343661785, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.02225, |
| "grad_norm": 2.6637048721313477, |
| "grad_norm_var": 0.14009088002925954, |
| "learning_rate": 0.0001, |
| "loss": 1.076, |
| "loss/crossentropy": 2.3794586658477783, |
| "loss/hidden": 0.86328125, |
| "loss/logits": 0.15214313566684723, |
| "loss/reg": 0.0060530174523591995, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.022375, |
| "grad_norm": 2.0105221271514893, |
| "grad_norm_var": 0.17628626628935157, |
| "learning_rate": 0.0001, |
| "loss": 0.9703, |
| "loss/crossentropy": 2.3926336765289307, |
| "loss/hidden": 0.77734375, |
| "loss/logits": 0.13244566321372986, |
| "loss/reg": 0.0060517978854477406, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.0225, |
| "grad_norm": 2.571902275085449, |
| "grad_norm_var": 0.16659277386996318, |
| "learning_rate": 0.0001, |
| "loss": 1.0739, |
| "loss/crossentropy": 2.7502923011779785, |
| "loss/hidden": 0.8515625, |
| "loss/logits": 0.16181406378746033, |
| "loss/reg": 0.006050686351954937, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.022625, |
| "grad_norm": 2.700366973876953, |
| "grad_norm_var": 0.1636147823311904, |
| "learning_rate": 0.0001, |
| "loss": 1.0113, |
| "loss/crossentropy": 2.502389669418335, |
| "loss/hidden": 0.8125, |
| "loss/logits": 0.138347327709198, |
| "loss/reg": 0.006049246061593294, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.02275, |
| "grad_norm": 2.7259435653686523, |
| "grad_norm_var": 0.1629618050893432, |
| "learning_rate": 0.0001, |
| "loss": 1.0192, |
| "loss/crossentropy": 2.2493560314178467, |
| "loss/hidden": 0.82421875, |
| "loss/logits": 0.1344609260559082, |
| "loss/reg": 0.006048021838068962, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.022875, |
| "grad_norm": 4.930091857910156, |
| "grad_norm_var": 0.43832731745023895, |
| "learning_rate": 0.0001, |
| "loss": 1.1874, |
| "loss/crossentropy": 2.649231433868408, |
| "loss/hidden": 0.94140625, |
| "loss/logits": 0.1855432242155075, |
| "loss/reg": 0.006046844646334648, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.023, |
| "grad_norm": 2.288604259490967, |
| "grad_norm_var": 0.4589782783160859, |
| "learning_rate": 0.0001, |
| "loss": 1.0354, |
| "loss/crossentropy": 3.0482568740844727, |
| "loss/hidden": 0.8203125, |
| "loss/logits": 0.15461647510528564, |
| "loss/reg": 0.006045445334166288, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.023125, |
| "grad_norm": 2.7902991771698, |
| "grad_norm_var": 0.4362058684835667, |
| "learning_rate": 0.0001, |
| "loss": 1.0744, |
| "loss/crossentropy": 2.726069211959839, |
| "loss/hidden": 0.8359375, |
| "loss/logits": 0.17799492180347443, |
| "loss/reg": 0.006044231820851564, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.02325, |
| "grad_norm": 3.597017526626587, |
| "grad_norm_var": 0.46633972017124825, |
| "learning_rate": 0.0001, |
| "loss": 1.0985, |
| "loss/crossentropy": 2.200692892074585, |
| "loss/hidden": 0.8984375, |
| "loss/logits": 0.13961729407310486, |
| "loss/reg": 0.006042772904038429, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.023375, |
| "grad_norm": 2.969062566757202, |
| "grad_norm_var": 0.42374272593361867, |
| "learning_rate": 0.0001, |
| "loss": 1.2314, |
| "loss/crossentropy": 2.3744540214538574, |
| "loss/hidden": 0.96875, |
| "loss/logits": 0.20225511491298676, |
| "loss/reg": 0.006041594315320253, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.0235, |
| "grad_norm": 3.2257020473480225, |
| "grad_norm_var": 0.4305906329857976, |
| "learning_rate": 0.0001, |
| "loss": 1.0982, |
| "loss/crossentropy": 2.442505121231079, |
| "loss/hidden": 0.875, |
| "loss/logits": 0.16284233331680298, |
| "loss/reg": 0.006040407810360193, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.023625, |
| "grad_norm": 3.670443058013916, |
| "grad_norm_var": 0.4666515285365591, |
| "learning_rate": 0.0001, |
| "loss": 1.2391, |
| "loss/crossentropy": 2.533158540725708, |
| "loss/hidden": 0.98046875, |
| "loss/logits": 0.19827201962471008, |
| "loss/reg": 0.0060392809100449085, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.02375, |
| "grad_norm": 7.53206729888916, |
| "grad_norm_var": 1.7591779439754056, |
| "learning_rate": 0.0001, |
| "loss": 1.1689, |
| "loss/crossentropy": 2.3104734420776367, |
| "loss/hidden": 0.96875, |
| "loss/logits": 0.13976144790649414, |
| "loss/reg": 0.006038178689777851, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.023875, |
| "grad_norm": 4.658889293670654, |
| "grad_norm_var": 1.833400975261701, |
| "learning_rate": 0.0001, |
| "loss": 1.3266, |
| "loss/crossentropy": 2.286229133605957, |
| "loss/hidden": 1.1015625, |
| "loss/logits": 0.16465552151203156, |
| "loss/reg": 0.006036726757884026, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.024, |
| "grad_norm": 3.2109904289245605, |
| "grad_norm_var": 1.8338781863373583, |
| "learning_rate": 0.0001, |
| "loss": 1.278, |
| "loss/crossentropy": 2.5849151611328125, |
| "loss/hidden": 1.0078125, |
| "loss/logits": 0.20983844995498657, |
| "loss/reg": 0.006035543512552977, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.024125, |
| "grad_norm": 2.556408643722534, |
| "grad_norm_var": 1.8519417466969637, |
| "learning_rate": 0.0001, |
| "loss": 1.0335, |
| "loss/crossentropy": 2.635669231414795, |
| "loss/hidden": 0.8359375, |
| "loss/logits": 0.13721294701099396, |
| "loss/reg": 0.006034051068127155, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.02425, |
| "grad_norm": 3.4185855388641357, |
| "grad_norm_var": 1.8153229069184569, |
| "learning_rate": 0.0001, |
| "loss": 1.0115, |
| "loss/crossentropy": 2.3127341270446777, |
| "loss/hidden": 0.828125, |
| "loss/logits": 0.12303752452135086, |
| "loss/reg": 0.00603274954482913, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.024375, |
| "grad_norm": 3.639681816101074, |
| "grad_norm_var": 1.6731808292397734, |
| "learning_rate": 0.0001, |
| "loss": 1.2374, |
| "loss/crossentropy": 2.4363749027252197, |
| "loss/hidden": 0.98046875, |
| "loss/logits": 0.19659578800201416, |
| "loss/reg": 0.006031363736838102, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.0245, |
| "grad_norm": 3.266385078430176, |
| "grad_norm_var": 1.614572274352353, |
| "learning_rate": 0.0001, |
| "loss": 1.19, |
| "loss/crossentropy": 2.2824337482452393, |
| "loss/hidden": 0.9609375, |
| "loss/logits": 0.16878634691238403, |
| "loss/reg": 0.006029782351106405, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.024625, |
| "grad_norm": 3.0692105293273926, |
| "grad_norm_var": 1.5801212385016838, |
| "learning_rate": 0.0001, |
| "loss": 1.1495, |
| "loss/crossentropy": 2.518056631088257, |
| "loss/hidden": 0.921875, |
| "loss/logits": 0.16731634736061096, |
| "loss/reg": 0.006028252653777599, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.02475, |
| "grad_norm": 3.390202283859253, |
| "grad_norm_var": 1.530565626963321, |
| "learning_rate": 0.0001, |
| "loss": 1.1783, |
| "loss/crossentropy": 2.3565316200256348, |
| "loss/hidden": 0.9375, |
| "loss/logits": 0.18055224418640137, |
| "loss/reg": 0.006026738323271275, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.024875, |
| "grad_norm": 2.524461030960083, |
| "grad_norm_var": 1.4779304822181976, |
| "learning_rate": 0.0001, |
| "loss": 1.095, |
| "loss/crossentropy": 2.3489255905151367, |
| "loss/hidden": 0.88671875, |
| "loss/logits": 0.1480264812707901, |
| "loss/reg": 0.006025230046361685, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 2.8753433227539062, |
| "grad_norm_var": 1.4056158732497617, |
| "learning_rate": 0.0001, |
| "loss": 1.1396, |
| "loss/crossentropy": 2.379971504211426, |
| "loss/hidden": 0.90625, |
| "loss/logits": 0.17312359809875488, |
| "loss/reg": 0.0060236188583076, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.025125, |
| "grad_norm": 2.2297983169555664, |
| "grad_norm_var": 1.4801331513155804, |
| "learning_rate": 0.0001, |
| "loss": 1.1642, |
| "loss/crossentropy": 2.401499032974243, |
| "loss/hidden": 0.9296875, |
| "loss/logits": 0.1743072271347046, |
| "loss/reg": 0.006021994166076183, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.02525, |
| "grad_norm": 2.7430193424224854, |
| "grad_norm_var": 1.5134885749372204, |
| "learning_rate": 0.0001, |
| "loss": 1.3503, |
| "loss/crossentropy": 2.3397345542907715, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.1963859498500824, |
| "loss/reg": 0.006020485423505306, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.025375, |
| "grad_norm": 3.3862688541412354, |
| "grad_norm_var": 1.4983780502999742, |
| "learning_rate": 0.0001, |
| "loss": 1.3154, |
| "loss/crossentropy": 2.3259048461914062, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.1614416241645813, |
| "loss/reg": 0.0060190120711922646, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.0255, |
| "grad_norm": 2.554938316345215, |
| "grad_norm_var": 1.547662147741073, |
| "learning_rate": 0.0001, |
| "loss": 1.1147, |
| "loss/crossentropy": 2.559544801712036, |
| "loss/hidden": 0.890625, |
| "loss/logits": 0.16388913989067078, |
| "loss/reg": 0.006017730105668306, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.025625, |
| "grad_norm": 2.6290361881256104, |
| "grad_norm_var": 1.5807281675134672, |
| "learning_rate": 0.0001, |
| "loss": 1.049, |
| "loss/crossentropy": 2.7080090045928955, |
| "loss/hidden": 0.828125, |
| "loss/logits": 0.16068041324615479, |
| "loss/reg": 0.006016433704644442, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.02575, |
| "grad_norm": 2.234259605407715, |
| "grad_norm_var": 0.38456120947827777, |
| "learning_rate": 0.0001, |
| "loss": 1.0392, |
| "loss/crossentropy": 2.3816347122192383, |
| "loss/hidden": 0.8359375, |
| "loss/logits": 0.14315146207809448, |
| "loss/reg": 0.0060149249620735645, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.025875, |
| "grad_norm": 2.810352325439453, |
| "grad_norm_var": 0.19522907990381644, |
| "learning_rate": 0.0001, |
| "loss": 1.1385, |
| "loss/crossentropy": 2.6245384216308594, |
| "loss/hidden": 0.90625, |
| "loss/logits": 0.17206540703773499, |
| "loss/reg": 0.006013684440404177, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.026, |
| "grad_norm": 2.198707342147827, |
| "grad_norm_var": 0.21847125065788287, |
| "learning_rate": 0.0001, |
| "loss": 0.9762, |
| "loss/crossentropy": 2.3812787532806396, |
| "loss/hidden": 0.796875, |
| "loss/logits": 0.119233138859272, |
| "loss/reg": 0.006012204568833113, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.026125, |
| "grad_norm": 2.5001378059387207, |
| "grad_norm_var": 0.22083751043745087, |
| "learning_rate": 0.0001, |
| "loss": 1.2526, |
| "loss/crossentropy": 2.5999109745025635, |
| "loss/hidden": 0.984375, |
| "loss/logits": 0.20815744996070862, |
| "loss/reg": 0.006010920740664005, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.02625, |
| "grad_norm": 3.175185203552246, |
| "grad_norm_var": 0.20582482438127556, |
| "learning_rate": 0.0001, |
| "loss": 1.239, |
| "loss/crossentropy": 2.3893682956695557, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.15550163388252258, |
| "loss/reg": 0.006009369157254696, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.026375, |
| "grad_norm": 3.482342481613159, |
| "grad_norm_var": 0.19031657232839597, |
| "learning_rate": 0.0001, |
| "loss": 1.1572, |
| "loss/crossentropy": 2.382542848587036, |
| "loss/hidden": 0.94921875, |
| "loss/logits": 0.14788678288459778, |
| "loss/reg": 0.006007815711200237, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.0265, |
| "grad_norm": 2.285135507583618, |
| "grad_norm_var": 0.19168098803167197, |
| "learning_rate": 0.0001, |
| "loss": 0.9667, |
| "loss/crossentropy": 2.552724838256836, |
| "loss/hidden": 0.78125, |
| "loss/logits": 0.1254206746816635, |
| "loss/reg": 0.006006232462823391, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.026625, |
| "grad_norm": 2.991971969604492, |
| "grad_norm_var": 0.1888233667670041, |
| "learning_rate": 0.0001, |
| "loss": 1.1472, |
| "loss/crossentropy": 2.472437620162964, |
| "loss/hidden": 0.9296875, |
| "loss/logits": 0.15750399231910706, |
| "loss/reg": 0.0060045006684958935, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.02675, |
| "grad_norm": 2.3775179386138916, |
| "grad_norm_var": 0.1665701003974154, |
| "learning_rate": 0.0001, |
| "loss": 1.1938, |
| "loss/crossentropy": 2.294337749481201, |
| "loss/hidden": 0.95703125, |
| "loss/logits": 0.17671090364456177, |
| "loss/reg": 0.006002978887408972, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.026875, |
| "grad_norm": 2.2992701530456543, |
| "grad_norm_var": 0.17463199132661936, |
| "learning_rate": 0.0001, |
| "loss": 1.2097, |
| "loss/crossentropy": 2.3843300342559814, |
| "loss/hidden": 0.9609375, |
| "loss/logits": 0.18876615166664124, |
| "loss/reg": 0.006001432426273823, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.027, |
| "grad_norm": 2.4926228523254395, |
| "grad_norm_var": 0.17347807328228151, |
| "learning_rate": 0.0001, |
| "loss": 1.3156, |
| "loss/crossentropy": 2.326836585998535, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.19308596849441528, |
| "loss/reg": 0.005999880842864513, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.027125, |
| "grad_norm": 2.552459478378296, |
| "grad_norm_var": 0.16193263198218044, |
| "learning_rate": 0.0001, |
| "loss": 1.1424, |
| "loss/crossentropy": 2.6629388332366943, |
| "loss/hidden": 0.91015625, |
| "loss/logits": 0.1722826063632965, |
| "loss/reg": 0.005998372100293636, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.02725, |
| "grad_norm": 2.866387128829956, |
| "grad_norm_var": 0.16409192036900605, |
| "learning_rate": 0.0001, |
| "loss": 1.0142, |
| "loss/crossentropy": 2.8154890537261963, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.15349115431308746, |
| "loss/reg": 0.005996840540319681, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.027375, |
| "grad_norm": 2.77524471282959, |
| "grad_norm_var": 0.12966566207502767, |
| "learning_rate": 0.0001, |
| "loss": 1.4111, |
| "loss/crossentropy": 2.4509928226470947, |
| "loss/hidden": 1.1015625, |
| "loss/logits": 0.249616801738739, |
| "loss/reg": 0.005995343904942274, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.0275, |
| "grad_norm": 2.887923240661621, |
| "grad_norm_var": 0.13285907347625023, |
| "learning_rate": 0.0001, |
| "loss": 1.2886, |
| "loss/crossentropy": 2.4280507564544678, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.20519307255744934, |
| "loss/reg": 0.005993579979985952, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.027625, |
| "grad_norm": 2.5383920669555664, |
| "grad_norm_var": 0.1337457284607846, |
| "learning_rate": 0.0001, |
| "loss": 1.3292, |
| "loss/crossentropy": 2.0803585052490234, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.17551109194755554, |
| "loss/reg": 0.005991705227643251, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.02775, |
| "grad_norm": 2.639490842819214, |
| "grad_norm_var": 0.12131687494494538, |
| "learning_rate": 0.0001, |
| "loss": 1.0593, |
| "loss/crossentropy": 2.293325901031494, |
| "loss/hidden": 0.8515625, |
| "loss/logits": 0.14782238006591797, |
| "loss/reg": 0.005989882629364729, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.027875, |
| "grad_norm": 2.4396984577178955, |
| "grad_norm_var": 0.12344012810124999, |
| "learning_rate": 0.0001, |
| "loss": 1.0587, |
| "loss/crossentropy": 2.7268667221069336, |
| "loss/hidden": 0.84765625, |
| "loss/logits": 0.15114662051200867, |
| "loss/reg": 0.0059883627109229565, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.028, |
| "grad_norm": 2.227886438369751, |
| "grad_norm_var": 0.12171264621671582, |
| "learning_rate": 0.0001, |
| "loss": 1.0087, |
| "loss/crossentropy": 2.4431943893432617, |
| "loss/hidden": 0.81640625, |
| "loss/logits": 0.13243696093559265, |
| "loss/reg": 0.005986812058836222, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.028125, |
| "grad_norm": 3.690627098083496, |
| "grad_norm_var": 0.18519755428341872, |
| "learning_rate": 0.0001, |
| "loss": 1.0732, |
| "loss/crossentropy": 2.4630942344665527, |
| "loss/hidden": 0.875, |
| "loss/logits": 0.13830721378326416, |
| "loss/reg": 0.005985158029943705, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.02825, |
| "grad_norm": 3.377890110015869, |
| "grad_norm_var": 0.19972658805784155, |
| "learning_rate": 0.0001, |
| "loss": 1.1848, |
| "loss/crossentropy": 2.2899203300476074, |
| "loss/hidden": 0.9609375, |
| "loss/logits": 0.16401749849319458, |
| "loss/reg": 0.005983633920550346, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.028375, |
| "grad_norm": 2.7600386142730713, |
| "grad_norm_var": 0.16135214723361363, |
| "learning_rate": 0.0001, |
| "loss": 1.0223, |
| "loss/crossentropy": 2.8077659606933594, |
| "loss/hidden": 0.8203125, |
| "loss/logits": 0.14218226075172424, |
| "loss/reg": 0.005982026923447847, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.0285, |
| "grad_norm": 2.3397345542907715, |
| "grad_norm_var": 0.15851713921701366, |
| "learning_rate": 0.0001, |
| "loss": 1.077, |
| "loss/crossentropy": 2.438030958175659, |
| "loss/hidden": 0.875, |
| "loss/logits": 0.14217695593833923, |
| "loss/reg": 0.005980519577860832, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.028625, |
| "grad_norm": 2.744401216506958, |
| "grad_norm_var": 0.15282793193407448, |
| "learning_rate": 0.0001, |
| "loss": 1.1967, |
| "loss/crossentropy": 2.557457447052002, |
| "loss/hidden": 0.97265625, |
| "loss/logits": 0.16425767540931702, |
| "loss/reg": 0.005979116074740887, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.02875, |
| "grad_norm": 2.4241418838500977, |
| "grad_norm_var": 0.15103305834679168, |
| "learning_rate": 0.0001, |
| "loss": 1.0402, |
| "loss/crossentropy": 2.743885040283203, |
| "loss/hidden": 0.828125, |
| "loss/logits": 0.15231972932815552, |
| "loss/reg": 0.005977709777653217, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.028875, |
| "grad_norm": 2.0828442573547363, |
| "grad_norm_var": 0.16526500993595217, |
| "learning_rate": 0.0001, |
| "loss": 0.9747, |
| "loss/crossentropy": 2.719327688217163, |
| "loss/hidden": 0.78125, |
| "loss/logits": 0.133681058883667, |
| "loss/reg": 0.005976095795631409, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.029, |
| "grad_norm": 2.127495527267456, |
| "grad_norm_var": 0.18259721536013085, |
| "learning_rate": 0.0001, |
| "loss": 1.0588, |
| "loss/crossentropy": 2.8147058486938477, |
| "loss/hidden": 0.85546875, |
| "loss/logits": 0.14354225993156433, |
| "loss/reg": 0.005974431522190571, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.029125, |
| "grad_norm": 4.263195991516113, |
| "grad_norm_var": 0.34219781045772657, |
| "learning_rate": 0.0001, |
| "loss": 1.1724, |
| "loss/crossentropy": 2.5414481163024902, |
| "loss/hidden": 0.96484375, |
| "loss/logits": 0.1478062868118286, |
| "loss/reg": 0.005972826853394508, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.02925, |
| "grad_norm": 2.9974324703216553, |
| "grad_norm_var": 0.34510225788824467, |
| "learning_rate": 0.0001, |
| "loss": 1.3152, |
| "loss/crossentropy": 2.697648763656616, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.20080995559692383, |
| "loss/reg": 0.005971227772533894, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.029375, |
| "grad_norm": 3.4798855781555176, |
| "grad_norm_var": 0.37664835069757197, |
| "learning_rate": 0.0001, |
| "loss": 1.2096, |
| "loss/crossentropy": 2.3990559577941895, |
| "loss/hidden": 0.95703125, |
| "loss/logits": 0.19287389516830444, |
| "loss/reg": 0.005969603545963764, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.0295, |
| "grad_norm": 2.43911075592041, |
| "grad_norm_var": 0.3848032740432508, |
| "learning_rate": 0.0001, |
| "loss": 1.0658, |
| "loss/crossentropy": 1.966374158859253, |
| "loss/hidden": 0.875, |
| "loss/logits": 0.13115233182907104, |
| "loss/reg": 0.005967943929135799, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.029625, |
| "grad_norm": 3.7423646450042725, |
| "grad_norm_var": 0.4356891905379257, |
| "learning_rate": 0.0001, |
| "loss": 1.2397, |
| "loss/crossentropy": 2.718675374984741, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.18789833784103394, |
| "loss/reg": 0.00596608454361558, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.02975, |
| "grad_norm": 3.328033924102783, |
| "grad_norm_var": 0.4449827328026664, |
| "learning_rate": 0.0001, |
| "loss": 1.5581, |
| "loss/crossentropy": 2.272303819656372, |
| "loss/hidden": 1.2421875, |
| "loss/logits": 0.2562662661075592, |
| "loss/reg": 0.005964066833257675, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.029875, |
| "grad_norm": 2.8761045932769775, |
| "grad_norm_var": 0.42986649641521024, |
| "learning_rate": 0.0001, |
| "loss": 1.1392, |
| "loss/crossentropy": 2.6973013877868652, |
| "loss/hidden": 0.91796875, |
| "loss/logits": 0.16159963607788086, |
| "loss/reg": 0.005962541792541742, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 2.4458563327789307, |
| "grad_norm_var": 0.4123921579785623, |
| "learning_rate": 0.0001, |
| "loss": 1.178, |
| "loss/crossentropy": 2.5731561183929443, |
| "loss/hidden": 0.9375, |
| "loss/logits": 0.18093177676200867, |
| "loss/reg": 0.005961006972938776, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.030125, |
| "grad_norm": 2.4645614624023438, |
| "grad_norm_var": 0.3844441578530656, |
| "learning_rate": 0.0001, |
| "loss": 1.0932, |
| "loss/crossentropy": 2.648738145828247, |
| "loss/hidden": 0.890625, |
| "loss/logits": 0.14302745461463928, |
| "loss/reg": 0.005959144793450832, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.03025, |
| "grad_norm": 3.0715034008026123, |
| "grad_norm_var": 0.3694944025754277, |
| "learning_rate": 0.0001, |
| "loss": 1.1916, |
| "loss/crossentropy": 2.4820139408111572, |
| "loss/hidden": 0.94921875, |
| "loss/logits": 0.18278783559799194, |
| "loss/reg": 0.005957332905381918, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.030375, |
| "grad_norm": 2.479677677154541, |
| "grad_norm_var": 0.37773887013444374, |
| "learning_rate": 0.0001, |
| "loss": 1.0787, |
| "loss/crossentropy": 2.614309549331665, |
| "loss/hidden": 0.87109375, |
| "loss/logits": 0.14808647334575653, |
| "loss/reg": 0.005955492611974478, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.0305, |
| "grad_norm": 3.0970399379730225, |
| "grad_norm_var": 0.36391299171458796, |
| "learning_rate": 0.0001, |
| "loss": 1.1987, |
| "loss/crossentropy": 2.2731809616088867, |
| "loss/hidden": 0.95703125, |
| "loss/logits": 0.18210504949092865, |
| "loss/reg": 0.00595364673063159, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.030625, |
| "grad_norm": 2.388214588165283, |
| "grad_norm_var": 0.37823356386532864, |
| "learning_rate": 0.0001, |
| "loss": 1.1283, |
| "loss/crossentropy": 2.532259225845337, |
| "loss/hidden": 0.91015625, |
| "loss/logits": 0.15858401358127594, |
| "loss/reg": 0.005952049978077412, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.03075, |
| "grad_norm": 2.97310733795166, |
| "grad_norm_var": 0.36540629077152076, |
| "learning_rate": 0.0001, |
| "loss": 1.1177, |
| "loss/crossentropy": 2.5206258296966553, |
| "loss/hidden": 0.89453125, |
| "loss/logits": 0.16365137696266174, |
| "loss/reg": 0.005950110498815775, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.030875, |
| "grad_norm": 2.15498423576355, |
| "grad_norm_var": 0.3579579158371985, |
| "learning_rate": 0.0001, |
| "loss": 1.1046, |
| "loss/crossentropy": 2.478773832321167, |
| "loss/hidden": 0.8828125, |
| "loss/logits": 0.162343829870224, |
| "loss/reg": 0.005948282778263092, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.031, |
| "grad_norm": 2.3404128551483154, |
| "grad_norm_var": 0.338987407645584, |
| "learning_rate": 0.0001, |
| "loss": 1.1555, |
| "loss/crossentropy": 2.1949751377105713, |
| "loss/hidden": 0.93359375, |
| "loss/logits": 0.1624409407377243, |
| "loss/reg": 0.005946675315499306, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.031125, |
| "grad_norm": 2.8813085556030273, |
| "grad_norm_var": 0.20879640313171802, |
| "learning_rate": 0.0001, |
| "loss": 1.1599, |
| "loss/crossentropy": 2.556128978729248, |
| "loss/hidden": 0.9296875, |
| "loss/logits": 0.1707805097103119, |
| "loss/reg": 0.005944731179624796, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.03125, |
| "grad_norm": 3.309937000274658, |
| "grad_norm_var": 0.22219010027481143, |
| "learning_rate": 0.0001, |
| "loss": 1.0939, |
| "loss/crossentropy": 2.4590022563934326, |
| "loss/hidden": 0.88671875, |
| "loss/logits": 0.14774294197559357, |
| "loss/reg": 0.005942681338638067, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.031375, |
| "grad_norm": 3.1676676273345947, |
| "grad_norm_var": 0.201728293925846, |
| "learning_rate": 0.0001, |
| "loss": 1.3162, |
| "loss/crossentropy": 2.419811487197876, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.24120670557022095, |
| "loss/reg": 0.005940672475844622, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.0315, |
| "grad_norm": 2.6006832122802734, |
| "grad_norm_var": 0.1951007002723287, |
| "learning_rate": 0.0001, |
| "loss": 1.3903, |
| "loss/crossentropy": 2.170666456222534, |
| "loss/hidden": 1.140625, |
| "loss/logits": 0.19024603068828583, |
| "loss/reg": 0.005938523914664984, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.031625, |
| "grad_norm": 2.4954755306243896, |
| "grad_norm_var": 0.14101991304577552, |
| "learning_rate": 0.0001, |
| "loss": 1.1465, |
| "loss/crossentropy": 2.262831449508667, |
| "loss/hidden": 0.93359375, |
| "loss/logits": 0.1535283327102661, |
| "loss/reg": 0.00593681400641799, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.03175, |
| "grad_norm": 2.339406728744507, |
| "grad_norm_var": 0.12652605714113535, |
| "learning_rate": 0.0001, |
| "loss": 0.984, |
| "loss/crossentropy": 2.2793617248535156, |
| "loss/hidden": 0.796875, |
| "loss/logits": 0.12778240442276, |
| "loss/reg": 0.005935273133218288, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.031875, |
| "grad_norm": 2.3391647338867188, |
| "grad_norm_var": 0.131427049667937, |
| "learning_rate": 0.0001, |
| "loss": 1.0622, |
| "loss/crossentropy": 2.4579379558563232, |
| "loss/hidden": 0.83984375, |
| "loss/logits": 0.16299216449260712, |
| "loss/reg": 0.0059331608936190605, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 2.3896231651306152, |
| "grad_norm_var": 0.13322512800125588, |
| "learning_rate": 0.0001, |
| "loss": 1.057, |
| "loss/crossentropy": 2.8022475242614746, |
| "loss/hidden": 0.85546875, |
| "loss/logits": 0.14219465851783752, |
| "loss/reg": 0.005931555759161711, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.032125, |
| "grad_norm": 2.125249147415161, |
| "grad_norm_var": 0.14907278605534582, |
| "learning_rate": 0.0001, |
| "loss": 1.0611, |
| "loss/crossentropy": 2.33644700050354, |
| "loss/hidden": 0.8515625, |
| "loss/logits": 0.15020999312400818, |
| "loss/reg": 0.005930029321461916, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.03225, |
| "grad_norm": 2.521933078765869, |
| "grad_norm_var": 0.13593429417580463, |
| "learning_rate": 0.0001, |
| "loss": 1.0436, |
| "loss/crossentropy": 2.512619733810425, |
| "loss/hidden": 0.8203125, |
| "loss/logits": 0.16396166384220123, |
| "loss/reg": 0.00592817785218358, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.032375, |
| "grad_norm": 2.5966317653656006, |
| "grad_norm_var": 0.13490910688263208, |
| "learning_rate": 0.0001, |
| "loss": 1.1331, |
| "loss/crossentropy": 2.248013734817505, |
| "loss/hidden": 0.91015625, |
| "loss/logits": 0.16364812850952148, |
| "loss/reg": 0.00592625979334116, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.0325, |
| "grad_norm": 2.2045137882232666, |
| "grad_norm_var": 0.12644607438415487, |
| "learning_rate": 0.0001, |
| "loss": 1.0015, |
| "loss/crossentropy": 2.3253698348999023, |
| "loss/hidden": 0.796875, |
| "loss/logits": 0.14540287852287292, |
| "loss/reg": 0.005924653727561235, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.032625, |
| "grad_norm": 2.4450156688690186, |
| "grad_norm_var": 0.1254090419850094, |
| "learning_rate": 0.0001, |
| "loss": 0.9932, |
| "loss/crossentropy": 2.2374210357666016, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.13316848874092102, |
| "loss/reg": 0.005922792013734579, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.03275, |
| "grad_norm": 7.747511863708496, |
| "grad_norm_var": 1.8160510254643325, |
| "learning_rate": 0.0001, |
| "loss": 1.2542, |
| "loss/crossentropy": 2.8747429847717285, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.17151576280593872, |
| "loss/reg": 0.005921173375099897, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.032875, |
| "grad_norm": 2.1854233741760254, |
| "grad_norm_var": 1.8132730792650582, |
| "learning_rate": 0.0001, |
| "loss": 1.0069, |
| "loss/crossentropy": 2.4989960193634033, |
| "loss/hidden": 0.8125, |
| "loss/logits": 0.13518914580345154, |
| "loss/reg": 0.005919379647821188, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.033, |
| "grad_norm": 3.5132219791412354, |
| "grad_norm_var": 1.8186749991604263, |
| "learning_rate": 0.0001, |
| "loss": 1.054, |
| "loss/crossentropy": 2.497178316116333, |
| "loss/hidden": 0.84765625, |
| "loss/logits": 0.1471494734287262, |
| "loss/reg": 0.005917761009186506, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.033125, |
| "grad_norm": 4.302145481109619, |
| "grad_norm_var": 1.9358282916849012, |
| "learning_rate": 0.0001, |
| "loss": 1.3123, |
| "loss/crossentropy": 2.1725542545318604, |
| "loss/hidden": 1.0859375, |
| "loss/logits": 0.16722658276557922, |
| "loss/reg": 0.0059160212986171246, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.03325, |
| "grad_norm": 2.3225510120391846, |
| "grad_norm_var": 1.9582913809461102, |
| "learning_rate": 0.0001, |
| "loss": 1.0153, |
| "loss/crossentropy": 2.6670029163360596, |
| "loss/hidden": 0.80859375, |
| "loss/logits": 0.1475904881954193, |
| "loss/reg": 0.0059142098762094975, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.033375, |
| "grad_norm": 5.196990013122559, |
| "grad_norm_var": 2.27294427304937, |
| "learning_rate": 0.0001, |
| "loss": 1.1665, |
| "loss/crossentropy": 2.6792731285095215, |
| "loss/hidden": 0.94140625, |
| "loss/logits": 0.1659836769104004, |
| "loss/reg": 0.00591221172362566, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.0335, |
| "grad_norm": 3.5144336223602295, |
| "grad_norm_var": 2.26638445070385, |
| "learning_rate": 0.0001, |
| "loss": 1.2502, |
| "loss/crossentropy": 2.2949023246765137, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.1677004098892212, |
| "loss/reg": 0.005910532083362341, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.033625, |
| "grad_norm": 2.861222267150879, |
| "grad_norm_var": 2.2433162495019436, |
| "learning_rate": 0.0001, |
| "loss": 1.3308, |
| "loss/crossentropy": 2.5955142974853516, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.2013990730047226, |
| "loss/reg": 0.005908492021262646, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.03375, |
| "grad_norm": 2.964390754699707, |
| "grad_norm_var": 2.1991134738974947, |
| "learning_rate": 0.0001, |
| "loss": 1.0975, |
| "loss/crossentropy": 2.483924150466919, |
| "loss/hidden": 0.8828125, |
| "loss/logits": 0.15562227368354797, |
| "loss/reg": 0.005906403064727783, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.033875, |
| "grad_norm": 2.75604510307312, |
| "grad_norm_var": 2.1620222961988325, |
| "learning_rate": 0.0001, |
| "loss": 1.2196, |
| "loss/crossentropy": 2.39125394821167, |
| "loss/hidden": 0.9765625, |
| "loss/logits": 0.18403753638267517, |
| "loss/reg": 0.00590470340102911, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.034, |
| "grad_norm": 2.360309362411499, |
| "grad_norm_var": 2.165352535939727, |
| "learning_rate": 0.0001, |
| "loss": 1.0194, |
| "loss/crossentropy": 2.530670404434204, |
| "loss/hidden": 0.8046875, |
| "loss/logits": 0.15565866231918335, |
| "loss/reg": 0.005902664735913277, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.034125, |
| "grad_norm": 2.496027946472168, |
| "grad_norm_var": 2.1195219252368287, |
| "learning_rate": 0.0001, |
| "loss": 1.2228, |
| "loss/crossentropy": 2.7535252571105957, |
| "loss/hidden": 0.9609375, |
| "loss/logits": 0.20284873247146606, |
| "loss/reg": 0.005900639574974775, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.03425, |
| "grad_norm": 2.854250431060791, |
| "grad_norm_var": 2.0941964139517344, |
| "learning_rate": 0.0001, |
| "loss": 1.1387, |
| "loss/crossentropy": 2.134964942932129, |
| "loss/hidden": 0.9296875, |
| "loss/logits": 0.15002194046974182, |
| "loss/reg": 0.005898929201066494, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.034375, |
| "grad_norm": 4.497798442840576, |
| "grad_norm_var": 2.149396374832277, |
| "learning_rate": 0.0001, |
| "loss": 1.2312, |
| "loss/crossentropy": 2.3270835876464844, |
| "loss/hidden": 0.99609375, |
| "loss/logits": 0.17617599666118622, |
| "loss/reg": 0.0058972095139324665, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.0345, |
| "grad_norm": 2.321152448654175, |
| "grad_norm_var": 2.1318278315927155, |
| "learning_rate": 0.0001, |
| "loss": 1.1523, |
| "loss/crossentropy": 1.858445644378662, |
| "loss/hidden": 0.94921875, |
| "loss/logits": 0.14408603310585022, |
| "loss/reg": 0.005895303096622229, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.034625, |
| "grad_norm": 2.4426257610321045, |
| "grad_norm_var": 2.1321312734782243, |
| "learning_rate": 0.0001, |
| "loss": 1.0267, |
| "loss/crossentropy": 2.4483628273010254, |
| "loss/hidden": 0.82421875, |
| "loss/logits": 0.1435263752937317, |
| "loss/reg": 0.005893299821764231, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.03475, |
| "grad_norm": 2.144637107849121, |
| "grad_norm_var": 0.843351985629086, |
| "learning_rate": 0.0001, |
| "loss": 1.0517, |
| "loss/crossentropy": 2.237915277481079, |
| "loss/hidden": 0.8515625, |
| "loss/logits": 0.14119011163711548, |
| "loss/reg": 0.005891298409551382, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.034875, |
| "grad_norm": 2.32000732421875, |
| "grad_norm_var": 0.8290445100225684, |
| "learning_rate": 0.0001, |
| "loss": 1.0462, |
| "loss/crossentropy": 2.6588850021362305, |
| "loss/hidden": 0.83203125, |
| "loss/logits": 0.1552983820438385, |
| "loss/reg": 0.0058892290107905865, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.035, |
| "grad_norm": 3.3390939235687256, |
| "grad_norm_var": 0.820283282746707, |
| "learning_rate": 0.0001, |
| "loss": 1.1937, |
| "loss/crossentropy": 2.5243186950683594, |
| "loss/hidden": 0.953125, |
| "loss/logits": 0.1817275732755661, |
| "loss/reg": 0.00588742271065712, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.035125, |
| "grad_norm": 3.1800894737243652, |
| "grad_norm_var": 0.7106469411621028, |
| "learning_rate": 0.0001, |
| "loss": 1.1937, |
| "loss/crossentropy": 2.556126832962036, |
| "loss/hidden": 0.953125, |
| "loss/logits": 0.18167603015899658, |
| "loss/reg": 0.005885709077119827, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.03525, |
| "grad_norm": 4.466390132904053, |
| "grad_norm_var": 0.8119073339313209, |
| "learning_rate": 0.0001, |
| "loss": 1.27, |
| "loss/crossentropy": 2.5671539306640625, |
| "loss/hidden": 0.984375, |
| "loss/logits": 0.2267427146434784, |
| "loss/reg": 0.0058837407268583775, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.035375, |
| "grad_norm": 3.2809953689575195, |
| "grad_norm_var": 0.5074810718943117, |
| "learning_rate": 0.0001, |
| "loss": 1.1245, |
| "loss/crossentropy": 2.1554338932037354, |
| "loss/hidden": 0.9140625, |
| "loss/logits": 0.1516391634941101, |
| "loss/reg": 0.005881770513951778, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.0355, |
| "grad_norm": 2.9982316493988037, |
| "grad_norm_var": 0.48786559613454966, |
| "learning_rate": 0.0001, |
| "loss": 1.1286, |
| "loss/crossentropy": 2.6773006916046143, |
| "loss/hidden": 0.90625, |
| "loss/logits": 0.1635606288909912, |
| "loss/reg": 0.005880062934011221, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.035625, |
| "grad_norm": 2.387657880783081, |
| "grad_norm_var": 0.5078162485774572, |
| "learning_rate": 0.0001, |
| "loss": 1.1214, |
| "loss/crossentropy": 2.4741320610046387, |
| "loss/hidden": 0.8984375, |
| "loss/logits": 0.1641697734594345, |
| "loss/reg": 0.0058782072737813, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.03575, |
| "grad_norm": 271.6628112792969, |
| "grad_norm_var": 4514.324895160767, |
| "learning_rate": 0.0001, |
| "loss": 1.6171, |
| "loss/crossentropy": 2.5766143798828125, |
| "loss/hidden": 1.375, |
| "loss/logits": 0.1833469420671463, |
| "loss/reg": 0.005876271054148674, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.035875, |
| "grad_norm": 3.545677900314331, |
| "grad_norm_var": 4512.577903953303, |
| "learning_rate": 0.0001, |
| "loss": 1.1466, |
| "loss/crossentropy": 2.5389881134033203, |
| "loss/hidden": 0.88671875, |
| "loss/logits": 0.20117658376693726, |
| "loss/reg": 0.005874336697161198, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.036, |
| "grad_norm": 2.9219233989715576, |
| "grad_norm_var": 4511.294050983276, |
| "learning_rate": 0.0001, |
| "loss": 1.1121, |
| "loss/crossentropy": 2.3270509243011475, |
| "loss/hidden": 0.8828125, |
| "loss/logits": 0.17058232426643372, |
| "loss/reg": 0.005872361361980438, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.036125, |
| "grad_norm": 2.831878423690796, |
| "grad_norm_var": 4510.526061571783, |
| "learning_rate": 0.0001, |
| "loss": 1.148, |
| "loss/crossentropy": 2.4853744506835938, |
| "loss/hidden": 0.91796875, |
| "loss/logits": 0.17128118872642517, |
| "loss/reg": 0.005870639346539974, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.03625, |
| "grad_norm": 2.284134864807129, |
| "grad_norm_var": 4511.83639181831, |
| "learning_rate": 0.0001, |
| "loss": 1.0599, |
| "loss/crossentropy": 2.3107759952545166, |
| "loss/hidden": 0.8515625, |
| "loss/logits": 0.14969472587108612, |
| "loss/reg": 0.005868903826922178, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.036375, |
| "grad_norm": 2.2008161544799805, |
| "grad_norm_var": 4516.84932017332, |
| "learning_rate": 0.0001, |
| "loss": 1.0902, |
| "loss/crossentropy": 2.4265358448028564, |
| "loss/hidden": 0.86328125, |
| "loss/logits": 0.1682073473930359, |
| "loss/reg": 0.0058671231381595135, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.0365, |
| "grad_norm": 2.6285743713378906, |
| "grad_norm_var": 4516.145108725088, |
| "learning_rate": 0.0001, |
| "loss": 1.2494, |
| "loss/crossentropy": 2.372230291366577, |
| "loss/hidden": 0.98046875, |
| "loss/logits": 0.2102714478969574, |
| "loss/reg": 0.005865375977009535, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.036625, |
| "grad_norm": 2.6784040927886963, |
| "grad_norm_var": 4515.607170253259, |
| "learning_rate": 0.0001, |
| "loss": 1.0752, |
| "loss/crossentropy": 2.6276440620422363, |
| "loss/hidden": 0.875, |
| "loss/logits": 0.14159329235553741, |
| "loss/reg": 0.005863656289875507, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.03675, |
| "grad_norm": 2.6373047828674316, |
| "grad_norm_var": 4514.470495103465, |
| "learning_rate": 0.0001, |
| "loss": 1.1694, |
| "loss/crossentropy": 2.70892333984375, |
| "loss/hidden": 0.9453125, |
| "loss/logits": 0.16546514630317688, |
| "loss/reg": 0.005862091202288866, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.036875, |
| "grad_norm": 2.384430170059204, |
| "grad_norm_var": 4514.321377312488, |
| "learning_rate": 0.0001, |
| "loss": 1.2472, |
| "loss/crossentropy": 2.1273090839385986, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.18860690295696259, |
| "loss/reg": 0.005860424134880304, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.037, |
| "grad_norm": 2.5959692001342773, |
| "grad_norm_var": 4515.978398966678, |
| "learning_rate": 0.0001, |
| "loss": 1.0376, |
| "loss/crossentropy": 2.7293522357940674, |
| "loss/hidden": 0.8203125, |
| "loss/logits": 0.1587076485157013, |
| "loss/reg": 0.0058588446117937565, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.037125, |
| "grad_norm": 2.2753238677978516, |
| "grad_norm_var": 4518.0185669920775, |
| "learning_rate": 0.0001, |
| "loss": 1.0063, |
| "loss/crossentropy": 2.4602949619293213, |
| "loss/hidden": 0.8125, |
| "loss/logits": 0.13525693118572235, |
| "loss/reg": 0.005857320036739111, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.03725, |
| "grad_norm": 3.009300708770752, |
| "grad_norm_var": 4521.093589717446, |
| "learning_rate": 0.0001, |
| "loss": 1.2573, |
| "loss/crossentropy": 2.8883349895477295, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.20657645165920258, |
| "loss/reg": 0.005855792202055454, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.037375, |
| "grad_norm": 2.700221538543701, |
| "grad_norm_var": 4522.372179334166, |
| "learning_rate": 0.0001, |
| "loss": 1.1557, |
| "loss/crossentropy": 2.5446314811706543, |
| "loss/hidden": 0.90234375, |
| "loss/logits": 0.19479964673519135, |
| "loss/reg": 0.005854278337210417, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.0375, |
| "grad_norm": 2.3786559104919434, |
| "grad_norm_var": 4523.758055495688, |
| "learning_rate": 0.0001, |
| "loss": 1.1224, |
| "loss/crossentropy": 2.469960927963257, |
| "loss/hidden": 0.90234375, |
| "loss/logits": 0.16156738996505737, |
| "loss/reg": 0.00585273839533329, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.037625, |
| "grad_norm": 2.7032158374786377, |
| "grad_norm_var": 4523.046593599144, |
| "learning_rate": 0.0001, |
| "loss": 1.1947, |
| "loss/crossentropy": 2.7451162338256836, |
| "loss/hidden": 0.94140625, |
| "loss/logits": 0.19476984441280365, |
| "loss/reg": 0.0058509958907961845, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.03775, |
| "grad_norm": 2.507664442062378, |
| "grad_norm_var": 0.11250867537391755, |
| "learning_rate": 0.0001, |
| "loss": 0.9899, |
| "loss/crossentropy": 2.53341007232666, |
| "loss/hidden": 0.796875, |
| "loss/logits": 0.1345081329345703, |
| "loss/reg": 0.005849248263984919, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.037875, |
| "grad_norm": 3.027892589569092, |
| "grad_norm_var": 0.06692647718721882, |
| "learning_rate": 0.0001, |
| "loss": 1.0973, |
| "loss/crossentropy": 2.7899296283721924, |
| "loss/hidden": 0.890625, |
| "loss/logits": 0.1482122391462326, |
| "loss/reg": 0.005847662687301636, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.038, |
| "grad_norm": 2.1617183685302734, |
| "grad_norm_var": 0.07146536810277529, |
| "learning_rate": 0.0001, |
| "loss": 0.969, |
| "loss/crossentropy": 2.4700305461883545, |
| "loss/hidden": 0.78125, |
| "loss/logits": 0.12925508618354797, |
| "loss/reg": 0.005846073850989342, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.038125, |
| "grad_norm": 2.3791332244873047, |
| "grad_norm_var": 0.06803597239225306, |
| "learning_rate": 0.0001, |
| "loss": 1.1912, |
| "loss/crossentropy": 2.4171202182769775, |
| "loss/hidden": 0.9453125, |
| "loss/logits": 0.18739524483680725, |
| "loss/reg": 0.005844476167112589, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.03825, |
| "grad_norm": 2.7622976303100586, |
| "grad_norm_var": 0.06636088237049004, |
| "learning_rate": 0.0001, |
| "loss": 1.0808, |
| "loss/crossentropy": 2.5030367374420166, |
| "loss/hidden": 0.8359375, |
| "loss/logits": 0.18643516302108765, |
| "loss/reg": 0.005842759273946285, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.038375, |
| "grad_norm": 2.4079246520996094, |
| "grad_norm_var": 0.059000676657357566, |
| "learning_rate": 0.0001, |
| "loss": 1.0359, |
| "loss/crossentropy": 2.381542682647705, |
| "loss/hidden": 0.828125, |
| "loss/logits": 0.1493588387966156, |
| "loss/reg": 0.0058412267826497555, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.0385, |
| "grad_norm": 2.5356478691101074, |
| "grad_norm_var": 0.058906038923372726, |
| "learning_rate": 0.0001, |
| "loss": 1.087, |
| "loss/crossentropy": 2.4928808212280273, |
| "loss/hidden": 0.875, |
| "loss/logits": 0.15363982319831848, |
| "loss/reg": 0.0058394852094352245, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.038625, |
| "grad_norm": 2.4036688804626465, |
| "grad_norm_var": 0.0597099908353601, |
| "learning_rate": 0.0001, |
| "loss": 0.986, |
| "loss/crossentropy": 2.5816946029663086, |
| "loss/hidden": 0.7890625, |
| "loss/logits": 0.13851355016231537, |
| "loss/reg": 0.005837727338075638, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.03875, |
| "grad_norm": 2.630572557449341, |
| "grad_norm_var": 0.05963840398777146, |
| "learning_rate": 0.0001, |
| "loss": 1.0333, |
| "loss/crossentropy": 2.140015125274658, |
| "loss/hidden": 0.828125, |
| "loss/logits": 0.14680367708206177, |
| "loss/reg": 0.005835913587361574, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.038875, |
| "grad_norm": 2.3641905784606934, |
| "grad_norm_var": 0.06012154861927167, |
| "learning_rate": 0.0001, |
| "loss": 1.0947, |
| "loss/crossentropy": 2.3300833702087402, |
| "loss/hidden": 0.8828125, |
| "loss/logits": 0.15358075499534607, |
| "loss/reg": 0.005834224168211222, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.039, |
| "grad_norm": 2.215728759765625, |
| "grad_norm_var": 0.06696490679455162, |
| "learning_rate": 0.0001, |
| "loss": 1.1411, |
| "loss/crossentropy": 2.4583277702331543, |
| "loss/hidden": 0.9140625, |
| "loss/logits": 0.1687404215335846, |
| "loss/reg": 0.005832599475979805, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.039125, |
| "grad_norm": 2.8934550285339355, |
| "grad_norm_var": 0.06994228066174794, |
| "learning_rate": 0.0001, |
| "loss": 1.2763, |
| "loss/crossentropy": 2.409702777862549, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.17889352142810822, |
| "loss/reg": 0.005831001792103052, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.03925, |
| "grad_norm": 8.741681098937988, |
| "grad_norm_var": 2.4613182467650705, |
| "learning_rate": 0.0001, |
| "loss": 1.1972, |
| "loss/crossentropy": 2.3858492374420166, |
| "loss/hidden": 0.96875, |
| "loss/logits": 0.1701970100402832, |
| "loss/reg": 0.005829236935824156, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.039375, |
| "grad_norm": 7.412417411804199, |
| "grad_norm_var": 3.707354176329111, |
| "learning_rate": 0.0001, |
| "loss": 1.3096, |
| "loss/crossentropy": 2.3804125785827637, |
| "loss/hidden": 1.1015625, |
| "loss/logits": 0.149795800447464, |
| "loss/reg": 0.005827469285577536, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.0395, |
| "grad_norm": 3.1443870067596436, |
| "grad_norm_var": 3.6580641482995806, |
| "learning_rate": 0.0001, |
| "loss": 1.1365, |
| "loss/crossentropy": 2.481820583343506, |
| "loss/hidden": 0.90234375, |
| "loss/logits": 0.1759084165096283, |
| "loss/reg": 0.005825776606798172, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.039625, |
| "grad_norm": 2.8567562103271484, |
| "grad_norm_var": 3.6479706732170993, |
| "learning_rate": 0.0001, |
| "loss": 1.0023, |
| "loss/crossentropy": 2.5141823291778564, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.14331723749637604, |
| "loss/reg": 0.005824015475809574, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.03975, |
| "grad_norm": 2.2817444801330566, |
| "grad_norm_var": 3.674359828489624, |
| "learning_rate": 0.0001, |
| "loss": 1.0893, |
| "loss/crossentropy": 2.184128999710083, |
| "loss/hidden": 0.875, |
| "loss/logits": 0.15605026483535767, |
| "loss/reg": 0.00582248717546463, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.039875, |
| "grad_norm": 2.249969005584717, |
| "grad_norm_var": 3.736641439481692, |
| "learning_rate": 0.0001, |
| "loss": 1.008, |
| "loss/crossentropy": 2.768484354019165, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.14897163212299347, |
| "loss/reg": 0.00582079216837883, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 2.6358306407928467, |
| "grad_norm_var": 3.684102068428194, |
| "learning_rate": 0.0001, |
| "loss": 1.3237, |
| "loss/crossentropy": 2.301954507827759, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.24987459182739258, |
| "loss/reg": 0.005819002632051706, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.040125, |
| "grad_norm": 2.353457450866699, |
| "grad_norm_var": 3.6871065280104496, |
| "learning_rate": 0.0001, |
| "loss": 1.1095, |
| "loss/crossentropy": 2.379765272140503, |
| "loss/hidden": 0.89453125, |
| "loss/logits": 0.15680107474327087, |
| "loss/reg": 0.005817302968353033, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.04025, |
| "grad_norm": 2.4568967819213867, |
| "grad_norm_var": 3.712514538750317, |
| "learning_rate": 0.0001, |
| "loss": 0.9706, |
| "loss/crossentropy": 2.380795955657959, |
| "loss/hidden": 0.77734375, |
| "loss/logits": 0.13508911430835724, |
| "loss/reg": 0.005815597716718912, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.040375, |
| "grad_norm": 3.207794189453125, |
| "grad_norm_var": 3.6654654630236734, |
| "learning_rate": 0.0001, |
| "loss": 1.3668, |
| "loss/crossentropy": 1.949703574180603, |
| "loss/hidden": 1.1171875, |
| "loss/logits": 0.19150257110595703, |
| "loss/reg": 0.005813860800117254, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.0405, |
| "grad_norm": 3.156318187713623, |
| "grad_norm_var": 3.6284383166396252, |
| "learning_rate": 0.0001, |
| "loss": 1.2742, |
| "loss/crossentropy": 2.1970410346984863, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.21606677770614624, |
| "loss/reg": 0.005812041461467743, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.040625, |
| "grad_norm": 2.556889533996582, |
| "grad_norm_var": 3.611332493108523, |
| "learning_rate": 0.0001, |
| "loss": 0.9529, |
| "loss/crossentropy": 2.7647974491119385, |
| "loss/hidden": 0.7578125, |
| "loss/logits": 0.1369488537311554, |
| "loss/reg": 0.00581031059846282, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.04075, |
| "grad_norm": 2.2634167671203613, |
| "grad_norm_var": 3.653624545749698, |
| "learning_rate": 0.0001, |
| "loss": 1.0757, |
| "loss/crossentropy": 2.334134340286255, |
| "loss/hidden": 0.859375, |
| "loss/logits": 0.1581987738609314, |
| "loss/reg": 0.005808570422232151, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.040875, |
| "grad_norm": 2.3521125316619873, |
| "grad_norm_var": 3.6551397839485555, |
| "learning_rate": 0.0001, |
| "loss": 0.9965, |
| "loss/crossentropy": 2.78828763961792, |
| "loss/hidden": 0.79296875, |
| "loss/logits": 0.1454332172870636, |
| "loss/reg": 0.005806888919323683, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.041, |
| "grad_norm": 3.0836093425750732, |
| "grad_norm_var": 3.5768996944618254, |
| "learning_rate": 0.0001, |
| "loss": 1.1938, |
| "loss/crossentropy": 2.2781612873077393, |
| "loss/hidden": 0.9609375, |
| "loss/logits": 0.1747758537530899, |
| "loss/reg": 0.005805303808301687, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.041125, |
| "grad_norm": 3.6110970973968506, |
| "grad_norm_var": 3.5651235487558246, |
| "learning_rate": 0.0001, |
| "loss": 1.1693, |
| "loss/crossentropy": 2.812913417816162, |
| "loss/hidden": 0.9375, |
| "loss/logits": 0.17377659678459167, |
| "loss/reg": 0.005803780164569616, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.04125, |
| "grad_norm": 2.5020155906677246, |
| "grad_norm_var": 1.552569952590708, |
| "learning_rate": 0.0001, |
| "loss": 1.0862, |
| "loss/crossentropy": 2.6585140228271484, |
| "loss/hidden": 0.86328125, |
| "loss/logits": 0.16489718854427338, |
| "loss/reg": 0.005802258383482695, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.041375, |
| "grad_norm": 2.383924961090088, |
| "grad_norm_var": 0.17978007457456116, |
| "learning_rate": 0.0001, |
| "loss": 1.1592, |
| "loss/crossentropy": 2.4862210750579834, |
| "loss/hidden": 0.94921875, |
| "loss/logits": 0.15199331939220428, |
| "loss/reg": 0.005800731014460325, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.0415, |
| "grad_norm": 2.187321424484253, |
| "grad_norm_var": 0.17949311071790794, |
| "learning_rate": 0.0001, |
| "loss": 1.0507, |
| "loss/crossentropy": 2.6380603313446045, |
| "loss/hidden": 0.84765625, |
| "loss/logits": 0.14507073163986206, |
| "loss/reg": 0.005798923317342997, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.041625, |
| "grad_norm": 2.21768856048584, |
| "grad_norm_var": 0.18601193201957902, |
| "learning_rate": 0.0001, |
| "loss": 1.1027, |
| "loss/crossentropy": 2.3925793170928955, |
| "loss/hidden": 0.875, |
| "loss/logits": 0.16972869634628296, |
| "loss/reg": 0.00579707371070981, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.04175, |
| "grad_norm": 2.682497262954712, |
| "grad_norm_var": 0.17937770683656615, |
| "learning_rate": 0.0001, |
| "loss": 1.3272, |
| "loss/crossentropy": 2.3586106300354004, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.1911502480506897, |
| "loss/reg": 0.005795224104076624, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.041875, |
| "grad_norm": 3.0983307361602783, |
| "grad_norm_var": 0.1826395003188658, |
| "learning_rate": 0.0001, |
| "loss": 1.1675, |
| "loss/crossentropy": 2.436326265335083, |
| "loss/hidden": 0.91796875, |
| "loss/logits": 0.1915540099143982, |
| "loss/reg": 0.005793258547782898, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.042, |
| "grad_norm": 6.251674652099609, |
| "grad_norm_var": 0.982431631272856, |
| "learning_rate": 0.0001, |
| "loss": 1.6879, |
| "loss/crossentropy": 2.3841142654418945, |
| "loss/hidden": 1.265625, |
| "loss/logits": 0.3643344044685364, |
| "loss/reg": 0.0057912725023925304, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.042125, |
| "grad_norm": 3.0111782550811768, |
| "grad_norm_var": 0.9617308564996427, |
| "learning_rate": 0.0001, |
| "loss": 1.3497, |
| "loss/crossentropy": 2.430532217025757, |
| "loss/hidden": 1.0703125, |
| "loss/logits": 0.2214677333831787, |
| "loss/reg": 0.00578899122774601, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.04225, |
| "grad_norm": 2.4221205711364746, |
| "grad_norm_var": 0.9640415151512265, |
| "learning_rate": 0.0001, |
| "loss": 1.0955, |
| "loss/crossentropy": 2.4376015663146973, |
| "loss/hidden": 0.890625, |
| "loss/logits": 0.1470467746257782, |
| "loss/reg": 0.005786662455648184, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.042375, |
| "grad_norm": 2.615758180618286, |
| "grad_norm_var": 0.9645524062068328, |
| "learning_rate": 0.0001, |
| "loss": 1.0887, |
| "loss/crossentropy": 2.5318005084991455, |
| "loss/hidden": 0.875, |
| "loss/logits": 0.15580901503562927, |
| "loss/reg": 0.0057848175056278706, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.0425, |
| "grad_norm": 2.857177972793579, |
| "grad_norm_var": 0.9599117798964886, |
| "learning_rate": 0.0001, |
| "loss": 1.1153, |
| "loss/crossentropy": 2.4260058403015137, |
| "loss/hidden": 0.89453125, |
| "loss/logits": 0.16291844844818115, |
| "loss/reg": 0.005782809574157, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.042625, |
| "grad_norm": 2.4030630588531494, |
| "grad_norm_var": 0.9680393035693963, |
| "learning_rate": 0.0001, |
| "loss": 1.2054, |
| "loss/crossentropy": 2.3009443283081055, |
| "loss/hidden": 0.953125, |
| "loss/logits": 0.194431871175766, |
| "loss/reg": 0.005780525505542755, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.04275, |
| "grad_norm": 2.264251470565796, |
| "grad_norm_var": 0.9679716782722624, |
| "learning_rate": 0.0001, |
| "loss": 1.0227, |
| "loss/crossentropy": 2.597288131713867, |
| "loss/hidden": 0.8203125, |
| "loss/logits": 0.14457917213439941, |
| "loss/reg": 0.005778233055025339, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.042875, |
| "grad_norm": 2.2368180751800537, |
| "grad_norm_var": 0.9767866404468121, |
| "learning_rate": 0.0001, |
| "loss": 0.943, |
| "loss/crossentropy": 2.4534237384796143, |
| "loss/hidden": 0.7578125, |
| "loss/logits": 0.12742644548416138, |
| "loss/reg": 0.005776000674813986, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.043, |
| "grad_norm": 2.469120979309082, |
| "grad_norm_var": 0.9824165851632264, |
| "learning_rate": 0.0001, |
| "loss": 1.0531, |
| "loss/crossentropy": 2.793834686279297, |
| "loss/hidden": 0.83984375, |
| "loss/logits": 0.15554235875606537, |
| "loss/reg": 0.005774145945906639, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.043125, |
| "grad_norm": 2.8334686756134033, |
| "grad_norm_var": 0.9387961568478952, |
| "learning_rate": 0.0001, |
| "loss": 0.9467, |
| "loss/crossentropy": 2.678666830062866, |
| "loss/hidden": 0.7578125, |
| "loss/logits": 0.13116785883903503, |
| "loss/reg": 0.005771928001195192, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.04325, |
| "grad_norm": 7.863356590270996, |
| "grad_norm_var": 2.5385263322105893, |
| "learning_rate": 0.0001, |
| "loss": 1.4695, |
| "loss/crossentropy": 2.613318920135498, |
| "loss/hidden": 1.2734375, |
| "loss/logits": 0.13832132518291473, |
| "loss/reg": 0.005770097486674786, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.043375, |
| "grad_norm": 2.763582468032837, |
| "grad_norm_var": 2.510660987467067, |
| "learning_rate": 0.0001, |
| "loss": 1.1302, |
| "loss/crossentropy": 2.846453905105591, |
| "loss/hidden": 0.90625, |
| "loss/logits": 0.166295126080513, |
| "loss/reg": 0.0057678911834955215, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.0435, |
| "grad_norm": 3.600456714630127, |
| "grad_norm_var": 2.4567056984087676, |
| "learning_rate": 0.0001, |
| "loss": 1.2108, |
| "loss/crossentropy": 2.515092372894287, |
| "loss/hidden": 0.96875, |
| "loss/logits": 0.18436874449253082, |
| "loss/reg": 0.005765695124864578, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.043625, |
| "grad_norm": 4.2698073387146, |
| "grad_norm_var": 2.4444505062987636, |
| "learning_rate": 0.0001, |
| "loss": 1.1224, |
| "loss/crossentropy": 2.3673834800720215, |
| "loss/hidden": 0.8984375, |
| "loss/logits": 0.16628439724445343, |
| "loss/reg": 0.005763507913798094, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.04375, |
| "grad_norm": 2.962045192718506, |
| "grad_norm_var": 2.42435544256402, |
| "learning_rate": 0.0001, |
| "loss": 1.079, |
| "loss/crossentropy": 2.9470205307006836, |
| "loss/hidden": 0.83203125, |
| "loss/logits": 0.1893935650587082, |
| "loss/reg": 0.005761242005974054, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.043875, |
| "grad_norm": 3.0306880474090576, |
| "grad_norm_var": 2.427092851603572, |
| "learning_rate": 0.0001, |
| "loss": 1.0201, |
| "loss/crossentropy": 2.3637542724609375, |
| "loss/hidden": 0.83203125, |
| "loss/logits": 0.13047108054161072, |
| "loss/reg": 0.0057592191733419895, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.044, |
| "grad_norm": 2.599585771560669, |
| "grad_norm_var": 1.855493477227511, |
| "learning_rate": 0.0001, |
| "loss": 0.9429, |
| "loss/crossentropy": 2.9222559928894043, |
| "loss/hidden": 0.7578125, |
| "loss/logits": 0.12747693061828613, |
| "loss/reg": 0.005757040809839964, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.044125, |
| "grad_norm": 2.4723081588745117, |
| "grad_norm_var": 1.882729557078295, |
| "learning_rate": 0.0001, |
| "loss": 1.2276, |
| "loss/crossentropy": 2.5835001468658447, |
| "loss/hidden": 0.94921875, |
| "loss/logits": 0.220790833234787, |
| "loss/reg": 0.005754764657467604, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.04425, |
| "grad_norm": 2.5266165733337402, |
| "grad_norm_var": 1.873911870827686, |
| "learning_rate": 0.0001, |
| "loss": 1.1879, |
| "loss/crossentropy": 2.4273722171783447, |
| "loss/hidden": 0.97265625, |
| "loss/logits": 0.15772980451583862, |
| "loss/reg": 0.005752884317189455, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.044375, |
| "grad_norm": 2.8139867782592773, |
| "grad_norm_var": 1.8632913443851133, |
| "learning_rate": 0.0001, |
| "loss": 1.2803, |
| "loss/crossentropy": 2.591078996658325, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.19931599497795105, |
| "loss/reg": 0.0057507967576384544, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.0445, |
| "grad_norm": 2.0173490047454834, |
| "grad_norm_var": 1.9371277324683585, |
| "learning_rate": 0.0001, |
| "loss": 1.0066, |
| "loss/crossentropy": 2.415416955947876, |
| "loss/hidden": 0.80859375, |
| "loss/logits": 0.14050991833209991, |
| "loss/reg": 0.005748571362346411, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.044625, |
| "grad_norm": 3.5304269790649414, |
| "grad_norm_var": 1.916250206343263, |
| "learning_rate": 0.0001, |
| "loss": 1.2665, |
| "loss/crossentropy": 2.7149741649627686, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.16997796297073364, |
| "loss/reg": 0.005746254697442055, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.04475, |
| "grad_norm": 47.96537399291992, |
| "grad_norm_var": 127.11164707702224, |
| "learning_rate": 0.0001, |
| "loss": 1.4579, |
| "loss/crossentropy": 2.7637100219726562, |
| "loss/hidden": 1.2265625, |
| "loss/logits": 0.17390823364257812, |
| "loss/reg": 0.005744417663663626, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.044875, |
| "grad_norm": 2.253833055496216, |
| "grad_norm_var": 127.10313415769795, |
| "learning_rate": 0.0001, |
| "loss": 1.1382, |
| "loss/crossentropy": 2.3016419410705566, |
| "loss/hidden": 0.9140625, |
| "loss/logits": 0.16676074266433716, |
| "loss/reg": 0.005742207169532776, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.045, |
| "grad_norm": 3.2059576511383057, |
| "grad_norm_var": 126.79034824550331, |
| "learning_rate": 0.0001, |
| "loss": 1.2389, |
| "loss/crossentropy": 2.624589204788208, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.18154433369636536, |
| "loss/reg": 0.005740353371948004, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.045125, |
| "grad_norm": 2.456129789352417, |
| "grad_norm_var": 126.9607902891753, |
| "learning_rate": 0.0001, |
| "loss": 1.0342, |
| "loss/crossentropy": 2.500290870666504, |
| "loss/hidden": 0.83203125, |
| "loss/logits": 0.14475134015083313, |
| "loss/reg": 0.005738324951380491, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.04525, |
| "grad_norm": 3.081372022628784, |
| "grad_norm_var": 127.21513938268541, |
| "learning_rate": 0.0001, |
| "loss": 1.1093, |
| "loss/crossentropy": 2.3305118083953857, |
| "loss/hidden": 0.8984375, |
| "loss/logits": 0.15346962213516235, |
| "loss/reg": 0.0057361493818461895, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.045375, |
| "grad_norm": 2.2634801864624023, |
| "grad_norm_var": 127.4280286195785, |
| "learning_rate": 0.0001, |
| "loss": 1.0956, |
| "loss/crossentropy": 2.4553990364074707, |
| "loss/hidden": 0.875, |
| "loss/logits": 0.16324618458747864, |
| "loss/reg": 0.005734298378229141, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.0455, |
| "grad_norm": 3.9597907066345215, |
| "grad_norm_var": 127.3359579534097, |
| "learning_rate": 0.0001, |
| "loss": 1.3557, |
| "loss/crossentropy": 2.6449685096740723, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.2202637791633606, |
| "loss/reg": 0.005732398014515638, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.045625, |
| "grad_norm": 2.7794013023376465, |
| "grad_norm_var": 127.76159157574789, |
| "learning_rate": 0.0001, |
| "loss": 1.0787, |
| "loss/crossentropy": 2.3118059635162354, |
| "loss/hidden": 0.86328125, |
| "loss/logits": 0.1581302285194397, |
| "loss/reg": 0.005730301141738892, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.04575, |
| "grad_norm": 4.7589192390441895, |
| "grad_norm_var": 127.32661229099328, |
| "learning_rate": 0.0001, |
| "loss": 1.3244, |
| "loss/crossentropy": 2.5914306640625, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.18898184597492218, |
| "loss/reg": 0.005728167947381735, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.045875, |
| "grad_norm": 4.024761199951172, |
| "grad_norm_var": 127.03030673720949, |
| "learning_rate": 0.0001, |
| "loss": 1.421, |
| "loss/crossentropy": 2.083667755126953, |
| "loss/hidden": 1.1640625, |
| "loss/logits": 0.1997053027153015, |
| "loss/reg": 0.005726283416152, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.046, |
| "grad_norm": 2.9291043281555176, |
| "grad_norm_var": 126.89672944049376, |
| "learning_rate": 0.0001, |
| "loss": 1.1321, |
| "loss/crossentropy": 2.7017500400543213, |
| "loss/hidden": 0.90625, |
| "loss/logits": 0.1686232089996338, |
| "loss/reg": 0.005724436603486538, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.046125, |
| "grad_norm": 2.289379119873047, |
| "grad_norm_var": 126.98034912166224, |
| "learning_rate": 0.0001, |
| "loss": 1.0433, |
| "loss/crossentropy": 2.404045581817627, |
| "loss/hidden": 0.8359375, |
| "loss/logits": 0.1501048356294632, |
| "loss/reg": 0.005722455680370331, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.04625, |
| "grad_norm": 2.5955307483673096, |
| "grad_norm_var": 126.95053618311779, |
| "learning_rate": 0.0001, |
| "loss": 1.1052, |
| "loss/crossentropy": 2.555497407913208, |
| "loss/hidden": 0.87890625, |
| "loss/logits": 0.16912290453910828, |
| "loss/reg": 0.0057206167839467525, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.046375, |
| "grad_norm": 2.5631515979766846, |
| "grad_norm_var": 127.05459572518181, |
| "learning_rate": 0.0001, |
| "loss": 1.0105, |
| "loss/crossentropy": 2.3253824710845947, |
| "loss/hidden": 0.80859375, |
| "loss/logits": 0.14470672607421875, |
| "loss/reg": 0.005718756001442671, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.0465, |
| "grad_norm": 2.8995003700256348, |
| "grad_norm_var": 126.65924311218065, |
| "learning_rate": 0.0001, |
| "loss": 1.0727, |
| "loss/crossentropy": 2.5171523094177246, |
| "loss/hidden": 0.859375, |
| "loss/logits": 0.15616215765476227, |
| "loss/reg": 0.005716769490391016, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.046625, |
| "grad_norm": 2.4674322605133057, |
| "grad_norm_var": 127.0582358856119, |
| "learning_rate": 0.0001, |
| "loss": 0.9544, |
| "loss/crossentropy": 2.426679849624634, |
| "loss/hidden": 0.765625, |
| "loss/logits": 0.13166998326778412, |
| "loss/reg": 0.005714884493499994, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.04675, |
| "grad_norm": 2.1486146450042725, |
| "grad_norm_var": 0.5554253140062239, |
| "learning_rate": 0.0001, |
| "loss": 1.0123, |
| "loss/crossentropy": 2.3567564487457275, |
| "loss/hidden": 0.8203125, |
| "loss/logits": 0.1348218023777008, |
| "loss/reg": 0.0057129692286252975, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.046875, |
| "grad_norm": 2.4249770641326904, |
| "grad_norm_var": 0.5421168003854054, |
| "learning_rate": 0.0001, |
| "loss": 1.0005, |
| "loss/crossentropy": 2.575383424758911, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.1425924003124237, |
| "loss/reg": 0.005710979457944632, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.047, |
| "grad_norm": 3.9449760913848877, |
| "grad_norm_var": 0.6036429091311817, |
| "learning_rate": 0.0001, |
| "loss": 1.1428, |
| "loss/crossentropy": 2.5839173793792725, |
| "loss/hidden": 0.94921875, |
| "loss/logits": 0.13653349876403809, |
| "loss/reg": 0.0057089440524578094, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.047125, |
| "grad_norm": 2.3119592666625977, |
| "grad_norm_var": 0.6148998912723904, |
| "learning_rate": 0.0001, |
| "loss": 1.088, |
| "loss/crossentropy": 2.492663860321045, |
| "loss/hidden": 0.859375, |
| "loss/logits": 0.1715661883354187, |
| "loss/reg": 0.005707095842808485, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.04725, |
| "grad_norm": 3.586817979812622, |
| "grad_norm_var": 0.6386998540868449, |
| "learning_rate": 0.0001, |
| "loss": 1.0907, |
| "loss/crossentropy": 2.8210177421569824, |
| "loss/hidden": 0.87890625, |
| "loss/logits": 0.15476316213607788, |
| "loss/reg": 0.005705154500901699, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.047375, |
| "grad_norm": 2.805647850036621, |
| "grad_norm_var": 0.6040650287121667, |
| "learning_rate": 0.0001, |
| "loss": 1.0792, |
| "loss/crossentropy": 2.54019832611084, |
| "loss/hidden": 0.859375, |
| "loss/logits": 0.16280022263526917, |
| "loss/reg": 0.005703243892639875, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.0475, |
| "grad_norm": 2.7932748794555664, |
| "grad_norm_var": 0.5445939245804574, |
| "learning_rate": 0.0001, |
| "loss": 1.4621, |
| "loss/crossentropy": 2.2343437671661377, |
| "loss/hidden": 1.1953125, |
| "loss/logits": 0.20978981256484985, |
| "loss/reg": 0.005701290909200907, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.047625, |
| "grad_norm": 2.661917209625244, |
| "grad_norm_var": 0.5482497924242672, |
| "learning_rate": 0.0001, |
| "loss": 0.9746, |
| "loss/crossentropy": 2.782052516937256, |
| "loss/hidden": 0.78125, |
| "loss/logits": 0.13640211522579193, |
| "loss/reg": 0.0056994096376001835, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.04775, |
| "grad_norm": 2.4914302825927734, |
| "grad_norm_var": 0.3228126995822395, |
| "learning_rate": 0.0001, |
| "loss": 1.126, |
| "loss/crossentropy": 2.166295051574707, |
| "loss/hidden": 0.91015625, |
| "loss/logits": 0.1589164137840271, |
| "loss/reg": 0.005697426851838827, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.047875, |
| "grad_norm": 2.961653709411621, |
| "grad_norm_var": 0.22106978564282992, |
| "learning_rate": 0.0001, |
| "loss": 1.1071, |
| "loss/crossentropy": 2.5477302074432373, |
| "loss/hidden": 0.8828125, |
| "loss/logits": 0.16730068624019623, |
| "loss/reg": 0.005695413798093796, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 2.9396286010742188, |
| "grad_norm_var": 0.22133896443579198, |
| "learning_rate": 0.0001, |
| "loss": 1.0254, |
| "loss/crossentropy": 2.555258274078369, |
| "loss/hidden": 0.828125, |
| "loss/logits": 0.1403425633907318, |
| "loss/reg": 0.005693417973816395, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.048125, |
| "grad_norm": 2.8298912048339844, |
| "grad_norm_var": 0.20691636732209961, |
| "learning_rate": 0.0001, |
| "loss": 1.195, |
| "loss/crossentropy": 2.472844362258911, |
| "loss/hidden": 0.984375, |
| "loss/logits": 0.15367946028709412, |
| "loss/reg": 0.005691539496183395, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.04825, |
| "grad_norm": 15.47062873840332, |
| "grad_norm_var": 10.256501481265339, |
| "learning_rate": 0.0001, |
| "loss": 1.4448, |
| "loss/crossentropy": 2.521524667739868, |
| "loss/hidden": 1.203125, |
| "loss/logits": 0.1847420334815979, |
| "loss/reg": 0.005689616315066814, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.048375, |
| "grad_norm": 2.455294370651245, |
| "grad_norm_var": 10.271871141002237, |
| "learning_rate": 0.0001, |
| "loss": 1.1018, |
| "loss/crossentropy": 2.309390068054199, |
| "loss/hidden": 0.89453125, |
| "loss/logits": 0.15039557218551636, |
| "loss/reg": 0.005687698721885681, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.0485, |
| "grad_norm": 3.23420786857605, |
| "grad_norm_var": 10.248744715041969, |
| "learning_rate": 0.0001, |
| "loss": 1.2879, |
| "loss/crossentropy": 2.4902544021606445, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.2154603898525238, |
| "loss/reg": 0.005685731768608093, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.048625, |
| "grad_norm": 2.660858631134033, |
| "grad_norm_var": 10.221989434520331, |
| "learning_rate": 0.0001, |
| "loss": 1.025, |
| "loss/crossentropy": 2.31535267829895, |
| "loss/hidden": 0.8359375, |
| "loss/logits": 0.13224059343338013, |
| "loss/reg": 0.005683773662894964, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.04875, |
| "grad_norm": 2.4209847450256348, |
| "grad_norm_var": 10.173641089965429, |
| "learning_rate": 0.0001, |
| "loss": 0.9974, |
| "loss/crossentropy": 2.1761093139648438, |
| "loss/hidden": 0.8125, |
| "loss/logits": 0.12805956602096558, |
| "loss/reg": 0.005681932438164949, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.048875, |
| "grad_norm": 3.108008623123169, |
| "grad_norm_var": 10.09354551501582, |
| "learning_rate": 0.0001, |
| "loss": 0.979, |
| "loss/crossentropy": 2.721165657043457, |
| "loss/hidden": 0.78125, |
| "loss/logits": 0.14099523425102234, |
| "loss/reg": 0.005679869093000889, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.049, |
| "grad_norm": 2.6531527042388916, |
| "grad_norm_var": 10.150022289467502, |
| "learning_rate": 0.0001, |
| "loss": 1.1723, |
| "loss/crossentropy": 2.518146514892578, |
| "loss/hidden": 0.9375, |
| "loss/logits": 0.17805764079093933, |
| "loss/reg": 0.005677856504917145, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.049125, |
| "grad_norm": 2.2534499168395996, |
| "grad_norm_var": 10.160179916565673, |
| "learning_rate": 0.0001, |
| "loss": 1.1292, |
| "loss/crossentropy": 2.633385181427002, |
| "loss/hidden": 0.91015625, |
| "loss/logits": 0.16230204701423645, |
| "loss/reg": 0.005675735417753458, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.04925, |
| "grad_norm": 2.9424333572387695, |
| "grad_norm_var": 10.185797665159741, |
| "learning_rate": 0.0001, |
| "loss": 1.4214, |
| "loss/crossentropy": 2.62923002243042, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.20838308334350586, |
| "loss/reg": 0.00567356962710619, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.049375, |
| "grad_norm": 2.622178792953491, |
| "grad_norm_var": 10.20593051221178, |
| "learning_rate": 0.0001, |
| "loss": 0.9697, |
| "loss/crossentropy": 2.5544826984405518, |
| "loss/hidden": 0.78125, |
| "loss/logits": 0.13172510266304016, |
| "loss/reg": 0.005671407096087933, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.0495, |
| "grad_norm": 2.635505199432373, |
| "grad_norm_var": 10.223008906743342, |
| "learning_rate": 0.0001, |
| "loss": 0.933, |
| "loss/crossentropy": 2.5959105491638184, |
| "loss/hidden": 0.75390625, |
| "loss/logits": 0.12239634245634079, |
| "loss/reg": 0.0056692929938435555, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.049625, |
| "grad_norm": 2.6063406467437744, |
| "grad_norm_var": 10.229570355797922, |
| "learning_rate": 0.0001, |
| "loss": 1.0478, |
| "loss/crossentropy": 2.719916343688965, |
| "loss/hidden": 0.83984375, |
| "loss/logits": 0.15127256512641907, |
| "loss/reg": 0.0056673381477594376, |
| "step": 397 |
| }, |
| { |
| "epoch": 0.04975, |
| "grad_norm": 2.589893102645874, |
| "grad_norm_var": 10.216701025853546, |
| "learning_rate": 0.0001, |
| "loss": 1.1265, |
| "loss/crossentropy": 2.3730130195617676, |
| "loss/hidden": 0.90234375, |
| "loss/logits": 0.16749918460845947, |
| "loss/reg": 0.0056652189232409, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.049875, |
| "grad_norm": 2.1503751277923584, |
| "grad_norm_var": 10.318666846324161, |
| "learning_rate": 0.0001, |
| "loss": 1.1685, |
| "loss/crossentropy": 2.2147741317749023, |
| "loss/hidden": 0.92578125, |
| "loss/logits": 0.1860472559928894, |
| "loss/reg": 0.005663097370415926, |
| "step": 399 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 3.6945109367370605, |
| "grad_norm_var": 10.300567557859127, |
| "learning_rate": 0.0001, |
| "loss": 1.1272, |
| "loss/crossentropy": 2.4212143421173096, |
| "loss/hidden": 0.921875, |
| "loss/logits": 0.1487593650817871, |
| "loss/reg": 0.005661314353346825, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.050125, |
| "grad_norm": 3.7444777488708496, |
| "grad_norm_var": 10.268632820538057, |
| "learning_rate": 0.0001, |
| "loss": 1.1221, |
| "loss/crossentropy": 2.5369904041290283, |
| "loss/hidden": 0.90625, |
| "loss/logits": 0.15929211676120758, |
| "loss/reg": 0.005659462418407202, |
| "step": 401 |
| }, |
| { |
| "epoch": 0.05025, |
| "grad_norm": 5.121776580810547, |
| "grad_norm_var": 0.5518050614602837, |
| "learning_rate": 0.0001, |
| "loss": 1.4671, |
| "loss/crossentropy": 2.2371129989624023, |
| "loss/hidden": 1.2109375, |
| "loss/logits": 0.19960111379623413, |
| "loss/reg": 0.005657529458403587, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.050375, |
| "grad_norm": 28.607572555541992, |
| "grad_norm_var": 41.63994308721723, |
| "learning_rate": 0.0001, |
| "loss": 1.1515, |
| "loss/crossentropy": 2.84385347366333, |
| "loss/hidden": 0.90234375, |
| "loss/logits": 0.19263674318790436, |
| "loss/reg": 0.005655454937368631, |
| "step": 403 |
| }, |
| { |
| "epoch": 0.0505, |
| "grad_norm": 2.38948655128479, |
| "grad_norm_var": 41.834466994087045, |
| "learning_rate": 0.0001, |
| "loss": 1.0929, |
| "loss/crossentropy": 2.2518088817596436, |
| "loss/hidden": 0.8984375, |
| "loss/logits": 0.13791221380233765, |
| "loss/reg": 0.005653408356010914, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.050625, |
| "grad_norm": 6.887917518615723, |
| "grad_norm_var": 41.907583648135414, |
| "learning_rate": 0.0001, |
| "loss": 1.2522, |
| "loss/crossentropy": 2.8729405403137207, |
| "loss/hidden": 1.046875, |
| "loss/logits": 0.14880970120429993, |
| "loss/reg": 0.005651514511555433, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.05075, |
| "grad_norm": 3.2420449256896973, |
| "grad_norm_var": 41.69182027548524, |
| "learning_rate": 0.0001, |
| "loss": 1.2031, |
| "loss/crossentropy": 2.598705530166626, |
| "loss/hidden": 0.98046875, |
| "loss/logits": 0.16617505252361298, |
| "loss/reg": 0.005649634636938572, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.050875, |
| "grad_norm": 2.3294692039489746, |
| "grad_norm_var": 41.9082544413822, |
| "learning_rate": 0.0001, |
| "loss": 1.0316, |
| "loss/crossentropy": 2.7743589878082275, |
| "loss/hidden": 0.84375, |
| "loss/logits": 0.13134868443012238, |
| "loss/reg": 0.005647764541208744, |
| "step": 407 |
| }, |
| { |
| "epoch": 0.051, |
| "grad_norm": 2.3849406242370605, |
| "grad_norm_var": 41.988788990047645, |
| "learning_rate": 0.0001, |
| "loss": 1.1579, |
| "loss/crossentropy": 2.2934722900390625, |
| "loss/hidden": 0.9375, |
| "loss/logits": 0.16397064924240112, |
| "loss/reg": 0.00564591446891427, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.051125, |
| "grad_norm": 2.616523504257202, |
| "grad_norm_var": 41.875558070811756, |
| "learning_rate": 0.0001, |
| "loss": 0.9281, |
| "loss/crossentropy": 2.617312431335449, |
| "loss/hidden": 0.7734375, |
| "loss/logits": 0.09819567203521729, |
| "loss/reg": 0.005644225515425205, |
| "step": 409 |
| }, |
| { |
| "epoch": 0.05125, |
| "grad_norm": 2.302281141281128, |
| "grad_norm_var": 42.058469053043055, |
| "learning_rate": 0.0001, |
| "loss": 1.0583, |
| "loss/crossentropy": 2.8029561042785645, |
| "loss/hidden": 0.859375, |
| "loss/logits": 0.14253735542297363, |
| "loss/reg": 0.005642317235469818, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.051375, |
| "grad_norm": 2.1521739959716797, |
| "grad_norm_var": 42.20532780726832, |
| "learning_rate": 0.0001, |
| "loss": 0.996, |
| "loss/crossentropy": 2.5798304080963135, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.13881272077560425, |
| "loss/reg": 0.005640234332531691, |
| "step": 411 |
| }, |
| { |
| "epoch": 0.0515, |
| "grad_norm": 4.3292155265808105, |
| "grad_norm_var": 41.914794683811124, |
| "learning_rate": 0.0001, |
| "loss": 1.3517, |
| "loss/crossentropy": 2.4219868183135986, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.2562292516231537, |
| "loss/reg": 0.005638125352561474, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.051625, |
| "grad_norm": 19.01975440979004, |
| "grad_norm_var": 53.903843358167165, |
| "learning_rate": 0.0001, |
| "loss": 1.3283, |
| "loss/crossentropy": 2.2926077842712402, |
| "loss/hidden": 1.078125, |
| "loss/logits": 0.19380658864974976, |
| "loss/reg": 0.005636140704154968, |
| "step": 413 |
| }, |
| { |
| "epoch": 0.05175, |
| "grad_norm": 2.859027862548828, |
| "grad_norm_var": 53.791467006877085, |
| "learning_rate": 0.0001, |
| "loss": 1.1115, |
| "loss/crossentropy": 2.429117441177368, |
| "loss/hidden": 0.90234375, |
| "loss/logits": 0.1528070569038391, |
| "loss/reg": 0.005634027067571878, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.051875, |
| "grad_norm": 2.385204792022705, |
| "grad_norm_var": 53.67862289213027, |
| "learning_rate": 0.0001, |
| "loss": 1.0186, |
| "loss/crossentropy": 2.710325002670288, |
| "loss/hidden": 0.81640625, |
| "loss/logits": 0.1458669900894165, |
| "loss/reg": 0.005631967913359404, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.052, |
| "grad_norm": 2.3011677265167236, |
| "grad_norm_var": 54.20582073402194, |
| "learning_rate": 0.0001, |
| "loss": 1.0843, |
| "loss/crossentropy": 2.485734701156616, |
| "loss/hidden": 0.87109375, |
| "loss/logits": 0.1569264829158783, |
| "loss/reg": 0.0056300037540495396, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.052125, |
| "grad_norm": 2.7714357376098633, |
| "grad_norm_var": 54.53064815195892, |
| "learning_rate": 0.0001, |
| "loss": 1.0741, |
| "loss/crossentropy": 2.6249403953552246, |
| "loss/hidden": 0.85546875, |
| "loss/logits": 0.1623522937297821, |
| "loss/reg": 0.0056281075812876225, |
| "step": 417 |
| }, |
| { |
| "epoch": 0.05225, |
| "grad_norm": 2.376473903656006, |
| "grad_norm_var": 55.22478277620113, |
| "learning_rate": 0.0001, |
| "loss": 1.2116, |
| "loss/crossentropy": 2.5150105953216553, |
| "loss/hidden": 0.95703125, |
| "loss/logits": 0.19830524921417236, |
| "loss/reg": 0.005626222584396601, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.052375, |
| "grad_norm": 2.6247470378875732, |
| "grad_norm_var": 17.572360223815615, |
| "learning_rate": 0.0001, |
| "loss": 1.172, |
| "loss/crossentropy": 2.7201685905456543, |
| "loss/hidden": 0.9453125, |
| "loss/logits": 0.17042091488838196, |
| "loss/reg": 0.005624283570796251, |
| "step": 419 |
| }, |
| { |
| "epoch": 0.0525, |
| "grad_norm": 49.02815628051758, |
| "grad_norm_var": 143.90483482694842, |
| "learning_rate": 0.0001, |
| "loss": 5.3824, |
| "loss/crossentropy": 2.692047357559204, |
| "loss/hidden": 4.84375, |
| "loss/logits": 0.48245739936828613, |
| "loss/reg": 0.005622203927487135, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.052625, |
| "grad_norm": 2.6867082118988037, |
| "grad_norm_var": 144.9870986829453, |
| "learning_rate": 0.0001, |
| "loss": 1.2507, |
| "loss/crossentropy": 2.404517412185669, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.19445687532424927, |
| "loss/reg": 0.005620268173515797, |
| "step": 421 |
| }, |
| { |
| "epoch": 0.05275, |
| "grad_norm": 4.397704124450684, |
| "grad_norm_var": 144.55498651709914, |
| "learning_rate": 0.0001, |
| "loss": 1.4596, |
| "loss/crossentropy": 2.1510226726531982, |
| "loss/hidden": 1.2109375, |
| "loss/logits": 0.19246640801429749, |
| "loss/reg": 0.005618296563625336, |
| "step": 422 |
| }, |
| { |
| "epoch": 0.052875, |
| "grad_norm": 4.239573955535889, |
| "grad_norm_var": 143.68003611616095, |
| "learning_rate": 0.0001, |
| "loss": 1.3275, |
| "loss/crossentropy": 2.686849355697632, |
| "loss/hidden": 1.09375, |
| "loss/logits": 0.17758557200431824, |
| "loss/reg": 0.005616751033812761, |
| "step": 423 |
| }, |
| { |
| "epoch": 0.053, |
| "grad_norm": 2.749202251434326, |
| "grad_norm_var": 143.4748837350726, |
| "learning_rate": 0.0001, |
| "loss": 1.0827, |
| "loss/crossentropy": 2.8104846477508545, |
| "loss/hidden": 0.8828125, |
| "loss/logits": 0.1437493860721588, |
| "loss/reg": 0.005615332629531622, |
| "step": 424 |
| }, |
| { |
| "epoch": 0.053125, |
| "grad_norm": 2.459291458129883, |
| "grad_norm_var": 143.5641839570371, |
| "learning_rate": 0.0001, |
| "loss": 1.0548, |
| "loss/crossentropy": 2.5806379318237305, |
| "loss/hidden": 0.8515625, |
| "loss/logits": 0.14714661240577698, |
| "loss/reg": 0.005613364279270172, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.05325, |
| "grad_norm": 2.294171094894409, |
| "grad_norm_var": 143.56904366210821, |
| "learning_rate": 0.0001, |
| "loss": 1.1486, |
| "loss/crossentropy": 2.6366002559661865, |
| "loss/hidden": 0.90234375, |
| "loss/logits": 0.19014191627502441, |
| "loss/reg": 0.005611394997686148, |
| "step": 426 |
| }, |
| { |
| "epoch": 0.053375, |
| "grad_norm": 2.2255382537841797, |
| "grad_norm_var": 143.52399251007708, |
| "learning_rate": 0.0001, |
| "loss": 1.0752, |
| "loss/crossentropy": 2.542306661605835, |
| "loss/hidden": 0.875, |
| "loss/logits": 0.14408408105373383, |
| "loss/reg": 0.005609368905425072, |
| "step": 427 |
| }, |
| { |
| "epoch": 0.0535, |
| "grad_norm": 3.5708723068237305, |
| "grad_norm_var": 143.80942972780392, |
| "learning_rate": 0.0001, |
| "loss": 1.0863, |
| "loss/crossentropy": 2.2636356353759766, |
| "loss/hidden": 0.8828125, |
| "loss/logits": 0.14744916558265686, |
| "loss/reg": 0.005607361439615488, |
| "step": 428 |
| }, |
| { |
| "epoch": 0.053625, |
| "grad_norm": 2.9189610481262207, |
| "grad_norm_var": 133.66980873374825, |
| "learning_rate": 0.0001, |
| "loss": 0.9895, |
| "loss/crossentropy": 2.7651426792144775, |
| "loss/hidden": 0.78515625, |
| "loss/logits": 0.1482805609703064, |
| "loss/reg": 0.005605428479611874, |
| "step": 429 |
| }, |
| { |
| "epoch": 0.05375, |
| "grad_norm": 3.2735564708709717, |
| "grad_norm_var": 133.5211490137515, |
| "learning_rate": 0.0001, |
| "loss": 1.2363, |
| "loss/crossentropy": 2.248082399368286, |
| "loss/hidden": 0.98046875, |
| "loss/logits": 0.19977417588233948, |
| "loss/reg": 0.0056034415028989315, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.053875, |
| "grad_norm": 3.5670769214630127, |
| "grad_norm_var": 133.0752341056661, |
| "learning_rate": 0.0001, |
| "loss": 1.2766, |
| "loss/crossentropy": 2.500338554382324, |
| "loss/hidden": 1.0234375, |
| "loss/logits": 0.19719059765338898, |
| "loss/reg": 0.005601502023637295, |
| "step": 431 |
| }, |
| { |
| "epoch": 0.054, |
| "grad_norm": 2.2697787284851074, |
| "grad_norm_var": 133.0901180807591, |
| "learning_rate": 0.0001, |
| "loss": 0.9931, |
| "loss/crossentropy": 2.6418793201446533, |
| "loss/hidden": 0.7890625, |
| "loss/logits": 0.14799568057060242, |
| "loss/reg": 0.005599519703537226, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.054125, |
| "grad_norm": 3.220383405685425, |
| "grad_norm_var": 132.91898234062202, |
| "learning_rate": 0.0001, |
| "loss": 1.2515, |
| "loss/crossentropy": 2.5073025226593018, |
| "loss/hidden": 1.0390625, |
| "loss/logits": 0.15643876791000366, |
| "loss/reg": 0.005597477313131094, |
| "step": 433 |
| }, |
| { |
| "epoch": 0.05425, |
| "grad_norm": 3.2845206260681152, |
| "grad_norm_var": 132.5476800488924, |
| "learning_rate": 0.0001, |
| "loss": 1.1441, |
| "loss/crossentropy": 2.509037971496582, |
| "loss/hidden": 0.9296875, |
| "loss/logits": 0.15849418938159943, |
| "loss/reg": 0.005595567170530558, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.054375, |
| "grad_norm": 2.254239320755005, |
| "grad_norm_var": 132.71932731242507, |
| "learning_rate": 0.0001, |
| "loss": 0.9815, |
| "loss/crossentropy": 2.567584991455078, |
| "loss/hidden": 0.78125, |
| "loss/logits": 0.14433184266090393, |
| "loss/reg": 0.005593593697994947, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.0545, |
| "grad_norm": 3.2273480892181396, |
| "grad_norm_var": 0.4676980414191933, |
| "learning_rate": 0.0001, |
| "loss": 1.1645, |
| "loss/crossentropy": 2.3639349937438965, |
| "loss/hidden": 0.94921875, |
| "loss/logits": 0.15934088826179504, |
| "loss/reg": 0.0055916691198945045, |
| "step": 436 |
| }, |
| { |
| "epoch": 0.054625, |
| "grad_norm": 2.6044058799743652, |
| "grad_norm_var": 0.47199755801328347, |
| "learning_rate": 0.0001, |
| "loss": 1.1033, |
| "loss/crossentropy": 2.539247989654541, |
| "loss/hidden": 0.8984375, |
| "loss/logits": 0.14898554980754852, |
| "loss/reg": 0.005589775741100311, |
| "step": 437 |
| }, |
| { |
| "epoch": 0.05475, |
| "grad_norm": 2.9674391746520996, |
| "grad_norm_var": 0.3399405404704983, |
| "learning_rate": 0.0001, |
| "loss": 1.252, |
| "loss/crossentropy": 2.5642499923706055, |
| "loss/hidden": 0.9921875, |
| "loss/logits": 0.20391228795051575, |
| "loss/reg": 0.005587900057435036, |
| "step": 438 |
| }, |
| { |
| "epoch": 0.054875, |
| "grad_norm": 2.4164047241210938, |
| "grad_norm_var": 0.23308679379454797, |
| "learning_rate": 0.0001, |
| "loss": 1.1939, |
| "loss/crossentropy": 2.3462696075439453, |
| "loss/hidden": 0.93359375, |
| "loss/logits": 0.2044137418270111, |
| "loss/reg": 0.005585688166320324, |
| "step": 439 |
| }, |
| { |
| "epoch": 0.055, |
| "grad_norm": 2.7590599060058594, |
| "grad_norm_var": 0.2329847653181711, |
| "learning_rate": 0.0001, |
| "loss": 1.0377, |
| "loss/crossentropy": 2.775485038757324, |
| "loss/hidden": 0.84375, |
| "loss/logits": 0.13808496296405792, |
| "loss/reg": 0.0055835009552538395, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.055125, |
| "grad_norm": 2.7251267433166504, |
| "grad_norm_var": 0.224188675724659, |
| "learning_rate": 0.0001, |
| "loss": 1.0001, |
| "loss/crossentropy": 2.4934420585632324, |
| "loss/hidden": 0.80859375, |
| "loss/logits": 0.1357189267873764, |
| "loss/reg": 0.005581483710557222, |
| "step": 441 |
| }, |
| { |
| "epoch": 0.05525, |
| "grad_norm": 2.4774584770202637, |
| "grad_norm_var": 0.21273704839308963, |
| "learning_rate": 0.0001, |
| "loss": 1.2166, |
| "loss/crossentropy": 2.426271438598633, |
| "loss/hidden": 0.95703125, |
| "loss/logits": 0.20375394821166992, |
| "loss/reg": 0.0055792308412492275, |
| "step": 442 |
| }, |
| { |
| "epoch": 0.055375, |
| "grad_norm": 3.2236833572387695, |
| "grad_norm_var": 0.1905493662305197, |
| "learning_rate": 0.0001, |
| "loss": 1.1724, |
| "loss/crossentropy": 2.9799797534942627, |
| "loss/hidden": 0.92578125, |
| "loss/logits": 0.19083930552005768, |
| "loss/reg": 0.005577271804213524, |
| "step": 443 |
| }, |
| { |
| "epoch": 0.0555, |
| "grad_norm": 2.5997183322906494, |
| "grad_norm_var": 0.16554225723918894, |
| "learning_rate": 0.0001, |
| "loss": 1.126, |
| "loss/crossentropy": 2.2098257541656494, |
| "loss/hidden": 0.92578125, |
| "loss/logits": 0.14447355270385742, |
| "loss/reg": 0.005575183313339949, |
| "step": 444 |
| }, |
| { |
| "epoch": 0.055625, |
| "grad_norm": 2.5179152488708496, |
| "grad_norm_var": 0.1725392629592297, |
| "learning_rate": 0.0001, |
| "loss": 1.2018, |
| "loss/crossentropy": 2.0029213428497314, |
| "loss/hidden": 0.98046875, |
| "loss/logits": 0.1655960977077484, |
| "loss/reg": 0.005572900176048279, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.05575, |
| "grad_norm": 2.5075204372406006, |
| "grad_norm_var": 0.16460110044899826, |
| "learning_rate": 0.0001, |
| "loss": 1.0614, |
| "loss/crossentropy": 2.3672924041748047, |
| "loss/hidden": 0.85546875, |
| "loss/logits": 0.15021467208862305, |
| "loss/reg": 0.005570439621806145, |
| "step": 446 |
| }, |
| { |
| "epoch": 0.055875, |
| "grad_norm": 2.441183567047119, |
| "grad_norm_var": 0.12700610259855102, |
| "learning_rate": 0.0001, |
| "loss": 0.9323, |
| "loss/crossentropy": 2.311056137084961, |
| "loss/hidden": 0.7578125, |
| "loss/logits": 0.11881721019744873, |
| "loss/reg": 0.00556844100356102, |
| "step": 447 |
| }, |
| { |
| "epoch": 0.056, |
| "grad_norm": 2.6724319458007812, |
| "grad_norm_var": 0.11304803744365562, |
| "learning_rate": 0.0001, |
| "loss": 1.0937, |
| "loss/crossentropy": 2.562101364135742, |
| "loss/hidden": 0.8671875, |
| "loss/logits": 0.1708334982395172, |
| "loss/reg": 0.005566492676734924, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.056125, |
| "grad_norm": 2.196300506591797, |
| "grad_norm_var": 0.11350312697665288, |
| "learning_rate": 0.0001, |
| "loss": 0.9882, |
| "loss/crossentropy": 2.4227116107940674, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.13182450830936432, |
| "loss/reg": 0.00556437112390995, |
| "step": 449 |
| }, |
| { |
| "epoch": 0.05625, |
| "grad_norm": 2.912667989730835, |
| "grad_norm_var": 0.0921566818687341, |
| "learning_rate": 0.0001, |
| "loss": 1.3721, |
| "loss/crossentropy": 1.9439491033554077, |
| "loss/hidden": 1.109375, |
| "loss/logits": 0.2070913016796112, |
| "loss/reg": 0.0055623650550842285, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.056375, |
| "grad_norm": 2.011991500854492, |
| "grad_norm_var": 0.10881512213368959, |
| "learning_rate": 0.0001, |
| "loss": 1.0172, |
| "loss/crossentropy": 2.498812675476074, |
| "loss/hidden": 0.81640625, |
| "loss/logits": 0.14521706104278564, |
| "loss/reg": 0.005560221150517464, |
| "step": 451 |
| }, |
| { |
| "epoch": 0.0565, |
| "grad_norm": 2.2709267139434814, |
| "grad_norm_var": 0.0912508163184422, |
| "learning_rate": 0.0001, |
| "loss": 1.1384, |
| "loss/crossentropy": 2.320579767227173, |
| "loss/hidden": 0.9140625, |
| "loss/logits": 0.16879746317863464, |
| "loss/reg": 0.005558326840400696, |
| "step": 452 |
| }, |
| { |
| "epoch": 0.056625, |
| "grad_norm": 2.954127788543701, |
| "grad_norm_var": 0.09996231296479816, |
| "learning_rate": 0.0001, |
| "loss": 1.2415, |
| "loss/crossentropy": 2.483376979827881, |
| "loss/hidden": 0.99609375, |
| "loss/logits": 0.18988527357578278, |
| "loss/reg": 0.005556488875299692, |
| "step": 453 |
| }, |
| { |
| "epoch": 0.05675, |
| "grad_norm": 2.442729949951172, |
| "grad_norm_var": 0.0916992305907788, |
| "learning_rate": 0.0001, |
| "loss": 1.0533, |
| "loss/crossentropy": 2.414472818374634, |
| "loss/hidden": 0.84765625, |
| "loss/logits": 0.1501239389181137, |
| "loss/reg": 0.005554646719247103, |
| "step": 454 |
| }, |
| { |
| "epoch": 0.056875, |
| "grad_norm": 2.598292589187622, |
| "grad_norm_var": 0.09002796513685567, |
| "learning_rate": 0.0001, |
| "loss": 0.9797, |
| "loss/crossentropy": 2.8175811767578125, |
| "loss/hidden": 0.78515625, |
| "loss/logits": 0.13899990916252136, |
| "loss/reg": 0.005552831571549177, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.057, |
| "grad_norm": 2.284618616104126, |
| "grad_norm_var": 0.09289234998963139, |
| "learning_rate": 0.0001, |
| "loss": 1.1767, |
| "loss/crossentropy": 2.5178730487823486, |
| "loss/hidden": 0.953125, |
| "loss/logits": 0.1680239588022232, |
| "loss/reg": 0.005550856236368418, |
| "step": 456 |
| }, |
| { |
| "epoch": 0.057125, |
| "grad_norm": 2.9749691486358643, |
| "grad_norm_var": 0.10255115779464533, |
| "learning_rate": 0.0001, |
| "loss": 1.146, |
| "loss/crossentropy": 2.6965036392211914, |
| "loss/hidden": 0.89453125, |
| "loss/logits": 0.19602364301681519, |
| "loss/reg": 0.005548745859414339, |
| "step": 457 |
| }, |
| { |
| "epoch": 0.05725, |
| "grad_norm": 2.4419991970062256, |
| "grad_norm_var": 0.10305738190390912, |
| "learning_rate": 0.0001, |
| "loss": 1.0782, |
| "loss/crossentropy": 2.507200241088867, |
| "loss/hidden": 0.87890625, |
| "loss/logits": 0.14385411143302917, |
| "loss/reg": 0.005546758882701397, |
| "step": 458 |
| }, |
| { |
| "epoch": 0.057375, |
| "grad_norm": 2.41898250579834, |
| "grad_norm_var": 0.07293072023693033, |
| "learning_rate": 0.0001, |
| "loss": 1.0665, |
| "loss/crossentropy": 2.4068796634674072, |
| "loss/hidden": 0.87109375, |
| "loss/logits": 0.13996180891990662, |
| "loss/reg": 0.005544655025005341, |
| "step": 459 |
| }, |
| { |
| "epoch": 0.0575, |
| "grad_norm": 3.584895372390747, |
| "grad_norm_var": 0.1446675774892469, |
| "learning_rate": 0.0001, |
| "loss": 1.419, |
| "loss/crossentropy": 2.4029970169067383, |
| "loss/hidden": 1.15625, |
| "loss/logits": 0.20734865963459015, |
| "loss/reg": 0.005542535334825516, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.057625, |
| "grad_norm": 2.5190699100494385, |
| "grad_norm_var": 0.14465856873481447, |
| "learning_rate": 0.0001, |
| "loss": 1.0687, |
| "loss/crossentropy": 2.632817268371582, |
| "loss/hidden": 0.84375, |
| "loss/logits": 0.16959112882614136, |
| "loss/reg": 0.005540382582694292, |
| "step": 461 |
| }, |
| { |
| "epoch": 0.05775, |
| "grad_norm": 3.293412446975708, |
| "grad_norm_var": 0.1759751166057581, |
| "learning_rate": 0.0001, |
| "loss": 1.2079, |
| "loss/crossentropy": 1.8526346683502197, |
| "loss/hidden": 0.984375, |
| "loss/logits": 0.16817334294319153, |
| "loss/reg": 0.005538390018045902, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.057875, |
| "grad_norm": 2.090097665786743, |
| "grad_norm_var": 0.1923380804679141, |
| "learning_rate": 0.0001, |
| "loss": 1.0403, |
| "loss/crossentropy": 2.7256767749786377, |
| "loss/hidden": 0.83984375, |
| "loss/logits": 0.14509689807891846, |
| "loss/reg": 0.005536381620913744, |
| "step": 463 |
| }, |
| { |
| "epoch": 0.058, |
| "grad_norm": 2.367372751235962, |
| "grad_norm_var": 0.19537989350592183, |
| "learning_rate": 0.0001, |
| "loss": 0.967, |
| "loss/crossentropy": 2.440683603286743, |
| "loss/hidden": 0.78125, |
| "loss/logits": 0.13041679561138153, |
| "loss/reg": 0.005534291733056307, |
| "step": 464 |
| }, |
| { |
| "epoch": 0.058125, |
| "grad_norm": 2.5434730052948, |
| "grad_norm_var": 0.18491306851457617, |
| "learning_rate": 0.0001, |
| "loss": 1.1396, |
| "loss/crossentropy": 2.811406373977661, |
| "loss/hidden": 0.91015625, |
| "loss/logits": 0.1740744560956955, |
| "loss/reg": 0.005532294511795044, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.05825, |
| "grad_norm": 2.613758087158203, |
| "grad_norm_var": 0.17830906169392974, |
| "learning_rate": 0.0001, |
| "loss": 1.0313, |
| "loss/crossentropy": 2.5138356685638428, |
| "loss/hidden": 0.828125, |
| "loss/logits": 0.1479034125804901, |
| "loss/reg": 0.005530340131372213, |
| "step": 466 |
| }, |
| { |
| "epoch": 0.058375, |
| "grad_norm": 3.6053991317749023, |
| "grad_norm_var": 0.21458171164135606, |
| "learning_rate": 0.0001, |
| "loss": 1.2109, |
| "loss/crossentropy": 1.9949983358383179, |
| "loss/hidden": 1.0, |
| "loss/logits": 0.155661940574646, |
| "loss/reg": 0.0055284383706748486, |
| "step": 467 |
| }, |
| { |
| "epoch": 0.0585, |
| "grad_norm": 2.2574644088745117, |
| "grad_norm_var": 0.21534123971961966, |
| "learning_rate": 0.0001, |
| "loss": 1.08, |
| "loss/crossentropy": 2.514662981033325, |
| "loss/hidden": 0.859375, |
| "loss/logits": 0.16538314521312714, |
| "loss/reg": 0.005526562221348286, |
| "step": 468 |
| }, |
| { |
| "epoch": 0.058625, |
| "grad_norm": 2.2614095211029053, |
| "grad_norm_var": 0.2206521084247221, |
| "learning_rate": 0.0001, |
| "loss": 1.2297, |
| "loss/crossentropy": 2.4910507202148438, |
| "loss/hidden": 0.98046875, |
| "loss/logits": 0.19400066137313843, |
| "loss/reg": 0.005524714011698961, |
| "step": 469 |
| }, |
| { |
| "epoch": 0.05875, |
| "grad_norm": 3.083524465560913, |
| "grad_norm_var": 0.22915168035201153, |
| "learning_rate": 0.0001, |
| "loss": 1.1725, |
| "loss/crossentropy": 2.5548853874206543, |
| "loss/hidden": 0.92578125, |
| "loss/logits": 0.19151920080184937, |
| "loss/reg": 0.005522689316421747, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.058875, |
| "grad_norm": 2.6530709266662598, |
| "grad_norm_var": 0.2287156357176549, |
| "learning_rate": 0.0001, |
| "loss": 0.9819, |
| "loss/crossentropy": 2.5769848823547363, |
| "loss/hidden": 0.79296875, |
| "loss/logits": 0.1337730437517166, |
| "loss/reg": 0.00552078802138567, |
| "step": 471 |
| }, |
| { |
| "epoch": 0.059, |
| "grad_norm": 2.857489585876465, |
| "grad_norm_var": 0.21848469951039154, |
| "learning_rate": 0.0001, |
| "loss": 1.2335, |
| "loss/crossentropy": 2.6933629512786865, |
| "loss/hidden": 0.98828125, |
| "loss/logits": 0.19003306329250336, |
| "loss/reg": 0.005518974736332893, |
| "step": 472 |
| }, |
| { |
| "epoch": 0.059125, |
| "grad_norm": 1.960106372833252, |
| "grad_norm_var": 0.24874750636482734, |
| "learning_rate": 0.0001, |
| "loss": 0.9776, |
| "loss/crossentropy": 2.534855365753174, |
| "loss/hidden": 0.7890625, |
| "loss/logits": 0.13338381052017212, |
| "loss/reg": 0.005517229437828064, |
| "step": 473 |
| }, |
| { |
| "epoch": 0.05925, |
| "grad_norm": 2.787822961807251, |
| "grad_norm_var": 0.24619457779295406, |
| "learning_rate": 0.0001, |
| "loss": 1.0858, |
| "loss/crossentropy": 2.396390438079834, |
| "loss/hidden": 0.88671875, |
| "loss/logits": 0.14397624135017395, |
| "loss/reg": 0.005515479948371649, |
| "step": 474 |
| }, |
| { |
| "epoch": 0.059375, |
| "grad_norm": 2.3396122455596924, |
| "grad_norm_var": 0.24936205040752385, |
| "learning_rate": 0.0001, |
| "loss": 1.0392, |
| "loss/crossentropy": 2.6306259632110596, |
| "loss/hidden": 0.83984375, |
| "loss/logits": 0.14426180720329285, |
| "loss/reg": 0.005513759795576334, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.0595, |
| "grad_norm": 2.367551803588867, |
| "grad_norm_var": 0.19447740210993794, |
| "learning_rate": 0.0001, |
| "loss": 1.1071, |
| "loss/crossentropy": 2.342672348022461, |
| "loss/hidden": 0.890625, |
| "loss/logits": 0.16136375069618225, |
| "loss/reg": 0.0055120959877967834, |
| "step": 476 |
| }, |
| { |
| "epoch": 0.059625, |
| "grad_norm": 2.3029873371124268, |
| "grad_norm_var": 0.19972845357339655, |
| "learning_rate": 0.0001, |
| "loss": 0.9785, |
| "loss/crossentropy": 2.725276231765747, |
| "loss/hidden": 0.796875, |
| "loss/logits": 0.12647491693496704, |
| "loss/reg": 0.0055101178586483, |
| "step": 477 |
| }, |
| { |
| "epoch": 0.05975, |
| "grad_norm": 2.3109138011932373, |
| "grad_norm_var": 0.1674590503375268, |
| "learning_rate": 0.0001, |
| "loss": 1.012, |
| "loss/crossentropy": 2.6665799617767334, |
| "loss/hidden": 0.81640625, |
| "loss/logits": 0.14054208993911743, |
| "loss/reg": 0.005508116912096739, |
| "step": 478 |
| }, |
| { |
| "epoch": 0.059875, |
| "grad_norm": 2.8778023719787598, |
| "grad_norm_var": 0.1605488706137739, |
| "learning_rate": 0.0001, |
| "loss": 1.0028, |
| "loss/crossentropy": 2.599010705947876, |
| "loss/hidden": 0.80078125, |
| "loss/logits": 0.14697444438934326, |
| "loss/reg": 0.0055063748732209206, |
| "step": 479 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 2.7762978076934814, |
| "grad_norm_var": 0.15971446982347573, |
| "learning_rate": 0.0001, |
| "loss": 1.1492, |
| "loss/crossentropy": 2.6345436573028564, |
| "loss/hidden": 0.9296875, |
| "loss/logits": 0.1645045280456543, |
| "loss/reg": 0.005504653323441744, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.060125, |
| "grad_norm": 3.0745112895965576, |
| "grad_norm_var": 0.1733429982183973, |
| "learning_rate": 0.0001, |
| "loss": 1.2914, |
| "loss/crossentropy": 2.1021008491516113, |
| "loss/hidden": 1.0546875, |
| "loss/logits": 0.18168240785598755, |
| "loss/reg": 0.005502650048583746, |
| "step": 481 |
| }, |
| { |
| "epoch": 0.06025, |
| "grad_norm": 2.5635828971862793, |
| "grad_norm_var": 0.17362979402171655, |
| "learning_rate": 0.0001, |
| "loss": 1.1746, |
| "loss/crossentropy": 2.599754810333252, |
| "loss/hidden": 0.9453125, |
| "loss/logits": 0.1743006557226181, |
| "loss/reg": 0.005500909872353077, |
| "step": 482 |
| }, |
| { |
| "epoch": 0.060375, |
| "grad_norm": 2.982170343399048, |
| "grad_norm_var": 0.11685041441696337, |
| "learning_rate": 0.0001, |
| "loss": 1.084, |
| "loss/crossentropy": 2.780411958694458, |
| "loss/hidden": 0.875, |
| "loss/logits": 0.15399503707885742, |
| "loss/reg": 0.005499421618878841, |
| "step": 483 |
| }, |
| { |
| "epoch": 0.0605, |
| "grad_norm": 6.475743770599365, |
| "grad_norm_var": 1.0413639393420129, |
| "learning_rate": 0.0001, |
| "loss": 2.1473, |
| "loss/crossentropy": 2.3867931365966797, |
| "loss/hidden": 1.703125, |
| "loss/logits": 0.38922837376594543, |
| "loss/reg": 0.005497433710843325, |
| "step": 484 |
| }, |
| { |
| "epoch": 0.060625, |
| "grad_norm": 2.522434711456299, |
| "grad_norm_var": 1.024975132918582, |
| "learning_rate": 0.0001, |
| "loss": 1.0915, |
| "loss/crossentropy": 2.741684675216675, |
| "loss/hidden": 0.88671875, |
| "loss/logits": 0.14987404644489288, |
| "loss/reg": 0.0054954588413238525, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.06075, |
| "grad_norm": 2.6852359771728516, |
| "grad_norm_var": 1.0236023483547378, |
| "learning_rate": 0.0001, |
| "loss": 1.0905, |
| "loss/crossentropy": 2.2552525997161865, |
| "loss/hidden": 0.8984375, |
| "loss/logits": 0.13711076974868774, |
| "loss/reg": 0.005493887234479189, |
| "step": 486 |
| }, |
| { |
| "epoch": 0.060875, |
| "grad_norm": 6.048346996307373, |
| "grad_norm_var": 1.65671866064532, |
| "learning_rate": 0.0001, |
| "loss": 1.4058, |
| "loss/crossentropy": 3.1526873111724854, |
| "loss/hidden": 1.0625, |
| "loss/logits": 0.2884060740470886, |
| "loss/reg": 0.005492268595844507, |
| "step": 487 |
| }, |
| { |
| "epoch": 0.061, |
| "grad_norm": 5.24729061126709, |
| "grad_norm_var": 1.9496829900519608, |
| "learning_rate": 0.0001, |
| "loss": 1.5487, |
| "loss/crossentropy": 2.391798496246338, |
| "loss/hidden": 1.234375, |
| "loss/logits": 0.2594112157821655, |
| "loss/reg": 0.0054903156124055386, |
| "step": 488 |
| }, |
| { |
| "epoch": 0.061125, |
| "grad_norm": 3.4879932403564453, |
| "grad_norm_var": 1.8414378354073275, |
| "learning_rate": 0.0001, |
| "loss": 1.2408, |
| "loss/crossentropy": 2.3853161334991455, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.1702655553817749, |
| "loss/reg": 0.005488729570060968, |
| "step": 489 |
| }, |
| { |
| "epoch": 0.06125, |
| "grad_norm": 2.416243076324463, |
| "grad_norm_var": 1.875598350696971, |
| "learning_rate": 0.0001, |
| "loss": 1.0646, |
| "loss/crossentropy": 2.310605049133301, |
| "loss/hidden": 0.86328125, |
| "loss/logits": 0.146418958902359, |
| "loss/reg": 0.005487216170877218, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.061375, |
| "grad_norm": 2.9619152545928955, |
| "grad_norm_var": 1.8217813283025472, |
| "learning_rate": 0.0001, |
| "loss": 1.2577, |
| "loss/crossentropy": 2.3735132217407227, |
| "loss/hidden": 1.015625, |
| "loss/logits": 0.18721503019332886, |
| "loss/reg": 0.005485245026648045, |
| "step": 491 |
| }, |
| { |
| "epoch": 0.0615, |
| "grad_norm": 2.9602112770080566, |
| "grad_norm_var": 1.7685642295810833, |
| "learning_rate": 0.0001, |
| "loss": 1.1274, |
| "loss/crossentropy": 2.6420083045959473, |
| "loss/hidden": 0.90234375, |
| "loss/logits": 0.17025524377822876, |
| "loss/reg": 0.005483296699821949, |
| "step": 492 |
| }, |
| { |
| "epoch": 0.061625, |
| "grad_norm": 2.5772223472595215, |
| "grad_norm_var": 1.7347667738241757, |
| "learning_rate": 0.0001, |
| "loss": 1.1004, |
| "loss/crossentropy": 2.4166319370269775, |
| "loss/hidden": 0.890625, |
| "loss/logits": 0.15491390228271484, |
| "loss/reg": 0.005481342785060406, |
| "step": 493 |
| }, |
| { |
| "epoch": 0.06175, |
| "grad_norm": 2.6494603157043457, |
| "grad_norm_var": 1.693988292922673, |
| "learning_rate": 0.0001, |
| "loss": 1.0762, |
| "loss/crossentropy": 2.7021005153656006, |
| "loss/hidden": 0.8671875, |
| "loss/logits": 0.1542307734489441, |
| "loss/reg": 0.005479689687490463, |
| "step": 494 |
| }, |
| { |
| "epoch": 0.061875, |
| "grad_norm": 2.065351963043213, |
| "grad_norm_var": 1.7911776893626628, |
| "learning_rate": 0.0001, |
| "loss": 1.015, |
| "loss/crossentropy": 2.4842755794525146, |
| "loss/hidden": 0.8203125, |
| "loss/logits": 0.13995476067066193, |
| "loss/reg": 0.005478002596646547, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.062, |
| "grad_norm": 2.650660753250122, |
| "grad_norm_var": 1.8016636980513454, |
| "learning_rate": 0.0001, |
| "loss": 1.1699, |
| "loss/crossentropy": 2.3899097442626953, |
| "loss/hidden": 0.94921875, |
| "loss/logits": 0.16591498255729675, |
| "loss/reg": 0.005476430524140596, |
| "step": 496 |
| }, |
| { |
| "epoch": 0.062125, |
| "grad_norm": 3.412050724029541, |
| "grad_norm_var": 1.7970375838694677, |
| "learning_rate": 0.0001, |
| "loss": 1.1983, |
| "loss/crossentropy": 2.4459383487701416, |
| "loss/hidden": 0.94140625, |
| "loss/logits": 0.20212361216545105, |
| "loss/reg": 0.005474465899169445, |
| "step": 497 |
| }, |
| { |
| "epoch": 0.06225, |
| "grad_norm": 2.7389674186706543, |
| "grad_norm_var": 1.7804152177025587, |
| "learning_rate": 0.0001, |
| "loss": 1.1076, |
| "loss/crossentropy": 2.6794888973236084, |
| "loss/hidden": 0.90625, |
| "loss/logits": 0.1465749740600586, |
| "loss/reg": 0.005472847726196051, |
| "step": 498 |
| }, |
| { |
| "epoch": 0.062375, |
| "grad_norm": 20.56003761291504, |
| "grad_norm_var": 20.18846043733062, |
| "learning_rate": 0.0001, |
| "loss": 1.0568, |
| "loss/crossentropy": 2.527268409729004, |
| "loss/hidden": 0.859375, |
| "loss/logits": 0.14275437593460083, |
| "loss/reg": 0.005471326876431704, |
| "step": 499 |
| }, |
| { |
| "epoch": 0.0625, |
| "grad_norm": 2.9909119606018066, |
| "grad_norm_var": 20.013739807194945, |
| "learning_rate": 0.0001, |
| "loss": 1.0002, |
| "loss/crossentropy": 2.311053991317749, |
| "loss/hidden": 0.80859375, |
| "loss/logits": 0.13688521087169647, |
| "loss/reg": 0.005469587165862322, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 8000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": true, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.2202930782208e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|